diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 00000000..ffcc0d16 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,18 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +indent_style = space +trim_trailing_whitespace = true + +[site/**] +charset = unset +end_of_line = unset +insert_final_newline = unset +indent_style = unset +trim_trailing_whitespace = unset + +[*.{proto,yaml,yml}] +indent_size = 2 diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..6c94fd85 --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +ignore = E203, E266, E501, W503, F403, F401 +max-line-length = 88 +select = B,C,E,F,W,T4,B9 diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..8f8c97e6 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +proto/buf.lock linguist-generated=true diff --git a/.github/actions/dev-tool-python/action.yml b/.github/actions/dev-tool-python/action.yml new file mode 100644 index 00000000..f16b8774 --- /dev/null +++ b/.github/actions/dev-tool-python/action.yml @@ -0,0 +1,24 @@ +name: 'Install Python' +inputs: + python-version: + required: true + default: '3.9' +runs: + using: "composite" + steps: + - name: Set up Python ${{ inputs.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ inputs.python-version }} + - uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + - name: Install dependencies + shell: bash + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + working-directory: ${{env.working-directory}} diff --git a/.github/workflows/c.yml b/.github/workflows/c.yml new file mode 100644 index 00000000..6aa7f629 --- /dev/null +++ b/.github/workflows/c.yml @@ -0,0 +1,36 @@ +name: C + +on: + pull_request: + push: + branches: [ main ] + +jobs: + test: + name: Test + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - name: Configure + run: mkdir -p build && cd build && cmake ../c -DSUBSTRAIT_VALIDATOR_BUILD_TESTS=ON + - name: Build + run: cmake --build build + - name: Test + run: ctest --output-on-failure --test-dir build + + style: + name: Style + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: jidicula/clang-format-action@v4.5.0 + with: + clang-format-version: '13' + check-path: c diff --git a/.github/workflows/misc.yml b/.github/workflows/misc.yml new file mode 100644 index 00000000..dbd1083d --- /dev/null +++ b/.github/workflows/misc.yml @@ -0,0 +1,23 @@ +name: Misc + +on: + pull_request: + push: + branches: [ main ] + +jobs: + license: + name: Check license headers + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Check License Header + uses: enarx/spdx@master + with: + licenses: Apache-2.0 MIT + + editorconfig: + name: Check editorconfig + runs-on: ubuntu-latest + steps: + - uses: editorconfig-checker/action-editorconfig-checker@v1 diff --git a/.github/workflows/proto.yml b/.github/workflows/proto.yml new file mode 100644 index 00000000..a9537e72 --- /dev/null +++ b/.github/workflows/proto.yml @@ -0,0 +1,30 @@ +name: Protobuf + +on: + pull_request: + push: + branches: [ main ] + +jobs: + style: + name: Style + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: bufbuild/buf-setup-action@v1.4.0 + - run: buf format --diff --exit-code + + check: + name: Check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: arduino/setup-protoc@v1 + - uses: bufbuild/buf-setup-action@v0.7.0 + - uses: bufbuild/buf-lint-action@v1 + - name: Compile protobuf + run: buf generate diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml new file mode 100644 index 00000000..cdc7ea16 --- /dev/null +++ b/.github/workflows/python.yml @@ -0,0 +1,113 @@ +name: Python + +on: + pull_request: + push: + branches: [ main ] + +jobs: + build: + name: Test + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + type: [wheel] + include: + - os: ubuntu-latest + type: sdist + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - name: Install sdist-only dependencies + if: ${{ matrix.type == 'sdist' }} + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - name: sdist-only build prep + if: ${{ matrix.type == 'sdist' }} + working-directory: rs + run: cargo check + # ^ this ensures that its build.rs is run, which is all we need. Sadly + # there doesn't seem to be a way to do this that doesn't pull in all + # normal dependencies: https://github.com/rust-lang/cargo/issues/7178 + - name: Install build dependencies + run: python3 -m pip install --upgrade pip maturin + - name: Prepare build environment + working-directory: py + run: python3 prepare_build.py populate + - name: Create sdist + if: ${{ matrix.type == 'sdist' }} + uses: messense/maturin-action@v1 + with: + command: sdist + args: -o dist -m py/Cargo.toml + - name: Build manylinux wheels + if: ${{ matrix.type == 'wheel' && matrix.os == 'ubuntu-latest' }} + uses: messense/maturin-action@v1 + with: + manylinux: auto + command: build + args: --release --no-sdist -o dist -m py/Cargo.toml + - name: Build Windows wheels + if: ${{ matrix.type == 'wheel' && matrix.os == 'windows-latest' }} + uses: messense/maturin-action@v1 + with: + command: build + args: --release --no-sdist -o dist -m py/Cargo.toml + - name: Build MacOS wheels + if: ${{ matrix.type == 'wheel' && matrix.os == 'macos-latest' }} + uses: messense/maturin-action@v1 + with: + command: build + args: --release --no-sdist -o dist --universal2 -m py/Cargo.toml + - name: Install runtime dependencies + run: python3 -m pip install --upgrade protobuf pytest click pyyaml jdot + - name: Install generated sdist + if: ${{ matrix.type == 'sdist' }} + run: python3 -m pip install dist/substrait_validator-*.tar.gz + - name: Install generated wheel + if: ${{ matrix.type == 'wheel' }} + run: python3 -m pip install --no-index --find-links=dist substrait-validator + - name: Test + working-directory: py/tests + run: python3 -m pytest + - name: Upload wheels + if: "startsWith(github.ref, 'refs/tags/')" + uses: actions/upload-artifact@v2 + with: + name: wheels + path: dist + + release: + name: Release + runs-on: ubuntu-latest + if: "startsWith(github.ref, 'refs/tags/')" + needs: [ build ] + steps: + - uses: actions/download-artifact@v2 + with: + name: wheels + - name: Publish to PyPI + uses: messense/maturin-action@v1 + env: + MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} + with: + command: upload + args: --skip-existing * + + fmt-lint: + name: Style & lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Install dependencies + run: python3 -m pip install --upgrade pip black==22.3.0 flake8==4.0.1 + - name: Black + run: python3 -m black --diff --check . + - name: Flake8 + run: python3 -m flake8 . diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 00000000..aa5edbeb --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,103 @@ +name: Rust + +on: + pull_request: + push: + branches: [ main ] + +jobs: + check: + name: Check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - uses: Swatinem/rust-cache@v1 + - name: Check + run: cargo check --all-features + + test: + name: Test + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - uses: Swatinem/rust-cache@v1 + - name: Run unit tests + run: cargo test --all-features + - name: Install test runner dependencies + run: python3 -m pip install --upgrade pip protobuf pyyaml + - name: Run validation tests + # No need to run validation tests for all operating systems, and Linux + # runners are the fastest of the bunch. + if: ${{ matrix.os == 'ubuntu-latest' }} + working-directory: tests + run: python3 runner.py run --no-html + + style: + name: Style + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + components: rustfmt + - uses: Swatinem/rust-cache@v1 + - name: Rustfmt + run: cargo fmt --all -- --check + + lint: + name: Lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + # Clippy 1.60 crashes on the codebase, see + # https://github.com/rust-lang/rust-clippy/issues/8527 + toolchain: "1.59.0" + override: true + components: clippy + - uses: Swatinem/rust-cache@v1 + - name: Clippy + run: cargo clippy --all-features -- -D warnings + + doc: + name: Doc + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - uses: Swatinem/rust-cache@v1 + - name: Doc + run: RUSTDOCFLAGS="-Dwarnings" cargo doc --workspace --all-features diff --git a/.github/workflows/yaml.yml b/.github/workflows/yaml.yml new file mode 100644 index 00000000..bc7132b4 --- /dev/null +++ b/.github/workflows/yaml.yml @@ -0,0 +1,15 @@ +name: YAML + +on: + pull_request: + push: + branches: [ main ] + +jobs: + lint: + name: Lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Run yamllint + run: yamllint . diff --git a/.gitignore b/.gitignore index 088ba6ba..78bf0d69 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,8 @@ -# Generated by Cargo -# will have compiled files and executables +**/*.rs.bk +**/target +**/.gradle +**/.idea +**/build +gen /target/ - -# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries -# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html -Cargo.lock - -# These are backup files generated by rustfmt **/*.rs.bk diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..592be67a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "substrait"] + path = substrait + url = https://github.com/substrait-io/substrait.git diff --git a/.licenserc.yaml b/.licenserc.yaml new file mode 100644 index 00000000..70a60952 --- /dev/null +++ b/.licenserc.yaml @@ -0,0 +1,13 @@ +header: + license: + spdx-id: Apache-2.0 + + paths: + - 'proto/substrait/**' + - 'derive/**' + - 'rs/**' + - 'py/**' + - 'c/**' + - 'tests/**' + + comment: never diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..4c93f9ed --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,29 @@ +repos: +- repo: https://github.com/doublify/pre-commit-rust + rev: v1.0 + hooks: + - id: fmt + - id: clippy + - id: cargo-check +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v13.0.1 + hooks: + - id: clang-format + types_or: [c, c++] +- repo: https://github.com/nametake/pre-commit-buf + rev: v2.0.0 + hooks: + - id: buf-lint +- repo: https://github.com/adrienverge/yamllint.git + rev: v1.26.0 + hooks: + - id: yamllint + args: [-c=.yamllint.yaml] +- repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black +- repo: https://gitlab.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 diff --git a/.yamllint.yaml b/.yamllint.yaml new file mode 100644 index 00000000..35e3e8ea --- /dev/null +++ b/.yamllint.yaml @@ -0,0 +1,9 @@ +rules: + line-length: + max: 120 + brackets: + forbid: false + min-spaces-inside: 0 + max-spaces-inside: 1 + min-spaces-inside-empty: 0 + max-spaces-inside-empty: 0 diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..657b9220 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,1446 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ahash" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +dependencies = [ + "getrandom", + "once_cell", + "serde", + "version_check", +] + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + +[[package]] +name = "anyhow" +version = "1.0.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "159bb86af3a200e19a068f4224eae4c8bb2d0fa054c7e5d1cacd5cef95e684cd" + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "base64" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" + +[[package]] +name = "bit-set" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e11e16035ea35e4e5997b393eacbf6f63983188f7a2ad25bfb13465f5ad59de" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytecount" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e" + +[[package]] +name = "bytes" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" + +[[package]] +name = "cbindgen" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51e3973b165dc0f435831a9e426de67e894de532754ff7a3f307c03ee5dec7dc" +dependencies = [ + "clap", + "heck 0.3.3", + "indexmap", + "log", + "proc-macro2", + "quote", + "serde", + "serde_json", + "syn", + "tempfile", + "toml", +] + +[[package]] +name = "cc" +version = "1.0.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "time 0.1.43", + "winapi", +] + +[[package]] +name = "clap" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aaa7bd5fb665c6864b5f963dd9097905c54125909c7aa94c9e18507cdbe6c53" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" +dependencies = [ + "cfg-if", + "lazy_static", +] + +[[package]] +name = "curl" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7de97b894edd5b5bcceef8b78d7da9b75b1d2f2f9a910569d0bde3dd31d84939" +dependencies = [ + "curl-sys", + "libc", + "openssl-probe", + "openssl-sys", + "schannel", + "socket2", + "winapi", +] + +[[package]] +name = "curl-sys" +version = "0.4.52+curl-7.81.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b8c2d1023ea5fded5b7b892e4b8e95f70038a421126a056761a84246a28971" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", + "winapi", +] + +[[package]] +name = "dunce" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "453440c271cf5577fd2a40e4942540cb7d0d2f85e27c8d07dd0023c925a67541" + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "fancy-regex" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d6b8560a05112eb52f04b00e5d3790c0dd75d9d980eb8a122fb23b92a623ccf" +dependencies = [ + "bit-set", + "regex", +] + +[[package]] +name = "fastrand" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf" +dependencies = [ + "instant", +] + +[[package]] +name = "fixedbitset" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "279fb028e20b3c4c320317955b77c5e0c9701f05a1d309905d6fc702cdc5053e" + +[[package]] +name = "float-pretty-print" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cae1cdc50a756244008a19d313827537e5e18d55f76779e8d5f9aa00769ca231" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "form_urlencoded" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" +dependencies = [ + "matches", + "percent-encoding", +] + +[[package]] +name = "fraction" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aba3510011eee8825018be07f08d9643421de007eaf62a3bde58d89b058abfa7" +dependencies = [ + "lazy_static", + "num", +] + +[[package]] +name = "getrandom" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d39cd93900197114fa1fcb7ae84ca742095eed9442088988ae74fa744e930e77" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "heck" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "idna" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" +dependencies = [ + "matches", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282a6247722caba404c065016bbfa522806e51714c34f5dfc3e4a3a46fcb4223" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "indoc" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47741a8bc60fb26eb8d6e0238bbb26d8575ff623fdc97b1a2c00c050b9684ed8" +dependencies = [ + "indoc-impl", + "proc-macro-hack", +] + +[[package]] +name = "indoc-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce046d161f000fffde5f432a0d034d0341dc152643b2598ed5bfce44c4f3a8f0" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "syn", + "unindent", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "iso8601" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a59a3f2be6271b2a844cd0dd13bf8ccc88a9540482d872c7ce58ab1c4db9fab" +dependencies = [ + "nom", +] + +[[package]] +name = "itertools" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" + +[[package]] +name = "jsonschema" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4be404426c47c9b868fc9b6ddda07f84e2885d12b17066036717db2cd4e5d77" +dependencies = [ + "ahash", + "anyhow", + "base64", + "bytecount", + "fancy-regex", + "fraction", + "iso8601", + "itoa", + "lazy_static", + "memchr", + "num-cmp", + "parking_lot 0.12.0", + "percent-encoding", + "regex", + "serde", + "serde_json", + "time 0.3.7", + "url", + "uuid", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.119" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bf2e165bb3457c8e098ea76f3e3bc9db55f87aa90d52d0e6be741470916aaa4" + +[[package]] +name = "libz-sys" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linked-hash-map" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fb9b38af92608140b86b693604b9ffcc5824240a484d1ecd4795bacb2fe88f3" + +[[package]] +name = "lock_api" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88943dd7ef4a2e5a4bfa2753aaab3013e34ce2533d1996fb18ef591e315e2b3b" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "matches" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" + +[[package]] +name = "memchr" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +dependencies = [ + "autocfg", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + +[[package]] +name = "nom" +version = "7.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" +dependencies = [ + "memchr", + "minimal-lexical", + "version_check", +] + +[[package]] +name = "num" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8536030f9fea7127f841b45bb6243b27255787fb4eb83958aa1ef9d2fdc0c36" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-cmp" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63335b2e2c34fae2fb0aa2cecfd9f0832a1e24b3b32ecec612c3426d46dc8aaa" + +[[package]] +name = "num-complex" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6b19411a9719e753aff12e5187b74d60d3dc449ec3f4dc21e3989c3f554bc95" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-derive" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "876a53fff98e03a936a674b29568b0e605f06b29372c2489ff4de23f1949743d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "num-integer" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c000134b5dbf44adc5cb772486d335293351644b801551abe8f75c84cfa4aef" +dependencies = [ + "autocfg", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "num_threads" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97ba99ba6393e2c3734791401b66902d981cb03bf190af674ca69949b6d5fb15" +dependencies = [ + "libc", +] + +[[package]] +name = "once_cell" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-sys" +version = "0.9.72" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e46109c383602735fa0a2e48dd2b7c892b048e1bf69e5c3b1d804b7d9c203cb" +dependencies = [ + "autocfg", + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core 0.8.5", +] + +[[package]] +name = "parking_lot" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" +dependencies = [ + "lock_api", + "parking_lot_core 0.9.1", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28141e0cc4143da2443301914478dc976a61ffdb3f043058310c70df2fed8954" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-sys", +] + +[[package]] +name = "paste" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45ca20c77d80be666aef2b45486da86238fabe33e38306bd3118fe4af33fa880" +dependencies = [ + "paste-impl", + "proc-macro-hack", +] + +[[package]] +name = "paste-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d95a7db200b97ef370c8e6de0088252f7e0dfff7d047a28528e47456c0fc98b6" +dependencies = [ + "proc-macro-hack", +] + +[[package]] +name = "percent-encoding" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" + +[[package]] +name = "petgraph" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "pkg-config" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe" + +[[package]] +name = "proc-macro-hack" +version = "0.5.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" + +[[package]] +name = "proc-macro2" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "prost" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5" +dependencies = [ + "bytes", + "heck 0.3.3", + "itertools 0.10.3", + "lazy_static", + "log", + "multimap", + "petgraph", + "prost", + "prost-types", + "regex", + "tempfile", + "which", +] + +[[package]] +name = "prost-derive" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe" +dependencies = [ + "anyhow", + "itertools 0.10.3", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a" +dependencies = [ + "bytes", + "prost", +] + +[[package]] +name = "pyo3" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cf01dbf1c05af0a14c7779ed6f3aa9deac9c3419606ac9de537a2d649005720" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "parking_lot 0.11.2", + "paste", + "pyo3-build-config", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf9e4d128bfbddc898ad3409900080d8d5095c379632fbbfbb9c8cfb1fb852b" +dependencies = [ + "once_cell", +] + +[[package]] +name = "pyo3-macros" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67701eb32b1f9a9722b4bc54b548ff9d7ebfded011c12daece7b9063be1fd755" +dependencies = [ + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f44f09e825ee49a105f2c7b23ebee50886a9aee0746f4dd5a704138a64b0218a" +dependencies = [ + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "864d3e96a899863136fc6e99f3d7cae289dafe43bf2c5ac19b70df7210c0a145" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "rustversion" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" + +[[package]] +name = "ryu" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f05ba609c234e60bee0d547fe94a4c7e9da733d1c962cf6e59efa4cd9c8bc75" +dependencies = [ + "lazy_static", + "winapi", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "serde" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e8d9fa5c3b304765ce1fd9c4c8a3de2c8db365a5b91be52f186efc675681d95" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "smallvec" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" + +[[package]] +name = "socket2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "strum" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" + +[[package]] +name = "strum_macros" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" +dependencies = [ + "heck 0.3.3", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + +[[package]] +name = "substrait-validator" +version = "0.0.1" +dependencies = [ + "base64", + "chrono", + "curl", + "float-pretty-print", + "glob", + "heck 0.4.0", + "itertools 0.8.2", + "jsonschema", + "num-derive", + "num-traits", + "once_cell", + "percent-encoding", + "prost", + "prost-build", + "prost-types", + "regex", + "serde_json", + "strum", + "strum_macros", + "substrait-validator-derive", + "thiserror", + "uriparse", + "url", + "walkdir", + "yaml-rust", +] + +[[package]] +name = "substrait-validator-c" +version = "0.0.1" +dependencies = [ + "cbindgen", + "libc", + "substrait-validator", + "thiserror", +] + +[[package]] +name = "substrait-validator-derive" +version = "0.0.1" +dependencies = [ + "heck 0.4.0", + "quote", + "syn", +] + +[[package]] +name = "substrait-validator-py" +version = "0.0.1" +dependencies = [ + "dunce", + "prost-build", + "pyo3", + "substrait-validator", + "walkdir", +] + +[[package]] +name = "syn" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "tempfile" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" +dependencies = [ + "cfg-if", + "fastrand", + "libc", + "redox_syscall", + "remove_dir_all", + "winapi", +] + +[[package]] +name = "test-runner" +version = "0.0.1" +dependencies = [ + "glob", + "prost-build", + "rayon", + "serde", + "serde_json", + "substrait-validator", + "walkdir", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "time" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "time" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "004cbc98f30fa233c61a38bc77e96a9106e65c88f2d3bef182ae952027e5753d" +dependencies = [ + "libc", + "num_threads", + "time-macros", +] + +[[package]] +name = "time-macros" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25eb0ca3468fc0acc11828786797f6ef9aa1555e4a211a60d64cc8e4d1be47d6" + +[[package]] +name = "tinyvec" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c1c1d5a42b6245520c249549ec267180beaffcc0615401ac8e31853d4b6d8d2" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + +[[package]] +name = "toml" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +dependencies = [ + "serde", +] + +[[package]] +name = "unicode-bidi" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a01404663e3db436ed2746d9fefef640d868edae3cceb81c3b8d5732fda678f" + +[[package]] +name = "unicode-normalization" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" + +[[package]] +name = "unicode-width" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "unindent" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "514672a55d7380da379785a4d70ca8386c8883ff7eaae877be4d2081cebe73d8" + +[[package]] +name = "uriparse" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0200d0fc04d809396c2ad43f3c95da3582a2556eba8d453c1087f4120ee352ff" +dependencies = [ + "fnv", + "lazy_static", +] + +[[package]] +name = "url" +version = "2.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" +dependencies = [ + "form_urlencoded", + "idna", + "matches", + "percent-encoding", +] + +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + +[[package]] +name = "which" +version = "4.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" +dependencies = [ + "either", + "lazy_static", + "libc", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3df6e476185f92a12c072be4a189a0210dcdcf512a1891d6dff9edb874deadc6" +dependencies = [ + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_msvc" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8e92753b1c443191654ec532f14c199742964a061be25d77d7a96f09db20bf5" + +[[package]] +name = "windows_i686_gnu" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a711c68811799e017b6038e0922cb27a5e2f43a2ddb609fe0b6f3eeda9de615" + +[[package]] +name = "windows_i686_msvc" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "146c11bb1a02615db74680b32a68e2d61f553cc24c4eb5b4ca10311740e44172" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c912b12f7454c6620635bbff3450962753834be2a594819bd5e945af18ec64bc" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "504a2476202769977a040c6364301a3f65d0cc9e3fb08600b2bda150a0488316" + +[[package]] +name = "yaml-rust" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85" +dependencies = [ + "linked-hash-map", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..c026aa4b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[workspace] + +members = [ + "derive", + "rs", + "c", + "py", + "tests" +] diff --git a/LICENSE b/LICENSE index 261eeb9e..67db8588 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,4 @@ + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -172,30 +173,3 @@ defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 00000000..0f93e115 --- /dev/null +++ b/README.md @@ -0,0 +1,152 @@ +Substrait validator +=================== + +This repository contains a validator for +[Substrait](https://github.com/substrait-io/substrait) plans. It's written in +Rust, but bindings are available for Python and C. Other languages may use the +C API via their respective foreign function interface systems. + +Command-line interface +---------------------- + +The easiest way to play around with the validator is via the command-line +interface provided by the Python `substrait-validator` module. At the time of +writing, the package is not yet available on PyPI, but it should be easy enough +to build from source (see the `py` subdirectory). After installing, you should +be able to run: + +```console +user@host:~$ substrait-validator +Missing input file. Try --help for usage information. +``` + +If that doesn't work, try `python3 -m substrait-validator`. + +Without any options, the validator will decode the given input file based on +the format implied by the file extension, validate the plan, print any +diagnostics encountered, and fail with code 1 if the validator determines that +the plan is invalid. Here's a valid YAML plan as a starting point for playing +around with it: + +```yaml +relations: +- rel: + read: + namedTable: + names: + - person + baseSchema: + names: + - name + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: + nullability: NULLABILITY_REQUIRED +``` + +When you save that as a `.yaml` file and pass it to the validator, it will +simply exit with code 0 without printing anything. Of course, it's more +interesting to try a plan that *isn't* valid, but we'll leave that as an +excercise to the reader. + +It's also more interesting to have the validator tell you how it interpreted +the plan. Let's change the command line to do that: + +```console +user@host:~$ substrait-validator input.yaml --out-file output.html --mode ignore +``` + +This generates `output.html`, a self-contained HTML file describing the plan. + +Just like the input file, the output file format is derived from the file +extension, so the `.html` part is significant. If you don't want to rely on +this, you can also just specify the formats you want manually using `--in-type` +and `--out-type`. + +`--mode ignore` tells the validator to emit a file and exit with code 0 +regardless of the validation result. The full list of modes is: + + - `strict`: fail unless the plan was proven to be valid; + - `loose` (default): fail if the plan was proven to be invalid; + - `ignore`: ignore the validation result, though the plan still needs some + level of sanity to succeed; for example, the file must exist, and must + decode according to the specified file format. + - `convert`: don't run validation at all; simply convert between different + representations of the given `substrait.Plan` message. For example, you + can use this to convert between the binary protobuf serialization format + and any of the text-based formats supported by the validator. + +Note that, without `--mode convert`, the output message type will be +`subtrait.validator.ParseResult` rather than `substrait.Plan` if you use any +of the protobuf-like serialization formats. This message type is a meta +description of the incoming `substrait.Plan` message, with all the information +gathered by the validator annotated to the nodes. The HTML format is pretty +much just a pretty-printed version of this format. More information about this +type is available in the associated `.proto` file. + +For more information, use the `--help` option. + +Library usage +------------- + +For library usage information, refer to the readme files for the language that +you want to use the library from. + +Diagnostics +----------- + +The primary output of the validator (beyond its validity verdict) is a list of +diagnostics. In fact, the validator derives its verdict from this list. Each +diagnostic consists of the following bits of information: + + - a severity, being either info, warning, or error; + - a classification, represented using a 4-digit diagnostic code; + - a cause description; and + - a path into the protobuf/YAML tree, pointing to the node that the diagnostic + originated from. + +The severity levels strictly map as follows: + + - an error means that something is invalid; + - a warning means that something may or may not be invalid (i.e. validity + could not be determined for some reason); and + - info has no effect on validity. + +Once the validator as gathered all diagnostics, the validity of the plan is +simply determined by the above mapping applied to the highest severity level +encountered. + +Note that the command line interface specifically could be said to have an +extra "fatal" level. Such fatal diagnostics are not strictly diagnostics in the +sense that they are validation results; they simply indicate that the CLI +returned a non-zero exit code and why. + +Severity levels can be clamped to a certain range, distinguished by their +classification. This allows you to, for example, disable warnings of a certain +type by clamping them down to info when you know that those particular warnings +are not of interest to your application, or raise severity to error if you want +the validator to be extra pedantic about something. Because the validator +derives its verdict from the highest-severity diagnostic encountered, clamping +severity levels may also change the verdict. + +You can request the list of diagnostic codes from the command-line interface +using the `--help-diagnostics` flag: + +```console +user@host:~$ substrait-validator --help-diagnostics +The following diagnostic codes are defined: + +0000 (Unclassified): unclassified diagnostic. + |- 0001 (NotYetImplemented): not yet implemented. + |- 0002 (IllegalValue): illegal value. +... +``` + +Diagnostic codes are organized in a tree. When you configure the severity range +of a diagnostic code with children, its children will inherit this +configuration, unless they themselves are also explicitly configured. For +example, you can disable all warnings and errors except for those corresponding +to one particular diagnostic by clamping code 0000 down to info only, and then +overriding the configuration for the diagnostic you're interested in back to +the full info to error range. diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 00000000..6b064cc1 --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,76 @@ +Release process +=============== + +Note: this is only intended for maintainers. See `README.md` for general +usage information. + +Incrementing version numbers +---------------------------- + +There are version numbers all over the place, though some of them aren't that +important: + + - `derive/Cargo.toml` and its reference as dependency in `rs/Cargo.toml`: + these two version numbers must be kept in sync, but only need to be + incremented when anything changes in `substrait-validator-derive`. + - `rs/Cargo.toml` and its references as dependencies in `py/Cargo.toml`, + `c/Cargo.toml`, and `tests/Cargo.toml`, as well as in `rs/README.md` for + the Cargo dependency copypasta: these must be kept in sync and incremented + when the `substrait-validator` sources, the protobuf files, OR the YAML + schema files are updated. + - `py/Cargo.toml` and `py/pyproject.toml`: must be kept in sync, and must be + incremented whenever the `substrait-validator` crate is updated OR the + Python bindings are modified. + - `c/Cargo.toml`: not very important as it should always be built from source + by corrosion, but good to synchronize with the version of the main crate. + - `tests/Cargo.toml`: can be ignored. + +Relation of `substrait-validator` crate version to the Substrait specification +version is TBD. + +Pushing to crates.io +-------------------- + +Note in advance: the crates in the `py`, `c`, and `tests` directories should +NOT be pushed to `crates.io`: + + - the Python bindings crate is either embedded as sources in Python source + distributions or is shipped pre-built from the git repo in binary wheels; + - the C bindings should be built by CMake/Corrosion after it obtains the + complete git repo or a tarball thereof; and + - the `tests` crate is just a test runner that serves no purpose outside of + this repository. + +Only the crates in the `derive` and `rs` directories, respectively +`substrait-validator` and `substrait-validator-derive` should be released. + +The release steps are as follows. + + - Update version numbers (see section above). + - If `substrait-validator-derive` changed, release it per normal procedures. + - Remove the `rs/src/resources` directory, if one exists. + - Run `cargo build` locally for `substrait-validator` to recreate above + directory using the protobuf and schema files from outside the validator + folder. + - Run `cargo package`. Verify that it ONLY complains about files in + `src/resources` not being committed yet. This is unavoidable without + checking in the protobuf files in multiple places. + - Release `substrait-validator` per normal procedures, but using + `--allow-dirty` to suppress the above. + +Pushing to PyPI +--------------- + +The release steps are as follows, though they should probably be performed by +CI to use the appropriate environment. + + - Update version numbers (see section above). + - Remove the `rs/src/resources` directory, if one exists. + - Run `cargo build` to recreate above directory using the protobuf and schema + files from outside the validator folder. + - Run `python3 prepare_build.py clean`. + - Run `python3 prepare_build.py populate`. This makes a local copy of the + protobuf files for inclusion in an sdist. + - Run `maturin sdist` to build the source distribution. + - Run `maturin build` in the appropriate environments to build binary + distributions. diff --git a/buf.gen.yaml b/buf.gen.yaml new file mode 100644 index 00000000..d413e58c --- /dev/null +++ b/buf.gen.yaml @@ -0,0 +1,10 @@ +version: v1 +plugins: + - name: cpp + out: gen/proto/cpp + - name: csharp + out: gen/proto/csharp + - name: java + out: gen/proto/java + - name: python + out: gen/proto/python diff --git a/buf.work.yaml b/buf.work.yaml new file mode 100644 index 00000000..4ca5887e --- /dev/null +++ b/buf.work.yaml @@ -0,0 +1,4 @@ +version: v1 +directories: + - proto + - substrait/proto diff --git a/c/.gitignore b/c/.gitignore new file mode 100644 index 00000000..1ece1ed6 --- /dev/null +++ b/c/.gitignore @@ -0,0 +1,2 @@ +/include/ +/build/ diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt new file mode 100644 index 00000000..82b3f6e6 --- /dev/null +++ b/c/CMakeLists.txt @@ -0,0 +1,59 @@ +cmake_minimum_required(VERSION 3.11) +project(substrait-validator) + +include(FetchContent) + +# Use Corrosion to make a shared librarywith target name +# "substrait-validator-c". Note that static linking is not (easily) possible +# due to the many dependencies introduced by libcurl. +FetchContent_Declare( + Corrosion + GIT_REPOSITORY https://github.com/corrosion-rs/corrosion.git + GIT_TAG v0.1.0 +) +FetchContent_MakeAvailable(Corrosion) +corrosion_import_crate( + MANIFEST_PATH ${CMAKE_CURRENT_SOURCE_DIR}/Cargo.toml +) + +# Add the include directory with the header file generated by build.rs. +target_include_directories( + substrait-validator-c + INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +# Handle testing. +option( + SUBSTRAIT_VALIDATOR_BUILD_TESTS + "Whether to build tests for the Substrait validator C bindings" + OFF +) +if(SUBSTRAIT_VALIDATOR_BUILD_TESTS) + + # GoogleTest requires at least C++11 + set(CMAKE_CXX_STANDARD 11) + + include(FetchContent) + FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip + ) + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + FetchContent_MakeAvailable(googletest) + + enable_testing() + + add_executable( + substrait-validator-c-test + ${CMAKE_CURRENT_SOURCE_DIR}/tests/test.cc + ) + target_link_libraries( + substrait-validator-c-test + gtest_main + substrait-validator-c + ) + + include(GoogleTest) + gtest_discover_tests(substrait-validator-c-test) + +endif() diff --git a/c/Cargo.toml b/c/Cargo.toml new file mode 100644 index 00000000..0e9b476a --- /dev/null +++ b/c/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "substrait-validator-c" +version = "0.0.1" +edition = "2021" +license = "Apache-2.0" + +[lib] +crate-type = ["cdylib", "staticlib"] +doc = false + +[build-dependencies] +cbindgen = "0.20.0" + +[dependencies] +substrait-validator = { path = "../rs", version = "0.0.1" } +libc = "0.2" +thiserror = "1.0" diff --git a/c/README.md b/c/README.md new file mode 100644 index 00000000..312310a4 --- /dev/null +++ b/c/README.md @@ -0,0 +1,55 @@ +# C bindings for validator + +This directory contains a Rust/cbindgen project to generate C bindings for +the validator crate. + +## Installation + +No binaries are published yet. + +### Building manually + +To build manually, you will need: + + - [rust](https://www.rust-lang.org/tools/install) + +At which point you can run: + +```console +user@host:/path/to/substrait-validator/c$ cargo build --release +``` + +This will generate a static and shared library at +`/path/to/substrait-validator/target/release/libsubstrait_validator_c.[a|so|lib|dll|dylib]`, +and header at `/path/to/substrait-validator/c/include`. + +### Building using CMake + +You can also build via CMake, and in doing so use the validator from within a +CMake-based project. You should be able to simply add this directory as a +subdirectory and link against the `substrait-validator-c` target. This will +refer to the static or shared library based on `BUILD_SHARED_LIBS`. + +You can also run tests as follows: + +```console +user@host:/path/to/substrait-validator/c$ mkdir build +user@host:/path/to/substrait-validator/c$ cd build +user@host:/path/to/substrait-validator/c/build$ cmake .. -DSUBSTRAIT_VALIDATOR_BUILD_TESTS=ON +... +user@host:/path/to/substrait-validator/c/build$ cmake --build . +... +user@host:/path/to/substrait-validator/c/build$ ctest . +Test project /path/to/substrait-validator/c/build + Start 1: BasicTest.BasicTest +1/1 Test #1: BasicTest.BasicTest .............. Passed 0.00 sec + +100% tests passed, 0 tests failed out of 1 + +Total Test time (real) = 0.00 sec +``` + +## Usage + +The generated header file includes docstrings that should be fairly +self-explanatory. diff --git a/c/build.rs b/c/build.rs new file mode 100644 index 00000000..9fbb2f53 --- /dev/null +++ b/c/build.rs @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: Apache-2.0 + +use std::env; + +fn main() { + let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap(); + + let mut config = cbindgen::Config { + cpp_compat: true, + language: cbindgen::Language::C, + ..Default::default() + }; + config.export.prefix = Some("substrait_validator_".to_string()); + config + .export + .rename + .insert("ConfigHandle".to_string(), "config_handle".to_string()); + config + .export + .rename + .insert("ResultHandle".to_string(), "result_handle".to_string()); + config + .export + .rename + .insert("Resolver".to_string(), "resolver".to_string()); + config + .export + .rename + .insert("Deleter".to_string(), "deleter".to_string()); + config.header = Some("// SPDX-License-Identifier: Apache-2.0".to_string()); + + cbindgen::Builder::new() + .with_crate(crate_dir) + .with_config(config) + .generate() + .expect("Unable to generate bindings") + .write_to_file("include/substrait_validator.h"); +} diff --git a/c/src/lib.rs b/c/src/lib.rs new file mode 100644 index 00000000..af46872e --- /dev/null +++ b/c/src/lib.rs @@ -0,0 +1,796 @@ +// SPDX-License-Identifier: Apache-2.0 + +// Functions dereferencing raw pointers are kind of par for the course in a C +// interface, and if we have to mark effectively all functions unsafe here, we +// can no longer selectively place unsafe {} blocks (there is no way to mark a +// function as unsafe to use without implicitly allowing unsafe code to be used +// in its implementation). +#![allow(clippy::not_unsafe_ptr_arg_deref)] + +use std::cell::RefCell; + +thread_local! { + /// Most recent error message, stored in thread-local storage for + /// thread-safety. + pub static LAST_ERROR: RefCell = RefCell::new(std::ffi::CString::default()); +} + +/// Pushes an error message. +fn set_last_error>(s: S) { + LAST_ERROR.with(|f| { + *f.borrow_mut() = std::ffi::CString::new(s.as_ref()).unwrap_or_default(); + }); +} + +/// Returns the most recent error message. Note that the returned pointer is +/// only valid until the next call that the current thread makes to this +/// library. +#[no_mangle] +pub extern "C" fn substrait_validator_get_last_error() -> *const libc::c_char { + LAST_ERROR.with(|f| { + let reference = f.borrow(); + reference.as_bytes_with_nul().as_ptr() as *const libc::c_char + }) +} + +/// Parser/validator configuration handle. +pub struct ConfigHandle { + pub config: substrait_validator::Config, +} + +/// Creates a parser/validator configuration structure. +#[no_mangle] +pub extern "C" fn substrait_validator_config_new() -> *mut ConfigHandle { + // Create a box to store the return value handle on the stack. + let handle = Box::new(ConfigHandle { + config: substrait_validator::Config::new(), + }); + + // Convert the box to its raw pointer and relinquish ownership. + Box::into_raw(handle) +} + +/// Frees memory associated with a configuration handle. No-op if given a +/// nullptr. +#[no_mangle] +pub extern "C" fn substrait_validator_config_free(handle: *mut ConfigHandle) { + // Ignore null pointers. + if handle.is_null() { + return; + } + + // UNSAFE: recover the box that we created the handle with and drop it. + // Assumes that the pointer was created by substrait_validator_config_new(). + let config = unsafe { Box::from_raw(handle) }; + drop(config); +} + +/// Queries which diagnostic codes are defined. If buf is non-null and size is +/// nonzero, up to size entries in buf are filled with valid diagnostic codes. +/// Regardless of how many entries were populated, the number of defined +/// diagnostic codes is returned. +pub extern "C" fn substrait_validator_diag_codes(buf: *mut u32, size: usize) -> usize { + if !buf.is_null() && size > 0 { + // UNSAFE: assumes that buf is properly aligned, that there is + // read/write access to a region of size u32s from buf onwards, and + // that nothing else is mutating the buffer. + let slice = unsafe { std::slice::from_raw_parts_mut(buf, size) }; + + for (code, class) in slice + .iter_mut() + .zip(substrait_validator::iter_diagnostics()) + { + *code = class.code(); + } + } + + substrait_validator::iter_diagnostics().count() +} + +/// For the given diagnostic code, returns the code for the group it belongs +/// to. Configuring a level override for the parent of a group of diagnostic +/// codes will set the default override for all diagnostics contained within +/// that group. +pub extern "C" fn substrait_validator_diag_parent(code: u32) -> u32 { + substrait_validator::Classification::parent(code) +} + +/// Returns the name of the given diagnostic code. If buf is non-null and size +/// is nonzero, up to size-1 characters in buf are filled with this name, +/// followed by a null termination character. The null termination character is +/// considered to be part of size. If buf is non-null, size is nonzero, and +/// code is valid, it is always written, even if this means that the name is +/// cut short. Bytes in buf beyond the resulting string length but within the +/// size limit may be clobbered. +/// +/// If code is valid, the function returns the minimum buffer size needed to +/// contain the complete name (being its string length + 1), regardless of the +/// supplied buffer. If code is invalid, 0 is returned, and an error message +/// can be retrieved with substrait_validator_get_last_error(). +pub extern "C" fn substrait_validator_diag_name( + code: u32, + buf: *mut libc::c_char, + size: usize, +) -> usize { + if let Some(class) = substrait_validator::Classification::from_code(code) { + let name = class.name(); + let name_bytes = name.as_bytes(); + + if !buf.is_null() && size > 0 { + // UNSAFE: assumes that buf is properly aligned, that there is + // read/write access to a region of size bytes from buf onwards, + // and that nothing else is mutating the buffer. + let slice = unsafe { std::slice::from_raw_parts_mut(buf as *mut u8, size) }; + + // Try to write name followed by a 0 to the first size-1 bytes + // of the buffer. + for (buf_byte, name_byte) in slice[..size - 1] + .iter_mut() + .zip(name_bytes.iter().cloned().chain(std::iter::once(0))) + { + *buf_byte = name_byte; + } + + // Pessimistically always write a 0 to the last byte of the buffer, + // even though we may already have written an early termination + // character. + slice[size - 1] = 0; + } + + // Return the minimum buffer size. + name_bytes.len() + 1 + } else { + set_last_error(format!("{code} is not a valid diagnostic code")); + 0 + } +} + +/// Returns the description of the given diagnostic code. If buf is non-null +/// and size is nonzero, up to size-1 characters in buf are filled with this +/// description, followed by a null termination character. The null +/// termination character is considered to be part of size. If buf is +/// non-null, size is nonzero, and code is valid, it is always written, even +/// if this means that the name is cut short. Bytes in buf beyond the +/// resulting string length but within the size limit may be clobbered. +/// +/// If code is valid, the function returns the minimum buffer size needed to +/// contain the complete description (being its string length + 1), regardless +/// of the supplied buffer. If code is invalid, 0 is returned, and an error +/// message can be retrieved with substrait_validator_get_last_error(). +pub extern "C" fn substrait_validator_diag_desc( + code: u32, + buf: *mut libc::c_char, + size: usize, +) -> usize { + if let Some(class) = substrait_validator::Classification::from_code(code) { + let description = class.description(); + let description_bytes = description.as_bytes(); + + if !buf.is_null() && size > 0 { + // UNSAFE: assumes that buf is properly aligned, that there is + // read/write access to a region of size bytes from buf onwards, + // and that nothing else is mutating the buffer. + let slice = unsafe { std::slice::from_raw_parts_mut(buf as *mut u8, size) }; + + // Try to write name followed by a 0 to the first size-1 bytes + // of the buffer. + for (buf_byte, name_byte) in slice[..size - 1] + .iter_mut() + .zip(description_bytes.iter().cloned().chain(std::iter::once(0))) + { + *buf_byte = name_byte; + } + + // Pessimistically always write a 0 to the last byte of the buffer, + // even though we may already have written an early termination + // character. + slice[size - 1] = 0; + } + + // Return the minimum buffer size. + description_bytes.len() + 1 + } else { + set_last_error(format!("{code} is not a valid diagnostic code")); + 0 + } +} + +/// Instructs the validator to ignore protobuf fields that it doesn't know +/// about yet (i.e., that have been added to the Substrait protobuf +/// descriptions, but haven't yet been implemented in the validator) if the +/// fields are set to their default value. If this option isn't set, or if an +/// unknown field is not set to its default value, a warning is emitted. +/// +/// Returns whether the function was successful. If false is returned, retrieve +/// the error message with substrait_validator_get_last_error(). +#[no_mangle] +pub extern "C" fn substrait_validator_config_ignore_unknown_fields( + config: *mut ConfigHandle, +) -> bool { + // Check for null. + if config.is_null() { + set_last_error("received null configuration handle"); + return false; + } + + // UNSAFE: unpack configuration handle. Assumes that the pointer was + // created by substrait_validator_config_new(), or behavior is undefined. + let config = unsafe { &mut (*config).config }; + + // Update configuration and return success. + config.ignore_unknown_fields(); + true +} + +/// Explicitly allows a protobuf message type for use in advanced extensions, +/// despite the fact that the validator can't validate it. If an advanced +/// extension is encountered that isn't explicitly allowed, a warning is +/// emitted. The type URL pattern may include * and ? wildcards for glob-like +/// matching (see https://docs.rs/glob/latest/glob/struct.Pattern.html for the +/// complete syntax). +/// +/// Returns whether the function was successful. If false is returned, retrieve +/// the error message with substrait_validator_get_last_error(). +#[no_mangle] +pub extern "C" fn substrait_validator_config_allow_proto_any_url( + config: *mut ConfigHandle, + pattern: *const libc::c_char, +) -> bool { + // Check for nulls. + if config.is_null() { + set_last_error("received null configuration handle"); + return false; + } + if pattern.is_null() { + set_last_error("received null pattern"); + return false; + } + + // UNSAFE: unpack configuration handle. Assumes that the pointer was + // created by substrait_validator_config_new(), or behavior is undefined. + let config = unsafe { &mut (*config).config }; + + // UNSAFE: unpack pattern string. Assumes that the pointer points to a + // null-terminated string. + let pattern = unsafe { std::ffi::CStr::from_ptr(pattern) }; + + // Parse the pattern. + let pattern = match pattern.to_str() { + Ok(u) => u, + Err(e) => { + set_last_error(format!("received invalid pattern: {e}")); + return false; + } + }; + let pattern = match substrait_validator::Pattern::new(pattern) { + Ok(p) => p, + Err(e) => { + set_last_error(format!("received invalid pattern: {e}")); + return false; + } + }; + + // Update configuration and return success. + config.allow_proto_any_url(pattern); + true +} + +/// Converts a positive/zero/negative integer into Info/Warning/Error +/// respectively. +fn int_to_level(x: i32) -> substrait_validator::Level { + match x { + 1..=i32::MAX => substrait_validator::Level::Info, + 0 => substrait_validator::Level::Warning, + i32::MIN..=-1 => substrait_validator::Level::Error, + } +} + +/// Sets a minimum and/or maximum error level for the given class of diagnostic +/// messages. Any previous settings for this class are overridden. The levels +/// are encoded as integers, where any positive value means info, zero means +/// warning, and negative means error. +/// +/// Returns whether the function was successful. If false is returned, retrieve +/// the error message with substrait_validator_get_last_error(). +#[no_mangle] +pub extern "C" fn substrait_validator_config_override_diagnostic_level( + config: *mut ConfigHandle, + class: u32, + minimum: i32, + maximum: i32, +) -> bool { + // Check for null. + if config.is_null() { + set_last_error("received null configuration handle"); + return false; + } + + // UNSAFE: unpack configuration handle. Assumes that the pointer was + // created by substrait_validator_config_new(), or behavior is undefined. + let config = unsafe { &mut (*config).config }; + + // Parse the diagnostic class/code. + let class = match substrait_validator::Classification::from_code(class) { + Some(c) => c, + None => { + set_last_error(format!("unknown diagnostic class {class}")); + return false; + } + }; + + // Parse the minimum and maximum levels. + let minimum = int_to_level(minimum); + let maximum = int_to_level(maximum); + + // Update configuration and return success. + config.override_diagnostic_level(class, minimum, maximum); + true +} + +/// Overrides the resolution behavior for (YAML) URIs matching the given +/// pattern. The pattern may include * and ? wildcards for glob-like matching +/// (see https://docs.rs/glob/latest/glob/struct.Pattern.html for the complete +/// syntax). If resolve_as is null, the URI will not be resolved; otherwise, it +/// will be resolved as if the URI in the plan had been that string. +/// +/// Returns whether the function was successful. If false is returned, retrieve +/// the error message with substrait_validator_get_last_error(). +#[no_mangle] +pub extern "C" fn substrait_validator_config_override_uri( + config: *mut ConfigHandle, + pattern: *const libc::c_char, + resolve_as: *const libc::c_char, +) -> bool { + // Check for nulls. + if config.is_null() { + set_last_error("received null configuration handle"); + return false; + } + if pattern.is_null() { + set_last_error("received null pattern"); + return false; + } + + // UNSAFE: unpack configuration handle. Assumes that the pointer was + // created by substrait_validator_config_new(), or behavior is undefined. + let config = unsafe { &mut (*config).config }; + + // UNSAFE: unpack pattern string. Assumes that the pointer points to a + // null-terminated string. + let pattern = unsafe { std::ffi::CStr::from_ptr(pattern) }; + + // Parse the pattern. + let pattern = match pattern.to_str() { + Ok(p) => p, + Err(e) => { + set_last_error(format!("received invalid pattern: {e}")); + return false; + } + }; + let pattern = match substrait_validator::Pattern::new(pattern) { + Ok(p) => p, + Err(e) => { + set_last_error(format!("received invalid pattern: {e}")); + return false; + } + }; + + // Unpack and parse resolve_as. + let resolve_as = if resolve_as.is_null() { + None + } else { + // UNSAFE: unpack resolve_as string. Assumes that the pointer points to + // a null-terminated string. + let resolve_as = unsafe { std::ffi::CStr::from_ptr(resolve_as) }; + + Some(match resolve_as.to_str() { + Ok(p) => p, + Err(e) => { + set_last_error(format!("received invalid replacement URI: {e}")); + return false; + } + }) + }; + + // Update configuration and return success. + config.override_uri(pattern, resolve_as); + true +} + +/// Callback function for deleting a buffer allocated by the user application. +pub type Deleter = + Option; + +/// (YAML) URI resolution callback function. +/// +/// The first argument (uri) is set to a null-terminated UTF-8 string +/// representing the URI that is to be resolved. If resolution succeeds, the +/// function must return the binary result buffer via buf and size and return +/// true. If it fails, it should instead write a UTF-8 error message to this +/// buffer (but it may also set buf to nullptr or leave it unchanged) and +/// return false. +/// +/// The buffer must remain valid only until the validator library returns +/// control to the application. Thus, the application may keep track of the +/// current buffer via thread-local storage or a global. It may also assign a +/// deleter function to the deleter parameter, which will be called by the +/// validator library when it is done with the buffer. deleter_user may be +/// used to pass additional contextual information to the deleter; it is not +/// used by the validator library for any purpose other than calling the +/// deleter function. +/// +/// All output parameters will be set to zero by the validator library before +/// the callback is called. +pub type Resolver = Option< + unsafe extern "C" fn( + uri: *const libc::c_char, + buf: *mut *const u8, + size: *mut usize, + deleter: *mut Deleter, + deleter_user: *mut *mut libc::c_void, + ) -> bool, +>; + +/// Wraps a buffer returned by Resolver. +struct ApplicationBuffer { + pub buf: *const u8, + pub size: usize, + pub deleter: Deleter, + pub deleter_user: *mut libc::c_void, +} + +impl Default for ApplicationBuffer { + fn default() -> Self { + Self { + buf: std::ptr::null(), + size: 0, + deleter: None, + deleter_user: std::ptr::null_mut(), + } + } +} + +impl Drop for ApplicationBuffer { + fn drop(&mut self) { + if let Some(deleter) = self.deleter { + // UNSAFE: assumes that the deleter function passed by the user is + // valid. + unsafe { deleter(self.deleter_user, self.buf, self.size) } + } + } +} + +impl AsRef<[u8]> for ApplicationBuffer { + fn as_ref(&self) -> &[u8] { + // UNSAFE: assumes that the pointer to the buffer returned by the + // application is non-null, that the pointed-to byte up to that byte + // plus self.size bytes can be dereferenced. + unsafe { std::slice::from_raw_parts(self.buf, self.size) } + } +} + +/// Rust representation of an error returned by the Resolver callback function. +#[derive(Debug, thiserror::Error)] +struct ApplicationError { + msg: String, +} + +impl ApplicationError { + fn new>(msg: S) -> Self { + ApplicationError { msg: msg.into() } + } +} + +impl From for ApplicationError { + fn from(buf: ApplicationBuffer) -> Self { + ApplicationError { + msg: match std::str::from_utf8(buf.as_ref()) { + Ok(e) => e.to_string(), + Err(e) => format!("unknown error (failed to decode error message: {e})"), + }, + } + } +} + +impl std::fmt::Display for ApplicationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.msg) + } +} + +/// Registers a URI resolution function with this configuration. If the given +/// function fails, any previously registered function will be used as a +/// fallback. +/// +/// See the documentation for the substrait_validator_resolver typedef for +/// more information about the semantics of the callback function. +/// +/// Returns whether the function was successful. If false is returned, retrieve +/// the error message with substrait_validator_get_last_error(). +#[no_mangle] +pub extern "C" fn substrait_validator_config_uri_resolver( + config: *mut ConfigHandle, + resolver: Resolver, +) -> bool { + // Check for nulls. + if config.is_null() { + set_last_error("received null configuration handle"); + return false; + } + + // UNSAFE: unpack configuration handle. Assumes that the pointer was + // created by substrait_validator_config_new(), or behavior is undefined. + let config = unsafe { &mut (*config).config }; + + // Unpack resolution function. + let resolver = match resolver { + Some(r) => r, + None => { + set_last_error("received null resolution function pointer"); + return false; + } + }; + + // Update configuration and return success. + config.add_uri_resolver(move |uri| { + let uri = match std::ffi::CString::new(uri) { + Ok(u) => u, + Err(_) => { + return Err(ApplicationError::new( + "cannot resolve URI with embedded nul characters", + )) + } + }; + let mut buffer = ApplicationBuffer::default(); + + // UNSAFE: assumes that the resolver function passed by the user is + // valid. + let result = unsafe { + resolver( + uri.as_ptr(), + &mut buffer.buf, + &mut buffer.size, + &mut buffer.deleter, + &mut buffer.deleter_user, + ) + }; + + if result { + if buffer.buf.is_null() { + Err(ApplicationError::new( + "URI resolver callback returned success but also a null buffer", + )) + } else { + Ok(buffer) + } + } else if buffer.buf.is_null() { + Err(ApplicationError::new("URI resolver callback failed")) + } else { + Err(ApplicationError::from(buffer)) + } + }); + true +} + +/// Parse/validation result handle. +pub struct ResultHandle { + pub result: substrait_validator::ParseResult, +} + +/// Parses the given byte buffer as a substrait.Plan message, using the given +/// configuration. If a null pointer is passed for the configuration, the +/// default configuration is used. +/// +/// Returns a handle to the parse result. This handle must be freed using +/// substrait_validator_free() when it is no longer needed. Fails and returns +/// nullptr only if the incoming buffer is nullptr; any other failure to parse +/// or validate the buffer is embedded in the handle. +#[no_mangle] +pub extern "C" fn substrait_validator_parse( + data: *const u8, + size: u64, + config: *const ConfigHandle, +) -> *mut ResultHandle { + // Catch null pointers. + if data.is_null() { + set_last_error("received null input buffer"); + return std::ptr::null_mut(); + } + + // UNSAFE: convert the incoming buffer information into a slice. + let data = unsafe { std::slice::from_raw_parts(data, size.try_into().unwrap()) }; + + // Perform the actual parsing. + let result = if config.is_null() { + substrait_validator::parse(data, &substrait_validator::Config::default()) + } else { + // UNSAFE: unpack configuration handle. Assumes that the pointer was + // created by substrait_validator_config_new(), or behavior is undefined. + substrait_validator::parse(data, unsafe { &(*config).config }) + }; + + // Create a box to store the return value handle on the stack. + let handle = Box::new(ResultHandle { result }); + + // Convert the box to its raw pointer and relinquish ownership. + Box::into_raw(handle) +} + +/// Frees memory associated with a parse result handle. No-op if given a +/// nullptr. +#[no_mangle] +pub extern "C" fn substrait_validator_free(handle: *mut ResultHandle) { + // Ignore null pointers. + if handle.is_null() { + return; + } + + // UNSAFE: recover the box that we created the handle with and drop it. + // Assumes that the pointer was created by substrait_validator_parse(). + let handle = unsafe { Box::from_raw(handle) }; + drop(handle); +} + +/// Returns whether the given parse result handle refers to a valid (positive +/// return value), invalid (negative return value), or possibly valid plan +/// (0 return value). +#[no_mangle] +pub extern "C" fn substrait_validator_check(handle: *const ResultHandle) -> i32 { + // UNSAFE: dereference the result handle. Assumes that the pointer was + // created by substrait_validator_parse(), or that it is null (in which + // case an exception is thrown safely). + let handle = unsafe { handle.as_ref() }; + if handle.is_none() { + return -1; + } + let result = &handle.as_ref().unwrap().result; + + // Perform the check. + match result.check() { + substrait_validator::Validity::Valid => 1, + substrait_validator::Validity::MaybeValid => 0, + substrait_validator::Validity::Invalid => -1, + } +} + +/// The guts for the export functions. +fn export( + format: substrait_validator::export::Format, + handle: *const ResultHandle, + size: *mut u64, +) -> *mut u8 { + // UNSAFE: dereference the result handle. Assumes that the pointer was + // created by substrait_validator_parse(), or that it is null (in which + // case an exception is thrown safely). + let handle = unsafe { handle.as_ref() }; + if handle.is_none() { + set_last_error("received null handle"); + return std::ptr::null_mut(); + } + let result = &handle.as_ref().unwrap().result; + + // Create a byte vector as output. The first 16 bytes are reserved: we'll + // store the length and capacity of the vector in there, and advance the + // pointer beyond this length before passing the data to the user. This + // allows us to fully recover the vector from just the returned pointer + // later, which we need in order to drop it safely. + let mut data: Vec = vec![0; 16]; + + // Perform the actual export function. + if let Err(e) = result.export(&mut data, format) { + set_last_error(e.to_string()); + return std::ptr::null_mut(); + } + + // UNSAFE: pass the length to the user, if they wanted to know about it. + // Assumes that the size pointer, if non-null, points to a writable and + // appropriately aligned memory location. + if let Some(size) = unsafe { size.as_mut() } { + *size = (data.len() - 16).try_into().unwrap(); + } + + // Append a null character, to prevent pain and misery if the user treats + // the buffer as a null-terminated string. + data.push(0); + + // Save the length and capacity of the vector to the start of said + // vector, so we can recover them later. + let len: u64 = data.len().try_into().unwrap(); + data[..8].clone_from_slice(&len.to_ne_bytes()); + let capacity: u64 = data.capacity().try_into().unwrap(); + data[8..16].clone_from_slice(&capacity.to_ne_bytes()); + + // Get the pointer to the vector, and relinquish ownership. + let ptr = data.as_mut_ptr(); + std::mem::forget(data); + + // UNSAFE: advance the pointer beyond the bytes that we're using to store + // the size of the vector. This assumes that advancing by 16 bytes doesn't + // advance beyond the end of the buffer, which should not be possible, as + // the buffer is at least 17 bytes long (8 bytes length, 8 bytes capacity, + // and null termination byte). + unsafe { ptr.add(16) } +} + +/// Frees memory associated with an exported buffer. No-op if given a nullptr. +#[no_mangle] +pub extern "C" fn substrait_validator_free_exported(data: *mut u8) { + // Don't do anything if the user passed nullptr. + if data.is_null() { + return; + } + + // UNSAFE: recover the pointer to the vector data. Assumes that the pointer + // was (ultimately) created using export(), in which case this just + // reverses the pointer arithmetic done at the end of its body. + let buffer_ptr = unsafe { data.sub(16) }; + + // UNSAFE: recover the vector length from the first 8 bytes. Assumes that + // these 8 bytes are readable. + let length_ptr = buffer_ptr; + let length = u64::from_ne_bytes( + unsafe { std::slice::from_raw_parts(length_ptr, 8) } + .try_into() + .unwrap(), + ); + let length = usize::try_from(length).unwrap(); + + // UNSAFE: recover the vector capacity from the next 8 bytes. Assumes that + // these 8 bytes are readable. + let capacity_ptr = unsafe { buffer_ptr.add(8) }; + let capacity = u64::from_ne_bytes( + unsafe { std::slice::from_raw_parts(capacity_ptr, 8) } + .try_into() + .unwrap(), + ); + let capacity = usize::try_from(capacity).unwrap(); + + // UNSAFE: recover the vector and drop it. Assumes that the recovered + // pointer, length, and capacity do indeed form the raw parts of a valid + // Vec. + let vec = unsafe { Vec::from_raw_parts(buffer_ptr, length, capacity) }; + drop(vec); +} + +/// Converts the given parse result to a multiline, null-terminated string, +/// where each line represents a diagnostic message. If size is non-null, the +/// length of the string (excluding null-termination byte) will be written to +/// it. The function will return nullptr upon failure, in which case +/// substrait_validator_get_last_error() can be used to retrieve an error +/// message. If the function succeeds, the returned pointer must eventually be +/// freed using substrait_validator_free_exported() in order to not leak +/// memory. +#[no_mangle] +pub extern "C" fn substrait_validator_export_diagnostics( + handle: *const ResultHandle, + size: *mut u64, +) -> *mut u8 { + export( + substrait_validator::export::Format::Diagnostics, + handle, + size, + ) +} + +/// Same as substrait_validator_export_diagnostics(), but instead returns a +/// buffer filled with a HTML-based human-readable description of the parsed +/// plan. +#[no_mangle] +pub extern "C" fn substrait_validator_export_html( + handle: *const ResultHandle, + size: *mut u64, +) -> *mut u8 { + export(substrait_validator::export::Format::Html, handle, size) +} + +/// Same as substrait_validator_export_diagnostics(), but instead returns a +/// substrait.validator.Node message in its binary serialization format. The +/// buffer is null-terminated, but note that protobuf serialization is a binary +/// format, so you'll need to use the size argument to get an accurate size. +#[no_mangle] +pub extern "C" fn substrait_validator_export_proto( + handle: *const ResultHandle, + size: *mut u64, +) -> *mut u8 { + export(substrait_validator::export::Format::Proto, handle, size) +} diff --git a/c/tests/test.cc b/c/tests/test.cc new file mode 100644 index 00000000..c9d3933c --- /dev/null +++ b/c/tests/test.cc @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include + +TEST(BasicTest, BasicTest) { + + // To not depend on the Substrait format, just throw garbage at the parser. + // It should immediately fail to parse that, of course, but we can still + // do some basic interface testing that way. + std::string nonsense = "protobuf bytes normally go here"; + + // Try parsing nonsense. + auto handle = substrait_validator_parse( + reinterpret_cast(nonsense.c_str()), nonsense.size(), + nullptr); + ASSERT_NE(handle, nullptr); + + // That should fail. + EXPECT_EQ(substrait_validator_check(handle), -1); + + // Try getting a list of error messages. + uint64_t data_size = 0; + auto data_ptr = substrait_validator_export_diagnostics(handle, &data_size); + + // Those messages should still be valid after freeing the handle. + substrait_validator_free(handle); + + // Check sanity. + ASSERT_NE(data_ptr, nullptr); + EXPECT_GT(data_size, 0); + EXPECT_EQ(strlen(reinterpret_cast(data_ptr)), data_size); + EXPECT_EQ(reinterpret_cast(data_ptr), + std::string("Error at plan: failed to decode Protobuf message: " + "invalid wire type value: 7 (code 1001)\n")); + + // Free the buffer. + substrait_validator_free_exported(data_ptr); +} diff --git a/derive/Cargo.toml b/derive/Cargo.toml new file mode 100644 index 00000000..7a8af000 --- /dev/null +++ b/derive/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "substrait-validator-derive" +description = "Procedural macros for substrait-validator" +homepage = "https://substrait.io/" +repository = "https://github.com/substrait-io/substrait" +readme = "README.md" +version = "0.0.1" +edition = "2021" +license = "Apache-2.0" + +[lib] +proc-macro = true + +[dependencies] +quote = "1.0" +syn = "1.0" +heck = "0.4" diff --git a/derive/README.md b/derive/README.md new file mode 100644 index 00000000..4ebb4ce8 --- /dev/null +++ b/derive/README.md @@ -0,0 +1,9 @@ +Procedural macro crate for substrait-validator +============================================== + +This crate defines some `#[derive]` macros for +[substrait-validator](https://crates.io/crates/substrait-validator), +specifically for the types generated by `prost-build`. This is needed because +`prost-build` on its own doesn't generate any introspection-like information +for the protobuf structures, such as message type names as strings, which we +want to be able to use in our parse tree. diff --git a/derive/src/lib.rs b/derive/src/lib.rs new file mode 100644 index 00000000..38dc2e39 --- /dev/null +++ b/derive/src/lib.rs @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Procedural macro crate for `substrait-validator-core`. +//! +//! The derive macros defined here are essentially an ugly workaround for the +//! lack of any protobuf introspection functionality provided by prost. +//! Basically, they take (the AST of) the code generated by prost and try to +//! recover the needed protobuf message metadata from there. Things would have +//! been a *LOT* simpler and a *LOT* less brittle if prost would simply +//! provide this information via traits of its own, but alas, there doesn't +//! seem to be a way to do this without forking prost, and introspection +//! seems to be a non-goal of that project. +//! +//! Besides being ugly, this method is rather brittle and imprecise when it +//! comes to recovering field names, due to the various case conversions +//! automatically done by protoc and prost. Some known issues are: +//! +//! - The recovered type name for messages defined within messages uses +//! incorrect case conventions, as the procedural macros have no way +//! of distinguishing packages from message definition scopes in the +//! type path. +//! - If the .proto source files use unexpected case conventions for +//! various things, the resulting case conventions for types, field names, +//! oneof variants, and enum variants will be wrong. +//! - Whenever the .proto source files name a field using something that is +//! a reserved word in Rust (notably `type`), prost will use a raw +//! identifier to represent the name. This syntax is currently not filtered +//! out from the recovered names, so a field named `type` becomes `r#type`. +//! This is probably not a fundamental problem, though. +//! +//! Ultimately, however, these names are only used for diagnostic messages and +//! the likes. In the worst case, the above inconsistencies may confuse the +//! user, but they should not affect the valid/invalid/maybe-valid result of +//! the validator or cause compile- or runtime errors. + +extern crate proc_macro; + +use heck::{ToShoutySnakeCase, ToSnakeCase}; +use proc_macro::TokenStream; +use quote::quote; + +/// Converts a Rust identifier string generated via stringify!() to the +/// original identifier by "cooking" raw identifiers. +fn cook_ident(ident: &syn::Ident) -> String { + let ident = ident.to_string(); + if let Some((_, keyword)) = ident.split_once('#') { + keyword.to_string() + } else { + ident + } +} + +#[doc(hidden)] +#[proc_macro_derive(ProtoMeta, attributes(proto_meta))] +pub fn proto_meta(input: TokenStream) -> TokenStream { + proto_meta_derive(syn::parse_macro_input!(input)) +} + +fn proto_meta_derive(ast: syn::DeriveInput) -> TokenStream { + match ast.data { + syn::Data::Struct(ref struct_data) => proto_meta_derive_message(&ast, struct_data), + syn::Data::Enum(ref enum_data) => match enum_data.variants.iter().next().unwrap().fields { + syn::Fields::Unit => { + for variant in enum_data.variants.iter() { + if !matches!(variant.fields, syn::Fields::Unit) { + panic!("all variants of a protobuf oneof enum must have a single, unnamed field"); + } + } + + proto_meta_derive_enum(&ast, enum_data) + } + syn::Fields::Unnamed(..) => { + for variant in enum_data.variants.iter() { + if let syn::Fields::Unnamed(fields) = &variant.fields { + if fields.unnamed.len() != 1 { + panic!("all variants of a protobuf oneof enum must have a single, unnamed field"); + } + } else { + panic!("all variants of a protobuf oneof enum must have a single, unnamed field"); + } + } + + proto_meta_derive_oneof(&ast, enum_data) + } + _ => panic!("enum with named elements don't map to protobuf constructs"), + }, + syn::Data::Union(_) => panic!("unions don't map to protobuf constructs"), + } +} + +enum FieldType { + Optional, + BoxedOptional, + Repeated, + Primitive, +} + +fn is_repeated(typ: &syn::Type) -> FieldType { + if let syn::Type::Path(path) = typ { + if let Some(last) = path.path.segments.last() { + if last.ident == "Option" { + if let syn::PathArguments::AngleBracketed(ref args) = last.arguments { + if let syn::GenericArgument::Type(syn::Type::Path(path2)) = + args.args.first().unwrap() + { + if path2.path.segments.last().unwrap().ident == "Box" { + return FieldType::BoxedOptional; + } else { + return FieldType::Optional; + } + } + } + panic!("Option without type argument?"); + } else if last.ident == "Vec" { + if let syn::PathArguments::AngleBracketed(ref args) = last.arguments { + if let syn::GenericArgument::Type(syn::Type::Path(path2)) = + args.args.first().unwrap() + { + if path2.path.segments.last().unwrap().ident == "u8" { + return FieldType::Primitive; + } else { + return FieldType::Repeated; + } + } + } + panic!("Vec without type argument?"); + } + } + } + FieldType::Primitive +} + +fn proto_meta_derive_message(ast: &syn::DeriveInput, data: &syn::DataStruct) -> TokenStream { + let name = &ast.ident; + let name_str = cook_ident(name); + let (impl_generics, ty_generics, where_clause) = ast.generics.split_for_impl(); + + let parse_unknown_matches: Vec<_> = data + .fields + .iter() + .map(|field| { + if let Some(ident) = &field.ident { + let ident_str = cook_ident(ident); + let action = match is_repeated(&field.ty) { + FieldType::Optional => quote! { + crate::parse::traversal::push_proto_field( + y, + &self.#ident.as_ref(), + #ident_str, + true, + |_, _| Ok(()), + ); + }, + FieldType::BoxedOptional => quote! { + crate::parse::traversal::push_proto_field( + y, + &self.#ident, + #ident_str, + true, + |_, _| Ok(()), + ); + }, + FieldType::Repeated => quote! { + crate::parse::traversal::push_proto_repeated_field( + y, + &self.#ident.as_ref(), + #ident_str, + true, + |_, _| Ok(()), + |_, _, _, _, _| (), + ); + }, + FieldType::Primitive => quote! { + use crate::input::traits::ProtoPrimitive; + if !y.config.ignore_unknown_fields || !self.#ident.proto_primitive_is_default() { + crate::parse::traversal::push_proto_field( + y, + &Some(&self.#ident), + #ident_str, + true, + |_, _| Ok(()), + ); + } + }, + }; + quote! { + if !y.field_parsed(#ident_str) { + unknowns = true; + #action + } + } + } else { + panic!("protobuf message fields must have names"); + } + }) + .collect(); + + quote!( + impl #impl_generics crate::input::traits::ProtoMessage for #name #ty_generics #where_clause { + fn proto_message_type() -> &'static str { + use ::once_cell::sync::Lazy; + static TYPE_NAME: Lazy<::std::string::String> = Lazy::new(|| { + crate::input::proto::cook_path(module_path!(), #name_str) + }); + &TYPE_NAME + } + } + + impl #impl_generics crate::input::traits::InputNode for #name #ty_generics #where_clause { + fn type_to_node() -> crate::output::tree::Node { + use crate::input::traits::ProtoMessage; + crate::output::tree::NodeType::ProtoMessage(Self::proto_message_type()).into() + } + + fn data_to_node(&self) -> crate::output::tree::Node { + use crate::input::traits::ProtoMessage; + crate::output::tree::NodeType::ProtoMessage(Self::proto_message_type()).into() + } + + fn oneof_variant(&self) -> Option<&'static str> { + None + } + + fn parse_unknown( + &self, + y: &mut crate::parse::context::Context<'_>, + ) -> bool { + let mut unknowns = false; + #(#parse_unknown_matches)* + unknowns + } + } + ) + .into() +} + +fn proto_meta_derive_oneof(ast: &syn::DeriveInput, data: &syn::DataEnum) -> TokenStream { + let name = &ast.ident; + let (impl_generics, ty_generics, where_clause) = ast.generics.split_for_impl(); + + let variant_matches: Vec<_> = data + .variants + .iter() + .map(|variant| { + let ident = &variant.ident; + let proto_name = cook_ident(ident).to_snake_case(); + quote! { #name::#ident (_) => #proto_name } + }) + .collect(); + + let node_matches: Vec<_> = data + .variants + .iter() + .map(|variant| { + let ident = &variant.ident; + quote! { #name::#ident (x) => x.data_to_node() } + }) + .collect(); + + let parse_unknown_matches: Vec<_> = data + .variants + .iter() + .map(|variant| { + let ident = &variant.ident; + quote! { #name::#ident (x) => x.parse_unknown(y) } + }) + .collect(); + + quote!( + impl #impl_generics crate::input::traits::ProtoOneOf for #name #ty_generics #where_clause { + fn proto_oneof_variant(&self) -> &'static str { + match self { + #(#variant_matches),* + } + } + } + + impl #impl_generics crate::input::traits::InputNode for #name #ty_generics #where_clause { + fn type_to_node() -> crate::output::tree::Node { + crate::output::tree::NodeType::ProtoMissingOneOf.into() + } + + fn data_to_node(&self) -> crate::output::tree::Node { + match self { + #(#node_matches),* + } + } + + fn oneof_variant(&self) -> Option<&'static str> { + use crate::input::traits::ProtoOneOf; + Some(self.proto_oneof_variant()) + } + + fn parse_unknown( + &self, + y: &mut crate::parse::context::Context<'_>, + ) -> bool { + match self { + #(#parse_unknown_matches),* + } + } + } + ) + .into() +} + +fn proto_meta_derive_enum(ast: &syn::DeriveInput, data: &syn::DataEnum) -> TokenStream { + let name = &ast.ident; + let name_str = cook_ident(name); + let (impl_generics, ty_generics, where_clause) = ast.generics.split_for_impl(); + + let upper_name = name_str.to_shouty_snake_case(); + + let variant_names: Vec<_> = data + .variants + .iter() + .map(|variant| { + let ident = &variant.ident; + let proto_name = format!( + "{}_{}", + upper_name, + cook_ident(ident).to_shouty_snake_case() + ); + (ident, proto_name) + }) + .collect(); + + let variant_matches: Vec<_> = variant_names + .iter() + .map(|(ident, proto_name)| { + quote! { #name::#ident => #proto_name } + }) + .collect(); + + let (_, first_variant_name) = &variant_names[0]; + + quote!( + impl #impl_generics crate::input::traits::ProtoEnum for #name #ty_generics #where_clause { + fn proto_enum_type() -> &'static str { + use ::once_cell::sync::Lazy; + static TYPE_NAME: Lazy<::std::string::String> = Lazy::new(|| { + crate::input::proto::cook_path(module_path!(), #name_str) + }); + &TYPE_NAME + } + + fn proto_enum_default_variant() -> &'static str { + #first_variant_name + } + + fn proto_enum_variant(&self) -> &'static str { + match self { + #(#variant_matches),* + } + } + + fn proto_enum_from_i32(x: i32) -> Option { + Self::from_i32(x) + } + } + ) + .into() +} diff --git a/proto/buf.lock b/proto/buf.lock new file mode 100644 index 00000000..c91b5810 --- /dev/null +++ b/proto/buf.lock @@ -0,0 +1,2 @@ +# Generated by buf. DO NOT EDIT. +version: v1 diff --git a/proto/buf.yaml b/proto/buf.yaml new file mode 100644 index 00000000..8b9ee4f7 --- /dev/null +++ b/proto/buf.yaml @@ -0,0 +1,11 @@ +version: v1 +name: buf.build/substrait-io/substrait +lint: + use: + - DEFAULT + ignore_only: + PACKAGE_VERSION_SUFFIX: + - substrait +breaking: + use: + - FILE diff --git a/proto/substrait/validator/validator.proto b/proto/substrait/validator/validator.proto new file mode 100644 index 00000000..0af8796a --- /dev/null +++ b/proto/substrait/validator/validator.proto @@ -0,0 +1,489 @@ +// SPDX-License-Identifier: Apache-2.0 +syntax = "proto3"; + +package substrait.validator; + +import "google/protobuf/any.proto"; + +option csharp_namespace = "Substrait.Validator.Protobuf"; +option java_multiple_files = true; +option java_package = "io.substrait.validator.proto"; + +// One of the functions of the validator is to convert Substrait plans to a +// format that is more easy to consume for software geared toward making +// human-readable representations of Substrait. The validator has a few +// builtin text-based exporters, but it can also emit the complete parse +// result via the binary serialization of this message type. +message ParseResult { + // Root node of the parse result tree. + Node root = 1; +} + +// Nodes of the validator parse result tree. +// +// Note that, unlike substrait.Plan and its children, the nodes in this tree +// are intentionally devoid of typing information: all nodes are of type Node. +// The purpose of this is to allow a consumer of these trees to walk over the +// entire tree without needing in-depth knowledge of how Substrait works (and, +// with that, to decouple them from changes to the Substrait specification): +// they are intended as an intermediate format for converting Substrait plans +// into more human-friendly representations after all, not for programmatically +// dealing with the semantics of Substrait itself. That's what the validator is +// for, in this case. +// +// In particular, gathering all diagnostics emitted by the validator only +// requires the consumer to use the Node, Node.Data, Node.Child, and of course +// the Diagnostic message types. +// +// In case the consumer does need additional information from the original +// substrait.Plan, every node can be related back to its corresponding message +// via the path information associated with the nodes. +message Node { + // The type of node. + oneof node_type { + // This node represents a protobuf message. The fields are described using + // Field, RepeatedField, and OneOfField messages in data. + ProtoMessage proto_message = 1; + + // This node represents a protobuf primitive or enum. + ProtoPrimitive proto_primitive = 2; + + // This node is inserted as a placeholder when a required oneof field was + // not populated in the input. + Empty proto_missing_oneof = 3; + + // Special case of proto_primitive for references to anchors defined + // elsewhere in the plan. + NodeReference node_reference = 4; + + // Special case for proto_primitive for references to YAML files via a URI. + // If resolved, the keys in the toplevel YAML map are represented using + // Field messages in data. + YamlReference yaml_reference = 5; + + // This node represents a YAML map/object. The keys are represented using + // Field messages in data. + Empty yaml_map = 6; + + // This node represents a YAML array. The elements are represented using + // ArrayElement messages in data. + Empty yaml_array = 7; + + // This node represents a YAML primitive. + PrimitiveData yaml_primitive = 8; + } + + // Semantic classification of this node. + Class class = 13; + + // Semantic classification of a node. + enum Class { + CLASS_UNSPECIFIED = 0; + + // This node represents a data type. + CLASS_TYPE = 1; + + // This node represents an expression. + CLASS_EXPRESSION = 2; + + // This node represents a relation. + CLASS_RELATION = 3; + } + + // Optional brief description of the node. Should not contain newlines or + // other non-span formatting information. + Comment brief = 14; + + // Optional summary of the node. Unlike brief, this may contain + // paragraph-level formatting information. + Comment summary = 15; + + // For the following types of nodes, the validator will try to do type + // resolution: + // - type-like nodes resolve to said type; + // - expression-like nodes resolve to the type returned by the expression; + // - relation-like nodes resolve to the schema (as a named struct) returned + // by the relation. + // This field will be populated for such nodes even if resolution fails, to + // indicate that there is supposed to be a type. In that case, the type kind + // will be set to "unresolved." The field will not be populated for nodes + // that don't have a logical Substrait type. + DataType data_type = 16; + + // Data associated with this node. + repeated Data data = 31; + + // Data associated with the node. Note that some variants are illegal based + // on the node type (for example, a primitive does not have fields, so it + // makes no sense for Field data to appear). + message Data { + oneof kind { + // Represents a child node in the tree. + Child child = 1; + + // Represents a diagnostic message. + Diagnostic diagnostic = 2; + + // Represents an (intermediate) data type. + DataType data_type = 3; + + // Unstructured additional information about the node or something in it. + Comment comment = 4; + } + } + + // Representation of a child node in the tree. + message Child { + // Path element identifying the relation of this child node to its + // parent. + Path.Element path = 1; + + // The child node. + Node node = 2; + + // Whether the validator recognized/expected the field or element that + // this child represents. Fields/elements may be unrecognized simply + // because validation is not implemented for them yet. In any case, this + // flag indicates that the subtree represented by this node could not be + // validated. + bool recognized = 3; + } + + // Information about a protobuf message. + message ProtoMessage { + // The full protobuf path for the type, for example "substrait.Plan". + string path = 1; + } + + // Information about a protobuf primitive. + message ProtoPrimitive { + // Logically compatible protobuf name of the primitive type, for example + // uint32 for any 32-bit unsigned data storage type. + string path = 1; + + // Value of the primitive. + PrimitiveData data = 2; + } + + // Information about the reference part of a reference/anchor pair. + message NodeReference { + // Integer value of the reference and anchor. + uint64 value = 1; + + // Absolute path to the referenced node, i.e. the node containing the + // anchor field. + Path path = 2; + } + + // Information about a reference to a YAML file. + message YamlReference { + // URI to the YAML file. + string uri = 1; + } + + // Value for a primitive data element. + message PrimitiveData { + // Note: to represent a YAML null, this field is simply not populated. + oneof data { + bool boolean = 1; + uint64 unsigned = 2; + int64 signed = 3; + double real = 4; + string unicode = 5; + bytes binary = 6; + string variant = 7; + google.protobuf.Any any = 8; + } + } +} + +// An absolute path to a node in the tree. +message Path { + // Name of the root node. Currently always set to `plan`. + string root = 1; + + // Elements of the path. The first element selects a child node of the root + // node, the second selects one of its children, etc. + repeated Element elements = 2; + + // Path element structure. + message Element { + oneof kind { + Field field = 1; + RepeatedField repeated_field = 2; + OneOfField oneof_field = 3; + ArrayElement array_element = 4; + } + } + + // Path element used for protobuf fields and YAML maps. + // Canonically represented as `.` if field matches + // [a-zA-Z_][a-zA-Z0-9_]*, or as `.""` using \\ and \" escape + // sequences if not (note that this can only happen for YAML map keys). + message Field { + string field = 1; + } + + // Path element used for protobuf repeated field elements. + // Canonically represented as `.[]`. + message RepeatedField { + string field = 1; + uint64 index = 2; + } + + // Path element used for protobuf oneof fields. + // Canonically represented as `.{}`. + message OneOfField { + string field = 1; + string variant = 2; + } + + // Path element used for YAML arrays. + // Canonically represented as `[]`. + message ArrayElement { + uint64 index = 2; + } +} + +// Representation of a diagnostic message. +message Diagnostic { + // The original error level/severity for this diagnostic. + Level original_level = 1; + + // The error level/severity for this diagnostic after adjustment according + // to the validator configuration. + Level adjusted_level = 2; + + // The machine-readable message for this diagnostic. + uint32 cause = 3; + + // The human-readable message for this diagnostic. + string msg = 4; + + // A path associated with this diagnostic. This is usually the path for + // the node it is associated with, but not necessarily: for example, a + // diagnostic message relating to a duplicate definition may refer back + // to the previous or first definition. + Path path = 5; + + // Error level. + enum Level { + LEVEL_UNSPECIFIED = 0; + + // Information diagnostic. Has no bearing on the validity of the plan. + LEVEL_INFO = 1; + + // Warning diagnostic. The presence of warning diagnostics indicates + // that the plan may or may not be valid, for example because the + // validator was unable to access a referenced YAML file, or because + // enhancements using protobuf's Any type were used. + LEVEL_WARNING = 2; + + // Error diagnostic. The presence of error diagnostics indicates that + // the plan is invalid. + LEVEL_ERROR = 3; + } +} + +// Representation of a comment made by the validator that is only intended +// to be interpreted by people. +message Comment { + // Comments consist of one or more "elements," defining formatting + // information. + repeated Element elements = 1; + + // A comment element. + message Element { + oneof kind { + // A span of text. + Span span = 1; + + // A newline, i.e. the next span should start on the next line. + Empty new_line = 2; + + // Opens a new unordered list. The next span is the start of the text for + // the next item. list_next elements are used to advance to the next list + // item; newlines can be used to add paragraphs without bullet points. + // Each list_open should be matched with a list_close. Lists may be + // nested. + Empty list_open = 3; + + // Advances to the next list item. + Empty list_next = 4; + + // Closes the current list. + Empty list_close = 5; + } + } + + // A span of text. + message Span { + // Text for this span. Should not include newlines. + string text = 1; + + // Specified if this span of text should link to something. + oneof link { + // Link to a path in the tree. + Path path = 2; + + // Link to a web page. + string url = 3; + } + } +} + +// Representation of a resolved data type. +message DataType { + // Type class. + Class class = 1; + + // Nullability. + bool nullable = 8; + + // Type variation, if any. + oneof variation { + UserDefinedVariation user_defined_variation = 9; + Empty unresolved_variation = 15; + } + + // Type parameters for non-simple types. + repeated Parameter parameters = 16; + + // A type class. + message Class { + oneof kind { + Simple simple = 1; + Compound compound = 2; + UserDefinedType user_defined_type = 3; + Empty unresolved_type = 7; + } + } + + // Enumeration of simple types. Message numbers correspond to the ones in + // substrait.Type. Note that UNSPECIFIED should never be emitted by the + // validator. + enum Simple { + SIMPLE_UNSPECIFIED = 0; + SIMPLE_BOOLEAN = 1; + SIMPLE_I8 = 2; + SIMPLE_I16 = 3; + SIMPLE_I32 = 5; + SIMPLE_I64 = 7; + SIMPLE_FP32 = 10; + SIMPLE_FP64 = 11; + SIMPLE_STRING = 12; + SIMPLE_BINARY = 13; + SIMPLE_TIMESTAMP = 14; + SIMPLE_DATE = 16; + SIMPLE_TIME = 17; + SIMPLE_INTERVAL_YEAR = 19; + SIMPLE_INTERVAL_DAY = 20; + SIMPLE_TIMESTAMP_TZ = 29; + SIMPLE_UUID = 32; + } + + // Enumeration of compound types. Message numbers correspond to the ones in + // substrait.Type. Note that UNSPECIFIED should never be emitted by the + // validator. + enum Compound { + COMPOUND_UNSPECIFIED = 0; + COMPOUND_FIXED_CHAR = 21; + COMPOUND_VAR_CHAR = 22; + COMPOUND_FIXED_BINARY = 23; + COMPOUND_DECIMAL = 24; + COMPOUND_STRUCT = 25; + COMPOUND_NAMED_STRUCT = 26; + COMPOUND_LIST = 27; + COMPOUND_MAP = 28; + } + + // Information about a user-defined type. + message UserDefinedType { + // URI of the YAML file that the type is (supposed to be) defined in, if + // known. + string uri = 1; + + // Name of the type within the scope of that YAML file. + string name = 2; + + // Type definition information from the YAML file, if resolution + // succeeded. + Definition definition = 3; + + // Type definition information from a YAML file for a user-defined type. + message Definition { + // The primitive structure of the type. + repeated Element structure = 1; + } + + // Primitive structure element for a user-defined type. + message Element { + // Name of the element. + string name = 1; + + // Type of data. + Simple kind = 2; + } + } + + // Information about a type variation. + message UserDefinedVariation { + // URI of the YAML file that the type variation is (supposed to be) defined + // in, if known. + string uri = 1; + + // Name of the type within the scope of that YAML file. + string name = 2; + + // Type definition information from the YAML file, if resolution + // succeeded. + Definition definition = 3; + + // Type definition information from a YAML file for a user-defined type. + message Definition { + // Base type. + oneof base_type { + Class physical = 1; + UserDefinedVariation logical = 2; + Empty unresolved = 7; + } + + // Function behavior for this type variation. + FunctionBehavior function_behavior = 8; + } + + // Function behavior for a type variation. + enum FunctionBehavior { + FUNCTION_BEHAVIOR_UNSPECIFIED = 0; + FUNCTION_BEHAVIOR_INHERITS = 1; + FUNCTION_BEHAVIOR_SEPARATE = 2; + } + } + + // Type parameter. + message Parameter { + // Type of parameter. + oneof kind { + // Anonymous data type parameter, for example the T in LIST. + DataType data_type = 1; + + // Named data type parameter, for example N:T in NSTRUCT. + Named named_type = 2; + + // Unsigned integer parameter, for example the L in VARCHAR. + uint64 unsigned = 3; + } + } + + // A named type, used for NSTRUCT (meta)types. + message Named { + // Name of the struct element. + string name = 1; + + // Data type of the struct element. + DataType data_type = 2; + } +} + +// Used for oneof field variants that have no data associated with them. +message Empty {} diff --git a/py/.gitignore b/py/.gitignore new file mode 100644 index 00000000..a0bd3051 --- /dev/null +++ b/py/.gitignore @@ -0,0 +1,79 @@ +/target + +# Byte-compiled / optimized / DLL files +__pycache__/ +.pytest_cache/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +.venv/ +env/ +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +include/ +man/ +venv/ +*.egg-info/ +.installed.cfg +*.egg + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt +pip-selfcheck.json + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# Rope +.ropeproject + +# Django stuff: +*.log +*.pot + +.DS_Store + +# Sphinx documentation +docs/_build/ + +# PyCharm +.idea/ + +# VSCode +.vscode/ + +# Pyenv +.python-version + +# Generated files +/text/ +/proto/ +/LICENSE +/protoc_out/ +/substrait_validator/substrait/ diff --git a/py/Cargo.toml b/py/Cargo.toml new file mode 100644 index 00000000..7bd45c67 --- /dev/null +++ b/py/Cargo.toml @@ -0,0 +1,38 @@ +[package] +name = "substrait-validator-py" +version = "0.0.1" +edition = "2018" +license = "Apache-2.0" +include = [ + "/LICENSE", + "/README.md", + "/build.rs", + "/pyproject.toml", + "/proto", + "/text", + "/src", + "/substrait_validator/*.py", + "/substrait_validator_build/*.py", + "/tests" +] + +[lib] +crate-type = ["cdylib"] + +# NOTE: we need to set the crate name to substrait_validator, because maturin +# seems to insist on also using this name for the Python module, and it'd be +# rather unfortunate if the Python module would need to be named +# substrait_validator_py... +name = "substrait_validator" + +# cargo doc especially can't deal with name collisions. +doc = false + +[dependencies] +substrait-validator = { path = "../rs", version = "0.0.1" } +pyo3 = { version = "0.15.1", features = ["extension-module"] } + +[build-dependencies] +prost-build = "0.9" +walkdir = "2" +dunce = "1" diff --git a/py/README.md b/py/README.md new file mode 100644 index 00000000..9f624c8e --- /dev/null +++ b/py/README.md @@ -0,0 +1,64 @@ +# Python bindings for validator + +This directory contains a Rust/PyO3 project to generate Python bindings for the +validator library. + +## Installation + +No wheels are published yet at this time, so you have to build manually. +Running something along the lines of `pip install .` should work. You should +only need to have a [rust](https://www.rust-lang.org/tools/install) compiler +installed. + +If you want to do an editable install, you must run +`./prepare_build.py populate` first. + +## Building wheels and source distributions + +You can build wheels and source distributions using +[maturin](https://github.com/PyO3/maturin), specifically using the `build` and +`sdist` commands. However, before you can do this, you must run +`./prepare_build.py populate`. This makes local copies of some files in the +repository that live outside of this subdirectory, such as the protobuf +description files. When you use `pip` or some other tool based on +`pyproject.toml`, this will be done automatically via build system hooks, but +unfortunately maturin doesn't itself provide hooks with which this can be +automated. + +## Running tests + +You can test the module using `pytest` after you install it. + +## Command-line usage + +The module exposes a command-line program named `substrait-validator` for +running the validator manually. You can also use the tool to convert between +various serialization formats of the `substrait.Plan` message. Run +`substrait-validator --help` for more information. + +## Library usage + +The library essentially provides a bunch of type conversion functions at +module scope to convert between the various representations of a Substrait +plan, including the result of the validator. The most important functions are +arguably `check_plan_valid(plan, config=None)` and +`check_plan_not_invalid(plan, config=None)`, which run validation on the given +plan and throw a Python exception corresponding to the first diagnostic +returned by the validator of the highest severity encountered if the plan is +not strictly or loosely valid respectively. That is, `check_plan_valid` will +throw an exception if the plan could not be proven to be valid, while +`check_plan_not_invalid` will only throw if it could be proven to be invalid. + +The `plan` argument can be a number of things: + + - `bytes`: treated as a binary serialization of `substrait.Plan`. + - `str`: treated as a protobuf JSON serialization of `substrait.Plan`. + - `dict`: treated as the above using Python's data model (JSON objects map + to `dict`s, JSON arrays map to `list`s). + - `substrait_validator.substrait.Plan`: a previously deserialized plan. + - `substrait_validator.ResultHandle`: a previously validated plan. + +`config` can be `None`/unspecified, or can be set to a +`substrait_validator.Config` object to configure the validator with. + +For more information, use Python's `help()` function. diff --git a/py/build.rs b/py/build.rs new file mode 100644 index 00000000..9036d873 --- /dev/null +++ b/py/build.rs @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: Apache-2.0 + +use std::collections::HashSet; +use std::env; +use std::ffi::OsStr; +use std::ffi::OsString; +use std::fs; +use std::io::BufRead; +use std::io::BufReader; +use std::io::Write; +use std::path::PathBuf; +use std::process::Command; +use walkdir::WalkDir; + +fn main() { + // Directory that the proto files are stored in. If the local_dependencies + // directory exists, we're building from an sdist package, in which case + // the proto files should have been copied to a local directory. + let input_paths = if std::path::Path::new("local_dependencies").exists() { + vec!["proto"] + } else { + vec!["../proto", "../substrait/proto"] + }; + + // Ensure above path is relative to the Cargo.toml directory. + let pwd = env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR"); + let input_paths = input_paths.iter().map(|p| PathBuf::from(&pwd).join(p)); + + // Output directory for protoc. This is a temporary directory: it will be + // completely deleted and then reconstructed. Afterward, the build script + // will patch the files in here and then move them to python_out. + let intermediate_path = "protoc_out"; + + // Where the final Python files will be moved to. + let output_path = "substrait_validator"; + + // The Python module prefix to patch into use statements of the files + // generated by protobuf. + let python_prefix = "substrait_validator."; + + // Canonicalize all paths to prevent ambiguity. + let input_paths = input_paths + .map(|p| dunce::canonicalize(p).unwrap()) + .collect::>(); + let workdir = std::env::current_dir().unwrap(); + let intermediate_path = workdir.join(intermediate_path); + let output_path = workdir.join(output_path); + + // Gather all .proto files. + let proto_files = input_paths + .iter() + .flat_map(|p| { + WalkDir::new(&p) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| { + e.path().extension() == Some(OsStr::new("proto")) + && e.metadata().unwrap().is_file() + }) + .map(|e| dunce::canonicalize(e.into_path()).unwrap()) + }) + .collect::>(); + + // Inform cargo that changes to the .proto files require a rerun. + for path in &proto_files { + println!("cargo:rerun-if-changed={}", path.display()); + } + + // Clean and recreate output directory. + fs::remove_dir_all(&intermediate_path).ok(); + fs::create_dir_all(&intermediate_path).expect("failed to create protoc output directory"); + + // Run protoc. + let mut cmd = Command::new(prost_build::protoc()); + for input_path in input_paths.iter() { + let mut proto_path_arg = OsString::new(); + proto_path_arg.push("--proto_path="); + proto_path_arg.push(&input_path); + cmd.arg(proto_path_arg); + } + let mut python_out_arg = OsString::new(); + python_out_arg.push("--python_out="); + python_out_arg.push(&intermediate_path); + cmd.arg(python_out_arg); + cmd.args(proto_files.iter()); + let output = cmd.output().expect("failed to run protoc"); + if !output.status.success() { + eprintln!("cmd: {:?}", cmd.get_program()); + for arg in cmd.get_args() { + eprintln!("arg: {:?}", arg); + } + panic!("{:?}", output); + } + + // Gather all Python files generated by protoc. + let intermediate_files: Vec<_> = WalkDir::new(&intermediate_path) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| { + e.path().extension() == Some(OsStr::new("py")) && e.metadata().unwrap().is_file() + }) + .map(|e| dunce::canonicalize(e.into_path()).unwrap()) + .collect(); + + // Patch the files. + let mut output_dirs = HashSet::new(); + for intermediate_file in intermediate_files { + // Determine the output filename. + let output_file = output_path.join( + intermediate_file + .strip_prefix(&intermediate_path) + .expect("intermediate file is not based in the expected directory"), + ); + + // Determine the output directory. + let mut path = output_file.to_path_buf(); + path.pop(); + + // Ensure that the directory exists, and create an __init__.py for it + // if we haven't already. + let mut path = output_file.to_path_buf(); + path.pop(); + if output_dirs.insert(path.clone()) { + fs::create_dir_all(&path).expect("failed to create output directory"); + path.push("__init__.py"); + fs::File::create(path).expect("failed to create __init__.py"); + } + + // Copy and patch the file. + let intermediate = + fs::File::open(&intermediate_file).expect("failed to open intermediate file"); + let mut output = fs::File::create(&output_file).expect("failed to create output file"); + for line in BufReader::new(intermediate).lines() { + let line = line.expect("failed to read from intermediate file"); + let line = if line.starts_with("from ") && !line.starts_with("from google") { + format!("from {}{}", python_prefix, &line[5..]) + } else { + line + }; + writeln!(output, "{}", line).unwrap(); + } + } +} diff --git a/py/prepare_build.py b/py/prepare_build.py new file mode 100755 index 00000000..18b91b89 --- /dev/null +++ b/py/prepare_build.py @@ -0,0 +1,34 @@ +#!/usr/bin/python +# SPDX-License-Identifier: Apache-2.0 + +import os +import sys +import substrait_validator_build + + +def eprint(*args): + print(*args, file=sys.stderr) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + eprint("Usage: {} [populate|clean]".format(sys.argv[0])) + eprint() + eprint( + "Populates or removes local copies of Substrait files needed for the build" + ) + eprint("that are stored outside of this subdirectory.") + sys.exit(1) + + if sys.argv[1] == "populate": + os.chdir(os.path.dirname(os.path.abspath(__file__))) + substrait_validator_build.populate() + sys.exit(0) + + if sys.argv[1] == "clean": + os.chdir(os.path.dirname(os.path.abspath(__file__))) + substrait_validator_build.clean() + sys.exit(0) + + eprint("Unknown command: {}".format(sys.argv[1])) + sys.exit(1) diff --git a/py/pyproject.toml b/py/pyproject.toml new file mode 100644 index 00000000..8481d2c7 --- /dev/null +++ b/py/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["maturin>=0.12,<0.13"] +build-backend = "substrait_validator_build" +backend-path = ["."] + +[project] +name = "substrait-validator" +version = "0.0.1" +description = "Validator for Substrait query plans" +readme = "README.md" +license = {file = "LICENSE"} +keywords = ["substrait"] +requires-python = ">=3.6" +classifiers = [ + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "protobuf > 3.19.3", + "click >= 8", + "pyyaml >= 6", + "jdot >= 0.5" +] + +[project.optional-dependencies] +test = [ + "pytest < 5.0.0", +] + +[project.urls] +homepage = "https://substrait.io/" +repository = "https://github.com/substrait-io/substrait-validator" + +[project.scripts] +substrait-validator = "substrait_validator:cli" diff --git a/py/src/lib.rs b/py/src/lib.rs new file mode 100644 index 00000000..20852b4a --- /dev/null +++ b/py/src/lib.rs @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: Apache-2.0 + +// This happens in PyO3 generated code, and there doesn't seem to be a more +// narrow scope that this can be disabled in (clippy seems a bit confused about +// the code causing the warning, in general). +#![allow(clippy::needless_option_as_deref)] + +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; +use pyo3::types::{PyBytes, PyDict, PyTuple}; + +/// Represents a validator/parser configuration. +#[pyclass] +struct Config { + config: substrait_validator::Config, +} + +#[pymethods] +impl Config { + #[new] + pub fn new() -> Self { + Config { + config: substrait_validator::Config::new(), + } + } + + /// Instructs the validator to ignore protobuf fields that it doesn't know + /// about yet (i.e., that have been added to the Substrait protobuf + /// descriptions, but haven't yet been implemented in the validator) if the + /// fields are set to their default value. If this option isn't set, or if + /// an unknown field is not set to its default value, a warning is emitted. + pub fn ignore_unknown_fields(&mut self) { + self.config.ignore_unknown_fields = true; + } + + /// Explicitly allows a protobuf message type to be used in advanced + /// extensions, despite the fact that the validator can't validate it. If + /// an advanced extension is encountered that isn't explicitly allowed, a + /// warning is emitted. The pattern may include * and ? wildcards for + /// glob-like matching (see + /// https://docs.rs/glob/latest/glob/struct.Pattern.html for the complete + /// syntax). + pub fn allow_proto_any_url(&mut self, pattern: &str) -> PyResult<()> { + let pattern = match substrait_validator::Pattern::new(pattern) { + Ok(p) => p, + Err(e) => { + return Err(PyValueError::new_err(format!( + "invalid pattern {pattern:?}: {e}" + ))); + } + }; + self.config.allow_proto_any_url(pattern); + Ok(()) + } + + /// Sets a minimum and/or maximum error level for the given class of + /// diagnostic messages. Any previous settings for this class are + /// overridden. + pub fn override_diagnostic_level( + &mut self, + class: u32, + minimum: &str, + maximum: &str, + ) -> PyResult<()> { + fn str_to_level(level: &str) -> PyResult { + match level { + "info" => Ok(substrait_validator::Level::Info), + "warning" => Ok(substrait_validator::Level::Warning), + "error" => Ok(substrait_validator::Level::Error), + level => Err(PyValueError::new_err(format!( + "invalid level {level:?}; must be \"info\", \"warning\", or \"error\"" + ))), + } + } + let class = match substrait_validator::Classification::from_code(class) { + Some(c) => c, + None => { + return Err(PyValueError::new_err(format!( + "unknown diagnostic class {class}" + ))) + } + }; + let minimum = str_to_level(minimum)?; + let maximum = str_to_level(maximum)?; + self.config + .override_diagnostic_level(class, minimum, maximum); + Ok(()) + } + + /// Overrides the resolution behavior for (YAML) URIs matching the given + /// pattern. The pattern may include * and ? wildcards for glob-like + /// matching (see https://docs.rs/glob/latest/glob/struct.Pattern.html + /// for the complete syntax). If resolve_as is None, the URI will not + /// be resolved; otherwise it should be a string representing the URI it + /// should be resolved as. + pub fn override_uri(&mut self, pattern: &str, resolve_as: Option<&str>) -> PyResult<()> { + let pattern = match substrait_validator::Pattern::new(pattern) { + Ok(p) => p, + Err(e) => { + return Err(PyValueError::new_err(format!( + "invalid pattern {pattern:?}: {e}" + ))); + } + }; + self.config.override_uri(pattern, resolve_as); + Ok(()) + } + + /// Registers a URI resolution function with this configuration. If + /// the given function fails, any previously registered function will be + /// used as a fallback. The callback function must take a single string + /// argument and return a bytes object, or throw an exception on failure. + pub fn add_uri_resolver(&mut self, callback: PyObject) { + self.config + .add_uri_resolver(move |uri| -> Result, PyErr> { + pyo3::Python::with_gil(|py| { + Ok(callback + .call1(py, (uri,))? + .as_ref(py) + .downcast::()? + .as_bytes() + .to_owned()) + }) + }) + } +} + +/// Represents a Substrait plan parse tree, as parsed by the validator. +/// +/// To construct a parse tree (and in doing so, validate the Substrait plan), +/// simply pass a bytes object containing the substrait.plan message to the +/// constructor. Note that this "never fails:" any failures to parse the +/// bytes object will be embedded as diagnostics in the ResultHandle object. +/// This allows multiple error messages to be contained within the object. Use +/// check(), check_valid(), or check_not_invalid() to check validity. +#[pyclass] +struct ResultHandle { + root: substrait_validator::ParseResult, +} + +#[pymethods] +impl ResultHandle { + #[new] + pub fn new(data: &[u8], config: Option<&Config>) -> Self { + Self { + root: if let Some(config) = config { + substrait_validator::parse(data, &config.config) + } else { + substrait_validator::parse(data, &substrait_validator::Config::default()) + }, + } + } + + /// Checks the validity of the plan passed to this ParseResult during + /// construction. Returns -1 for invalid plans, 0 for possibly valid + /// plans (i.e. the validator was unable to prove validity either way), + /// or 1 for valid plans. + pub fn check(&self) -> i32 { + match self.root.check() { + substrait_validator::Validity::Valid => 1, + substrait_validator::Validity::MaybeValid => 0, + substrait_validator::Validity::Invalid => -1, + } + } + + /// Throws a ValueError exception containing the first error or warning + /// encountered in the plan if the plan was not proven to be valid by the + /// validator. + pub fn check_valid(&self) -> PyResult<()> { + if let Some(diag) = self.root.get_diagnostic() { + if diag.adjusted_level >= substrait_validator::Level::Warning { + return Err(PyValueError::new_err(diag.to_string())); + } + } + Ok(()) + } + + /// Throws a ValueError exception containing the first error encountered + /// in the plan if the plan was proven to be invalid by the validator. + pub fn check_not_invalid(&self) -> PyResult<()> { + if let Some(diag) = self.root.get_diagnostic() { + if diag.adjusted_level >= substrait_validator::Level::Error { + return Err(PyValueError::new_err(diag.to_string())); + } + } + Ok(()) + } + + /// Exports all diagnostic messages contained in this parse result as a + /// multiline string. + pub fn export_diagnostics(&self) -> PyResult { + let mut result: Vec = vec![]; + self.root.export( + &mut result, + substrait_validator::export::Format::Diagnostics, + )?; + let result = String::from_utf8(result)?; + Ok(result) + } + + /// Exports the parse tree as a HTML multiline string, intended for + /// debugging. + pub fn export_html(&self) -> PyResult { + let mut result: Vec = vec![]; + self.root + .export(&mut result, substrait_validator::export::Format::Html)?; + let result = String::from_utf8(result)?; + Ok(result) + } + + /// Exports the entire parse tree as a substrait.validator.Node protobuf + /// message, using binary serialization. + pub fn export_proto(&self, py: Python) -> PyResult { + let mut result = vec![]; + self.root + .export(&mut result, substrait_validator::export::Format::Proto)?; + let result = PyBytes::new(py, &result).into(); + Ok(result) + } +} + +/// Rust-native module for the validator. +#[pymodule] +fn substrait_validator(_py: Python, m: &PyModule) -> PyResult<()> { + /// Returns a dictionary mapping all diagnostic codes currently defined + /// to three-tuples consisting of: + /// - the name of the diagnostic as a str; + /// - its description as a str; and + /// - the diagnostic code of its parent as an integer, or None for code 0. + #[pyfn(m)] + #[pyo3(name = "get_diagnostic_codes")] + fn get_diagnostic_codes_py(py: Python) -> PyResult { + let dict = PyDict::new(py); + for class in substrait_validator::iter_diagnostics() { + dict.set_item( + class.code(), + PyTuple::new( + py, + [ + class.name().to_object(py), + class.description().to_object(py), + if class.code() == 0 { + py.None() + } else { + substrait_validator::Classification::parent(class.code()).to_object(py) + }, + ], + ), + )?; + } + Ok(dict.into()) + } + + m.add_class::()?; + m.add_class::()?; + Ok(()) +} diff --git a/py/substrait_validator/__init__.py b/py/substrait_validator/__init__.py new file mode 100644 index 00000000..6645fba6 --- /dev/null +++ b/py/substrait_validator/__init__.py @@ -0,0 +1,748 @@ +# SPDX-License-Identifier: Apache-2.0 + +import sys +import json +import jdot +import yaml +import click +import urllib.request +from io import BytesIO +from typing import Iterable +from google.protobuf import json_format +from google.protobuf.message import DecodeError as ProtoDecodeError +from .substrait_validator import ResultHandle, Config as _Config, get_diagnostic_codes +from .substrait.plan_pb2 import Plan +from .substrait.validator.validator_pb2 import ParseResult, Diagnostic, Path + + +_JDOT_MACROS = """@macros +.field .selection { .directReference .structField .field ?v .rootReference {} } +.field0 .selection { .directReference .structField {} .rootReference {} } +.nullable .nullability "NULLABILITY_NULLABLE" +.required .nullability "NULLABILITY_REQUIRED" + +@output +""" + + +def _jdot_coder() -> jdot.JdotCoder: + coder = jdot.JdotCoder() + coder.decode(_JDOT_MACROS) + return coder + + +def _jdot_dumps(data) -> str: + return _JDOT_MACROS + _jdot_coder().encode( + data, formatter=jdot.formatter.JdotFormatter() + ) + + +def _jdot_loads(data: str): + return _jdot_coder().decode(data) + + +def _populate_config(cfg): + """We can't derive from _Config to add the add_urllib_resolver() function, + so we'll just have to monkey-patch it.""" + + def generate_method(cls, name, fn): + def x(self, *args, **kwargs): + return fn(self._config, *args, **kwargs) + + x.__name__ = name + x.__doc__ = f.__doc__ + setattr(cls, name, x) + + for name in dir(_Config): + if name.startswith("_"): + continue + f = getattr(_Config, name) + if not callable(f): + continue + generate_method(cfg, name, f) + cfg.__doc__ = _Config.__doc__ + return cfg + + +@_populate_config +class Config: + def __init__(self): + self._config = _Config() + + @staticmethod + def _unwrap(config): + if isinstance(config, Config): + return config._config + elif isinstance(config, _Config): + return config + elif config is None: + return None + else: + raise TypeError("unsupported type: {}".format(type(config))) + + def add_urllib_resolver(self): + """Adds a URI resolver based on urllib.""" + + def urllib_resolver(uri): + return urllib.request.urlopen(uri).read() + + self._config.add_uri_resolver(urllib_resolver) + + +def load_plan_from_proto(data: bytes) -> Plan: + """Load a Substrait plan from its protobuf serialization.""" + if not isinstance(data, bytes): + raise TypeError("unsupported type: {}".format(type(data))) + plan = Plan() + plan.ParseFromString(data) + return plan + + +def load_plan_from_json(data: str) -> Plan: + """Load a Substrait plan from its JSON string representation.""" + if not isinstance(data, str): + raise TypeError("unsupported type: {}".format(type(data))) + return json_format.Parse(data, Plan()) + + +def load_plan_from_dict(data: dict) -> Plan: + """Load a Substrait plan from its Python object JSON representation.""" + if not isinstance(data, dict): + raise TypeError("unsupported type: {}".format(type(data))) + return load_plan_from_json(json.dumps(data)) + + +def load_plan_from_yaml(data: str) -> Plan: + """Load a Substrait plan from YAML data mimicking the structure of + its JSON string representation.""" + if not isinstance(data, str): + raise TypeError("unsupported type: {}".format(type(data))) + return load_plan_from_dict(yaml.safe_load(data)) + + +def load_plan_from_jdot(data: str) -> Plan: + """Load a Substrait plan from JDOT data mimicking the structure of + its JSON string representation.""" + if not isinstance(data, str): + raise TypeError("unsupported type: {}".format(type(data))) + return load_plan_from_dict(_jdot_loads(data)) + + +def load_plan(data) -> Plan: + """Loads a plan from its binary protobuf serialization (bytes input), + a JSON string (string input), or a dictionary representation of such a + JSON string (dict input). If data is already a Plan, this function is + no-op and simply returns its input.""" + if isinstance(data, Plan): + return data + elif isinstance(data, bytes): + return load_plan_from_proto(data) + elif isinstance(data, str): + return load_plan_from_json(data) + elif isinstance(data, dict): + return load_plan_from_dict(data) + else: + raise TypeError("unsupported type: {}".format(type(data))) + + +def parse_plan(plan, config=None) -> ParseResult: + """Parses the given plan with the validator. plan can be anything + supported by load_plan(), a Plan object, or a ResultHandle object. This is + just an alternate name for plan_to_parse_result().""" + return plan_to_parse_result(plan, config) + + +def plan_to_proto(plan) -> bytes: + """Converts a plan to its binary protobuf serialization. plan can be + anything supported by load_plan().""" + return load_plan(plan).SerializeToString() + + +def plan_to_json(plan) -> str: + """Converts a plan to its JSON serialization, returned as a string. plan + can be anything supported by load_plan().""" + return json_format.MessageToJson(load_plan(plan)) + + +def plan_to_dict(plan) -> dict: + """Converts a plan to its JSON serialization, returned as a dict. plan can + be anything supported by load_plan().""" + return json_format.MessageToDict(load_plan(plan)) + + +def plan_to_yaml(plan) -> str: + """Converts a plan to the YAML equivalent of its JSON serialization, + returned as a string. plan can be anything supported by load_plan().""" + return yaml.safe_dump(plan_to_dict(plan)) + + +def plan_to_jdot(plan) -> str: + """Converts a plan to the JDOT equivalent of its JSON serialization, + returned as a string. plan can be anything supported by load_plan().""" + return _jdot_dumps(plan_to_dict(plan)) + + +def plan_to_result_handle(plan, config=None) -> ResultHandle: + """Parses a Substrait plan using the validator, and returns its result + handle object. plan can be anything supported by load_plan(). If the + input is already a ResultHandle, it is returned as-is.""" + if isinstance(plan, ResultHandle): + return plan + if isinstance(plan, bytes): + data = plan + else: + data = plan_to_proto(plan) + return ResultHandle(data, Config._unwrap(config)) + + +def plan_to_parse_result(plan, config=None) -> ParseResult: + """Parses the given plan with the validator, and returns its parse result. + plan can be anything supported by load_plan(), a Plan object, or a + ResultHandle object.""" + result = ParseResult() + result.ParseFromString(plan_to_parse_result_proto(plan, config)) + return result + + +def plan_to_parse_result_proto(plan, config=None) -> str: + """Same as parse_plan(), but returns the binary serialization of the + parse result. This is faster, if you don't plan to use the serialization + from python.""" + return plan_to_result_handle(plan, config).export_proto() + + +def plan_to_diagnostics(plan, config=None) -> Iterable[Diagnostic]: + """Converts a plan to an iterable of Diagnostics. plan can be anything + supported by plan_to_result_handle().""" + + def walk(node): + for data in node.data: + if data.HasField("child"): + for diagnostic in walk(data.child.node): + yield diagnostic + elif data.HasField("diagnostic"): + yield data.diagnostic + + return walk(plan_to_parse_result(plan, config).root) + + +def plan_to_diagnostics_str(plan, config=None) -> str: + """Converts a plan to a multiline string representing the diagnostic + messages returned by the validator for that plan. plan can be anything + supported by plan_to_result_handle().""" + return plan_to_result_handle(plan, config).export_diagnostics() + + +def plan_to_html(plan, config=None) -> str: + """Generates a HTML page for the given plan to serve as documentation + while debugging. plan can be anything supported by + plan_to_result_handle().""" + return plan_to_result_handle(plan, config).export_html() + + +def check_plan(plan, config=None) -> int: + """Returns 1 if the given plan is valid, -1 if it is invalid, or 0 if the + validator cannot determine validity. plan can be anything supported by + load_plan(), a Plan object, or a ResultHandle object.""" + return plan_to_result_handle(plan, config).check() + + +def check_plan_valid(plan, config=None): + """Throws a ValueError exception containing the first error or warning + encountered in the plan if the validator cannot prove correctness of + the given plan. plan can be anything supported by load_plan(), a Plan + object, or a ResultHandle object.""" + plan_to_result_handle(plan, config).check_valid() + + +def check_plan_not_invalid(plan, config=None): + """Throws a ValueError exception containing the first error encountered in + the plan if the validator can prove that the given plan is invalid. plan + can be anything supported by load_plan(), a Plan object, or a ResultHandle + object.""" + plan_to_result_handle(plan, config).check_not_invalid() + + +def path_to_string(path: Path) -> str: + """Converts a substrait.validator.Path message to a string.""" + elements = [path.root] + for element in path.elements: + if element.HasField("field"): + elements.append(f".{element.field.field}") + elif element.HasField("repeated_field"): + elements.append( + f".{element.repeated_field.field}[{element.repeated_field.index}]" + ) + elif element.HasField("oneof_field"): + elements.append( + f".{element.oneof_field.field}<{element.oneof_field.variant}>" + ) + elif element.HasField("array_element"): + elements.append(f"[{element.array_element.index}]") + else: + raise ValueError("invalid path element") + return "".join(elements) + + +@click.command() +@click.argument("infile", required=False) +@click.option( + "--in-type", + type=click.Choice(["ext", "proto", "json", "yaml", "jdot"], case_sensitive=False), + default="ext", + help=( + 'Input file type. "ext" uses the extension of the input ' + 'file, defaulting to "proto" if there is none.' + ), +) +@click.option( + "--verbosity", + "-v", + type=click.Choice( + ["info", "warn", "error", "fatal", "quiet"], case_sensitive=False + ), + default="warn", + help=("Specifies the verbosity for writing diagnostics to " "stderr."), +) +@click.option( + "--out-file", + "-O", + default=None, + help='Output file. "-" may be used to select stdout.', +) +@click.option( + "--out-type", + type=click.Choice( + ["ext", "diag", "html", "proto", "json", "yaml", "jdot"], case_sensitive=False + ), + default="ext", + help=( + 'Output file type. "ext" uses the extension of the output ' + 'file, defaulting to "diag" if there is none.' + ), +) +@click.option( + "--mode", + "-m", + type=click.Choice(["convert", "ignore", "loose", "strict"], case_sensitive=False), + default="loose", + help=( + 'Validation mode. "convert" disables all but protobuf\'s ' + "internal validation, and can be used to convert between " + 'different representations of substrait.Plan. "ignore" ' + "runs validation, but ignores the result (i.e. the " + "program always returns 0 and emits an output file if " + 'requested). "loose" fails only if the validator can ' + 'prove that the plan is invalid. "strict" fails if it ' + "cannot prove that it is valid." + ), +) +@click.option( + "--ignore-unknown-fields", + help=( + "Do not generate warnings for unknown protobuf fields " + "that are set to their protobuf-defined default value." + ), +) +@click.option( + "--allow-proto-any", + multiple=True, + help=( + "Explicitly allow the given protobuf type URL(s) to be " + "used in protobuf Any messages. Supports glob syntax." + ), +) +@click.option( + "--diagnostic-level", + nargs=3, + multiple=True, + help=( + "Clamps the error level of diagnostics with diagnostic " + "code or class [0] to at least [1] and at most [2]. " + "For example, --diagnostic-level 1 warn error will " + "override the level of info diagnostics with code 1 " + "to warning, leaving the other levels unchanged." + ), +) +@click.option( + "--override-uri", + nargs=2, + multiple=True, + help=( + "Overrides URIs in the plan that match [0] with [1]. Set " + '[1] to "-" to disable resolution of matching URIs. ' + "Supports glob syntax. For example, " + '"--override-uri http://* -" disables resolution via ' + "http." + ), +) +@click.option( + "--use-urllib/--no-use-urllib", + default=True, + help=( + "Enable URI resolution via urllib. Enabled by default. " + "If disabled, only file:// URIs will resolve (after " + "application of any --override-uri options)." + ), +) +@click.option( + "--help-diagnostics", + is_flag=True, + help=("Show a list of all known diagnostic codes and exit."), +) +def cli( # noqa: C901 + infile, + in_type, + out_file, + out_type, + mode, + verbosity, + ignore_unknown_fields, + allow_proto_any, + diagnostic_level, + override_uri, + use_urllib, + help_diagnostics, +): + """Validate or convert the substrait.Plan represented by INFILE (or stdin + using "-"). + + The following formats are supported: + + \b + - proto: binary serialization format of protobuf. + - json: JSON serialization format of protobuf. + - yaml: like JSON, but represented as YAML. + - jdot: like JSON, but represented as JDOT (still highly experimental, + see https://github.com/saulpw/jdot). + - diag*: list of validator diagnostic messages. + - html*: all information known about the plan in HTML format. + + *output-only, and not supported in -mconvert mode. + + When validation is enabled, the output message type will be + substrait.validator.Result. If you just want to convert between different + representations of the substrait.Plan message, use -mconvert. + """ + + # Define various helper functions and constants. + INFO = Diagnostic.Level.LEVEL_INFO + WARN = Diagnostic.Level.LEVEL_WARNING + ERROR = Diagnostic.Level.LEVEL_ERROR + FATAL = ERROR + 1 + QUIET = FATAL + 1 + + def level_str_to_int(level): + """Converts a string representation of an error level or verbosity to + its internal integer representation.""" + return { + "info": INFO, + "warn": WARN, + "error": ERROR, + "fatal": FATAL, + "quiet": QUIET, + }[level] + + def emit_diagnostic(level, msg, code=None, source=None, original_level=None): + """Emits a diagnostic message to stderr.""" + + # Only print the diagnostic if the configured verbosity is high enough. + if level < verbosity_level: + return + + # Determine the original error level. + if original_level is None: + original_level = level + + # Format the level. + formatted = [ + { + FATAL: click.style("Fatal error", fg="red", bold=True), + ERROR: click.style("Error", fg="red", bold=True), + WARN: click.style("Warning", fg="yellow", bold=False), + INFO: click.style("Info", fg="green", bold=False), + }[level] + ] + + # Format extra information written within parentheses. + parens = [] + if original_level != level: + if original_level > level: + mod = "reduced from " + else: + mod = "promoted from " + mod += { + FATAL: "fatal", + ERROR: "error", + WARN: "warning", + INFO: "info", + }[original_level] + parens.append(mod) + if code is not None: + parens.append(f"code {code:04d}") + if parens: + formatted.append(" ({})".format(", ".join(parens))) + formatted.append(":\n") + + # Append source information, if known. + if source is not None: + formatted.append(f" at {source}:\n") + + # Append the actual message. + formatted.append(" ") + formatted.append("\n ".join(str(msg).split("\n"))) + formatted.append("\n") + + # Print the formatted diagnostic. + click.echo("".join(formatted), err=True) + + def fatal(*args, **kwargs): + """Shorthand for emit_diagnostic(FATAL, ...) followed by exiting with + code 1.""" + emit_diagnostic(FATAL, *args, **kwargs) + sys.exit(1) + + def deduce_format(fil, typ, remap): + """Deduces the file format for fil with type hint typ using the rules + in remap.""" + if typ == "ext": + if fil is None: + typ = remap["DEFAULT"] + else: + _, *ext = fil.rsplit(".", maxsplit=1) + if ext: + typ = ext[0].lower() + typ = remap.get(typ, remap["DEFAULT"]) + return typ + + def emit_output(data): + """Emits the given output data as specified on the command line.""" + # Encode text formats as unicode. + if not isinstance(data, bytes): + data = data.encode("utf-8") + + # Write to the output. + if out_file == "-": + try: + count = sys.stdout.buffer.write(data) + except IOError as e: + fatal(f"failed to write to stdout: {e}") + elif out_file is not None: + try: + with open(out_file, "wb") as f: + count = f.write(data) + except IOError as e: + fatal(f"failed to write output file: {e}") + else: + return + if count < len(data): + fatal("failed to write all output") + + def emit_proto(out_message): + """Emits the given protobuf message as specified on the command + line.""" + + # Convert to appropriate data format. + if out_type == "proto": + emit_output(out_message.SerializeToString()) + elif out_type == "json": + emit_output(json_format.MessageToJson(out_message)) + else: + out_dict = json_format.MessageToDict(out_message) + if out_type == "yaml": + emit_output(yaml.safe_dump(out_dict)) + elif out_type == "jdot": + emit_output(_jdot_dumps(out_dict)) + else: + fatal(f"cannot emit protobuf message in {out_type} format") + + # Print diagnostic code help if requested. + if help_diagnostics: + click.echo("The following diagnostic codes are defined:\n") + diags = {} + for code, (name, desc, parent) in sorted(get_diagnostic_codes().items()): + diag = (code, name, desc, []) + diags[code] = diag + if parent is not None: + diags[parent][3].append(diag) + + def print_diag(diag, first_prefix="", next_prefix=""): + code, name, desc, children = diag + click.echo(f"{first_prefix}{code:04d} ({name}): {desc}.") + for child in children[:-1]: + print_diag(child, f"{next_prefix} |- ", f"{next_prefix} | ") + if children: + print_diag(children[-1], f"{next_prefix} '- ", f"{next_prefix} ") + + print_diag(diags[0]) + sys.exit(0) + + # Parse verbosity level. + verbosity_level = level_str_to_int(verbosity) + + # Check input file. + in_file = infile + if in_file is None: + click.echo("Missing input file. Try --help for usage information.", err=True) + sys.exit(2) + + # Handle automatic format deduction. + in_type = deduce_format( + in_file, + in_type, + { + "DEFAULT": "proto", + "json": "json", + "yaml": "yaml", + "jdot": "jdot", + }, + ) + out_type = deduce_format( + out_file, + out_type, + { + "DEFAULT": "proto", + "json": "json", + "yaml": "yaml", + "jdot": "jdot", + "txt": "diag", + "html": "html", + "htm": "html", + }, + ) + + # Read input file. + if in_file == "-": + try: + in_data = sys.stdin.buffer.read() + except IOError as e: + fatal(f"failed to read from stdin: {e}") + else: + try: + with open(in_file, "rb") as f: + in_data = f.read() + except IOError as e: + fatal(f"failed to read input file: {e}") + + # Parse input format. + if in_type == "proto": + + # Convert the plan directly. + try: + in_plan = load_plan_from_proto(in_data) + except ProtoDecodeError as e: + fatal(e) + + else: + + # Remaining formats are UTF-8 encoded. + try: + in_str = in_data.decode("utf8") + except UnicodeError as e: + fatal(f"failed to decode input file: {e}") + + # Convert from different variations of the JSON object model. + if in_type == "json": + try: + in_dict = json.loads(in_str) + except json.decoder.JSONDecodeError as e: + fatal(f"failed to decode input file: {e}") + elif in_type == "yaml": + try: + in_dict = yaml.safe_load(in_str) + except yaml.YAMLError as e: + fatal(f"failed to decode input file: {e}") + elif in_type == "jdot": + try: + in_dict = _jdot_loads(in_str) + except jdot.decoder.DecodeException as e: + fatal(f"failed to decode input file: {e}") + else: + raise NotImplementedError(in_type) + + # The outermost structure must be a dict for anything to work at all. + if not isinstance(in_dict, dict): + fatal("toplevel structure of decoded JSON-like input is not a object") + + # Convert the dict representation of the JSON object model to the + # protobuf message wrapper. + try: + in_plan = load_plan_from_dict(in_dict) + except json_format.ParseError as e: + fatal(e) + + # Handle convert-only mode. + if mode == "convert": + emit_proto(in_plan) + return 0 + + # Construct parser/validator configuration. + config = Config() + if ignore_unknown_fields: + config.ignore_unknown_fields() + for pattern in allow_proto_any: + try: + config.allow_proto_any_url(pattern) + except ValueError as e: + fatal(e) + for code, minimum, maximum in diagnostic_level: + try: + code = int(code, 10) + if code < 0: + raise ValueError() + minimum = minimum.lower() + if minimum == "warn": + minimum = "warning" + maximum = maximum.lower() + if maximum == "warn": + maximum = "warning" + config.override_diagnostic_level(code, minimum, maximum) + except ValueError as e: + fatal(e) + for pattern, resolve_as in override_uri: + if resolve_as == "-": + resolve_as = None + try: + config.override_uri(pattern, resolve_as) + except ValueError as e: + fatal(e) + if use_urllib: + config.add_urllib_resolver() + + # Run the parser/validator. + result = plan_to_result_handle(in_plan, config) + + # Emit diagnostics to stderr. + for diagnostic in plan_to_diagnostics(result): + emit_diagnostic( + msg=diagnostic.msg, + code=diagnostic.cause, + source=path_to_string(diagnostic.path), + level=diagnostic.adjusted_level, + original_level=diagnostic.original_level, + ) + + # Check validity. + validity = check_plan(result) + if mode == "loose": + if validity < 0: + fatal("plan is invalid") + elif mode == "strict": + if validity < 1: + fatal("failed to prove that plan is valid") + elif mode != "ignore": + raise ValueError("mode") + + # Emit output file. + if out_type == "diag": + emit_output(plan_to_diagnostics_str(result)) + elif out_type == "html": + emit_output(plan_to_html(result)) + else: + emit_proto(plan_to_parse_result(result)) + + return 0 + + +if __name__ == "__main__": + cli() diff --git a/py/substrait_validator_build/__init__.py b/py/substrait_validator_build/__init__.py new file mode 100644 index 00000000..bae30bde --- /dev/null +++ b/py/substrait_validator_build/__init__.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 + +from maturin import * +import shutil +import os + + +_PATHS = [ + (os.path.join("..", "proto"), "proto"), + (os.path.join("..", "substrait", "proto"), "proto"), + (os.path.join("..", "substrait", "text"), "text"), + (os.path.join("..", "LICENSE"), "LICENSE"), + (None, "protoc_out"), + (None, "substrait_validator/substrait"), + (None, "substrait_validator/__pycache__"), +] + + +def clean(): + for _, path in _PATHS: + if os.path.isdir(path): + shutil.rmtree(path) + elif os.path.isfile(path): + os.unlink(path) + + +def _copytree(source, dest): + if os.path.isdir(source): + if not os.path.isdir(dest): + os.makedirs(dest) + files = os.listdir(source) + for f in files: + _copytree(os.path.join(source, f), os.path.join(dest, f)) + else: + shutil.copyfile(source, dest) + + +def populate(): + clean() + for source, dest in _PATHS: + if source is not None: + _copytree(source, dest) + + +def _prepare(): + # If the local_dependencies directory exists, pip is building the package + # from a source distribution. In that case, the build environment is + # already as it should be. + if not os.path.isdir("local_dependencies"): + populate() + + +_maturin_prepare_metadata_for_build_wheel = ( + prepare_metadata_for_build_wheel # noqa: F405 +) + + +def prepare_metadata_for_build_wheel(*args, **kwargs): + _prepare() + return _maturin_prepare_metadata_for_build_wheel(*args, **kwargs) + + +_maturin_build_wheel = build_wheel # noqa: F405 + + +def build_wheel(*args, **kwargs): + _prepare() + return _maturin_build_wheel(*args, **kwargs) + + +_maturin_build_sdist = build_sdist # noqa: F405 + + +def build_sdist(*args, **kwargs): + _prepare() + return _maturin_build_sdist(*args, **kwargs) diff --git a/py/tests/data.py b/py/tests/data.py new file mode 100644 index 00000000..e70e4684 --- /dev/null +++ b/py/tests/data.py @@ -0,0 +1,887 @@ +# SPDX-License-Identifier: Apache-2.0 + +BASIC_PLAN = """ +{ + "extensionUris": [], + "extensions": [], + "relations": [ + { + "rel": { + "project": { + "input": { + "read": { + "common": { + "direct": {} + }, + "projection": { + "select": { + "structItems": [ + { + "field": 0 + }, + { + "field": 1 + } + ] + }, + "maintainSingularStruct": false + }, + "namedTable": { + "names": [ + "person" + ] + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + } + } + } + ] + } + } + } + ], + "expectedTypeUrls": [] +} +""" + +COMPLEX_PLAN = """ +{ + "extensionUris": [{ + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, { + "extensionUriAnchor": 4, + "uri": "/functions_arithmetic_decimal.yaml" + }, { + "extensionUriAnchor": 3, + "uri": "/functions_datetime.yaml" + }, { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + }], + "extensions": [{ + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 1, + "name": "and:bool" + } + }, { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 2, + "name": "equal:any1_any1" + } + }, { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "lt:date_date" + } + }, { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 4, + "name": "gt:date_date" + } + }, { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 5, + "name": "multiply:opt_decimal_decimal" + } + }, { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 6, + "name": "subtract:opt_decimal_decimal" + } + }, { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 7, + "name": "sum:opt_decimal" + } + }], + "relations": [{ + "root": { + "input": { + "fetch": { + "common": { + "direct": { + } + }, + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [4, 5, 6, 7] + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [33, 34, 35, 36] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["C_CUSTKEY", "C_NAME", "C_ADDRESS", "C_NATIONKEY", "C_PHONE", "C_ACCTBAL", "C_MKTSEGMENT", "C_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 117, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": ["CUSTOMER"] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["O_ORDERKEY", "O_CUSTKEY", "O_ORDERSTATUS", "O_TOTALPRICE", "O_ORDERDATE", "O_ORDERPRIORITY", "O_CLERK", "O_SHIPPRIORITY", "O_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 79, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": ["ORDERS"] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": ["L_ORDERKEY", "L_PARTKEY", "L_SUPPKEY", "L_LINENUMBER", "L_QUANTITY", "L_EXTENDEDPRICE", "L_DISCOUNT", "L_TAX", "L_RETURNFLAG", "L_LINESTATUS", "L_SHIPDATE", "L_COMMITDATE", "L_RECEIPTDATE", "L_SHIPINSTRUCT", "L_SHIPMODE", "L_COMMENT"], + "struct": { + "types": [{ + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "namedTable": { + "names": ["LINEITEM"] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 1, + "args": [{ + "scalarFunction": { + "functionReference": 2, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + }, { + "cast": { + "type": { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "HOUSEHOLD", + "nullable": false + } + } + } + }], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "scalarFunction": { + "functionReference": 2, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + }], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + } + } + }, { + "scalarFunction": { + "functionReference": 2, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 17 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + }], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + } + } + }, { + "scalarFunction": { + "functionReference": 3, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 12 + } + }, + "rootReference": { + } + } + }, { + "literal": { + "date": 9214, + "nullable": false + } + }], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }, { + "scalarFunction": { + "functionReference": 4, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 27 + } + }, + "rootReference": { + } + } + }, { + "literal": { + "date": 9214, + "nullable": false + } + }], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + } + } + }, + "expressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 17 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 12 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 15 + } + }, + "rootReference": { + } + } + }, { + "scalarFunction": { + "functionReference": 5, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + }, { + "scalarFunction": { + "functionReference": 6, + "args": [{ + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false + } + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 23 + } + }, + "rootReference": { + } + } + }], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }] + } + }, + "groupings": [{ + "groupingExpressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }] + }, { + "groupingExpressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }] + }], + "measures": [{ + "measure": { + "functionReference": 7, + "args": [{ + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + } + }] + } + }, + "expressions": [{ + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }] + } + }, + "sorts": [{ + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + }, { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + }] + } + }, + "offset": "0", + "count": "10" + } + }, + "names": ["L_ORDERKEY", "REVENUE", "O_ORDERDATE", "O_SHIPPRIORITY"] + } + }], + "expectedTypeUrls": [] +} +""" + +BASIC_YAML = """--- +types: + - name: point + structure: + latitude: i32 + longitude: i32 + - name: line + structure: + start: point + end: point +""" diff --git a/py/tests/test_api.py b/py/tests/test_api.py new file mode 100644 index 00000000..e987fd07 --- /dev/null +++ b/py/tests/test_api.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: Apache-2.0 + +import substrait_validator as sv +import pytest +from data import BASIC_PLAN, BASIC_YAML + + +def test_proto_roundtrip(): + """Round-trip test a basic Plan using the protobuf wrapper functions.""" + original_plan = sv.load_plan(BASIC_PLAN) + assert type(original_plan) is sv.Plan + + # Round-trip via binary representation. + data = sv.plan_to_proto(original_plan) + assert type(data) is bytes + round_tripped_plan = sv.load_plan(data) + assert round_tripped_plan == original_plan + + # Round-trip via JSON string. + data = sv.plan_to_json(original_plan) + assert type(data) is str + round_tripped_plan = sv.load_plan(data) + assert round_tripped_plan == original_plan + + # Round-trip via JSON dict. + data = sv.plan_to_dict(original_plan) + assert type(data) is dict + round_tripped_plan = sv.load_plan(data) + assert round_tripped_plan == original_plan + + # Round-trip via YAML. + data = sv.plan_to_yaml(original_plan) + assert type(data) is str + round_tripped_plan = sv.load_plan_from_yaml(data) + assert round_tripped_plan == original_plan + + # Round-trip via JDOT. + data = sv.plan_to_jdot(original_plan) + assert type(data) is str + round_tripped_plan = sv.load_plan_from_jdot(data) + assert round_tripped_plan == original_plan + + # Check identity. + round_tripped_plan = sv.load_plan(original_plan) + assert round_tripped_plan == original_plan + + +def test_parsing(): + """Test the parsing function.""" + result = sv.plan_to_parse_result(BASIC_PLAN) + assert type(result) == sv.ParseResult + + root = sv.parse_plan(BASIC_PLAN) + assert type(root) == sv.ParseResult + + root = sv.plan_to_parse_result(BASIC_PLAN) + assert type(root) == sv.ParseResult + + +def test_export_html(): + """Test the HTML export function.""" + html = sv.plan_to_html(BASIC_PLAN) + assert type(html) == str + lines = list(filter(bool, html.split("\n"))) + assert lines[0] == "" + assert lines[-1] == "" + + +def test_export_diags(): + """Test the diagnostics export functions.""" + diags = sv.plan_to_diagnostics_str(BASIC_PLAN) + assert type(diags) == str + + diags = list(sv.plan_to_diagnostics(BASIC_PLAN)) + for diag in diags: + assert type(diag) == sv.Diagnostic + + +def test_valid_invalid(): + """Test the plan validity functions.""" + # Override all diagnostics to info, so the plan is considered valid. + config = sv.Config() + config.override_diagnostic_level(0, "info", "info") + plan = sv.plan_to_result_handle(BASIC_PLAN, config) + assert sv.check_plan(plan) == 1 + sv.check_plan_valid(plan) + sv.check_plan_not_invalid(plan) + + # Override all diagnostics to warning, so the validity is considered to be + # unknown. + config = sv.Config() + config.override_diagnostic_level(0, "warning", "warning") + plan = sv.plan_to_result_handle(BASIC_PLAN, config) + assert sv.check_plan(plan) == 0 + with pytest.raises(ValueError): + sv.check_plan_valid(plan) + sv.check_plan_not_invalid(plan) + + # Override all diagnostics to error, so the plan is considered to be + # invalid. + config = sv.Config() + config.override_diagnostic_level(0, "error", "error") + plan = sv.plan_to_result_handle(BASIC_PLAN, config) + assert sv.check_plan(plan) == -1 + with pytest.raises(ValueError): + sv.check_plan_valid(plan) + with pytest.raises(ValueError): + sv.check_plan_not_invalid(plan) + + +def test_resolver_callback(): + """Tests whether the YAML URI resolver callback works.""" + + def resolver(s): + if s == "test:hello": + return BASIC_YAML.encode("utf-8") + raise ValueError("unknown URI") + + config = sv.Config() + + # Disable "not yet implemented" warnings. + config.override_diagnostic_level(1, "info", "info") + + # Disable missing root relation error, so we don't have to supply one. + config.override_diagnostic_level(5001, "info", "info") + + # Add the resolver. + config.add_uri_resolver(resolver) + + sv.check_plan_valid( + { + "extensionUris": [ + { + "extension_uri_anchor": 1, + "uri": "test:hello", + } + ] + }, + config, + ) + + with pytest.raises( + ValueError, + match=r"failed to resolve YAML: ValueError: unknown URI \(code 2002\)", + ): + sv.check_plan_valid( + { + "extensionUris": [ + { + "extension_uri_anchor": 1, + "uri": "test:bye", + } + ] + }, + config, + ) diff --git a/py/tests/test_cli.py b/py/tests/test_cli.py new file mode 100644 index 00000000..15398a13 --- /dev/null +++ b/py/tests/test_cli.py @@ -0,0 +1,271 @@ +# SPDX-License-Identifier: Apache-2.0 + +from click.testing import CliRunner +from substrait_validator import cli +from data import BASIC_PLAN, COMPLEX_PLAN +import tempfile +import json +import pprint +from os.path import join as pjoin +from os.path import isfile +import platform + + +def run(*args): + return CliRunner().invoke(cli, args) + + +def test_no_args(): + result = run() + assert result.exit_code == 2 + assert "Missing input file." in result.output + + +def test_mconvert_auto(): + """Test -mconvert with automatic format deduction from file extensions.""" + with tempfile.TemporaryDirectory() as tmp: + with open(pjoin(tmp, "plan.json"), "w") as f: + f.write(BASIC_PLAN) + + def convert(src, dest): + assert ( + run(pjoin(tmp, src), "-O", pjoin(tmp, dest), "-mconvert").exit_code == 0 + ) + + convert("plan.json", "plan.proto") + + with open(pjoin(tmp, "plan.proto"), "rb") as f: + a = f.read() + + convert("plan.proto", "plan.yaml") + convert("plan.yaml", "plan.jdot") + convert("plan.jdot", "plan.json") + convert("plan.json", "plan.bin") + + with open(pjoin(tmp, "plan.bin"), "rb") as f: + b = f.read() + + assert a == b + + +def test_mconvert_manual(): + """Test -mconvert with automatic format deduction from file extensions.""" + with tempfile.TemporaryDirectory() as tmp: + with open(pjoin(tmp, "data"), "w") as f: + f.write(BASIC_PLAN) + + def convert(in_type, out_type): + assert ( + run( + pjoin(tmp, "data"), + "-O", + pjoin(tmp, "data"), + "-mconvert", + "--in-type", + in_type, + "--out-type", + out_type, + ).exit_code + == 0 + ) + + convert("json", "proto") + + with open(pjoin(tmp, "data"), "rb") as f: + a = f.read() + + convert("proto", "yaml") + convert("yaml", "jdot") + convert("jdot", "json") + convert("json", "proto") + + with open(pjoin(tmp, "data"), "rb") as f: + b = f.read() + + assert a == b + + +def test_mconvert_complex(): + """Test -mconvert with a complex plan.""" + with tempfile.TemporaryDirectory() as tmp: + with open(pjoin(tmp, "data"), "w") as f: + f.write(COMPLEX_PLAN) + + def convert(in_type, out_type): + assert ( + run( + pjoin(tmp, "data"), + "-O", + pjoin(tmp, "data"), + "-mconvert", + "--in-type", + in_type, + "--out-type", + out_type, + ).exit_code + == 0 + ) + + convert("json", "proto") + + with open(pjoin(tmp, "data"), "rb") as f: + a = f.read() + + convert("proto", "yaml") + convert("yaml", "jdot") + convert("jdot", "json") + convert("json", "proto") + + with open(pjoin(tmp, "data"), "rb") as f: + b = f.read() + + assert a == b + + +def test_valid_invalid(): + """Test exit code based on validity for various modes using diagnostic + level overrides to force an outcome.""" + with tempfile.TemporaryDirectory() as tmp: + with open(pjoin(tmp, "plan.json"), "w") as f: + f.write(BASIC_PLAN) + + # Test all corner cases. + def x(mode, level): + return run( + pjoin(tmp, "plan.json"), + "-m", + mode, + "--diagnostic-level", + "0", + level, + level, + ).exit_code + + assert x("ignore", "error") == 0 + assert x("loose", "error") == 1 + assert x("loose", "warning") == 0 + assert x("strict", "warning") == 1 + assert x("strict", "info") == 0 + + # Default should be -mloose. + def x(level): + return run( + pjoin(tmp, "plan.json"), "--diagnostic-level", "0", level, level + ).exit_code + + assert x("info") == 0 + assert x("warning") == 0 + assert x("error") == 1 + + +def test_verbosity(): + """Test verbosity using diagnostic level overrides.""" + with tempfile.TemporaryDirectory() as tmp: + with open(pjoin(tmp, "plan.json"), "w") as f: + f.write(BASIC_PLAN) + + # Test all corner cases. + def x(verbosity, level): + return run( + pjoin(tmp, "plan.json"), + "-v", + verbosity, + "--diagnostic-level", + "0", + level, + level, + ).output.split(maxsplit=1)[:1] + + assert x("quiet", "error") == [] + assert x("fatal", "error") == ["Fatal"] + assert x("error", "error") == ["Error"] + assert x("error", "warn") == [] + assert x("warn", "warn") == ["Warning"] + assert x("warn", "info") == [] + assert x("info", "info") == ["Info"] + + +def test_export(): + """Test export logic.""" + with tempfile.TemporaryDirectory() as tmp: + with open(pjoin(tmp, "plan.json"), "w") as f: + f.write(BASIC_PLAN) + + def x(output, level): + return run( + pjoin(tmp, "plan.json"), + "-O", + pjoin(tmp, output), + "--diagnostic-level", + "0", + level, + level, + ).exit_code + + def y(output): + assert x(output, "error") == 1 + assert not isfile(pjoin(tmp, output)) + assert x(output, "info") == 0 + with open(pjoin(tmp, output), "rb") as f: + return f.read() + + assert y("output.proto")[0] == 10 + assert y("output.json").startswith(b'{\n "root":') + assert y("output.yaml").startswith(b"root:") + assert y("output.jdot").startswith(b"@macros") + assert b"" in y("output.html") + assert y("output.txt").startswith(b"Info") + + +def test_uri_resolution(): + """Test URI resolution logic.""" + with tempfile.TemporaryDirectory() as tmp: + with open(pjoin(tmp, "plan.json"), "w") as f: + f.write( + json.dumps( + { + "extensionUris": [ + { + "extension_uri_anchor": 1, + "uri": "https://raw.githubusercontent.com/substrait-io/substrait/82078995c19faa9d4e53a90cd66800c26d88f970/extensions/extension_types.yaml", + } + ] + } + ) + ) + + # Obtain a valid file:// URL for the above JSON file as well. + if platform.system() == "Windows": + local_url = "file:///" + pjoin(tmp, "plan.json").replace("\\", "/") + else: + local_url = "file://" + pjoin(tmp, "plan.json") + + def x(*args): + return run( + pjoin(tmp, "plan.json"), + "-verror", # verbosity error + "--diagnostic-level", + "2002", + "error", + "error", # YAML resolution failure -> error + "--diagnostic-level", + "0", + "info", + "info", # all other diagnostics -> info + *args + ).exit_code + + # Actual remote lookup. + assert x() == 0 + + # Disable remote lookups, so we expect a failure (not file://). + assert x("--no-use-urllib") == 1 + + # Try file:// protocol instead. This one is handled by the Rust + # fallback resolution logic. Note that plan.json is obviously not + # valid YAML, but all diagnostics not related to URI resolution are + # overridden to info, so we don't have to care. + assert x("--no-use-urllib", "--override-uri", "*", local_url) == 0 + + # urllib should also support file://. + assert x("--use-urllib", "--override-uri", "*", local_url) == 0 diff --git a/rs/Cargo.toml b/rs/Cargo.toml new file mode 100644 index 00000000..319cef84 --- /dev/null +++ b/rs/Cargo.toml @@ -0,0 +1,86 @@ +[package] +name = "substrait-validator" +description = "Substrait validator" +homepage = "https://substrait.io/" +repository = "https://github.com/substrait-io/substrait" +readme = "README.md" +version = "0.0.1" +edition = "2021" +license = "Apache-2.0" +include = ["src", "build.rs", "README.md"] + +[dependencies] + +# Prost is used to deal with protobuf serialization and deserialization. +prost = "0.9" +prost-types = "0.9" + +# Prost doesn't generate any introspection stuff, so we hack that stuff in with +# our own procedural macros. +substrait-validator-derive = { path = "../derive", version = "0.0.1" } + +# Google/protobuf has a funny idea about case conventions (it converts them all +# over the place) and prost remaps to Rust's conventions to boot. So, to +# recover the original names as much as possible, we need some case conversion +# of our own. +heck = "0.4" + +# Used for dealing with deserializing the YAML extension files. Note that we're +# not using serde_yaml here because serde_yaml is just a wrapper around +# yaml-rust, and we don't use any of serde's derive logic. +yaml-rust = "0.4" + +# The schema for the extension files uses jsonschema syntax. The rust crate for +# schema validation with this format this uses serde_json types as input for +# both the schema and the input, so we need to depend on that as well, even +# though we don't actually do any JSON serialization and deserialization. +jsonschema = { version = "=0.15.0", default-features = false } +serde_json = "1" + +# Used for checking identifier syntax (could be removed if regexes don't end up +# being useful elsewhere too). +regex = "1.5" + +# Used for checking URI syntax. +uriparse = "0.6" + +# Used for only compiling regexes and the extension file schema once. +once_cell = "1.9" + +# Various small helper crates for the diagnostic/error enums. +thiserror = "1.0" +strum = "0.23" +strum_macros = "0.23" +num-traits = "0.2" +num-derive = "0.3" + +# For intersperse(). +itertools = "0.8" + +# Glob patterns are used in the configuration structure and are used to +# syntax-check URI glob patterns in the Substrait plans. +glob = "0.3" + +# Used to resolve YAML URIs. If the curl feature is disabled, only file:// +# URLs will work without adding a custom resolver. +url = "2.2" +curl = { version = "0.4", optional = true } + +# Used for interpretation and conversion of various date/time-related literals +# in plans. +chrono = "0.4" + +# Used for pretty-printing floating point literal values. +float-pretty-print = "0.1" + +# Used by the HTML exporter. +base64 = "0.13" +percent-encoding = "2.1" + +[build-dependencies] + +# Used for generating Rust structs from the protobuf definitions. +prost-build = "0.9" + +# Used to automatically find all protobuf files. +walkdir = "2" diff --git a/rs/README.md b/rs/README.md new file mode 100644 index 00000000..1ea785f6 --- /dev/null +++ b/rs/README.md @@ -0,0 +1,29 @@ +Substrait query plan validator +============================== + +This crate implements a validator for [Substrait](https://substrait.io/) query +plans. + +``` +[dependencies] +substrait-validator = "0.0.1" +``` + +YAML file resolution +-------------------- + +One of the complexities of validating Substrait plans is resolving the YAML +extension files. By default, the crate only supports `file://...` URLs, but +often, the YAML files will be stored remotely. To make handling this easier, +you can enable [curl](https://crates.io/crates/curl) as an optional +dependency: + +``` +[dependencies] +substrait-validator = { version = "0.0.1", features = ["curl"] } +``` + +This adds the `substrait_validator::Config::add_curl_yaml_uri_resolver()` +method, which will use `libcurl` to resolve the files, thus supporting all the +common protocols (http, https, ftp, etc.). The downside is that the curl crate +depends on system libraries. diff --git a/rs/build.rs b/rs/build.rs new file mode 100644 index 00000000..ee614584 --- /dev/null +++ b/rs/build.rs @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: Apache-2.0 + +use std::env; +use std::ffi::OsStr; +use std::fs; +use std::io::Error; +use std::io::ErrorKind; +use std::io::Result; +use std::path::Path; +use std::path::PathBuf; + +/// Copies the file at src_tree/path to dest_tree/path if it's newer. +/// Automatically creates parent directories in dest as needed. +fn synchronize(src_tree: &Path, dest_tree: &Path, path: &Path) -> Result<()> { + // Construct paths. + let src = src_tree.join(path); + let dest = dest_tree.join(path); + + // Inform cargo that we should re-run if src changes. + println!("cargo:rerun-if-changed={}", src.display()); + + // Ensure that the source exists. + if !src.exists() { + return Err(Error::new(ErrorKind::Other, "source file not found")); + } + + // Check if destination already exists. + if dest.exists() { + // Check if it's newer than or equally old as the source; in that case + // we don't have to copy it again. + if dest.metadata()?.modified()? >= src.metadata()?.modified()? { + return Ok(()); + } + } else { + // Check if the destination directory exists, and if not, create it. + if let Some(parent) = dest.parent() { + if !parent.is_dir() { + fs::create_dir_all(parent)?; + } + } + } + + // Copy the file. + std::fs::copy(&src, &dest)?; + + Ok(()) +} + +/// Returns all protobuf files in the given directory. +fn find_proto_files(proto_path: &Path) -> Vec { + walkdir::WalkDir::new(proto_path) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| { + e.path().extension() == Some(OsStr::new("proto")) && e.metadata().unwrap().is_file() + }) + .map(|e| e.into_path()) + .collect() +} + +fn main() -> Result<()> { + // Determine the directory of Cargo.toml for this crate. + let manifest_dir = + PathBuf::from(&env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set")); + let resource_dir = manifest_dir.join("src/resources"); + + // Determine whether we're building from the git repository or from a + // crate file. If the former, we first synchronize our src/resources + // directory with the rest of the repository. + if manifest_dir.join("in-git-repo").exists() { + let validator_git_dir = manifest_dir.join(".."); + let substrait_git_dir = validator_git_dir.join("substrait"); + + // Synchronize the YAML extension file schema. + synchronize( + &substrait_git_dir, + &resource_dir, + &PathBuf::from("text/simple_extensions_schema.yaml"), + )?; + + // Synchronize the protobuf files from the main repository. + for proto_file in find_proto_files(&substrait_git_dir.join("proto")) { + synchronize( + &substrait_git_dir, + &resource_dir, + proto_file + .strip_prefix(&substrait_git_dir) + .expect("failed to strip prefix"), + )?; + } + + // Synchronize the validator-specific protobuf files. + for proto_file in find_proto_files(&validator_git_dir.join("proto")) { + synchronize( + &validator_git_dir, + &resource_dir, + proto_file + .strip_prefix(&validator_git_dir) + .expect("failed to strip prefix"), + )?; + } + } + + // Find all protobuf files in our resource directory. We just synchronized + // these files if we're building from git. + let proto_path = PathBuf::from(&resource_dir).join("proto"); + let proto_files: Vec<_> = find_proto_files(&proto_path); + + // Compile the protobuf files using prost. + let mut config = prost_build::Config::new(); + config.type_attribute(".", "#[derive(::substrait_validator_derive::ProtoMeta)]"); + config.compile_protos(&proto_files, &[&proto_path.display().to_string()])?; + + // Inform cargo that changes to the .proto files require a rerun. + for path in &proto_files { + println!("cargo:rerun-if-changed={}", path.display()); + } + + Ok(()) +} diff --git a/rs/in-git-repo b/rs/in-git-repo new file mode 100644 index 00000000..283ec634 --- /dev/null +++ b/rs/in-git-repo @@ -0,0 +1,4 @@ +This file serves as a marker for build.rs that it's building from within the +git repository, which will make it synchronize the protobuf and schema files +with the repo locations. It is not included in the release package, so a build +from crates.io will not try to look for files in parent directories. diff --git a/rs/src/.gitignore b/rs/src/.gitignore new file mode 100644 index 00000000..cb6eb2cc --- /dev/null +++ b/rs/src/.gitignore @@ -0,0 +1 @@ +/resources/ diff --git a/rs/src/export/diagnostics.rs b/rs/src/export/diagnostics.rs new file mode 100644 index 00000000..074e16df --- /dev/null +++ b/rs/src/export/diagnostics.rs @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! This module provides a basic export format that's just a listing of the +//! diagnostic messages contained in the tree. + +use crate::output::parse_result; + +/// Export the diagnostic messages of the tree as a multiline string. +pub fn export( + out: &mut T, + _root_name: &'static str, + result: &parse_result::ParseResult, +) -> std::io::Result<()> { + for diag in result.root.iter_diagnostics() { + writeln!(out, "{diag}")?; + } + Ok(()) +} diff --git a/rs/src/export/html/fa-solid-900.woff2 b/rs/src/export/html/fa-solid-900.woff2 new file mode 100644 index 00000000..360ba115 Binary files /dev/null and b/rs/src/export/html/fa-solid-900.woff2 differ diff --git a/rs/src/export/html/fa-solid-900.woff2.LICENSE.txt b/rs/src/export/html/fa-solid-900.woff2.LICENSE.txt new file mode 100644 index 00000000..87c82e3e --- /dev/null +++ b/rs/src/export/html/fa-solid-900.woff2.LICENSE.txt @@ -0,0 +1,92 @@ +Copyright (c) 2022 Fonticons, Inc. (https://fontawesome.com) +with Reserved Font Name: "Font Awesome". + +This Font Software is licensed under the SIL Open Font License, Version 1.1. +This license is copied below, and is also available with a FAQ at: +http://scripts.sil.org/OFL + +SIL OPEN FONT LICENSE +Version 1.1 - 26 February 2007 + +PREAMBLE +The goals of the Open Font License (OFL) are to stimulate worldwide +development of collaborative font projects, to support the font creation +efforts of academic and linguistic communities, and to provide a free and +open framework in which fonts may be shared and improved in partnership +with others. + +The OFL allows the licensed fonts to be used, studied, modified and +redistributed freely as long as they are not sold by themselves. The +fonts, including any derivative works, can be bundled, embedded, +redistributed and/or sold with any software provided that any reserved +names are not used by derivative works. The fonts and derivatives, +however, cannot be released under any other type of license. The +requirement for fonts to remain under this license does not apply +to any document created using the fonts or their derivatives. + +DEFINITIONS +"Font Software" refers to the set of files released by the Copyright +Holder(s) under this license and clearly marked as such. This may +include source files, build scripts and documentation. + +"Reserved Font Name" refers to any names specified as such after the +copyright statement(s). + +"Original Version" refers to the collection of Font Software components as +distributed by the Copyright Holder(s). + +"Modified Version" refers to any derivative made by adding to, deleting, +or substituting — in part or in whole — any of the components of the +Original Version, by changing formats or by porting the Font Software to a +new environment. + +"Author" refers to any designer, engineer, programmer, technical +writer or other person who contributed to the Font Software. + +PERMISSION & CONDITIONS +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Font Software, to use, study, copy, merge, embed, modify, +redistribute, and sell modified and unmodified copies of the Font +Software, subject to the following conditions: + +1) Neither the Font Software nor any of its individual components, +in Original or Modified Versions, may be sold by itself. + +2) Original or Modified Versions of the Font Software may be bundled, +redistributed and/or sold with any software, provided that each copy +contains the above copyright notice and this license. These can be +included either as stand-alone text files, human-readable headers or +in the appropriate machine-readable metadata fields within text or +binary files as long as those fields can be easily viewed by the user. + +3) No Modified Version of the Font Software may use the Reserved Font +Name(s) unless explicit written permission is granted by the corresponding +Copyright Holder. This restriction only applies to the primary font name as +presented to the users. + +4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font +Software shall not be used to promote, endorse or advertise any +Modified Version, except to acknowledge the contribution(s) of the +Copyright Holder(s) and the Author(s) or with their explicit written +permission. + +5) The Font Software, modified or unmodified, in part or in whole, +must be distributed entirely under this license, and must not be +distributed under any other license. The requirement for fonts to +remain under this license does not apply to any document created +using the Font Software. + +TERMINATION +This license becomes null and void if any of the above conditions are +not met. + +DISCLAIMER +THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE +COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL +DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM +OTHER DEALINGS IN THE FONT SOFTWARE. diff --git a/rs/src/export/html/mod.rs b/rs/src/export/html/mod.rs new file mode 100644 index 00000000..7d5e4ccd --- /dev/null +++ b/rs/src/export/html/mod.rs @@ -0,0 +1,682 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! This module provides a human-readable export format based on HTML. + +use crate::output::comment; +use crate::output::data_type; +use crate::output::data_type::ParameterInfo; +use crate::output::diagnostic; +use crate::output::parse_result; +use crate::output::path; +use crate::output::tree; +use std::sync::Arc; + +const HEADER1: &str = concat!( + r#" + + + + + + + +"# +); + +const FOOTER: &str = r#" + + + +"#; + +/// All the error levels for nodes that we have different formatting for in +/// the context of HTML output. +#[derive(PartialOrd, Ord, PartialEq, Eq)] +enum Level { + /// Subtree is valid. + Ok, + + /// There are descendent nodes with warnings. + ChildWarning, + + /// The current node has warnings. + Warning, + + /// There are descendent nodes with errors. + ChildError, + + /// The current node has errors. + Error, +} + +impl From for Level { + fn from(level: diagnostic::Level) -> Self { + match level { + diagnostic::Level::Info => Level::Ok, + diagnostic::Level::Warning => Level::Warning, + diagnostic::Level::Error => Level::Error, + } + } +} + +impl Level { + pub fn class(&self) -> &'static str { + match self { + Level::Ok => "ok", + Level::ChildWarning => "warn_child", + Level::Warning => "warn_here", + Level::ChildError => "error_child", + Level::Error => "error_here", + } + } +} + +/// Escapes HTML text or parameter values using character entities. +fn html_escape>(text: S) -> String { + let text = text.as_ref(); + let mut result = String::with_capacity(text.len()); + for c in text.chars() { + match c { + '&' => result += "&", + '<' => result += "<", + '>' => result += ">", + '"' => result += """, + '\'' => result += "'", + c => result.push(c), + } + } + result +} + +/// Encodes part of an URL using percent escape sequences. +fn url_encode>(text: S) -> String { + use std::fmt::Write; + let text = text.as_ref(); + let mut result = String::with_capacity(text.len()); + for c in text.chars() { + if c.is_alphanumeric() || "-._~!$&'()*+,;=:@".contains(c) { + result.push(c); + } else { + let mut buf = [0; 4]; + for b in c.encode_utf8(&mut buf).as_bytes() { + write!(result, "%{:02x}", *b).unwrap(); + } + } + } + result +} + +/// Encodes a node path using () instead of [] and {}. Such paths should be +/// still be unambiguous, and should be more readable than their +/// percent-encoded variants (only round parentheses are unreserved in URLs). +fn path_encode>(text: S) -> String { + text.as_ref() + .chars() + .map(|c| match c { + '[' => '(', + ']' => ')', + '<' => '(', + '>' => ')', + c => c, + }) + .collect() +} + +/// Formats a path to a node or diagnostic. +fn format_path(path: &path::PathBuf, index: Option) -> String { + if let Some(index) = index { + format!("{path}:{index}") + } else { + path.to_string() + } +} + +/// Formats the parameters of an tag to a node or diagnostic. +fn format_reference_parameters(path: &path::PathBuf, index: Option) -> String { + let path = format_path(path, index); + format!( + "href=\"#{}\" title=\"{}\"", + html_escape(url_encode(path_encode(&path))), + html_escape(&path) + ) +} + +/// Formats a link to a node (index = None) +/// or diagnostic (index = Some(index of NodeData entry)). +fn format_reference( + text: S, + path: &path::PathBuf, + index: Option, +) -> String { + format!("{text}", format_reference_parameters(path, index)) +} + +/// Formats an anchor/permalink tag for a node (index = None) +/// or diagnostic (index = Some(index of NodeData entry)). +fn format_anchor(path: &path::PathBuf, index: Option) -> String { + format!( + "", + format_reference_parameters(path, index) + ) +} + +/// Formats the id parameter for a div/details tag for a node (index = None) +/// or diagnostic (index = Some(index of NodeData entry)). +fn format_id(path: &path::PathBuf, index: Option) -> String { + format!( + "id=\"{}\"", + html_escape(url_encode(path_encode(format_path(path, index)))) + ) +} + +/// Creates a span with the given class name. The text is HTML-escaped. +fn format_span(class: &'static str, text: S) -> String { + format!( + "{}", + html_escape(text.to_string()) + ) +} + +/// Creates a span with the given class name. +fn format_span_html(class: &'static str, html: S) -> String { + format!("{}", html) +} + +/// Formats a diagnostic message box. path should be the node that the +/// diagnostic is defined in, and index should be its index within Node::data. +/// with_id specifies whether the HTML id parameter should be included. +fn format_diagnostic( + diag: &diagnostic::Diagnostic, + path: &path::PathBuf, + index: usize, + with_id: bool, + with_path: bool, +) -> String { + let cause = format_span( + "cause", + if with_path { + diag.to_string() + } else { + format!("{:#}", diag) + }, + ); + let cause = if &diag.path == path { + cause + } else { + format_reference(cause, &diag.path, None) + }; + let id = if with_id { + let mut id = format_id(path, Some(index)); + id.push(' '); + id + } else { + String::new() + }; + let anchor = format_anchor(path, Some(index)); + + let class = match diag.adjusted_level { + diagnostic::Level::Info => "diag_info", + diagnostic::Level::Warning => "diag_warn", + diagnostic::Level::Error => "diag_error", + }; + + format!("
\n{cause}\n{anchor}\n
") +} + +/// Format a flattened list of diagnostic cards. +fn format_diagnostics(path: &path::Path, node: &tree::Node) -> (Vec, diagnostic::Level) { + let mut html = vec![]; + let mut level = diagnostic::Level::Info; + for (index, data) in node.data.iter().enumerate() { + match data { + tree::NodeData::Child(child) => { + let (sub_html, sub_level) = + format_diagnostics(&path.with(child.path_element.clone()), &child.node); + html.extend(sub_html); + level = std::cmp::max(level, sub_level); + } + tree::NodeData::Diagnostic(diag) => { + html.push(format_diagnostic( + diag, + &path.to_path_buf(), + index, + false, + true, + )); + level = std::cmp::max(level, diag.adjusted_level); + } + _ => {} + } + } + (html, level) +} + +/// Formats a comment span. +fn format_comment_span(span: &comment::Span) -> String { + match &span.link { + None => html_escape(&span.text), + Some(comment::Link::Path(path)) => format_reference(html_escape(&span.text), path, None), + Some(comment::Link::Url(url)) => format!( + "{}", + html_escape(url), + html_escape(&span.text) + ), + } +} + +/// Formats a comment using HTML markup. +fn format_comment(comment: &comment::Comment) -> String { + let mut result = String::new(); + let mut p_open = false; + for element in comment.elements().iter() { + match element { + comment::Element::Span(span) => { + if !p_open { + result += "

"; + p_open = true; + } + result += &format_comment_span(span); + } + comment::Element::NewLine => { + if p_open { + result += "

"; + p_open = false; + } + } + comment::Element::ListOpen => { + if p_open { + result += "

"; + p_open = false; + } + result += "
  • "; + } + comment::Element::ListNext => { + if p_open { + result += "

    "; + p_open = false; + } + result += "
  • "; + } + comment::Element::ListClose => { + if p_open { + result += "

    "; + p_open = false; + } + result += "
"; + } + } + } + if p_open { + result += "

"; + } + result +} + +/// Formats a brief comment using HTML markup. +fn format_brief(brief: &comment::Brief) -> String { + let mut result = String::new(); + for span in brief.spans().iter() { + result += &format_comment_span(span); + } + result +} + +// Format the relation trees. +fn format_relation_tree( + path: &path::Path, + node: &tree::Node, + index: &mut usize, + is_root: bool, + in_expression: bool, +) -> Vec { + let mut html = vec![]; + + let text = node + .brief + .as_ref() + .map(format_brief) + .unwrap_or_else(|| String::from("unknown")); + let is_relation = matches!(node.class, tree::Class::Relation); + let is_expression = matches!(node.class, tree::Class::Expression); + + if is_relation { + if is_root { + html.push("
".to_string()); + html.push(format!( + "Query/relation graph #{}", + *index + )); + html.push("
  • Sink
      ".to_string()); + }; + html.push(format!( + "
    • {text} ({})", + if in_expression { + "subquery" + } else { + "data_source" + }, + format_reference("link", &path.to_path_buf(), None) + )); + } + + let mut has_children = false; + for data in node.data.iter() { + if let tree::NodeData::Child(child) = data { + let sub_html = format_relation_tree( + &path.with(child.path_element.clone()), + &child.node, + index, + is_root && !is_relation, + (in_expression && !is_relation) || is_expression, + ); + if !sub_html.is_empty() { + if is_relation && !has_children { + html.push("
        ".to_string()); + } + has_children = true; + html.extend(sub_html); + } + } + } + + if is_relation { + if has_children { + html.push("
      ".to_string()); + } + html.push("
    • ".to_string()); + if is_root { + html.push("
".to_string()); + html.push("
".to_string()); + *index += 1; + } + } + + html +} + +// Format a data type parameter card. +fn format_data_type_card(content: &str) -> String { + format!( + "
\n{}\n
", + html_escape(content), + ) +} + +// Format a data type. +fn format_data_type(prefix: &str, data_type: &Arc) -> Vec { + let mut html = vec![]; + + if data_type.parameters().is_empty() { + html.push(format_data_type_card(&format!("{prefix}: {:#}", data_type))); + } else { + html.push("
\n".to_string()); + html.push(format!("{prefix}: {}", html_escape(data_type.to_string()))); + html.push("".to_string()); + for (index, parameter) in data_type.parameters().iter().enumerate() { + let name = data_type + .class() + .parameter_name(index) + .unwrap_or_else(|| "?".to_string()); + match parameter { + data_type::Parameter::Type(t) => { + html.extend(format_data_type(&format!(".{name}"), t)) + } + data_type::Parameter::NamedType(n, t) => { + html.extend(format_data_type(&format!(".{n}"), t)) + } + data_type::Parameter::Unsigned(i) => { + html.push(format_data_type_card(&format!(".{name}: {i}"))) + } + } + } + html.push("
".to_string()); + } + + html +} + +// Format the node tree. +fn format_node_tree( + path: &path::Path, + unknown_subtree: bool, + node: &tree::Node, +) -> (Vec, Level) { + // Get the HTML ID for this card. + let pathbuf = path.to_path_buf(); + let id = format_id(&pathbuf, None); + + // Format the card header. + let brief = if let Some(brief) = &node.brief { + format_span_html("brief", format_brief(brief)) + } else { + String::from("") + }; + let value = match &node.node_type { + tree::NodeType::ProtoMessage(proto_type) => { + format!("{brief} {}", format_span("type", proto_type)) + } + tree::NodeType::ProtoPrimitive(proto_type, data) => { + format!( + "= {} {brief} {}", + format_span("value", data), + format_span("type", proto_type) + ) + } + tree::NodeType::ProtoMissingOneOf => "?".to_string(), + tree::NodeType::NodeReference(num, target) => format_reference( + format!( + "= {} {brief} {}", + format_span("value", num), + format_span("type", "uint32, reference") + ), + &target.path, + None, + ), + tree::NodeType::YamlReference(yaml) => { + format!( + "= {} {brief} {}", + format_span("value", &yaml.uri), + format_span("type", "string, resolved to YAML") + ) + } + tree::NodeType::YamlMap => format!("{brief} {}", format_span("type", "YAML map")), + tree::NodeType::YamlArray => format!("{brief} {}", format_span("type", "YAML array")), + tree::NodeType::YamlPrimitive(data) => format!("= {}{brief}", format_span("value", data)), + }; + let header = format!( + "{} {value} {}", + format_span("field", path.end_to_string()), + format_anchor(&pathbuf, None) + ); + + // If the node doesn't have any additional data associated with it, output + // a normal
rather than a
card. + if node.data.is_empty() && node.summary.is_none() { + let class = if unknown_subtree { "unknown" } else { "ok" }; + return ( + vec![format!("
{header}
")], + Level::Ok, + ); + } + + // Gather child nodes here. The first entry of the html Vec is reserved for + // the open tags, which we don't have all the information for just yet. + let mut html = vec![String::new()]; + let mut level = Level::Ok; + + // Add the summary. + if let Some(ref summary) = node.summary { + html.push(format_comment(summary)); + } + + // Iterate over node data here, recursively entering children. + for (index, data) in node.data.iter().enumerate() { + match data { + tree::NodeData::Child(child) => { + let (sub_html, sub_level) = format_node_tree( + &path.with(child.path_element.clone()), + !child.recognized, + &child.node, + ); + html.extend(sub_html); + level = std::cmp::max(level, sub_level); + } + tree::NodeData::Diagnostic(diag) => { + html.push(format_diagnostic( + diag, + &pathbuf, + index, + true, + diag.path != pathbuf, + )); + level = std::cmp::max(level, diag.adjusted_level.into()); + } + tree::NodeData::DataType(data_type) => { + html.extend(format_data_type( + if matches!(node.class, tree::Class::Relation) { + "Schema" + } else { + "Data type" + }, + data_type, + )); + } + tree::NodeData::Comment(comment) => { + html.push("
\n".to_string()); + html.push(format_comment(comment)); + html.push("\n
".to_string()); + } + } + } + + // Add the surrounding
tags now that we have the error level + // information we needed. + let class = if unknown_subtree { + "unknown" + } else { + level.class() + }; + html[0] = format!("
\n\n{header}\n"); + html.push("
".to_string()); + + // Determine the minimum error level for the parent. + let level = match level { + Level::Error => Level::ChildError, + Level::Warning => Level::ChildWarning, + x => x, + }; + + (html, level) +} + +/// Export the tree in HTML format, with as many details as possible, and as +/// human-readable as possible. Purely intended for debugging. +pub fn export( + out: &mut T, + root_name: &'static str, + result: &parse_result::ParseResult, +) -> std::io::Result<()> { + let path = path::Path::Root(root_name); + + // Generate and write header. + let font_awesome_b64 = base64::encode(FONT_AWESOME); + write!(out, "{HEADER1}{}{HEADER2}", font_awesome_b64)?; + + // Emit the node graph. + writeln!(out, "
")?; + writeln!(out, "Relation graphs")?; + writeln!( + out, + "
Note: data flows upwards in these graphs.
" + )?; + let mut index = 0; + for s in format_relation_tree(&path, &result.root, &mut index, true, false) { + writeln!(out, "{s}")?; + } + writeln!(out, "
")?; + + // Emit diagnostics summary. + let (diag_html, level) = format_diagnostics(&path, &result.root); + let validity_class = match level { + diagnostic::Level::Info => "valid", + diagnostic::Level::Warning => "maybe_valid", + diagnostic::Level::Error => "invalid", + }; + let validity_summary = match level { + diagnostic::Level::Info => "This plan is VALID", + diagnostic::Level::Warning => "The validator was unable to determine validity", + diagnostic::Level::Error => "This plan is INVALID", + }; + writeln!( + out, + "
", + Level::from(level).class() + )?; + writeln!( + out, + "{validity_summary}" + )?; + if diag_html.is_empty() { + writeln!( + out, + "
No diagnostics were reported.
" + )?; + } else { + for s in diag_html { + writeln!(out, "{s}")?; + } + } + writeln!(out, "
")?; + + // Emit protobuf-level raw node tree. + for s in format_node_tree(&path, false, &result.root).0 { + writeln!(out, "{s}")?; + } + + write!(out, "{FOOTER}") +} diff --git a/rs/src/export/html/style.css b/rs/src/export/html/style.css new file mode 100644 index 00000000..96dc1747 --- /dev/null +++ b/rs/src/export/html/style.css @@ -0,0 +1,333 @@ +body { + font-family: sans-serif; +} + +details, +div.card { + border: 1px solid; + border-color: rgba(0, 0, 0, .3); + color: rgba(0, 0, 0, .8); + border-radius: 4px; + margin-top: .2em; +} + +details:hover, +div.card:hover { + border-color: #000; + color: rgba(0, 0, 0, .9); +} + +details:target, +div.card:target { + box-shadow: 0 0 .3em .2em rgba(0, 0, 0, 0.3); + border-color: #000; + color: rgba(0, 0, 0, .9); +} + +details { + padding: .2em .5em 0; +} + +summary { + margin: -.2em -.5em 0; + padding: .2em .5em; +} + +details[open] { + padding: .2em .5em; +} + +details[open] > summary { + border-bottom: 1px solid rgba(0, 0, 0, .3); + margin-bottom: .2em; +} + +div.card { + padding: .2em .5em; +} + +details.ok { + background-color: #dfd; +} + +details.warn_child { + background-color: #fed; +} + +details.warn_here { + background-color: #fdb; +} + +details.error_child { + background-color: #fdd; +} + +details.error_here { + background-color: #fbb; +} + +details.unknown, +div.unknown { + background-color: #ddd; +} + +details.data_type { + background-color: #def; +} + +details.data_type > summary::before { + font: normal 900 1em "Font Awesome 6 Free"; + color: #048; + content: "\f0db"; + padding-right: .2em; +} + +div.data_type { + background-color: #bdf; +} + +div.data_type::before { + font: normal 900 1em "Font Awesome 6 Free"; + color: #048; + content: "\f0db"; + padding-right: .2em; +} + +div.comment { + background-color: #bfd; +} + +div.comment::before { + font: normal 900 1em "Font Awesome 6 Free"; + color: #084; + content: "\f249"; + padding-right: .2em; + float: left; +} + +div.comment > p { + margin: 0 0 0.2em; +} + +details > p { + margin: 0 0 0.2em; +} + +details.relation_tree { + background-color: #bdf; +} + +details.relation_tree > summary::before { + font: normal 900 1em "Font Awesome 6 Free"; + color: #048; + content: "\f0e8"; + padding-right: .2em; +} + +div.diag_info { + background-color: #9f9; + color: #333; +} + +div.diag_info::before, +summary.valid::before { + font: normal 900 1em "Font Awesome 6 Free"; + color: #080; + content: "\f058"; +} + +span.valid { + color: #080; + font-weight: bold; +} + +div.diag_warn { + background-color: #fc9; + color: #333; + font-weight: bold; +} + +div.diag_warn::before, +summary.maybe_valid::before { + font: normal 900 1em "Font Awesome 6 Free"; + color: #840; + content: "\f059"; +} + +div.diag_error { + background-color: #f99; + color: #000; + font-weight: bold; +} + +div.diag_error::before, +summary.invalid::before { + font: normal 900 1em "Font Awesome 6 Free"; + color: #800; + content: "\f00d"; +} + +span.invalid { + color: #c00; + font-weight: bold; +} + +a.anchor { + opacity: 0.4; + text-decoration: none; + float: right; +} + +a.anchor:hover { + opacity: 1.0; +} + +a.anchor::before { + font: normal 900 1em "Font Awesome 6 Free"; + color: #000; + content: "\f0c1"; +} + +details:target, +div.card:target { + animation: highlight 1000ms ease-out; +} + +@keyframes highlight { + 0% { box-shadow: 0 0 2em 1em rgba(0, 0, 0, 0.3); } + 50% { box-shadow: 0 0 2em 1em rgba(0, 0, 0, 0.3); } + 100% { } +} + +span.field { + font-weight: bold; + color: #333; +} + +span.value { + font-weight: bold; + color: #000; +} + +span.brief { + font-style: italic; + color: #000; +} + +span.type { + font-style: italic; + font-size: 80%; + color: #555; +} + +span.cause { + font-weight: normal; +} + +div.note { + font-style: italic; + color: #555; +} + +.tree, +.tree ul, +.tree li { + list-style: none; + margin: 0; + padding: 0; + position: relative; +} + +.tree { + margin: 0 auto 1em; + text-align: center; +} + +.tree, +.tree ul { + display: table; +} + +.tree ul { + width: 100%; +} + +.tree li { + display: table-cell; + padding: 1.5em 0 0; + vertical-align: top; +} + +/* _________ */ +.tree li:before { + outline: solid 1px #666; + content: ""; + left: 0; + position: absolute; + right: 0; + top: 0; +} + +.tree li:first-child:before { + left: 50%; +} + +.tree li:last-child:before { + right: 50%; +} + +.tree span { + border: solid 0.1em #666; + border-radius: 0.2em; + display: inline-block; + margin: 0 0.2em 0.5em; + padding: 0.2em 0.5em; + position: relative; +} + +/* | */ +.tree ul:before { + outline: solid 1px #555; + content: ""; + height: 0.5em; + left: 50%; + position: absolute; +} + +.tree span:before { + margin-left: -1px; + padding-left: 0.2em; + font-size: 100%; + content: ""; + height: 1.5em; + left: 50%; + position: absolute; +} + +.tree span.data_source:before { + border-left: solid 2px #555; +} + +.tree span.subquery:before { + border-left: dotted 2px #555; +} + +.tree ul:before { + top: -0.5em; +} + +.tree span:before { + top: -1.55em; +} + +/* The root node doesn't connect upwards */ +.tree > li { + margin-top: 0; +} + +.tree > li:before, +.tree > li:after, +.tree > li > span:before { + outline: none !important; + border: none !important; +} diff --git a/rs/src/export/mod.rs b/rs/src/export/mod.rs new file mode 100644 index 00000000..2a865667 --- /dev/null +++ b/rs/src/export/mod.rs @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module dealing with serializing a [ParseResult](parse_result::ParseResult) +//! to a byte stream in various formats. + +mod diagnostics; +mod html; +mod proto; + +use crate::output::parse_result; + +/// Supported output formats for exporting. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Format { + /// Emit a newline-separated, flattened list of diagnostics. + Diagnostics, + + /// Emit a HTML page with detailed information about the parsed plan. + Html, + + /// Emit all parse information as a substrait.validator.Node protobuf + /// message, using binary serialization. + Proto, +} + +/// Exports the given doctree with the given format to the given output. +pub fn export( + out: &mut T, + format: Format, + root_name: &'static str, + result: &parse_result::ParseResult, +) -> std::io::Result<()> { + match format { + Format::Diagnostics => diagnostics::export(out, root_name, result), + Format::Html => html::export(out, root_name, result), + Format::Proto => proto::export(out, root_name, result), + } +} diff --git a/rs/src/export/proto.rs b/rs/src/export/proto.rs new file mode 100644 index 00000000..4343abe0 --- /dev/null +++ b/rs/src/export/proto.rs @@ -0,0 +1,461 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! This module provides an export format based on protobuf, to represent the +//! output tree as accurately as possible. +//! +//! This is primarily intended to be used to cross programming language +//! boundaries for the validator output, whenever the simplified formats are +//! not comprehensive enough. The Python bindings specifically make extensive +//! use of this. + +use crate::input::proto::substrait::validator; +use crate::output::comment; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::output::extension; +use crate::output::parse_result; +use crate::output::path; +use crate::output::primitive_data; +use crate::output::tree; +use prost::Message; + +impl From<&parse_result::ParseResult> for validator::ParseResult { + fn from(result: &parse_result::ParseResult) -> Self { + Self { + root: Some((&result.root).into()), + } + } +} + +impl From<&tree::Node> for validator::Node { + fn from(node: &tree::Node) -> Self { + Self { + node_type: Some((&node.node_type).into()), + class: (&node.class).into(), + brief: node.brief.as_ref().map(|x| x.into()), + summary: node.summary.as_ref().map(|x| x.into()), + data_type: node.data_type.as_ref().map(|x| x.as_ref().into()), + data: node.data.iter().map(|x| x.into()).collect(), + } + } +} + +impl From<&tree::Class> for i32 { + fn from(class: &tree::Class) -> Self { + match class { + tree::Class::Misc => validator::node::Class::Unspecified, + tree::Class::Type => validator::node::Class::Type, + tree::Class::Expression => validator::node::Class::Expression, + tree::Class::Relation => validator::node::Class::Relation, + } + .into() + } +} + +impl From<&tree::NodeData> for validator::node::Data { + fn from(node: &tree::NodeData) -> Self { + Self { + kind: Some(match node { + tree::NodeData::Child(child) => validator::node::data::Kind::Child(child.into()), + tree::NodeData::Diagnostic(diagnostic) => { + validator::node::data::Kind::Diagnostic(diagnostic.into()) + } + tree::NodeData::DataType(data_type) => { + validator::node::data::Kind::DataType(data_type.as_ref().into()) + } + tree::NodeData::Comment(comment) => { + validator::node::data::Kind::Comment(comment.into()) + } + }), + } + } +} + +impl From<&tree::Child> for validator::node::Child { + fn from(node: &tree::Child) -> Self { + Self { + path: Some((&node.path_element).into()), + node: Some(node.node.as_ref().into()), + recognized: node.recognized, + } + } +} + +impl From<&diagnostic::Diagnostic> for validator::Diagnostic { + fn from(node: &diagnostic::Diagnostic) -> Self { + Self { + original_level: (&node.original_level).into(), + adjusted_level: (&node.adjusted_level).into(), + cause: node.cause.classification.into(), + msg: node.cause.to_string(), + path: Some((&node.path).into()), + } + } +} + +impl From<&diagnostic::Level> for i32 { + fn from(node: &diagnostic::Level) -> Self { + match node { + diagnostic::Level::Error => validator::diagnostic::Level::Error, + diagnostic::Level::Warning => validator::diagnostic::Level::Warning, + diagnostic::Level::Info => validator::diagnostic::Level::Info, + } + .into() + } +} + +impl From<&comment::Comment> for validator::Comment { + fn from(node: &comment::Comment) -> Self { + Self { + elements: node.elements().iter().map(|x| x.into()).collect(), + } + } +} + +impl From<&comment::Brief> for validator::Comment { + fn from(node: &comment::Brief) -> Self { + Self { + elements: node + .spans() + .iter() + .map(|x| validator::comment::Element { + kind: Some(validator::comment::element::Kind::Span(x.into())), + }) + .collect(), + } + } +} + +impl From<&comment::Element> for validator::comment::Element { + fn from(node: &comment::Element) -> Self { + validator::comment::Element { + kind: Some(match node { + comment::Element::Span(span) => { + validator::comment::element::Kind::Span(span.into()) + } + comment::Element::NewLine => { + validator::comment::element::Kind::NewLine(validator::Empty {}) + } + comment::Element::ListOpen => { + validator::comment::element::Kind::ListOpen(validator::Empty {}) + } + comment::Element::ListNext => { + validator::comment::element::Kind::ListNext(validator::Empty {}) + } + comment::Element::ListClose => { + validator::comment::element::Kind::ListClose(validator::Empty {}) + } + }), + } + } +} + +impl From<&comment::Span> for validator::comment::Span { + fn from(node: &comment::Span) -> Self { + Self { + text: node.text.to_string(), + link: node.link.as_ref().map(|x| x.into()), + } + } +} + +impl From<&comment::Link> for validator::comment::span::Link { + fn from(node: &comment::Link) -> Self { + match node { + comment::Link::Path(path) => validator::comment::span::Link::Path(path.into()), + comment::Link::Url(url) => validator::comment::span::Link::Url(url.into()), + } + } +} + +impl From<&tree::NodeType> for validator::node::NodeType { + fn from(node: &tree::NodeType) -> Self { + match node { + tree::NodeType::ProtoMessage(proto_type) => { + validator::node::NodeType::ProtoMessage(validator::node::ProtoMessage { + path: proto_type.to_string(), + }) + } + tree::NodeType::ProtoPrimitive(proto_type, data) => { + validator::node::NodeType::ProtoPrimitive(validator::node::ProtoPrimitive { + path: proto_type.to_string(), + data: Some(data.into()), + }) + } + tree::NodeType::ProtoMissingOneOf => { + validator::node::NodeType::ProtoMissingOneof(validator::Empty::default()) + } + tree::NodeType::NodeReference(anchor, node) => { + validator::node::NodeType::NodeReference(validator::node::NodeReference { + value: *anchor, + path: Some((&node.path).into()), + }) + } + tree::NodeType::YamlReference(info) => { + validator::node::NodeType::YamlReference(validator::node::YamlReference { + uri: info.uri.name().unwrap_or_default().to_string(), + }) + } + tree::NodeType::YamlMap => { + validator::node::NodeType::YamlMap(validator::Empty::default()) + } + tree::NodeType::YamlArray => { + validator::node::NodeType::YamlArray(validator::Empty::default()) + } + tree::NodeType::YamlPrimitive(data) => { + validator::node::NodeType::YamlPrimitive(data.into()) + } + } + } +} + +impl From<&primitive_data::PrimitiveData> for validator::node::PrimitiveData { + fn from(node: &primitive_data::PrimitiveData) -> Self { + Self { + data: match node { + primitive_data::PrimitiveData::Null => None, + primitive_data::PrimitiveData::Bool(x) => { + Some(validator::node::primitive_data::Data::Boolean(*x)) + } + primitive_data::PrimitiveData::Unsigned(x) => { + Some(validator::node::primitive_data::Data::Unsigned(*x)) + } + primitive_data::PrimitiveData::Signed(x) => { + Some(validator::node::primitive_data::Data::Signed(*x)) + } + primitive_data::PrimitiveData::Float(x) => { + Some(validator::node::primitive_data::Data::Real(*x)) + } + primitive_data::PrimitiveData::String(x) => Some( + validator::node::primitive_data::Data::Unicode(x.to_string()), + ), + primitive_data::PrimitiveData::Bytes(x) => { + Some(validator::node::primitive_data::Data::Binary(x.clone())) + } + primitive_data::PrimitiveData::Enum(x) => Some( + validator::node::primitive_data::Data::Variant(x.to_string()), + ), + primitive_data::PrimitiveData::Any(x) => { + Some(validator::node::primitive_data::Data::Any(x.clone())) + } + }, + } + } +} + +impl From<&path::PathBuf> for validator::Path { + fn from(node: &path::PathBuf) -> Self { + Self { + root: node.root.to_string(), + elements: node.elements.iter().map(|x| x.into()).collect(), + } + } +} + +impl From<&path::PathElement> for validator::path::Element { + fn from(node: &path::PathElement) -> Self { + Self { + kind: Some(match node { + path::PathElement::Field(field) => { + validator::path::element::Kind::Field(validator::path::Field { + field: field.to_string(), + }) + } + path::PathElement::Repeated(field, index) => { + validator::path::element::Kind::RepeatedField(validator::path::RepeatedField { + field: field.to_string(), + index: (*index).try_into().unwrap(), + }) + } + path::PathElement::Variant(field, variant) => { + validator::path::element::Kind::OneofField(validator::path::OneOfField { + field: field.to_string(), + variant: variant.to_string(), + }) + } + path::PathElement::Index(index) => { + validator::path::element::Kind::ArrayElement(validator::path::ArrayElement { + index: (*index).try_into().unwrap(), + }) + } + }), + } + } +} + +impl From<&data_type::DataType> for validator::DataType { + fn from(node: &data_type::DataType) -> Self { + Self { + class: Some(node.class().into()), + nullable: node.nullable(), + variation: node.variation().as_ref().map(|x| x.as_ref().into()), + parameters: node.parameters().iter().map(|x| x.into()).collect(), + } + } +} + +impl From<&data_type::Class> for validator::data_type::Class { + fn from(node: &data_type::Class) -> Self { + validator::data_type::Class { + kind: Some(match node { + data_type::Class::Simple(simple) => { + validator::data_type::class::Kind::Simple(simple.into()) + } + data_type::Class::Compound(compound) => { + validator::data_type::class::Kind::Compound(compound.into()) + } + data_type::Class::UserDefined(user_defined) => { + validator::data_type::class::Kind::UserDefinedType(user_defined.as_ref().into()) + } + data_type::Class::Unresolved => { + validator::data_type::class::Kind::UnresolvedType(validator::Empty {}) + } + }), + } + } +} + +impl From<&data_type::Simple> for i32 { + fn from(node: &data_type::Simple) -> Self { + match node { + data_type::Simple::Boolean => validator::data_type::Simple::Boolean, + data_type::Simple::I8 => validator::data_type::Simple::I8, + data_type::Simple::I16 => validator::data_type::Simple::I16, + data_type::Simple::I32 => validator::data_type::Simple::I32, + data_type::Simple::I64 => validator::data_type::Simple::I64, + data_type::Simple::Fp32 => validator::data_type::Simple::Fp32, + data_type::Simple::Fp64 => validator::data_type::Simple::Fp64, + data_type::Simple::String => validator::data_type::Simple::String, + data_type::Simple::Binary => validator::data_type::Simple::Binary, + data_type::Simple::Timestamp => validator::data_type::Simple::Timestamp, + data_type::Simple::TimestampTz => validator::data_type::Simple::TimestampTz, + data_type::Simple::Date => validator::data_type::Simple::Date, + data_type::Simple::Time => validator::data_type::Simple::Time, + data_type::Simple::IntervalYear => validator::data_type::Simple::IntervalYear, + data_type::Simple::IntervalDay => validator::data_type::Simple::IntervalDay, + data_type::Simple::Uuid => validator::data_type::Simple::Uuid, + } + .into() + } +} + +impl From<&data_type::Compound> for i32 { + fn from(node: &data_type::Compound) -> Self { + match node { + data_type::Compound::FixedChar => validator::data_type::Compound::FixedChar, + data_type::Compound::VarChar => validator::data_type::Compound::VarChar, + data_type::Compound::FixedBinary => validator::data_type::Compound::FixedBinary, + data_type::Compound::Decimal => validator::data_type::Compound::Decimal, + data_type::Compound::Struct => validator::data_type::Compound::Struct, + data_type::Compound::NamedStruct => validator::data_type::Compound::NamedStruct, + data_type::Compound::List => validator::data_type::Compound::List, + data_type::Compound::Map => validator::data_type::Compound::Map, + } + .into() + } +} + +impl From<&extension::Reference> for validator::data_type::UserDefinedType { + fn from(node: &extension::Reference) -> Self { + Self { + uri: node.uri.name().unwrap_or_default().to_string(), + name: node.name.name().unwrap_or_default().to_string(), + definition: node.definition.as_ref().map(|x| x.as_ref().into()), + } + } +} + +impl From<&extension::DataType> for validator::data_type::user_defined_type::Definition { + fn from(node: &extension::DataType) -> Self { + Self { + structure: node + .structure + .iter() + .map( + |(name, simple)| validator::data_type::user_defined_type::Element { + name: name.to_string(), + kind: simple.into(), + }, + ) + .collect(), + } + } +} + +impl From<&extension::Reference> for validator::data_type::Variation { + fn from(node: &extension::Reference) -> Self { + if let Some(ref definition) = node.definition { + validator::data_type::Variation::UserDefinedVariation( + validator::data_type::UserDefinedVariation { + uri: node.uri.name().unwrap_or_default().to_string(), + name: node.name.name().unwrap_or_default().to_string(), + definition: Some(Box::new(definition.as_ref().into())), + }, + ) + } else { + validator::data_type::Variation::UnresolvedVariation(validator::Empty {}) + } + } +} + +impl From<&extension::TypeVariation> for validator::data_type::user_defined_variation::Definition { + fn from(node: &extension::TypeVariation) -> Self { + Self { + base_type: None, + function_behavior: (&node.function_behavior).into(), + } + } +} + +impl From<&extension::FunctionBehavior> for i32 { + fn from(node: &extension::FunctionBehavior) -> Self { + match node { + extension::FunctionBehavior::Inherits => { + validator::data_type::user_defined_variation::FunctionBehavior::Inherits + } + extension::FunctionBehavior::Separate => { + validator::data_type::user_defined_variation::FunctionBehavior::Separate + } + } + .into() + } +} + +impl From<&data_type::Parameter> for validator::data_type::Parameter { + fn from(node: &data_type::Parameter) -> Self { + Self { + kind: Some(match node { + data_type::Parameter::Type(data_type) => { + validator::data_type::parameter::Kind::DataType(data_type.as_ref().into()) + } + data_type::Parameter::NamedType(name, data_type) => { + validator::data_type::parameter::Kind::NamedType(validator::data_type::Named { + name: name.to_string(), + data_type: Some(data_type.as_ref().into()), + }) + } + data_type::Parameter::Unsigned(unsigned) => { + validator::data_type::parameter::Kind::Unsigned(*unsigned) + } + }), + } + } +} + +/// Export the complete parse tree in protobuf substrait.validator.Node format. +pub fn export( + out: &mut T, + _root_name: &'static str, + result: &parse_result::ParseResult, +) -> std::io::Result<()> { + let root = validator::ParseResult::from(result); + let buf = root.encode_to_vec(); + if out.write(&buf)? < buf.len() { + Err(std::io::Error::new( + std::io::ErrorKind::Other, + "failed to write all bytes", + )) + } else { + Ok(()) + } +} diff --git a/rs/src/input/config.rs b/rs/src/input/config.rs new file mode 100644 index 00000000..0f6dd99f --- /dev/null +++ b/rs/src/input/config.rs @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! This module provides the configuration structure for the validator. +//! +//! This structure, [`Config`], is to be constructed by the application using +//! the validator to configure it. Alternatively, the default configuration can +//! be constructed by using the [`std::default::Default`] trait. + +use crate::output::diagnostic; +pub use glob; +use std::collections::HashMap; + +/// Trait object representing some immutable binary data. +pub type BinaryData = Box>; + +/// Trait object representing some error data. +pub type ErrorData = Box; + +/// Callback function type for resolving/downloading URIs. +pub type UriResolver = Box std::result::Result + Send>; + +/// Attempts to resolve and fetch the data for the given URI using libcurl, +/// allowing the validator to handle remote YAML extension URLs with most +/// protocols. +#[cfg(feature = "curl")] +fn resolve_with_curl(uri: &str) -> Result, curl::Error> { + let mut binary_data: Vec = vec![]; + let mut curl_handle = curl::easy::Easy::new(); + curl_handle.url(uri)?; + { + let mut transfer = curl_handle.transfer(); + transfer.write_function(|buf| { + binary_data.extend_from_slice(buf); + Ok(buf.len()) + })?; + transfer.perform()?; + } + Ok(binary_data) +} + +/// Configuration structure. +#[derive(Default)] +pub struct Config { + /// When set, do not generate warnings for unknown protobuf fields that are + /// set to their protobuf-defined default value. + pub ignore_unknown_fields: bool, + + /// Protobuf message URLs that are explicitly allowed for use in "any" + /// messages, i.e. that the caller warrants the existence of in the + /// consumer that the plan is validated for. + pub allowed_proto_any_urls: Vec, + + /// Allows the level of diagnostic messages to be overridden based on their + /// classification/code. The logic for this is as follows: + /// + /// - if an entry exists for the classication of the incoming diagnostic, + /// override its error level to at most the second argument, and then to + /// at least the first argument. Otherwise, + /// - if an entry exists for the group of said classification, use its + /// level limits instead. Otherwise, + /// - if an entry exists for Unclassified (code 0), use its level limits + /// instead. Otherwise, do not adjust the level. + /// + /// Note that setting an entry to (Info, Error) leaves the diagnostic + /// level unchanged. + pub diagnostic_level_overrides: + HashMap, + + /// Allows URIs from the plan to be remapped (Some(mapping)) or ignored + /// (None). All resolution can effectively be disabled by just adding a + /// rule that maps * to None. Furthermore, in the absence of a custom + /// yaml_uri_resolver function, this can be used to remap URIs to + /// pre-downloaded files. + pub uri_overrides: Vec<(glob::Pattern, Option)>, + + /// Optional callback function for resolving URIs. If specified, all + /// URIs (after processing yaml_uri_overrides) are resolved using this + /// function. The function takes the URI as its argument, and should either + /// return the download contents as a Vec or return a String-based + /// error. If no downloader is specified, only file:// URLs with an + /// absolute path are supported. + pub uri_resolver: Option, +} + +impl Config { + /// Creates a default configuration. + pub fn new() -> Self { + Self::default() + } + + /// Instructs the validator to ignore protobuf fields that it doesn't know + /// about yet (i.e., that have been added to the Substrait protobuf + /// descriptions, but haven't yet been implemented in the validator) if the + /// fields are set to their default value. If this option isn't set, or if + /// an unknown field is not set to its default value, a warning is emitted. + pub fn ignore_unknown_fields(&mut self) { + self.ignore_unknown_fields = true; + } + + /// Explicitly allows a protobuf message type to be used in advanced + /// extensions, despite the fact that the validator can't validate it. If + /// an advanced extension is encountered that isn't explicitly allowed, a + /// warning is emitted. + pub fn allow_proto_any_url(&mut self, pattern: glob::Pattern) { + self.allowed_proto_any_urls.push(pattern); + } + + /// Sets a minimum and/or maximum error level for the given class of + /// diagnostic messages. Any previous settings for this class are + /// overridden. + pub fn override_diagnostic_level( + &mut self, + class: diagnostic::Classification, + minimum: diagnostic::Level, + maximum: diagnostic::Level, + ) { + self.diagnostic_level_overrides + .insert(class, (minimum, maximum)); + } + + /// Overrides the resolution behavior for (YAML) URIs matching the given + /// pattern. If resolve_as is None, the URI file will not be resolved; + /// if it is Some(s), it will be resolved as if the URI in the plan had + /// been s. + pub fn override_uri>(&mut self, pattern: glob::Pattern, resolve_as: Option) { + self.uri_overrides + .push((pattern, resolve_as.map(|s| s.into()))); + } + + /// Registers a URI resolution function with this configuration. If + /// the given function fails, any previously registered function will be + /// used as a fallback. + pub fn add_uri_resolver(&mut self, resolver: F) + where + F: Fn(&str) -> Result + Send + 'static, + D: AsRef<[u8]> + 'static, + E: std::error::Error + 'static, + { + let previous = self.uri_resolver.take(); + self.uri_resolver = Some(Box::new(move |uri| match resolver(uri) { + Ok(d) => Ok(Box::new(d)), + Err(e) => match &previous { + Some(f) => f.as_ref()(uri), + None => Err(Box::new(e)), + }, + })); + } + + /// Registers a URI resolver based on libcurl. If libcurl fails, any + /// `uri_resolver` registered previously will be used as a fallback. + #[cfg(feature = "curl")] + pub fn add_curl_uri_resolver(&mut self) { + self.add_uri_resolver(resolve_with_curl) + } +} diff --git a/rs/src/input/mod.rs b/rs/src/input/mod.rs new file mode 100644 index 00000000..b7ba8808 --- /dev/null +++ b/rs/src/input/mod.rs @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Input representation module. +//! +//! This module provides the data structures for representing the input of the +//! validator. + +pub mod config; +pub mod proto; +pub mod traits; +pub mod yaml; diff --git a/rs/src/input/proto.rs b/rs/src/input/proto.rs new file mode 100644 index 00000000..27c4a62f --- /dev/null +++ b/rs/src/input/proto.rs @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for representing Substrait protobuf input. +//! +//! The structures here are generated using [`prost`], but have a bunch of +//! extra traits from [`traits`](crate::input::traits) associated with them, +//! for which the implementations are generated using +//! [`substrait_validator_derive`]. The purpose of these traits is to add basic +//! introspection capabilities to the prost structures. One of the use cases +//! for this is to let the parsing code automatically detect when the +//! validation code ignored a subtree while validating, which implies that the +//! validator hasn't checked everything and thus should not warrant that the +//! received plan is valid. + +use crate::input::traits; +use crate::output::primitive_data; + +use heck::ToUpperCamelCase; + +#[allow(clippy::large_enum_variant)] +pub mod substrait { + include!(concat!(env!("OUT_DIR"), "/substrait.rs")); + pub mod extensions { + include!(concat!(env!("OUT_DIR"), "/substrait.extensions.rs")); + } + pub mod validator { + include!(concat!(env!("OUT_DIR"), "/substrait.validator.rs")); + } +} + +/// Converts a Rust module path and name (the latter already processed by +/// cook_ident()) to a protobuf type path. +pub fn cook_path(module_path: &str, type_name: &str) -> String { + let mut iter = module_path + .split("::") + .skip(module_path!().split("::").count()) + .map(cook_ident) + .chain(::std::iter::once(type_name)) + .peekable(); + let mut items = vec![]; + if matches!(iter.peek(), Some(&"substrait")) { + items.push(iter.next().unwrap().to_string()); + if matches!(iter.peek(), Some(&"extensions") | Some(&"validator")) { + items.push(iter.next().unwrap().to_string()); + } + } + items.extend(iter.map(|x| x.to_upper_camel_case())); + ::itertools::Itertools::intersperse(items.iter().map(|x| x.as_ref()), ".").collect() +} + +/// Converts a Rust identifier string generated via stringify!() to the +/// original identifier by "cooking" raw identifiers. +pub fn cook_ident(ident: &str) -> &str { + if let Some((_, keyword)) = ident.split_once('#') { + keyword + } else { + ident + } +} + +impl traits::ProtoPrimitive for bool { + fn proto_primitive_type() -> &'static str { + "bool" + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Bool(false) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Bool(*self) + } + + fn proto_primitive_is_default(&self) -> bool { + !*self + } +} + +impl traits::ProtoPrimitive for u32 { + fn proto_primitive_type() -> &'static str { + "uint32" + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Unsigned(0) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Unsigned((*self).into()) + } + + fn proto_primitive_is_default(&self) -> bool { + *self == 0 + } +} + +impl traits::ProtoPrimitive for u64 { + fn proto_primitive_type() -> &'static str { + "uint64" + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Unsigned(0) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Unsigned(*self) + } + + fn proto_primitive_is_default(&self) -> bool { + *self == 0 + } +} + +impl traits::ProtoPrimitive for i32 { + fn proto_primitive_type() -> &'static str { + "int32" + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Signed(0) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Signed((*self).into()) + } + + fn proto_primitive_is_default(&self) -> bool { + *self == 0 + } +} + +impl traits::ProtoPrimitive for i64 { + fn proto_primitive_type() -> &'static str { + "int64" + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Signed(0) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Signed(*self) + } + + fn proto_primitive_is_default(&self) -> bool { + *self == 0 + } +} + +impl traits::ProtoPrimitive for f32 { + fn proto_primitive_type() -> &'static str { + "float" + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Float(0.0) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Float((*self).into()) + } + + fn proto_primitive_is_default(&self) -> bool { + *self == 0.0 + } +} + +impl traits::ProtoPrimitive for f64 { + fn proto_primitive_type() -> &'static str { + "double" + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Float(0.0) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Float(*self) + } + + fn proto_primitive_is_default(&self) -> bool { + *self == 0.0 + } +} + +impl traits::ProtoPrimitive for String { + fn proto_primitive_type() -> &'static str { + "string" + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::String(String::new()) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::String(self.clone()) + } + + fn proto_primitive_is_default(&self) -> bool { + self.is_empty() + } +} + +impl traits::ProtoPrimitive for Vec { + fn proto_primitive_type() -> &'static str { + "bytes" + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Bytes(vec![]) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Bytes(self.clone()) + } + + fn proto_primitive_is_default(&self) -> bool { + self.is_empty() + } +} + +impl traits::ProtoPrimitive for prost_types::Any { + fn proto_primitive_type() -> &'static str { + "any" + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Any(prost_types::Any::default()) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Any(self.clone()) + } + + fn proto_primitive_is_default(&self) -> bool { + self.type_url.is_empty() + } +} diff --git a/rs/src/input/traits.rs b/rs/src/input/traits.rs new file mode 100644 index 00000000..d3a02946 --- /dev/null +++ b/rs/src/input/traits.rs @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing introspection traits for [`prost`]-generated protobuf +//! types. + +use crate::output::primitive_data; +use crate::output::tree; +use crate::parse::context; + +/// Trait for all Rust types that represent input tree node types. +pub trait InputNode { + /// Creates an empty output node for a protobuf datum of this type. + /// + /// For primitive types, this fills the value with protobuf's default. + fn type_to_node() -> tree::Node; + + /// Creates an empty output node for a protobuf datum with this value. + fn data_to_node(&self) -> tree::Node; + + /// Returns the name of the selected variant of a oneof field, if this + /// is a rust enum used to represent a oneof field. + fn oneof_variant(&self) -> Option<&'static str>; + + /// Complete the subtrees of this datum in output that have not already + /// been parsed using UnknownField nodes. Returns whether any such nodes + /// were added. + fn parse_unknown(&self, context: &mut context::Context<'_>) -> bool; +} + +/// Trait for all Rust types that represent protobuf messages. These are +/// always structs for which all fields implement InputNode. +pub trait ProtoMessage: InputNode { + /// Returns the protobuf type name for messages of this type. + fn proto_message_type() -> &'static str; +} + +/// Trait for all Rust types that represent protobuf's oneof abstraction. +/// In the world of protobuf, these aren't really a thing of their own, but +/// in Rust, they are defined as enums, each variant containing a one-tuple +/// of some type implementing InputNode. +pub trait ProtoOneOf: InputNode { + /// Returns the name of the selected variant of a oneof field. + fn proto_oneof_variant(&self) -> &'static str; +} + +/// Trait for Rust types that map to the protobuf primitive types. +pub trait ProtoPrimitive: InputNode { + /// Returns the protobuf type name for primitives of this type. + fn proto_primitive_type() -> &'static str; + + /// Returns the protobuf-specified default value for this primitive + /// data type. + fn proto_primitive_default() -> primitive_data::PrimitiveData; + + /// Returns the actual value for this primitive data type as a + /// ProtoPrimitiveData variant. + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData; + + /// Returns whether this is the default value of the primitive. + fn proto_primitive_is_default(&self) -> bool; +} + +/// Trait for all Rust types that represent protobuf enums. These are +/// always represented as a Rust enum with no contained values for any of +/// the variants. +pub trait ProtoEnum: ProtoPrimitive { + /// Returns the protobuf type name for enums of this type. + fn proto_enum_type() -> &'static str; + + /// Returns the name of the default variant of an enum. + fn proto_enum_default_variant() -> &'static str; + + /// Returns the name of the selected variant of an enum. + fn proto_enum_variant(&self) -> &'static str; + + /// Returns the enumeration entry corresponding to the given integer + /// value, if any. + fn proto_enum_from_i32(x: i32) -> Option + where + Self: Sized; +} + +/// Blanket implementation to make all protobuf enums behave like +/// primitives as well. +impl ProtoPrimitive for T { + fn proto_primitive_type() -> &'static str { + T::proto_enum_type() + } + + fn proto_primitive_default() -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Enum(T::proto_enum_default_variant()) + } + + fn proto_primitive_data(&self) -> primitive_data::PrimitiveData { + primitive_data::PrimitiveData::Enum(self.proto_enum_variant()) + } + + fn proto_primitive_is_default(&self) -> bool { + self.proto_enum_variant() == T::proto_enum_default_variant() + } +} + +/// Blanket implementation to make all protobuf primitives behave like +/// generic protobuf datums. +/// +/// Note: if Rust would allow it, we could define blanket implementations +/// for ProtoMessage and ProtoOneOf as well, since they're always the same. +/// Unfortunately, we can only define a single blanket implementation, so +/// we opt for the one that isn't already generated via derive macros. +impl InputNode for T { + fn type_to_node() -> tree::Node { + tree::NodeType::ProtoPrimitive(T::proto_primitive_type(), T::proto_primitive_default()) + .into() + } + + fn data_to_node(&self) -> tree::Node { + tree::NodeType::ProtoPrimitive(T::proto_primitive_type(), self.proto_primitive_data()) + .into() + } + + fn oneof_variant(&self) -> Option<&'static str> { + None + } + + fn parse_unknown(&self, _context: &mut context::Context<'_>) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::input::proto::substrait; + use crate::output::primitive_data; + use crate::output::tree; + + #[test] + fn message() { + assert_eq!(substrait::Plan::proto_message_type(), "substrait.Plan"); + assert_eq!( + substrait::Plan::type_to_node(), + tree::Node { + class: tree::Class::Misc, + brief: None, + summary: None, + node_type: tree::NodeType::ProtoMessage("substrait.Plan"), + data_type: None, + data: vec![], + } + ); + + let msg = substrait::Plan::default(); + assert_eq!( + msg.data_to_node(), + tree::Node { + class: tree::Class::Misc, + brief: None, + summary: None, + node_type: tree::NodeType::ProtoMessage("substrait.Plan"), + data_type: None, + data: vec![], + } + ); + assert_eq!(msg.oneof_variant(), None); + } + + #[test] + fn oneof() { + assert_eq!( + substrait::plan_rel::RelType::type_to_node(), + tree::Node { + class: tree::Class::Misc, + brief: None, + summary: None, + node_type: tree::NodeType::ProtoMissingOneOf, + data_type: None, + data: vec![], + } + ); + + let oneof = substrait::plan_rel::RelType::Rel(substrait::Rel::default()); + assert_eq!(oneof.proto_oneof_variant(), "rel"); + assert_eq!( + oneof.data_to_node(), + tree::Node { + class: tree::Class::Misc, + brief: None, + summary: None, + node_type: tree::NodeType::ProtoMessage("substrait.Rel"), + data_type: None, + data: vec![], + } + ); + assert_eq!(oneof.oneof_variant(), Some("rel")); + } + + #[test] + fn enumeration() { + assert_eq!( + substrait::AggregationPhase::proto_enum_type(), + "substrait.AggregationPhase" + ); + assert_eq!( + substrait::AggregationPhase::proto_enum_default_variant(), + "AGGREGATION_PHASE_UNSPECIFIED" + ); + assert_eq!( + substrait::AggregationPhase::Unspecified.proto_enum_variant(), + "AGGREGATION_PHASE_UNSPECIFIED" + ); + + assert_eq!( + substrait::AggregationPhase::proto_primitive_type(), + "substrait.AggregationPhase" + ); + assert_eq!( + substrait::AggregationPhase::proto_primitive_default(), + primitive_data::PrimitiveData::Enum("AGGREGATION_PHASE_UNSPECIFIED") + ); + assert_eq!( + substrait::AggregationPhase::Unspecified.proto_primitive_data(), + primitive_data::PrimitiveData::Enum("AGGREGATION_PHASE_UNSPECIFIED") + ); + + assert_eq!( + substrait::AggregationPhase::type_to_node(), + tree::Node { + class: tree::Class::Misc, + brief: None, + summary: None, + node_type: tree::NodeType::ProtoPrimitive( + "substrait.AggregationPhase", + primitive_data::PrimitiveData::Enum("AGGREGATION_PHASE_UNSPECIFIED") + ), + data_type: None, + data: vec![], + } + ); + assert_eq!( + substrait::AggregationPhase::Unspecified.data_to_node(), + tree::Node { + class: tree::Class::Misc, + brief: None, + summary: None, + node_type: tree::NodeType::ProtoPrimitive( + "substrait.AggregationPhase", + primitive_data::PrimitiveData::Enum("AGGREGATION_PHASE_UNSPECIFIED") + ), + data_type: None, + data: vec![], + } + ); + assert_eq!( + substrait::AggregationPhase::Unspecified.oneof_variant(), + None + ); + } + + #[test] + fn primitive() { + assert_eq!(u32::proto_primitive_type(), "uint32"); + assert_eq!( + u32::proto_primitive_default(), + primitive_data::PrimitiveData::Unsigned(0) + ); + assert_eq!( + 42u32.proto_primitive_data(), + primitive_data::PrimitiveData::Unsigned(42) + ); + + assert_eq!( + u32::type_to_node(), + tree::Node { + class: tree::Class::Misc, + brief: None, + summary: None, + node_type: tree::NodeType::ProtoPrimitive( + "uint32", + primitive_data::PrimitiveData::Unsigned(0) + ), + data_type: None, + data: vec![], + } + ); + assert_eq!( + 42u32.data_to_node(), + tree::Node { + class: tree::Class::Misc, + brief: None, + summary: None, + node_type: tree::NodeType::ProtoPrimitive( + "uint32", + primitive_data::PrimitiveData::Unsigned(42) + ), + data_type: None, + data: vec![], + } + ); + assert_eq!(42u32.oneof_variant(), None); + } +} diff --git a/rs/src/input/yaml.rs b/rs/src/input/yaml.rs new file mode 100644 index 00000000..fcda7957 --- /dev/null +++ b/rs/src/input/yaml.rs @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for representing YAML input. +//! +//! We (ab)use [`serde_json::value::Value`] for this; the primary reason being +//! that a [JSON schema](https://json-schema.org/) is used for basic schema +//! validation of the YAML files, and the [`jsonschema`] crate we use for that +//! uses [`serde_json`]'s representation). [`yaml_to_json()`] may be used to +//! convert the output from [`yaml_rust`] to this structure. + +use crate::output::diagnostic; +use crate::output::path; +use crate::output::primitive_data; +use crate::output::tree; +use crate::parse::traversal; + +use yaml_rust::yaml::Yaml; + +/// Type for the type used for arbitrary YAML values. +pub type Value = serde_json::value::Value; + +/// Typedef for the type used for YAML arrays. +pub type Array = Vec; + +/// Typedef for the type used for YAML maps. +pub type Map = serde_json::map::Map; + +/// Converts a [`yaml_rust`] YAML structure into its equivalent JSON object +/// model using [`serde_json`]'s types. +pub fn yaml_to_json(y: Yaml, path: &path::Path) -> diagnostic::DiagResult { + match y { + Yaml::Real(ref s) => Ok(Value::Number( + serde_json::value::Number::from_f64(y.as_f64().ok_or_else(|| { + diag!( + path.to_path_buf(), + Error, + YamlParseFailed, + "failed to parse {s} as float" + ) + })?) + .ok_or_else(|| { + diag!( + path.to_path_buf(), + Error, + YamlParseFailed, + "{s} float is not supported" + ) + })?, + )), + Yaml::Integer(i) => Ok(Value::Number(i.into())), + Yaml::String(s) => Ok(Value::String(s)), + Yaml::Boolean(b) => Ok(Value::Bool(b)), + Yaml::Array(a) => Ok(Value::Array( + a.into_iter() + .enumerate() + .map(|(index, value)| yaml_to_json(value, &path.with_index(index))) + .collect::>>()?, + )), + Yaml::Hash(m) => Ok(Value::Object( + m.into_iter() + .map(|(key, value)| { + let key = key + .as_str() + .ok_or_else(|| { + diag!( + path.to_path_buf(), + Error, + YamlParseFailed, + "non-string map keys are not supported" + ) + })? + .to_string(); + let path = path.with_field(&key); + let value = yaml_to_json(value, &path)?; + Ok((key, value)) + }) + .collect::>>()?, + )), + Yaml::Alias(_) => Err(diag!( + path.to_path_buf(), + Error, + YamlParseFailed, + "YAML aliases are not supported" + )), + Yaml::Null => Ok(Value::Null), + Yaml::BadValue => panic!("encountered Yaml::BadValue"), + } +} + +impl crate::input::traits::InputNode for Value { + fn type_to_node() -> tree::Node { + tree::NodeType::YamlMap.into() + } + + fn data_to_node(&self) -> tree::Node { + match self { + Value::Null => tree::NodeType::YamlPrimitive(primitive_data::PrimitiveData::Null), + Value::Bool(b) => { + tree::NodeType::YamlPrimitive(primitive_data::PrimitiveData::Bool(*b)) + } + Value::Number(n) => tree::NodeType::YamlPrimitive( + n.as_u64() + .map(primitive_data::PrimitiveData::Unsigned) + .or_else(|| n.as_i64().map(primitive_data::PrimitiveData::Signed)) + .or_else(|| n.as_f64().map(primitive_data::PrimitiveData::Float)) + .unwrap(), + ), + Value::String(s) => { + tree::NodeType::YamlPrimitive(primitive_data::PrimitiveData::String(s.clone())) + } + Value::Array(_) => tree::NodeType::YamlArray, + Value::Object(_) => tree::NodeType::YamlMap, + } + .into() + } + + fn oneof_variant(&self) -> Option<&'static str> { + None + } + + fn parse_unknown(&self, context: &mut crate::parse::context::Context<'_>) -> bool { + match self { + Value::Array(array) => { + let mut any = false; + for (index, _) in array.iter().enumerate() { + if !context.field_parsed(index.to_string()) { + traversal::push_yaml_element(array, context, index, true, |_, _| Ok(())); + any = true; + } + } + any + } + Value::Object(object) => { + let mut any = false; + let mut keys: Vec<_> = object.keys().collect(); + keys.sort(); + for field_name in keys { + if !context.field_parsed(field_name) { + traversal::push_yaml_field(self, context, field_name, true, |_, _| Ok(())) + .unwrap(); + any = true; + } + } + any + } + _ => false, + } + } +} diff --git a/rs/src/lib.rs b/rs/src/lib.rs new file mode 100644 index 00000000..023f4359 --- /dev/null +++ b/rs/src/lib.rs @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Crate for validating [Substrait](https://substrait.io/). +//! +//! The usage pattern is roughly as follows. +//! +//! 1) Build a [`Config`] structure to configure the validator. You can also +//! just use [`std::default::Default`] if you don't need to configure +//! anything, but you might want to at least call +//! [`Config::add_curl_uri_resolver()`] (if you're using the `curl` +//! feature). +//! 2) Parse the incoming `substrait.Plan` message using [`parse()`]. This +//! creates a [ParseResult], containing a [tree](output::tree) structure +//! corresponding to the query plan that also contains diagnostics and +//! other annotations added by the validator. +//! 3) You can traverse the tree yourself using [ParseResult::root], or you +//! can use one of the methods associated with [ParseResult] to obtain the +//! validation results you need. +//! +//! Note that only the binary protobuf serialization format is supported at the +//! input; the JSON format is *not* supported. This is a limitation of `prost`, +//! the crate that was used for protobuf deserialization. If you're looking for +//! a library (or CLI) that supports more human-friendly input, check out the +//! Python bindings. + +#[macro_use] +pub mod output; + +#[macro_use] +mod parse; + +pub mod export; +pub mod input; + +mod string_util; + +use strum::IntoEnumIterator; + +// Aliases for common types used on the crate interface. +pub use input::config::glob::Pattern; +pub use input::config::Config; +pub use output::diagnostic::Classification; +pub use output::diagnostic::Diagnostic; +pub use output::diagnostic::Level; +pub use output::parse_result::ParseResult; +pub use output::parse_result::Validity; + +/// Validates the given substrait.Plan message and returns the parse tree. +pub fn parse(buffer: B, config: &Config) -> ParseResult { + parse::parse(buffer, config) +} + +/// Returns an iterator that yields all known diagnostic classes. +pub fn iter_diagnostics() -> impl Iterator { + Classification::iter() +} diff --git a/rs/src/output/comment.rs b/rs/src/output/comment.rs new file mode 100644 index 00000000..45542ad0 --- /dev/null +++ b/rs/src/output/comment.rs @@ -0,0 +1,341 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for comments. +//! +//! [`Comment`]s can be added to nodes between the child edges to attach +//! additional miscellaneous information that doesn't fit in any of the more +//! structured types, intended purely to be formatted for and interpreted by +//! humans. + +use crate::output::path; + +/// Representation of a comment message intended only for human consumption. +/// Includes basic formatting information. +#[derive(Clone, Debug, PartialEq, Default)] +pub struct Comment { + /// Formatting elements and spans that make up the comment. + elements: Vec, +} + +impl Comment { + /// Creates an empty comment. + pub fn new() -> Self { + Self::default() + } + + /// Adds a piece of plain text to the comment. + pub fn plain(mut self, text: S) -> Self { + self.push(Element::Span(text.to_string().into())); + self + } + + /// Adds a piece of text to the comment that links to the given path. + pub fn link(mut self, text: S, path: path::PathBuf) -> Self { + self.push(Element::Span(Span { + text: text.to_string(), + link: Some(Link::Path(path)), + })); + self + } + + /// Adds a piece of text to the comment that links to the given URL. + pub fn url(mut self, text: S, url: U) -> Self { + self.push(Element::Span(Span { + text: text.to_string(), + link: Some(Link::Url(url.to_string())), + })); + self + } + + /// Adds a newline/paragraph break. + pub fn nl(mut self) -> Self { + self.push(Element::NewLine); + self + } + + /// Opens a list. + pub fn lo(mut self) -> Self { + self.push(Element::ListOpen); + self + } + + /// Advances to the next list item. + pub fn li(mut self) -> Self { + self.push(Element::ListNext); + self + } + + /// Closes the current list. + pub fn lc(mut self) -> Self { + self.push(Element::ListClose); + self + } + + /// Pushes an element into this comment. + pub fn push(&mut self, element: Element) { + // Some pairs of element types should never follow each other, because + // one implies the other. + match self.elements.pop() { + None => self.elements.push(element), + Some(Element::Span(s1)) => { + if let Element::Span(s2) = element { + let (s1, maybe_s2) = merge_spans(s1, s2); + self.elements.push(Element::Span(s1)); + if let Some(s2) = maybe_s2 { + self.elements.push(Element::Span(s2)); + } + } else { + self.elements.push(Element::Span(s1)); + self.elements.push(element); + } + } + Some(Element::NewLine) => { + if matches!(element, Element::Span(_)) { + self.elements.push(Element::NewLine); + } + self.elements.push(element); + } + Some(Element::ListOpen) => { + self.elements.push(Element::ListOpen); + if !matches!(element, Element::ListNext) { + self.elements.push(element); + } + } + Some(Element::ListNext) => { + self.elements.push(Element::ListNext); + if !matches!(element, Element::ListNext) { + self.elements.push(element); + } + } + Some(Element::ListClose) => { + self.elements.push(Element::ListClose); + if !matches!(element, Element::NewLine) { + self.elements.push(element); + } + } + } + } + + /// Pushes a whole other comment's worth of elements into this comment. + pub fn extend(&mut self, other: Comment) { + let mut it = other.elements.into_iter(); + + // The first element of other may need to be merged with its new + // predecessor. + if let Some(element) = it.next() { + self.push(element); + } + + // The rest of the elements would already have been merged, so we can + // just copy them over. + self.elements.extend(it); + } + + /// Returns the slice of elements that comprise the comment. + /// + /// This list is "minimal:" + /// - there are no consecutive newlines, list item tags, or spans with + /// equal formatting (they are merged together); + /// - there are no empty lists, and there is never a list item immediately + /// following a list open tag (as this is redundant). + pub fn elements(&self) -> &[Element] { + &self.elements + } +} + +impl std::fmt::Display for Comment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut indent = 0; + for element in self.elements.iter() { + match element { + Element::Span(span) => span.fmt(f), + Element::NewLine => write!(f, "\n\n{: >1$}", "", indent), + Element::ListOpen => { + indent += 3; + write!(f, "\n\n{: >1$}", "- ", indent) + } + Element::ListNext => { + write!(f, "\n\n{: >1$}", "- ", indent) + } + Element::ListClose => { + indent -= 3; + write!(f, "\n\n{: >1$}", "", indent) + } + }?; + } + Ok(()) + } +} + +impl From for Comment { + fn from(text: String) -> Self { + Self { + elements: vec![Element::Span(text.into())], + } + } +} + +/// A comment element. +#[derive(Clone, Debug, PartialEq)] +pub enum Element { + /// A span of text. Should not include newlines. + Span(Span), + + /// A newline/paragraph break. + NewLine, + + /// Starts a new list. Subsequent spans form the text for the first item. + ListOpen, + + /// Advances to the next list item. + ListNext, + + /// Closes a list. + ListClose, +} + +/// Like Comment, but single-line. +#[derive(Clone, Debug, PartialEq, Default)] +pub struct Brief { + /// Spans that make up the comment. These are simply concatenated, but + /// spans may contain optional formatting information. + spans: Vec, +} + +impl Brief { + /// Creates an empty comment. + pub fn new() -> Self { + Self::default() + } + + /// Adds a piece of plain text to the comment. + pub fn plain(mut self, text: S) -> Self { + self.push(text.to_string().into()); + self + } + + /// Adds a piece of text to the comment that links to the given path. + pub fn link(mut self, text: S, path: path::PathBuf) -> Self { + self.push(Span { + text: text.to_string(), + link: Some(Link::Path(path)), + }); + self + } + + /// Adds a piece of text to the comment that links to the given URL. + pub fn url(mut self, text: S, url: U) -> Self { + self.push(Span { + text: text.to_string(), + link: Some(Link::Url(url.to_string())), + }); + self + } + + /// Pushes a span into this brief. + pub fn push(&mut self, span: Span) { + if let Some(s1) = self.spans.pop() { + let s2 = span; + let (s1, maybe_s2) = merge_spans(s1, s2); + self.spans.push(s1); + if let Some(s2) = maybe_s2 { + self.spans.push(s2); + } + } else { + self.spans.push(span); + } + } + + /// Pushes a whole other brief's worth of elements into this brief. + pub fn extend(&mut self, other: Brief) { + let mut it = other.spans.into_iter(); + + // The first span of other may need to be merged with its new + // predecessor. + if let Some(element) = it.next() { + self.push(element); + } + + // The rest of the spans would already have been merged, so we can + // just copy them over. + self.spans.extend(it); + } + + /// Returns the slice of spans that comprise the brief. + pub fn spans(&self) -> &[Span] { + &self.spans + } +} + +impl std::fmt::Display for Brief { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for span in self.spans.iter() { + span.fmt(f)?; + } + Ok(()) + } +} + +impl From for Brief { + fn from(text: String) -> Self { + Self { + spans: vec![text.into()], + } + } +} + +impl From for Comment { + fn from(brief: Brief) -> Self { + Self { + elements: brief.spans.into_iter().map(Element::Span).collect(), + } + } +} + +/// A span of text within a comment. +#[derive(Clone, Debug, PartialEq)] +pub struct Span { + /// The span of text. + pub text: String, + + /// Whether this span of text should link to something. + pub link: Option, +} + +impl std::fmt::Display for Span { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.text) + } +} + +impl From for Span { + fn from(text: String) -> Self { + Span { text, link: None } + } +} + +/// Merges two spans together, if possible. A space is inserted between the +/// spans if there isn't one already. +fn merge_spans(mut a: Span, b: Span) -> (Span, Option) { + if b.text.is_empty() { + return (a, None); + } + if !a.text.ends_with(' ') && !b.text.starts_with(' ') { + a.text.push(' '); + } + if a.link == b.link { + a.text += &b.text; + return (a, None); + } + (a, Some(b)) +} + +/// A link to something. +#[derive(Clone, Debug, PartialEq)] +pub enum Link { + /// Link to another node in the tree, via an absolute node path. + Path(path::PathBuf), + + /// Link to some external URL. + Url(String), +} diff --git a/rs/src/output/data_type.rs b/rs/src/output/data_type.rs new file mode 100644 index 00000000..e63815ed --- /dev/null +++ b/rs/src/output/data_type.rs @@ -0,0 +1,864 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for dealing with Substrait's type system. +//! +//! See [`DataType`]. + +use crate::output::diagnostic; +use crate::output::extension; +use crate::string_util; +use crate::string_util::Describe; +use std::collections::HashSet; +use std::fmt::Write; +use std::sync::Arc; +use strum_macros::{Display, EnumString}; + +/// Typedef for type variations. +pub type Variation = Option>>; + +/// A Substrait data type. Includes facilities for storing unresolved or +/// partially-resolved types. +#[derive(Clone, Debug, PartialEq)] +pub struct DataType { + /// Type class (simple, compound, or user-defined). + class: Class, + + /// Nullability. + nullable: bool, + + /// Type variation, if any. + variation: Variation, + + /// Type parameters for non-simple types. + parameters: Vec, +} + +impl Describe for DataType { + fn describe( + &self, + f: &mut std::fmt::Formatter<'_>, + limit: string_util::Limit, + ) -> std::fmt::Result { + let mut name = String::new(); + write!(&mut name, "{}", self.class)?; + if self.nullable { + write!(&mut name, "?")?; + } + if let Some(variation) = &self.variation { + write!(&mut name, "[{variation}]")?; + } + write!(f, "{}", name)?; + let (_, limit) = limit.split(name.len()); + if self.class.has_parameters() { + write!(f, "<")?; + string_util::describe_sequence( + f, + &self.parameters, + limit, + 20, + |f, param, _, limit| param.describe(f, limit), + )?; + write!(f, ">")?; + } + Ok(()) + } +} + +impl std::fmt::Display for DataType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.display().fmt(f) + } +} + +impl DataType { + /// Creates a new type. + pub fn new( + class: Class, + nullable: bool, + variation: Variation, + parameters: Vec, + ) -> diagnostic::Result> { + // Check whether class and parameters work together. + class.check_parameters(¶meters)?; + + // Check whether the specified type variation is applicable to this + // type. + if let Some(variation) = &variation { + if let Some(definition) = &variation.definition { + let base = definition.get_base_class(); + if !base.weak_equals(&class) { + return Err(cause!( + TypeMismatchedVariation, + "variation {variation} is derived from {base}, not {class}" + )); + } + } + } + + Ok(Arc::new(DataType { + class, + nullable, + variation, + parameters, + })) + } + + /// Creates a new unresolved type with the given description. + pub fn new_unresolved() -> Arc { + Arc::new(DataType { + class: Class::Unresolved, + nullable: false, + variation: None, + parameters: vec![], + }) + } + + /// Creates a new struct type. + pub fn new_struct>>( + fields: T, + nullable: bool, + ) -> Arc { + Arc::new(DataType { + class: Class::Compound(Compound::Struct), + nullable, + variation: None, + parameters: fields.into_iter().map(Parameter::Type).collect(), + }) + } + + /// Creates a new list type. + pub fn new_list(element: Arc, nullable: bool) -> Arc { + Arc::new(DataType { + class: Class::Compound(Compound::List), + nullable, + variation: None, + parameters: vec![Parameter::Type(element)], + }) + } + + /// Creates a new map type. + pub fn new_map(key: Arc, value: Arc, nullable: bool) -> Arc { + Arc::new(DataType { + class: Class::Compound(Compound::List), + nullable, + variation: None, + parameters: vec![Parameter::Type(key), Parameter::Type(value)], + }) + } + + /// Creates the type of a predicate, i.e. a boolean. + pub fn new_predicate(nullable: bool) -> Arc { + Arc::new(DataType { + class: Class::Simple(Simple::Boolean), + nullable, + variation: None, + parameters: vec![], + }) + } + + /// Creates the type of a (default) integer, i.e. i32. + pub fn new_integer(nullable: bool) -> Arc { + Arc::new(DataType { + class: Class::Simple(Simple::I32), + nullable, + variation: None, + parameters: vec![], + }) + } + + /// Returns a nullable variant of this type. + pub fn make_nullable(&self) -> Arc { + Arc::new(DataType { + class: self.class.clone(), + nullable: true, + variation: self.variation.clone(), + parameters: self.parameters.clone(), + }) + } + + /// Returns the type class. + pub fn class(&self) -> &Class { + &self.class + } + + /// Returns whether the type is nullable. + pub fn nullable(&self) -> bool { + self.nullable + } + + /// Returns the type variation. + pub fn variation(&self) -> &Variation { + &self.variation + } + + /// Returns the type parameters. + pub fn parameters(&self) -> &Vec { + &self.parameters + } + + /// Returns the value of the given integer parameter. + pub fn int_parameter(&self, index: usize) -> Option { + if let Some(Parameter::Unsigned(value)) = self.parameters.get(index) { + Some(*value) + } else { + None + } + } + + /// Returns the value of the given type parameter. + pub fn type_parameter(&self, index: usize) -> Option> { + match self.parameters.get(index) { + Some(Parameter::Type(t)) => Some(t.clone()), + Some(Parameter::NamedType(_, t)) => Some(t.clone()), + _ => None, + } + } + + /// Returns whether this is an unresolved type. + pub fn is_unresolved(&self) -> bool { + matches!(self.class, Class::Unresolved) + } + + /// Returns whether any part of this type tree is an unresolved type. + pub fn is_unresolved_deep(&self) -> bool { + self.is_unresolved() + || self.parameters.iter().any(|p| match p { + Parameter::Type(t) => t.is_unresolved_deep(), + Parameter::NamedType(_, t) => t.is_unresolved_deep(), + _ => false, + }) + } + + /// Returns whether this is a STRUCT or NSTRUCT type. + pub fn is_struct(&self) -> bool { + matches!( + self.class, + Class::Compound(Compound::Struct) | Class::Compound(Compound::NamedStruct) + ) + } + + /// Returns Some(Vec)) when this is a STRUCT or NSTRUCT type, where the + /// vector contains the field types. Returns None otherwise. + pub fn unwrap_struct(&self) -> Option>> { + if self.is_struct() { + Some( + self.parameters + .iter() + .map(|x| x.get_type().cloned().unwrap_or_default()) + .collect(), + ) + } else { + None + } + } + + /// Returns Some(T) when this is a STRUCT or NSTRUCT type with only a + /// single element of type T, or None otherwise. + pub fn unwrap_singular_struct(&self) -> Option> { + if self.is_struct() && self.parameters.len() == 1 { + self.type_parameter(0) + } else { + None + } + } + + /// Returns whether this is a LIST type. + pub fn is_list(&self) -> bool { + matches!(self.class, Class::Compound(Compound::List)) + } + + /// Returns Some(T) when this is a LIST type with element type T, or None + /// otherwise. + pub fn unwrap_list(&self) -> Option> { + if self.is_list() { + self.type_parameter(0) + } else { + None + } + } + + /// Returns whether this is a MAP type. + pub fn is_map(&self) -> bool { + matches!(self.class, Class::Compound(Compound::Map)) + } + + /// Returns Some(T) when this is a MAP type with value type T, or None + /// otherwise. + pub fn unwrap_map(&self) -> Option> { + if self.is_map() { + self.type_parameter(1) + } else { + None + } + } + + /// Returns Some(T) when this is a MAP type with key type T, or None + /// otherwise. + pub fn unwrap_map_key(&self) -> Option> { + if self.is_map() { + self.type_parameter(0) + } else { + None + } + } + + /// Returns whether this is the base type for this type, i.e. it does + /// not have a variation. + pub fn is_base_type(&self) -> bool { + self.variation.is_none() + } + + /// Returns the type of the nth field of this struct. Returns None if + /// out of range or if this is known to not be a struct. + pub fn index_struct(&self, index: usize) -> Option> { + if self.is_unresolved() { + Some(DataType::new_unresolved()) + } else if self.is_struct() { + match self.parameters.get(index) { + Some(Parameter::Type(t)) => Some(t.clone()), + Some(Parameter::NamedType(_, t)) => Some(t.clone()), + _ => None, + } + } else { + None + } + } + + /// Internal helper for split_field_names() and strip_field_names(). + fn split_field_names_internal(&self, namer: &mut F) -> Arc { + let is_struct = self.is_struct(); + let parameters = self + .parameters + .iter() + .cloned() + .enumerate() + .map(|(i, p)| { + let p = if is_struct { + let (p, name) = p.split_name(); + namer(name.unwrap_or_else(|| i.to_string())); + p + } else { + p + }; + p.map_type(|t| t.split_field_names_internal(namer)) + }) + .collect(); + let class = if self.class == Class::Compound(Compound::NamedStruct) { + Class::Compound(Compound::Struct) + } else { + self.class.clone() + }; + Arc::new(DataType { + class, + nullable: self.nullable, + variation: self.variation.clone(), + parameters, + }) + } + + /// Converts all NSTRUCT types in the tree to STRUCT, and returns the + /// flattened list of field names encountered. The fields of STRUCT types + /// are also returned, to ensure that the returned Vec is applicable to + /// apply_field_names(); their names are simply their zero-based index + /// converted to a string. + pub fn split_field_names(&self) -> (Arc, Vec) { + let mut names = vec![]; + let data_type = self.split_field_names_internal(&mut |s| names.push(s)); + (data_type, names) + } + + /// Like split_field_names(), but drops the name strings. + pub fn strip_field_names(&self) -> Arc { + self.split_field_names_internal(&mut |_| ()) + } + + /// Internal helper function for apply_field_names(). + fn apply_field_names_internal diagnostic::Result>( + &self, + mut namer: &mut F, + ) -> diagnostic::Result> { + if self.is_struct() { + let parameters: Result, _> = self + .parameters + .iter() + .cloned() + .map(|p| { + p.with_name(&mut namer)? + .map_type_result(|t| t.apply_field_names_internal(namer)) + }) + .collect(); + + // The data type may be invalid after renaming, so we need to + // call new() to perform check validity. + DataType::new( + Class::Compound(Compound::NamedStruct), + self.nullable, + self.variation.clone(), + parameters?, + ) + } else { + let parameters: Result, _> = self + .parameters + .iter() + .cloned() + .map(|p| p.map_type_result(|t| t.apply_field_names_internal(namer))) + .collect(); + + // Data types generated this way can never become invalid, so we + // can construct directly. + Ok(Arc::new(DataType { + class: self.class.clone(), + nullable: self.nullable, + variation: self.variation.clone(), + parameters: parameters?, + })) + } + } + + /// Applies names to STRUCTs, or renames the names in NSTRUCTs, based on a + /// flattened vector of names. + pub fn apply_field_names(&self, names: &[S]) -> diagnostic::Result> { + let mut names = names.iter(); + let mut num_too_few = 0; + let mut namer = || { + Ok(names.next().map(|s| s.to_string()).unwrap_or_else(|| { + num_too_few += 1; + format!("unnamed{num_too_few}") + })) + }; + let new_type = self.apply_field_names_internal(&mut namer)?; + let remainder = names.count(); + if self.is_unresolved_deep() { + Ok(new_type) + } else if remainder > 0 { + Err(cause!( + TypeMismatchedFieldNameAssociations, + "received {remainder} too many field name(s)" + )) + } else if num_too_few > 0 { + Err(cause!( + TypeMismatchedFieldNameAssociations, + "received {num_too_few} too few field name(s)" + )) + } else { + Ok(new_type) + } + } +} + +impl Default for DataType { + fn default() -> Self { + DataType { + class: Class::Unresolved, + nullable: false, + variation: None, + parameters: vec![], + } + } +} + +/// Trait for checking the type parameters for a base type. +pub trait ParameterInfo { + /// Checks whether the given parameter set is valid for this base type. + fn check_parameters(&self, params: &[Parameter]) -> diagnostic::Result<()>; + + /// Returns the logical name of the given parameter. + fn parameter_name(&self, index: usize) -> Option; + + /// Whether this type supports parameters. This is used to determine + /// whether to print <> when the parameter list is empty. This is used to + /// distinguish a concrete empty struct from a struct with unspecified + /// fields. + fn has_parameters(&self) -> bool; +} + +/// Type class. +#[derive(Clone, Debug, PartialEq)] +pub enum Class { + /// Well-known simple type. + Simple(Simple), + + /// Well-known compound type. + Compound(Compound), + + /// User-defined type. + UserDefined(Arc>), + + /// Unresolved type. Used for error recovery. + Unresolved, +} + +impl std::fmt::Display for Class { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Class::Simple(simple) => write!(f, "{simple}"), + Class::Compound(compound) => write!(f, "{compound}"), + Class::UserDefined(user_defined) => write!(f, "{user_defined}"), + Class::Unresolved => write!(f, "!"), + } + } +} + +impl ParameterInfo for Class { + fn check_parameters(&self, params: &[Parameter]) -> diagnostic::Result<()> { + match self { + Class::Simple(_) => { + if params.is_empty() { + Ok(()) + } else { + Err(cause!( + TypeMismatchedParameters, + "simple types cannot be parameterized" + )) + } + } + Class::Compound(compound) => compound.check_parameters(params), + Class::UserDefined(_) => { + if params.is_empty() { + Ok(()) + } else { + Err(cause!( + TypeMismatchedParameters, + "user-defined types cannot currently be parameterized" + )) + } + } + Class::Unresolved => Ok(()), + } + } + + fn parameter_name(&self, index: usize) -> Option { + if let Class::Compound(compound) = self { + compound.parameter_name(index) + } else { + None + } + } + + fn has_parameters(&self) -> bool { + if let Class::Compound(compound) = self { + compound.has_parameters() + } else { + false + } + } +} + +impl Class { + /// Checks whether two classes are equal, also returning true if either or + /// both are unresolved. + pub fn weak_equals(&self, rhs: &Class) -> bool { + match (self, rhs) { + (_, Class::Unresolved) | (Class::Unresolved, _) => true, + (a, b) => a == b, + } + } +} + +/// Enumeration of simple types defined by Substrait. +#[derive(Clone, Debug, PartialEq, Display, EnumString)] +#[strum(ascii_case_insensitive, serialize_all = "snake_case")] +pub enum Simple { + Boolean, + I8, + I16, + I32, + I64, + Fp32, + Fp64, + String, + Binary, + Timestamp, + TimestampTz, + Date, + Time, + IntervalYear, + IntervalDay, + Uuid, +} + +/// Enumeration of compound types defined by Substrait. +#[derive(Clone, Debug, PartialEq, Display, EnumString)] +#[strum(ascii_case_insensitive, serialize_all = "UPPERCASE")] +pub enum Compound { + FixedChar, + VarChar, + FixedBinary, + Decimal, + Struct, + #[strum(serialize = "NSTRUCT")] + NamedStruct, + List, + Map, +} + +impl ParameterInfo for Compound { + fn check_parameters(&self, params: &[Parameter]) -> diagnostic::Result<()> { + match self { + Compound::FixedChar | Compound::VarChar | Compound::FixedBinary => { + if params.len() != 1 { + return Err(cause!( + TypeMismatchedParameters, + "{self} expects a single parameter (length)" + )); + } + if let Parameter::Unsigned(length) = params[0] { + // Note: 2147483647 = 2^31-1 = maximum value for signed + // 32-bit integer. However, the significance of the number + // is just that the Substrait specification says this is + // the limit. + const MIN_LENGTH: u64 = 1; + const MAX_LENGTH: u64 = 2147483647; + if !(MIN_LENGTH..=MAX_LENGTH).contains(&length) { + return Err(cause!( + TypeMismatchedParameters, + "{self} length {length} is out of range {MIN_LENGTH}..{MAX_LENGTH}" + )); + } + } else { + return Err(cause!( + TypeMismatchedParameters, + "{self} length parameter must be a positive integer" + )); + } + } + Compound::Decimal => { + if params.len() != 2 { + return Err(cause!( + TypeMismatchedParameters, + "{self} expects two parameters (precision and scale)" + )); + } + if let Parameter::Unsigned(precision) = params[0] { + const MIN_PRECISION: u64 = 1; + const MAX_PRECISION: u64 = 38; + if !(MIN_PRECISION..=MAX_PRECISION).contains(&precision) { + return Err(cause!( + TypeMismatchedParameters, + "{self} precision {precision} is out of range {MIN_PRECISION}..{MAX_PRECISION}" + )); + } + if let Parameter::Unsigned(scale) = params[1] { + if scale > precision { + return Err(cause!( + TypeMismatchedParameters, + "{self} scale {scale} is out of range 0..{precision}" + )); + } + } else { + return Err(cause!( + TypeMismatchedParameters, + "{self} scale parameter must be a positive integer" + )); + } + } else { + return Err(cause!( + TypeMismatchedParameters, + "{self} precision parameter must be a positive integer" + )); + } + } + Compound::Struct => { + for param in params.iter() { + if !matches!(param, Parameter::Type(_)) { + return Err(cause!( + TypeMismatchedParameters, + "{self} parameters must be types" + )); + } + } + } + Compound::NamedStruct => { + let mut names = HashSet::with_capacity(params.len()); + for param in params.iter() { + if let Parameter::NamedType(name, _) = ¶m { + if !names.insert(name) { + return Err(cause!( + TypeMismatchedParameters, + "duplicate field name in {self}: {name}" + )); + } + } else { + return Err(cause!( + TypeMismatchedParameters, + "{self} parameters must be name-types pairs" + )); + } + } + } + Compound::List => { + if params.len() != 1 { + return Err(cause!( + TypeMismatchedParameters, + "{self} expects a single parameter (element type)" + )); + } + if !matches!(params[0], Parameter::Type(_)) { + return Err(cause!( + TypeMismatchedParameters, + "{self} element type parameter must be a type" + )); + } + } + Compound::Map => { + if params.len() != 2 { + return Err(cause!( + TypeMismatchedParameters, + "{self} expects two parameters (key type and value type)" + )); + } + if !matches!(params[0], Parameter::Type(_)) { + return Err(cause!( + TypeMismatchedParameters, + "{self} key type parameter must be a type" + )); + } + if !matches!(params[1], Parameter::Type(_)) { + return Err(cause!( + TypeMismatchedParameters, + "{self} value type parameter must be a type" + )); + } + } + } + Ok(()) + } + + fn parameter_name(&self, index: usize) -> Option { + match (self, index) { + (Compound::FixedChar, 0) => Some(String::from("length")), + (Compound::VarChar, 0) => Some(String::from("length")), + (Compound::FixedBinary, 0) => Some(String::from("length")), + (Compound::Decimal, 0) => Some(String::from("precision")), + (Compound::Decimal, 1) => Some(String::from("scale")), + (Compound::Struct, i) => Some(format!("{}", i)), + (Compound::NamedStruct, i) => Some(format!("{}", i)), + (Compound::List, 0) => Some(String::from("element")), + (Compound::Map, 0) => Some(String::from("key")), + (Compound::Map, 1) => Some(String::from("value")), + (_, _) => None, + } + } + + fn has_parameters(&self) -> bool { + true + } +} + +/// Parameter for parameterized types. +#[derive(Clone, Debug, PartialEq)] +pub enum Parameter { + /// Type parameter (list element type, struct element types, etc). + Type(Arc), + + /// Named type parameter (named struct/schema pseudotype elements). + NamedType(String, Arc), + + /// Integral type parameter (varchar length, etc.). + Unsigned(u64), +} + +impl Describe for Parameter { + fn describe( + &self, + f: &mut std::fmt::Formatter<'_>, + limit: string_util::Limit, + ) -> std::fmt::Result { + match self { + Parameter::Type(data_type) => data_type.describe(f, limit), + Parameter::NamedType(name, data_type) => { + let (name_limit, type_limit) = limit.split(name.len()); + string_util::describe_identifier(f, name, name_limit)?; + write!(f, ": ")?; + data_type.describe(f, type_limit) + } + Parameter::Unsigned(value) => write!(f, "{value}"), + } + } +} + +impl std::fmt::Display for Parameter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.display().fmt(f) + } +} + +impl Parameter { + /// Splits the name annotation off from a named type parameter. + pub fn split_name(self) -> (Parameter, Option) { + match self { + Parameter::NamedType(n, t) => (Parameter::Type(t), Some(n)), + p => (p, None), + } + } + + /// Returns the name of a named type parameter. + pub fn get_name(&self) -> Option<&str> { + match self { + Parameter::NamedType(n, _) => Some(n), + _ => None, + } + } + + /// Returns the type of a type parameter. + pub fn get_type(&self) -> Option<&Arc> { + match self { + Parameter::Type(t) => Some(t), + Parameter::NamedType(_, t) => Some(t), + _ => None, + } + } + + /// Annotates the parameter with a name, if applicable. If the parameter + /// was already named, the name is replaced. The function is only called + /// for Types and NamedTypes. None is returned only if the function was + /// called and returned None. + pub fn with_name Result>(self, f: F) -> Result { + Ok(match self { + Parameter::Type(t) => Parameter::NamedType(f()?, t), + Parameter::NamedType(_, t) => Parameter::NamedType(f()?, t), + p => p, + }) + } + + /// Modifies the contained type using the given function, if applicable. If + /// this is not a type parameter, the function is not called. + pub fn map_type_result) -> Result, E>>( + self, + f: F, + ) -> Result { + Ok(match self { + Parameter::Type(t) => Parameter::Type(f(t)?), + Parameter::NamedType(n, t) => Parameter::NamedType(n, f(t)?), + p => p, + }) + } + + /// Modifies the contained type using the given function, if applicable. If + /// this is not a type parameter, the function is not called. + pub fn map_type) -> Arc>(self, f: F) -> Parameter { + match self { + Parameter::Type(t) => Parameter::Type(f(t)), + Parameter::NamedType(n, t) => Parameter::NamedType(n, f(t)), + p => p, + } + } +} + +impl From for Parameter { + fn from(t: DataType) -> Self { + Parameter::Type(Arc::new(t)) + } +} + +impl From> for Parameter { + fn from(t: Arc) -> Self { + Parameter::Type(t) + } +} + +impl From for Parameter { + fn from(x: u64) -> Self { + Parameter::Unsigned(x) + } +} diff --git a/rs/src/output/diagnostic.rs b/rs/src/output/diagnostic.rs new file mode 100644 index 00000000..5f8614bf --- /dev/null +++ b/rs/src/output/diagnostic.rs @@ -0,0 +1,594 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for diagnostic message types. +//! +//! Since diagnostic messages are rather important for a validator (after all, +//! getting a diagnostic message is hardly an exceptional case), they have +//! quite a bit of metadata attached to them. Ultimately, the diagnostic +//! messages attached to the tree ([`Diagnostic`]) have the following +//! parameters: +//! +//! - cause.message: an enumeration of various types of error messages, in +//! the usual Rust way. Messages generated by this crate are usually +//! untyped (they just use String), but error information from other +//! crates is retained as much as possible. +//! - cause.classification: an enumeration of various bits of the validation +//! process where diagnostics might occur. Each [`Classification`] enum +//! variant can be converted to a unique number, known as the diagnostic +//! code, which the user of the crate may use to easily programmatically +//! determine what caused a diagnostic in a language-agnostic way. The user +//! may also configure the validator in advance to promote or reduce the +//! severity of diagnostics, indexed by their code. The codes are +//! furthermore organized into groups, with up to 999 classes per group: the +//! thousands digit and up is the group identifier, and the less-significant +//! digits form the sub-code. Sub-code 0 is reserved to refer to the group +//! as a whole. +//! - original_level: the error [`Level`] that the validation code assigned to +//! the message. This can be `Error`, `Warning`, or `Info`, which correspond +//! directly to "this is definitely wrong," "this may or may not be wrong," +//! and "this conforms to the Substrait specification, but it's worth noting +//! anyway" respectively. +//! - adjusted_level: the error [`Level`] after configuration-based adjustment. +//! This level is what's used by the high-level APIs to determine the +//! validity of a plan. Thus, a user can choose to ignore a particular error +//! if their consumer implementation can deal with it anyway, or they can +//! assert whether a particular type of warning is actually an error or not. +//! - path: a path into the substrait.Plan message. This is *usually* just a +//! copy of the path to the node that was being validated when the +//! diagnostic was created, but in some cases diagnostics may be placed in a +//! parent node (for instance to refer to a node that should exist but +//! doesn't), or refer to a different location altogether (for instance to +//! point the user to the previous definition in a note following a +//! duplicate definition error). + +use crate::output::path; +use num_traits::cast::FromPrimitive; +use std::sync::Arc; +use strum::EnumProperty; + +/// Owned variant of jsonschema::error::ValidationError<'a>. Instead of a +/// reference to the YAML tree node that caused the error, this just contains +/// the formatted error message. The validation error kind and paths are +/// however retained. +#[derive(Debug, thiserror::Error)] +pub struct JsonSchemaValidationError { + pub message: String, + pub kind: jsonschema::error::ValidationErrorKind, + pub instance_path: jsonschema::paths::JSONPointer, + pub schema_path: jsonschema::paths::JSONPointer, +} + +impl std::fmt::Display for JsonSchemaValidationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.message.fmt(f) + } +} + +impl From> for JsonSchemaValidationError { + fn from(v: jsonschema::error::ValidationError) -> Self { + JsonSchemaValidationError { + message: v.to_string(), + kind: v.kind, + instance_path: v.instance_path, + schema_path: v.schema_path, + } + } +} + +/// Enumeration for error message data we might encounter. +#[derive(Debug, thiserror::Error)] +pub enum Message { + #[error("{0}")] + Untyped(String), + + #[error("{0}")] + ProstDecodeError(#[from] prost::DecodeError), + + #[error("{0}")] + IoError(#[from] std::io::Error), + + #[error("{0}")] + UtfError(#[from] std::str::Utf8Error), + + #[error("{0}")] + YamlScanError(#[from] yaml_rust::ScanError), + + #[error("{0}")] + JsonSchemaValidationError(#[from] JsonSchemaValidationError), + + #[error("{0}")] + UriError(#[from] uriparse::URIReferenceError), + + #[error("{0}")] + GlobError(#[from] glob::PatternError), +} + +impl From<&str> for Message { + fn from(s: &str) -> Self { + Message::Untyped(s.to_string()) + } +} + +impl From for Message { + fn from(s: String) -> Self { + Message::Untyped(s) + } +} + +impl From> for Message { + fn from(v: jsonschema::error::ValidationError<'_>) -> Self { + JsonSchemaValidationError::from(v).into() + } +} + +/// Enumeration for the particular types of diagnostics we might encounter. +/// +/// Numbers must be assigned as follows: +/// - the group identifier is represented by the thousands digit and up; +/// - the first classification for each group (i.e. divisible by 1000) is +/// reserved for diagnostics that have no more specific information +/// attached to them: their description must be hidden and related to +/// the group name; +/// - group 0 is a sort of null group, where no group information is known; +/// - all enum variant names for classifications belonging to a group (except +/// the null group) must be prefixed by the group name; +/// - for backward/forward-compatibility, numbers should not be reassigned. +/// +/// The Description and HiddenDescription enum properties define a description +/// of the class. When Description is used, the description is prefixed before +/// the error message; when HiddenDescription is used, the message is not +/// prefixed, and should thus be sufficiently specific to not need it. The +/// latter is useful to reduce the amount of redundant information in a +/// message. +#[derive( + Clone, + Copy, + Debug, + PartialEq, + Eq, + Hash, + strum_macros::EnumIter, + strum_macros::EnumProperty, + num_derive::FromPrimitive, +)] +pub enum Classification { + // Unclassified diagnostics (group 0). + #[strum(props(HiddenDescription = "unclassified diagnostic"))] + Unclassified = 0, + + #[strum(props(Description = "not yet implemented"))] + NotYetImplemented = 1, + + #[strum(props(Description = "illegal value"))] + IllegalValue = 2, + + #[strum(props(Description = "illegal value in hint"))] + IllegalValueInHint = 3, + + #[strum(props(Description = "illegal URI"))] + IllegalUri = 4, + + #[strum(props(Description = "illegal glob"))] + IllegalGlob = 5, + + // Protobuf-related diagnostics (group 1). + #[strum(props(HiddenDescription = "protobuf-related diagnostic"))] + Proto = 1000, + + #[strum(props(HiddenDescription = "protobuf parsing failed"))] + ProtoParseFailed = 1001, + + #[strum(props(Description = "missing required protobuf field"))] + ProtoMissingField = 1002, + + #[strum(props(Description = "encountered a protobuf \"any\""))] + ProtoAny = 1004, + + #[strum(props(Description = "missing protobuf \"any\" declaration"))] + ProtoMissingAnyDeclaration = 1006, + + // YAML-reated diagnostics (group 2). + #[strum(props(HiddenDescription = "YAML-related diagnostic"))] + Yaml = 2000, + + #[strum(props(Description = "did not attempt to resolve YAML"))] + YamlResolutionDisabled = 2001, + + #[strum(props(Description = "failed to resolve YAML"))] + YamlResolutionFailed = 2002, + + #[strum(props(Description = "failed to parse YAML"))] + YamlParseFailed = 2003, + + #[strum(props(Description = "YAML does not conform to schema"))] + YamlSchemaValidationFailed = 2004, + + #[strum(props(Description = "missing required YAML key"))] + YamlMissingKey = 2005, + + #[strum(props(Description = "missing required YAML array element"))] + YamlMissingElement = 2007, + + #[strum(props(Description = "invalid YAML value type"))] + YamlInvalidType = 2008, + + // Link resolution diagnostics (group 3). + #[strum(props(HiddenDescription = "link resolution diagnostic"))] + Link = 3000, + + #[strum(props(Description = "failed to resolve anchor"))] + LinkMissingAnchor = 3001, + + #[strum(props(Description = "failed to resolve function name"))] + LinkMissingFunctionName = 3002, + + #[strum(props(Description = "failed to resolve type name"))] + LinkMissingTypeName = 3003, + + #[strum(props(Description = "failed to resolve type variation name"))] + LinkMissingTypeVariationName = 3004, + + // Type-related diagnostics (group 4). + #[strum(props(HiddenDescription = "type-related diagnostics"))] + Type = 4000, + + #[strum(props(Description = "unknown type"))] + TypeUnknown = 4001, + + #[strum(props(Description = "mismatched type parameters"))] + TypeMismatchedParameters = 4002, + + #[strum(props(Description = "mismatched field name associations"))] + TypeMismatchedFieldNameAssociations = 4003, + + #[strum(props(Description = "invalid swizzle operation"))] + TypeInvalidSwizzle = 4004, + + #[strum(props(Description = "mismatched types"))] + TypeMismatch = 4005, + + #[strum(props(Description = "struct type is required"))] + TypeStructRequired = 4006, + + #[strum(props(Description = "mismatched type variation"))] + TypeMismatchedVariation = 4007, + + #[strum(props(Description = "mismatched nullability"))] + TypeMismatchedNullability = 4008, + + // Relation-related diagnostics (group 5). + #[strum(props(HiddenDescription = "relation-related diagnostics"))] + Relation = 5000, + + #[strum(props(Description = "missing root relation"))] + RelationRootMissing = 5001, + + #[strum(props(Description = "missing relation"))] + RelationMissing = 5002, + + #[strum(props(Description = "invalid relation"))] + RelationInvalid = 5003, + + // Expression-related diagnostics (group 6). + #[strum(props(HiddenDescription = "expression-related diagnostics"))] + Expression = 6000, + + #[strum(props(Description = "field reference into non-existent stream"))] + ExpressionFieldRefMissingStream = 6001, + + #[strum(props(Description = "illegal literal value"))] + ExpressionIllegalLiteralValue = 6002, + + #[strum(props(Description = "function definition unavailable"))] + ExpressionFunctionDefinitionUnavailable = 6003, + + #[strum(props(Description = "illegal subquery"))] + ExpressionIllegalSubquery = 6004, + + // Redundant declarations (group 7). + #[strum(props( + HiddenDescription = "diagnostics for pointing out parts of the plan that can be removed without changing its semantics" + ))] + Redundant = 7000, + + #[strum(props(Description = "redundant protobuf \"any\" declaration"))] + RedundantProtoAnyDeclaration = 7001, + + #[strum(props(Description = "redundant extension URI definition"))] + RedundantExtensionDefition = 7002, + + #[strum(props(Description = "redundant function declaration"))] + RedundantFunctionDeclaration = 7003, + + #[strum(props(Description = "redundant type declaration"))] + RedundantTypeDeclaration = 7004, + + #[strum(props(Description = "redundant type variation declaration"))] + RedundantTypeVariationDeclaration = 7005, + + #[strum(props(Description = "redundant list slice"))] + RedundantListSlice = 7006, + + #[strum(props(Description = "redundant field"))] + RedundantField = 7007, +} + +impl Default for Classification { + fn default() -> Self { + Classification::Unclassified + } +} + +impl Classification { + /// Returns the complete code for this classification. + pub fn code(&self) -> u32 { + *self as u32 + } + + /// Returns the name of the classiciation. + pub fn name(&self) -> String { + format!("{:?}", self) + } + + /// Returns the group code for this classification. + pub fn group_code(&self) -> u32 { + (*self as u32) / 1000 + } + + /// Returns the group variant for this classification. + pub fn group(&self) -> Classification { + Self::from_group(self.group_code()) + .unwrap_or_else(|| panic!("missing group for {:?}", self)) + } + + /// Returns the code for this classification within its group. + pub fn sub_code(&self) -> u32 { + (*self as u32) % 1000 + } + + /// Returns the description of this classification. + pub fn description(&self) -> &str { + self.get_str("Description") + .or_else(|| self.get_str("HiddenDescription")) + .unwrap_or_else(|| { + panic!( + "missing Description or HiddenDescription property for {:?}", + self + ) + }) + } + + /// Returns the classification associated with the given code, if any. + pub fn from_code(code: u32) -> Option { + Self::from_u32(code) + } + + /// Returns the group classification associated with the given code, if + /// any. + pub fn group_from_code(code: u32) -> Option { + Self::from_group(code / 1000) + } + + /// Returns the group classification associated with the given group. + pub fn from_group(group: u32) -> Option { + Self::from_u32(group * 1000) + } + + /// Returns the "parent" code for the given code. For non-group codes, this + /// is the code of their group (code rounded down to thousands). For group + /// codes, this is 0. + pub fn parent(code: u32) -> u32 { + if code % 1000 != 0 { + (code / 1000) * 1000 + } else { + 0 + } + } + + /// Formats a Message with this classification. + pub fn format_message( + &self, + message: &Message, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + if let Some(description) = self.get_str("Description") { + write!(f, "{description}: ")?; + } + write!(f, "{message} (code {:04})", self.code()) + } +} + +impl From for u32 { + /// Converts a Classification into its error code. + fn from(classification: Classification) -> Self { + classification.code() + } +} + +/// Description of the cause of a diagnostic. +#[derive(Clone, Debug, thiserror::Error)] +pub struct Cause { + /// The error message. Within this crate we don't bother typing these + /// beyond the Classification enum, but we do retain typing information for + /// messages from other crates. + pub message: Arc, + + /// Classification of this cause. This attaches an error code and generic + /// message for said code to the diagnostic message. The user can use these + /// codes to for instance always promote a particular type of diagnostic to + /// an error (like gcc -Werror). + pub classification: Classification, +} + +impl PartialEq for Cause { + fn eq(&self, other: &Self) -> bool { + self.message.to_string() == other.message.to_string() + && self.classification == other.classification + } +} + +impl std::fmt::Display for Cause { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.classification.format_message(&self.message, f) + } +} + +/// Convenience/shorthand macro for creating error diagnostics. Use this +/// variant when you have something that can be cast into a Message via into(), +/// like a pre-formatted string or a compatible Error type from a dependency. +macro_rules! ecause { + ($class:ident, $message:expr) => { + crate::output::diagnostic::Cause { + message: std::sync::Arc::new($message.into()), + classification: crate::output::diagnostic::Classification::$class, + } + }; +} + +/// Convenience/shorthand macro for creating error diagnostics. Use this +/// variant when you want to format a string. The argument list beyond the +/// diagnostic class identifier is passed straight to [`format!`]. +macro_rules! cause { + ($class:ident, $($args:expr),*) => { + ecause!($class, format!($($args),*)) + }; +} + +/// Result type for diagnostic causes. +pub type Result = std::result::Result; + +/// Error level for a diagnostic message. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Level { + /// Level used for diagnostics that don't point out anything wrong with + /// the plan, and merely provide additional information. + Info, + + /// Level used for diagnostics that may or may not indicate that there + /// is something wrong with the plan, i.e. the plan *could* be valid, + /// but the validator isn't sure. + Warning, + + /// Level used for diagnostics that indicate that there is definitely + /// something wrong with the plan. + Error, +} + +/// A diagnostic message, without configuration-based level override. +#[derive(Clone, Debug, PartialEq, thiserror::Error)] +pub struct RawDiagnostic { + /// The cause of the diagnostic. + pub cause: Cause, + + /// The severity of the diagnostic. + pub level: Level, + + /// The path within the protobuf message where the diagnostic occurred. + pub path: path::PathBuf, +} + +impl std::fmt::Display for RawDiagnostic { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.level)?; + if !f.alternate() { + write!(f, " at {}", self.path)?; + } + write!(f, ": {}", self.cause) + } +} + +/// A diagnostic message, including configuration-based level override. +#[derive(Clone, Debug, PartialEq, thiserror::Error)] +pub struct Diagnostic { + /// The cause of the diagnostic. + pub cause: Cause, + + /// The original severity of the diagnostic. + pub original_level: Level, + + /// The severity of the diagnostic after application of configuration. + pub adjusted_level: Level, + + /// The path within the protobuf message where the diagnostic occurred. + pub path: path::PathBuf, +} + +impl std::fmt::Display for Diagnostic { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self.adjusted_level)?; + match self.original_level.cmp(&self.adjusted_level) { + std::cmp::Ordering::Less => write!(f, " (upgraded from {:?})", self.original_level)?, + std::cmp::Ordering::Equal => {} + std::cmp::Ordering::Greater => { + write!(f, " (downgraded from {:?})", self.original_level)? + } + } + if !f.alternate() { + write!(f, " at {}", self.path)?; + } + write!(f, ": {}", self.cause) + } +} + +impl RawDiagnostic { + /// Converts to an AdjustedDiagnostic by adding an adjusted level. + pub fn adjust_level(self, adjusted_level: Level) -> Diagnostic { + Diagnostic { + cause: self.cause, + original_level: self.level, + adjusted_level, + path: self.path, + } + } +} + +/// Convenience/shorthand macro for creating error diagnostics. +macro_rules! diag { + ($path:expr, $level:ident, $class:ident, $($args:expr),*) => { + diag!($path, $level, cause!($class, $($args),*)) + }; + ($path:expr, $level:ident, $cause:expr) => { + crate::output::diagnostic::RawDiagnostic { + cause: $cause, + level: crate::output::diagnostic::Level::$level, + path: $path + } + }; +} +/*macro_rules! ediag { + ($path:expr, $level:ident, $class:ident, $err:expr) => { + diag!($path, $level, ecause!($class, $err)) + }; +}*/ + +/// Result type for complete diagnostics, including path. +pub type DiagResult = std::result::Result; + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashSet; + use strum::IntoEnumIterator; + + #[test] + fn test_diagnostic_classifications() { + // Check validity of the classifications definitions. + let mut descriptions = HashSet::new(); + for class in Classification::iter() { + let group = class.group(); + if group != Classification::Unclassified { + assert!( + class.name().starts_with(&group.name()), + "incorrect group prefix for {:?}, should start with {:?}", + class, + group + ); + } + assert!( + descriptions.insert(class.description().to_string()), + "duplicate description for {:?}", + class + ); + } + } +} diff --git a/rs/src/output/extension.rs b/rs/src/output/extension.rs new file mode 100644 index 00000000..abc9689e --- /dev/null +++ b/rs/src/output/extension.rs @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for dealing with YAML-based Substrait extensions. + +use crate::output::data_type; +use crate::output::path; +use crate::output::tree; +use crate::string_util; +use std::collections::HashMap; +use std::sync::Arc; + +/// Represents a named reference to something. +#[derive(Clone, Debug, Default)] +pub struct NamedReference { + /// The name of the type, type variation, or function. + name: Option, + + /// The path to the node that defined the anchor for this extension, if + /// any. + anchor_path: Option, +} + +impl PartialEq for NamedReference { + /// Named references are equal if both references have a known name and + /// those names are the same. + fn eq(&self, other: &Self) -> bool { + self.name.is_some() && other.name.is_some() && self.name == other.name + } +} + +impl Eq for NamedReference {} + +impl std::fmt::Display for NamedReference { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(name) = &self.name { + write!(f, "{}", string_util::as_ident_or_string(name)) + } else { + write!(f, "?") + } + } +} + +impl NamedReference { + /// Create a new anchor-based reference. + pub fn new( + name: Option, + anchor_path: Option, + ) -> Arc { + Arc::new(NamedReference { + name: name.map(|x| x.to_string()), + anchor_path, + }) + } + + /// Create a new named reference. + pub fn new_by_name(name: S) -> Arc { + Arc::new(NamedReference { + name: Some(name.to_string()), + anchor_path: None, + }) + } + + /// Create a new unknown reference. + pub fn new_unknown() -> Arc { + Arc::default() + } + + /// Returns the name, if known. + pub fn name(&self) -> Option<&str> { + self.name.as_ref().map(|s| &s[..]) + } + + /// Returns the path to the anchor, if known. + pub fn anchor_path(&self) -> Option<&path::PathBuf> { + self.anchor_path.as_ref() + } +} + +/// Named/namespaced reference to a particular extension definition. +#[derive(Clone, Debug, Default)] +pub struct Reference { + /// The name of the type, type variation, or function. + pub name: Arc, + + /// The URI of the YAML file that defined this extension. + pub uri: Arc, + + /// Extension definition information, specific to this type of extension, + /// if we managed to resolve the reference. + pub definition: Option>, +} + +impl PartialEq for Reference { + /// References are equal if they refer to the same thing, regardless of how + /// they refer to it. If we're not sure4 because either reference is + /// (partially) unresolved, return false pessimistically. + fn eq(&self, other: &Self) -> bool { + self.name == other.name && self.uri == other.uri + } +} + +impl Eq for Reference {} + +impl std::fmt::Display for Reference { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}::{}", self.uri, self.name) + } +} + +/// User-defined base data type. +#[derive(Clone, Debug, PartialEq, Default)] +pub struct DataType { + /// The underlying structure of the type. + pub structure: Vec<(String, data_type::Simple)>, +} + +/// The base type of a type variation. +#[derive(Clone, Debug, PartialEq)] +pub enum TypeVariationBase { + /// The type variation is immediately based in a physical type. + Physical(data_type::Class), + + /// The type variation is based in another logical type variation. + Logical(Arc), + + /// The base type is unknown. + Unresolved, +} + +impl Default for TypeVariationBase { + fn default() -> Self { + TypeVariationBase::Unresolved + } +} + +/// Type variation extension. +#[derive(Clone, Debug, PartialEq, Default)] +pub struct TypeVariation { + /// The base type for this variation. + pub base: TypeVariationBase, + + /// Function behavior for this variation. + pub function_behavior: FunctionBehavior, +} + +impl TypeVariation { + /// Return the base class for this type variation, if known. + pub fn get_base_class(&self) -> data_type::Class { + match &self.base { + TypeVariationBase::Physical(x) => x.clone(), + TypeVariationBase::Logical(x) => x.get_base_class(), + TypeVariationBase::Unresolved => data_type::Class::Unresolved, + } + } +} + +/// Type variation function behavior. +#[derive(Clone, Debug, PartialEq)] +pub enum FunctionBehavior { + Inherits, + Separate, +} + +impl Default for FunctionBehavior { + fn default() -> Self { + FunctionBehavior::Inherits + } +} + +/// Function extension. +#[derive(Clone, Debug, PartialEq, Default)] +pub struct Function { + // TODO: need much more information here to do type checking. +} + +/// Information about a YAML extension, which may or may not be resolved. +#[derive(Clone, Debug, PartialEq)] +pub enum YamlInfo { + Unresolved(Arc), + Resolved(Arc), +} + +impl YamlInfo { + pub fn data(&self) -> Option<&YamlData> { + match self { + YamlInfo::Unresolved(_) => None, + YamlInfo::Resolved(x) => Some(x), + } + } + + pub fn uri(&self) -> &Arc { + match self { + YamlInfo::Unresolved(x) => x, + YamlInfo::Resolved(x) => &x.uri, + } + } +} + +impl Default for YamlInfo { + fn default() -> Self { + YamlInfo::Unresolved(Arc::default()) + } +} + +impl std::fmt::Display for YamlInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.uri()) + } +} + +/// Data for a resolved YAML file. +#[derive(Clone, Debug, PartialEq)] +pub struct YamlData { + /// URI for the YAML file. + pub uri: Arc, + + /// Reference to the parsed YAML data, if any. + pub data: tree::NodeReference, + + /// Functions defined in this YAML file. Names are stored in lower case + /// (Substrait's name resolution is case-insensitive). + pub functions: HashMap>, + + /// Types defined in this YAML file. Names are stored in lower case + /// (Substrait's name resolution is case-insensitive). + pub types: HashMap>, + + /// Type variations defined in this YAML file. Names are stored in lower + /// case (Substrait's name resolution is case-insensitive). + pub type_variations: HashMap>, +} + +impl YamlData { + /// Constructs an empty YamlData object with an invalid reference to the + /// data node. Everything still needs to be populated for this to become + /// valid. + pub fn new(uri: Arc) -> YamlData { + YamlData { + uri, + data: tree::NodeReference { + path: path::Path::Root("").to_path_buf(), + node: Arc::new(tree::NodeType::YamlMap.into()), + }, + functions: HashMap::default(), + types: HashMap::default(), + type_variations: HashMap::default(), + } + } + + /// Helper function for the various resolvers. + fn local_reference( + &self, + name: S, + definition: Option>, + ) -> Arc> { + Arc::new(Reference { + name: NamedReference::new_by_name(name), + uri: self.uri.clone(), + definition, + }) + } + + /// Resolves a function defined in this YAML data block by name. Returns an + /// unresolved reference if it does not exist. + pub fn resolve_function(&self, name: S) -> Arc> { + let name = name.to_string(); + let maybe_def = self.functions.get(&name).cloned(); + self.local_reference(name, maybe_def) + } + + /// Resolves a type defined in this YAML data block by name. Returns an + /// unresolved reference if it does not exist. + pub fn resolve_type(&self, name: S) -> Arc> { + let name = name.to_string(); + let maybe_def = self.types.get(&name).cloned(); + self.local_reference(name, maybe_def) + } + + /// Resolves a type variation defined in this YAML data block by name. + /// Returns an unresolved reference if it does not exist. + pub fn resolve_type_variation(&self, name: S) -> Arc> { + let name = name.to_string(); + let maybe_def = self.type_variations.get(&name).cloned(); + self.local_reference(name, maybe_def) + } +} diff --git a/rs/src/output/mod.rs b/rs/src/output/mod.rs new file mode 100644 index 00000000..3cfb23df --- /dev/null +++ b/rs/src/output/mod.rs @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Output representation module. +//! +//! This module provides the data structures for representing the output of the +//! validator. + +#[macro_use] +pub mod diagnostic; + +pub mod comment; +pub mod data_type; +pub mod extension; +pub mod parse_result; +pub mod path; +pub mod primitive_data; +pub mod tree; diff --git a/rs/src/output/parse_result.rs b/rs/src/output/parse_result.rs new file mode 100644 index 00000000..29130846 --- /dev/null +++ b/rs/src/output/parse_result.rs @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for the toplevel type representing a parse/validation result. + +use crate::export; +use crate::output::diagnostic; +use crate::output::tree; + +/// Validity of a plan. +/// +/// Note that there is a one-to-one correspondence with Level. The only +/// difference between Level and Validity is that the variant names for Level +/// are more sensible in the context of a diagnostic, while the names for +/// Validity are more sensible when talking about a validation result as a +/// whole. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Validity { + /// The plan is valid. + Valid, + + /// The plan may or may not be valid; the validator was not able to prove + /// or disprove validity. + MaybeValid, + + /// The plan is invalid. + Invalid, +} + +impl From for Validity { + fn from(level: diagnostic::Level) -> Self { + match level { + diagnostic::Level::Info => Validity::Valid, + diagnostic::Level::Warning => Validity::MaybeValid, + diagnostic::Level::Error => Validity::Invalid, + } + } +} + +impl From for diagnostic::Level { + fn from(validity: Validity) -> Self { + match validity { + Validity::Valid => diagnostic::Level::Info, + Validity::MaybeValid => diagnostic::Level::Warning, + Validity::Invalid => diagnostic::Level::Error, + } + } +} + +/// Representation of a parse/validation result. +pub struct ParseResult { + /// The root node of the tree. + pub root: tree::Node, +} + +impl ParseResult { + /// Iterates over all diagnostic messages in the tree. + pub fn iter_diagnostics(&self) -> impl Iterator + '_ { + self.root.iter_diagnostics() + } + + /// Returns the first diagnostic of the highest severity level in the tree. + pub fn get_diagnostic(&self) -> Option<&diagnostic::Diagnostic> { + self.root.get_diagnostic() + } + + /// Returns whether the plan represented by the given parse tree is valid. + pub fn check(&self) -> Validity { + if let Some(diag) = self.get_diagnostic() { + diag.adjusted_level.into() + } else { + Validity::Valid + } + } + + /// Exports a parse tree to a file or other output device using the specified + /// data format. + pub fn export( + &self, + out: &mut T, + format: export::Format, + ) -> std::io::Result<()> { + export::export(out, format, "plan", self) + } +} diff --git a/rs/src/output/path.rs b/rs/src/output/path.rs new file mode 100644 index 00000000..731eb095 --- /dev/null +++ b/rs/src/output/path.rs @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for handling tree paths. +//! +//! The [`PathElement`], [`Path`], and [`PathBuf`] types are used to uniquely +//! refer to any node in a Substrait plan (or, more accurately, any +//! combination of protobuf and YAML data). [`Path`], and [`PathBuf`] work +//! roughly the same as [`std::path::Path`], and [`std::path::PathBuf`], but +//! for protobuf/YAML tree paths rather than filesystem paths. + +use crate::string_util; + +/// Element of a path to some field of a protobuf message and/or YAML file. +#[derive(Clone, Debug, PartialEq)] +pub enum PathElement { + /// Refers to an optional protobuf field with the given name within the + /// message, or a YAML map entry with the given key. + Field(String), + + /// Refers to one of the elements of a repeated field with the given + /// name within the message referred to by the parent path. + Repeated(String, usize), + + /// Refers to the selected variant of a OneOf field with the given name + /// within the message referred to by the parent path. The first str is + /// the field name, the second is the variant name. + Variant(String, String), + + /// Refers to an indexed element within a YAML array. + Index(usize), +} + +impl std::fmt::Display for PathElement { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if !f.alternate() { + match self { + PathElement::Index(_) => {} + _ => write!(f, ".")?, + } + } + match self { + PathElement::Field(field) => write!(f, "{}", string_util::as_ident_or_string(field)), + PathElement::Repeated(field, index) => { + write!(f, "{}[{index}]", string_util::as_ident_or_string(field)) + } + PathElement::Variant(field, variant) => write!( + f, + "{}<{}>", + string_util::as_ident_or_string(field), + string_util::as_ident_or_string(variant) + ), + PathElement::Index(index) => write!(f, "[{index}]"), + } + } +} + +impl PathElement { + /// Same as to_string(), but doesn't include the dot prefix for the + /// variants that would normally have one. + pub fn to_string_without_dot(&self) -> String { + format!("{:#}", self) + } +} + +/// Refers to a location within a protobuf message. +#[derive(Clone, Debug, PartialEq)] +pub struct PathBuf { + pub root: &'static str, + pub elements: Vec, +} + +impl std::fmt::Display for PathBuf { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.root)?; + for element in self.elements.iter() { + write!(f, "{element}")?; + } + Ok(()) + } +} + +/// Used to track a location within a protobuf message. The owned version +/// is PathBuf. +#[derive(Clone, Debug, PartialEq)] +pub enum Path<'a> { + /// Refers to the root message. + Root(&'static str), + + /// Refers to an optional field with the given name within the message + /// referred to by the given parent path. + Select(&'a Path<'a>, PathElement), +} + +impl Default for Path<'_> { + fn default() -> Self { + Path::Root("") + } +} + +impl Path<'_> { + /// Returns a new Path that references an optional field with the + /// given name within the protobuf message referred to by the current + /// path, or likewise for the key within a YAML map. + pub fn with(&self, element: PathElement) -> Path { + Path::Select(self, element) + } + + /// Returns a new Path that references an optional field with the + /// given name within the protobuf message referred to by the current + /// path, or likewise for the key within a YAML map. + pub fn with_field>(&self, name: S) -> Path { + self.with(PathElement::Field(name.into())) + } + + /// Returns a new Path that references an element of a repeated field + /// with the given name within the message referred to by the current + /// path. + pub fn with_repeated>(&self, name: S, index: usize) -> Path { + self.with(PathElement::Repeated(name.into(), index)) + } + + /// Returns a new Path that references a particular variant of a + /// OneOf field with the given name within the message referred to + /// by the current path. + pub fn with_variant, V: Into>(&self, name: S, variant: V) -> Path { + self.with(PathElement::Variant(name.into(), variant.into())) + } + + /// Returns a new Path that references a YAML array element. + pub fn with_index(&self, index: usize) -> Path { + self.with(PathElement::Index(index)) + } +} + +impl std::fmt::Display for Path<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Path::Root(name) => write!(f, "{name}"), + Path::Select(parent, element) => write!(f, "{parent}{element}"), + } + } +} + +impl Path<'_> { + pub fn end_to_string(&self) -> String { + match self { + Path::Root(name) => name.to_string(), + Path::Select(_, element) => element.to_string(), + } + } + + /// Creates an owned version of this Path. + pub fn to_path_buf(&self) -> PathBuf { + match self { + Path::Root(name) => PathBuf { + root: name, + elements: vec![], + }, + Path::Select(parent, element) => { + let mut parent = parent.to_path_buf(); + parent.elements.push(element.clone()); + parent + } + } + } +} + +impl From> for PathBuf { + fn from(path: Path<'_>) -> Self { + path.to_path_buf() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn paths() { + let a = Path::Root("a"); + let b = a.with_field("b"); + let c = b.with_repeated("c", 42); + let d = c.with_variant("d", "e"); + let e = d.with_index(33); + let buf: PathBuf = e.to_path_buf(); + assert_eq!(e.to_string(), "a.b.c[42].d[33]"); + assert_eq!(buf.to_string(), "a.b.c[42].d[33]"); + } + + #[test] + fn non_ident_paths() { + let a = Path::Root("a"); + let b = a.with_field("4"); + let c = b.with_repeated("8", 15); + let d = c.with_variant("16", "23"); + let e = d.with_index(42); + let buf: PathBuf = e.to_path_buf(); + assert_eq!(e.to_string(), "a.\"4\".\"8\"[15].\"16\"<\"23\">[42]"); + assert_eq!(buf.to_string(), "a.\"4\".\"8\"[15].\"16\"<\"23\">[42]"); + } +} diff --git a/rs/src/output/primitive_data.rs b/rs/src/output/primitive_data.rs new file mode 100644 index 00000000..ea72e917 --- /dev/null +++ b/rs/src/output/primitive_data.rs @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for primitive data elements. +//! +//! The [`PrimitiveData`] enum is used to represent primitive data in the +//! input, for use in the leaf nodes of the tree. + +/// Enumeration for representing any type of primitive data that can be stored +/// in YAML or protobuf. +#[derive(Clone, Debug, PartialEq)] +pub enum PrimitiveData { + /// Used for nulls (YAML only). + Null, + + /// Used for booleans. + Bool(bool), + + /// Used for unsigned integers. + Unsigned(u64), + + /// Used for signed integers. + Signed(i64), + + /// Used for floating-point values. + Float(f64), + + /// Used for UTF-8 strings. + String(String), + + /// Used for bytestrings. + Bytes(Vec), + + /// Used for enumerations (protobuf only). + Enum(&'static str), + + /// Used for Any messages (protobuf only). + Any(prost_types::Any), +} + +fn hexdump(f: &mut std::fmt::Formatter<'_>, x: &[u8]) -> std::fmt::Result { + for (i, b) in x.iter().enumerate() { + if i > 0 { + write!(f, " ")?; + } + write!(f, "{:02X}", b)?; + } + Ok(()) +} + +impl std::fmt::Display for PrimitiveData { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + PrimitiveData::Null => write!(f, "null"), + PrimitiveData::Bool(true) => write!(f, "true"), + PrimitiveData::Bool(false) => write!(f, "false"), + PrimitiveData::Unsigned(x) => write!(f, "{x}"), + PrimitiveData::Signed(x) => write!(f, "{x}"), + PrimitiveData::Float(x) => write!(f, "{x}"), + PrimitiveData::String(x) => write!(f, "{x:?}"), + PrimitiveData::Bytes(x) => hexdump(f, x), + PrimitiveData::Enum(x) => write!(f, "{x}"), + PrimitiveData::Any(x) => { + write!(f, "{}(", x.type_url)?; + hexdump(f, &x.value)?; + write!(f, ")") + } + } + } +} diff --git a/rs/src/output/tree.rs b/rs/src/output/tree.rs new file mode 100644 index 00000000..c3d10252 --- /dev/null +++ b/rs/src/output/tree.rs @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for the output tree structure. +//! +//! This module provides the types for the tree structure that constitutes +//! the output of the validator. The nodes in the tree are intended to +//! correspond exactly to the protobuf messages, primitives, and YAML values +//! (the latter actually using the JSON object model) that constitute the +//! incoming plan. Likewise, the structure of the tree is the same as the +//! input. However, unlike the input: +//! +//! - All nodes and the relations between them are encapsulated in generic +//! types, independent from the corresponding messages/values in the +//! original tree. This allows the tree to be traversed by generic code +//! with no understanding of Substrait. +//! - Additional information can be attached to the nodes, edges, and +//! between the edges, such as diagnostic messages and data type +//! information. +//! +//! The node type for the output trees is [`Node`]. This structure contains +//! a single [`NodeType`] enum variant and zero or more [`NodeData`] enum +//! variants in an ordered sequence to form the tree structure; [`NodeType`] +//! includes information about the node itself, while the [`NodeData`] +//! elements represent edges to other nodes ([`Child`]) or contextual +//! information. A subtree might look something like this: +//! +//! ```text +//! Node ---> ProtoMessage } Parent node +//! | +//! .--------------'--------------. +//! | | | | +//! v v v v +//! Child Diagnostic Comment Child } Edges +//! | | +//! v v +//! Node ---> ProtoPrimitive Node ---> ProtoMessage } Child nodes +//! | | +//! '-> PrimitiveData : +//! ``` +//! +//! Note that the [`Child`] struct includes information about how the child +//! node relates to its parent (which field, array element, etc) via +//! [`PathElement`](path::PathElement), such that the original tree structure +//! could in theory be completely reconstructed. +//! +//! Nevertheless, the conversion from protobuf/YAML to this tree structure is +//! only intended to be a one-way street; indeed, the output tree is not +//! intended to ever be treated as some executable query plan by a computer at +//! all. It serves only as an intermediate format for documentation, debug, +//! and/or validation output. The [export](mod@crate::export) module deals with +//! breaking this internal representation down further, into (file) formats +//! that are not specific to the Substrait validator. + +use crate::output::comment; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::output::extension; +use crate::output::path; +use crate::output::primitive_data; +use std::collections::VecDeque; +use std::sync::Arc; + +/// Node for a semi-structured documentation-like tree representation of a +/// parsed Substrait plan. The intention is for this to be serialized into +/// some human-readable format. +/// +/// Note: although it should be possible to reconstruct the entire plan from +/// the information contained in the tree, the tree is only intended to be +/// converted to structured human-readable documentation for the plan. It is +/// expressly NOT intended to be read as a form of AST by a downstream +/// process, and therefore isn't nearly as strictly-typed as you would +/// otherwise want it to be. Protobuf itself is already a reasonable format +/// for this! +#[derive(Clone, Debug, PartialEq)] +pub struct Node { + /// The type of a node in terms of plan semantics. + pub class: Class, + + /// An optional brief description of the node. This can be regarded as + /// a comment placed at the start of the data vector, but it is usually + /// only set at the end of the parse function. + pub brief: Option, + + /// An optional comment summarizing what this node does. This can be + /// regarded as a comment placed at the start of the data vector (just + /// after brief, if brief is also defined), but it is usually only set + /// at the end of the parse function. + pub summary: Option, + + /// The type of node in terms of what it represents in the original + /// data structure. + pub node_type: NodeType, + + /// The type of data returned by this node, if any. Depending on the + /// message and context, this may represent a table schema or scalar + /// data. + pub data_type: Option>, + + /// The information gathered about the message. + /// + /// This normally includes all child nodes for this message, possibly + /// interspersed with diagnostics, type information, and unstructured + /// comment nodes to provide context, all ordered in a reasonable way. + /// Note however that this information is intended to be understood by + /// a human, not by the validator itself (aside from serialization to a + /// human-readable notation). + pub data: Vec, +} + +impl From for Node { + fn from(node_type: NodeType) -> Self { + Node { + class: Class::Misc, + brief: None, + summary: None, + node_type, + data_type: None, + data: vec![], + } + } +} + +impl Node { + /// Returns an iterator that iterates over all nodes depth-first. + pub fn iter_flattened_nodes(&self) -> FlattenedNodeIter { + FlattenedNodeIter { + remaining: VecDeque::from(vec![self]), + } + } + + /// Returns an iterator that iterates over all NodeData objects in the + /// order in which they were defined. + pub fn iter_flattened_node_data(&self) -> FlattenedNodeDataIter { + FlattenedNodeDataIter { + remaining: self.data.iter().rev().collect(), + } + } + + /// Iterates over all diagnostics in the tree. + pub fn iter_diagnostics(&self) -> impl Iterator + '_ { + self.iter_flattened_node_data().filter_map(|x| match x { + NodeData::Diagnostic(d) => Some(d), + _ => None, + }) + } + + /// Returns the first diagnostic of the highest severity level in the tree. + pub fn get_diagnostic(&self) -> Option<&diagnostic::Diagnostic> { + let mut result: Option<&diagnostic::Diagnostic> = None; + for diag in self.iter_diagnostics() { + // We can return immediately for error diagnostics, since this is the + // highest level. + if diag.adjusted_level == diagnostic::Level::Error { + return Some(diag); + } + + // For other levels, update only if the incoming diagnostic is of a + // higher level/severity than the current one. + if let Some(cur) = result.as_mut() { + if diag.adjusted_level > (*cur).adjusted_level { + *cur = diag; + } + } else { + result = Some(diag); + } + } + result + } + + /// Returns a reference to the data type that this node returns at runtime + /// or (for type nodes) represents. If no type information is attached, a + /// reference to a default-generated unresolved type is returned. + pub fn data_type(&self) -> Arc { + self.data_type.clone().unwrap_or_default() + } +} + +/// The original data type that the node represents, to (in theory) allow the +/// original structure of the plan to be recovered from the documentation tree. +#[derive(Clone, Debug, PartialEq)] +pub enum NodeType { + /// The associated node represents a protobuf message of the given type + /// (full protobuf path). The contents of the message are described using + /// Field, RepeatedField, and OneOfField. + ProtoMessage(&'static str), + + /// The associated node represents a protobuf primitive value of the given + /// type and with the given data. + ProtoPrimitive(&'static str, primitive_data::PrimitiveData), + + /// The associated node represents an unpopulated oneof field. This is used + /// for an error recovery node when a required oneof field is not + /// populated. + ProtoMissingOneOf, + + /// Used for anchor/reference-based references to other nodes. + NodeReference(u64, NodeReference), + + /// Used for resolved YAML URIs, in order to include the parse result and + /// documentation for the referenced YAML (if available), in addition to + /// the URI itself. + YamlReference(Arc), + + /// The associated node represents a YAML map. The contents of the map are + /// described using Field and UnknownField. + YamlMap, + + /// The associated node represents a YAML array. The contents of the array + /// are described using ArrayElement datums. + YamlArray, + + /// The associated node represents a YAML primitive. + YamlPrimitive(primitive_data::PrimitiveData), +} + +/// Semantical information about a node. +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Class { + /// Used for nodes for which no better classification exists. + Misc, + + /// Used for nodes that define a type. The data_type field signifies this + /// data type. + Type, + + /// Used for nodes that represent scalar expressions or literals. The + /// data_type field signifies the type of the value returned by the + /// expression. + Expression, + + /// Used for nodes that represent relations. The data_type field signifies + /// the schema for the data returned by the relation. + Relation, +} + +/// Information nodes for a parsed protobuf message. +#[derive(Clone, Debug, PartialEq)] +pub enum NodeData { + /// A reference to a child node in the tree. + Child(Child), + + /// Indicates that parsing/validating this message resulted in some + /// diagnostic message being emitted. The secondary error level is the + /// modified level via + Diagnostic(diagnostic::Diagnostic), + + /// Provides (intermediate) type information for this node. Depending on + /// the message, this may be a struct or named struct representing a + /// schema, or it may represent the type of some scalar expression. + /// Multiple TypeInfo nodes may be present, in particular for relations + /// that perform multiple operations in one go (for example read, project, + /// emit). The TypeInfo and operation description *Field nodes are then + /// ordered by data flow. In particular, the last TypeInfo node always + /// represents the type of the final result of a node. + DataType(Arc), + + /// Used for adding unstructured additional information to a message, + /// wherever this may aid human understanding of a message. + Comment(comment::Comment), +} + +/// Reference to a child node in the tree. +#[derive(Clone, Debug, PartialEq)] +pub struct Child { + /// Path element identifying the relation of this child node to its parent. + pub path_element: path::PathElement, + + /// The child node. + pub node: Arc, + + /// Whether the validator recognized/expected the field or element that + /// this child represents. Fields/elements may be unrecognized simply + /// because validation is not implemented for them yet. In any case, this + /// flag indicates that the subtree represented by this node could not be + /// validated. + pub recognized: bool, +} + +/// A reference to a node elsewhere in the tree. +#[derive(Clone, Debug, PartialEq)] +pub struct NodeReference { + /// Absolute path to the node. + pub path: path::PathBuf, + + /// Link to the node. + pub node: Arc, +} + +pub struct FlattenedNodeIter<'a> { + remaining: VecDeque<&'a Node>, +} +impl<'a> Iterator for FlattenedNodeIter<'a> { + type Item = &'a Node; + + fn next(&mut self) -> Option { + let maybe_node = self.remaining.pop_back(); + if let Some(node) = maybe_node { + self.remaining + .extend(node.data.iter().rev().filter_map(|x| -> Option<&Node> { + if let NodeData::Child(child) = x { + Some(&child.node) + } else { + None + } + })); + } + maybe_node + } +} + +pub struct FlattenedNodeDataIter<'a> { + remaining: VecDeque<&'a NodeData>, +} + +impl<'a> Iterator for FlattenedNodeDataIter<'a> { + type Item = &'a NodeData; + + fn next(&mut self) -> Option { + let maybe_node_data = self.remaining.pop_back(); + if let Some(NodeData::Child(child)) = maybe_node_data { + self.remaining.extend(child.node.data.iter().rev()) + } + maybe_node_data + } +} diff --git a/rs/src/parse/context.rs b/rs/src/parse/context.rs new file mode 100644 index 00000000..c611e4fc --- /dev/null +++ b/rs/src/parse/context.rs @@ -0,0 +1,570 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing the types containing contextual information for parse +//! functions. +//! +//! Refer to the documentation for [`parse`](mod@crate::parse) for more +//! information. + +use crate::input::config; +use crate::output::comment; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::output::extension; +use crate::output::path; +use crate::output::tree; +use std::collections::HashMap; +use std::collections::HashSet; +use std::fmt::Debug; +use std::hash::Hash; +use std::sync::Arc; + +/// Parse/validation context and output node, passed to parser functions along +/// with a reference to the to-be-parsed input node. +pub struct Context<'a> { + /// The node in the documentation tree that should reflect the input node. + /// The structure of the documentation tree will be the same as the input + /// tree, but represented in a more generic way, and with annotations like + /// comments and diagnostics attached to each node. The output tree is not + /// intended to be read back by the validator. + output: &'a mut tree::Node, + + /// State object. This is tracked between nodes as they are traversed, and + /// is always mutable for the node currently being validated. + state: &'a mut State, + + /// "Breadcrumbs" with information about the ancestors of the current node. + /// Essentially a stack structure, where only the top of the stack is + /// mutable. + breadcrumb: Breadcrumb<'a>, + + /// Configuration structure, created before validation starts and immutable + /// afterwards. + pub config: &'a config::Config, +} + +impl<'a> Context<'a> { + /// Creates a root parse context. + /// + /// root_name is the prefix used for all paths, normally just "plan" (if + /// different tree parsers are ever created, this can be used to + /// disambiguate between tree types). output is the root node that the + /// children will be added to as parsing progresses. state is the state + /// object used for tracking parser state. config is the configuration for + /// the parser. + pub fn new( + root_name: &'static str, + output: &'a mut tree::Node, + state: &'a mut State, + config: &'a config::Config, + ) -> Self { + Self { + output, + state, + breadcrumb: Breadcrumb::new(root_name), + config, + } + } + + /// Creates a parse context for a child of the node corresponding to this + /// context. output is its node. path_element specifies its relation to + /// the node corresponding to the current context. + pub fn child<'b>( + &'b mut self, + output: &'b mut tree::Node, + path_element: path::PathElement, + ) -> Context<'b> { + Context { + output, + state: self.state, + breadcrumb: self.breadcrumb.next(path_element), + config: self.config, + } + } + + /// Returns the node type of the associated node. + pub fn node_type(&self) -> &tree::NodeType { + &self.output.node_type + } + + /// Replaces the node type of the associated node. + /// + /// This should only be needed to upgrade primitive nodes to more specific + /// types, for instance references or resolved URIs. + pub fn replace_node_type(&mut self, node_type: tree::NodeType) -> tree::NodeType { + std::mem::replace(&mut self.output.node_type, node_type) + } + + /// Returns the data type currently associated with the current node. If no + /// data type was associated yet, this silently returns a reference to an + /// unresolved type object. + pub fn data_type(&self) -> Arc { + self.output.data_type.clone().unwrap_or_default() + } + + /// Sets the semantic description of the current node. + pub fn set_description>( + &mut self, + class: tree::Class, + brief: Option, + ) { + self.output.class = class; + self.output.brief = brief.map(|c| c.into()); + } + + /// Appends to the summary of this node. + pub fn push_summary>(&mut self, comment: C) { + if let Some(summary) = self.output.summary.as_mut() { + summary.extend(comment.into()) + } else { + self.output.summary = Some(comment.into()) + } + } + + /// Pushes data into the current node. + /// + /// This is primarily intended for use by the traversal macros and the more + /// specific functions defined here, like set_data_type(). + pub fn push(&mut self, node_data: tree::NodeData) { + self.output.data.push(node_data); + } + + /// Pushes a diagnostic into the node. This also evaluates its adjusted + /// error level. + pub fn push_diagnostic(&mut self, diag: diagnostic::RawDiagnostic) { + // Get the configured level limits for this diagnostic. First try the + // classification of the diagnostic itself, then its group, and then + // finally Unclassified. If no entries exist, simply yield + // (Info, Error), which is no-op. + let (min, max) = self + .config + .diagnostic_level_overrides + .get(&diag.cause.classification) + .or_else(|| { + self.config + .diagnostic_level_overrides + .get(&diag.cause.classification.group()) + }) + .or_else(|| { + self.config + .diagnostic_level_overrides + .get(&diagnostic::Classification::Unclassified) + }) + .unwrap_or(&(diagnostic::Level::Info, diagnostic::Level::Error)); + + // Adjust the level. + let adjusted_level = if diag.level < *min { + *min + } else if diag.level > *max { + *max + } else { + diag.level + }; + let adjusted = diag.adjust_level(adjusted_level); + + // Actually push the data item. + self.output.data.push(tree::NodeData::Diagnostic(adjusted)); + } + + /// Pushes a comment into the node. + pub fn push_comment>(&mut self, comment: C) { + self.push(tree::NodeData::Comment(comment.into())) + } + + /// Sets the data type "returned" by this node. Specifically: + /// + /// - for type nodes, this should be used to specify the type; + /// - for expression nodes, this should be used to specify the type of the + /// data returned by the expression; + /// + /// Can be called multiple times; only the data type specified for the + /// final call attached to the node's "return type", but each time a + /// NodeData::DataType is pushed into the node data as well. + pub fn set_data_type(&mut self, data_type: Arc) { + if !data_type.is_unresolved() { + self.push(tree::NodeData::DataType(data_type.clone())); + } + self.output.data_type = Some(data_type); + } + + /// Updates the current schema. This also pushes the data type to the + /// current node. Relation parsers *must* use this after traversing their + /// inputs, but before they start to parse any expressions based on that + /// schema; after all, the schema defines how (column) references behave. + /// If the schema isn't known, it may be set to an unresolved type. + pub fn set_schema(&mut self, schema: Arc) { + *self + .state + .schema_stack + .last_mut() + .expect("no schema present on schema stack") = Some(schema.clone()); + self.set_data_type(schema); + } + + /// Clears the current schema, requiring schema!() to be called before + /// expressions can be parsed again. + pub fn clear_schema(&mut self) { + *self + .state + .schema_stack + .last_mut() + .expect("no schema present on schema stack") = None; + } + + /// Returns the current schema. depth specifies for which subquery the + /// schema should be selected; depth 0 is the current query, depth 1 would + /// be its parent query, 2 would be its grandparent, etc. Returns Err when + /// the referenced schema semantically doesn't exist; returns Ok(unresolved + /// type) when it does but the actual type isn't known. + pub fn schema(&self, depth: usize) -> diagnostic::Result> { + let len = self.state.schema_stack.len(); + if depth >= len { + Err(cause!( + ExpressionFieldRefMissingStream, + "indexing query beyond current query depth ({len})" + )) + } else if let Some(Some(schema)) = self.state.schema_stack.get(len - depth - 1) { + Ok(schema.clone()) + } else { + Err(cause!( + ExpressionFieldRefMissingStream, + "query data stream has not yet been instantiated" + )) + } + } + + /// Pushes an empty slot for the schema of the relation tree onto the + /// schema stack, allowing schema!() to be used. This must be used when + /// traversing into the root of a relation tree; i.e., the root must be + /// parsed within the context of the provided function. + pub fn enter_relation_root R>(&mut self, f: F) -> R { + // Push a schema slot onto the stack for the relation tree to fill + // in. + self.state.schema_stack.push(None); + + // Ensure that return statements can't break out of the context + // early by wrapping the block in a closure first. + let result = f(self); + + // Pop the schema again. + self.state + .schema_stack + .pop() + .expect("no schema present on schema stack"); + + result + } + + /// Returns all data that has thus far been pushed into the current node. + pub fn node_data(&self) -> &[tree::NodeData] { + &self.output.data + } + + /// Returns the resolver for URI anchors and references. + pub fn extension_uris(&mut self) -> &mut Resolver> { + &mut self.state.extension_uris + } + + /// Registers an extension URI definition. Shorthand for uris().define(), + /// using the current path as the registration path. + pub fn define_extension_uri( + &mut self, + anchor: u32, + uri: Arc, + ) -> Result<(), (Arc, path::PathBuf)> { + self.state + .extension_uris + .define(anchor, uri, self.breadcrumb.path.to_path_buf()) + } + + /// Returns the resolver for function anchors and references. + pub fn fns(&mut self) -> &mut Resolver>> { + &mut self.state.functions + } + + /// Registers a function definition. Shorthand for fns().define(), using + /// the current path as the registration path. + pub fn define_fn( + &mut self, + anchor: u32, + uri: Arc>, + ) -> Result< + (), + ( + Arc>, + path::PathBuf, + ), + > { + self.state + .functions + .define(anchor, uri, self.breadcrumb.path.to_path_buf()) + } + + /// Returns the resolver for type anchors and references. + pub fn types(&mut self) -> &mut Resolver>> { + &mut self.state.types + } + + /// Registers a type definition. Shorthand for fns().define(), using the + /// current path as the registration path. + pub fn define_type( + &mut self, + anchor: u32, + uri: Arc>, + ) -> Result< + (), + ( + Arc>, + path::PathBuf, + ), + > { + self.state + .types + .define(anchor, uri, self.breadcrumb.path.to_path_buf()) + } + + /// Returns the resolver for type variation anchors and references. + pub fn tvars( + &mut self, + ) -> &mut Resolver>> { + &mut self.state.type_variations + } + + /// Registers a type definition. Shorthand for fns().define(), using the + /// current path as the registration path. + pub fn define_tvar( + &mut self, + anchor: u32, + uri: Arc>, + ) -> Result< + (), + ( + Arc>, + path::PathBuf, + ), + > { + self.state + .type_variations + .define(anchor, uri, self.breadcrumb.path.to_path_buf()) + } + + /// Returns the resolver for protobuf Any types present in the + /// `expected_type_urls` manifest. + pub fn proto_any_types(&mut self) -> &mut Resolver { + &mut self.state.proto_any_types + } + + /// Defines a protobuf Any type URL, allowing it for use within the plan. + /// If the type was already declared, this returns the path that defined + /// it in the form of an Err result. + pub fn define_proto_any_type(&mut self, url: S) -> Result<(), path::PathBuf> { + self.state + .proto_any_types + .define(url.to_string(), (), self.breadcrumb.path.to_path_buf()) + .map_err(|(_, p)| p) + } + + /// Resolves a protobuf "any" message. The first return value specifies + /// whether usage of the type was explicitly allowed in the validator + /// configuration. The second return value specifies the path to the + /// manifest entry for the type, if it was defined. If the type URL does + /// not exist in the manifest, a suitable error is generated automatically. + pub fn resolve_proto_any(&mut self, x: &prost_types::Any) -> (bool, Option) { + let path = self + .state + .proto_any_types + .resolve(&x.type_url) + .map(|(_, path)| path.clone()); + if path.is_none() { + diagnostic!(self, Error, ProtoMissingAnyDeclaration, "{}", x.type_url); + } + let allowed = self + .config + .allowed_proto_any_urls + .iter() + .any(|p| p.matches(&x.type_url)); + (allowed, path) + } + + /// Returns a mutable reference to the Option that possibly contains the + /// YAML data object under construction. + pub fn yaml_data_opt(&mut self) -> &mut Option { + &mut self.state.yaml_data + } + + /// Returns a mutable reference to the YAML data object under construction. + /// Panics if we're not currently constructing YAML data. + pub fn yaml_data(&mut self) -> &mut extension::YamlData { + self.state.yaml_data.as_mut().unwrap() + } + + /// Returns the path leading up to the current node. + pub fn path(&self) -> &path::Path<'a> { + &self.breadcrumb.path + } + + /// Returns the path leading up to the current node. + pub fn path_buf(&self) -> path::PathBuf { + self.breadcrumb.path.to_path_buf() + } + + /// Returns the path leading up to the parent node, if any. + pub fn parent_path_buf(&self) -> Option { + self.breadcrumb.parent.map(|x| x.path.to_path_buf()) + } + + /// Indicates that the field with the given name has been parsed. See also + /// is_field_parsed(). + pub fn set_field_parsed(&mut self, field: S) -> bool { + self.breadcrumb.fields_parsed.insert(field.to_string()) + } + + /// Returns whether the field with the given name has been parsed yet. + /// + /// This is primarily intended for use by the traversal macros. They use it + /// to ensure that: + /// + /// - a field is only parsed once; + /// - fields not parsed by the parse function are parsed using a generic + /// method, along with emission of a warning message. + pub fn field_parsed>(&mut self, field: S) -> bool { + self.breadcrumb.fields_parsed.contains(field.as_ref()) + } +} + +#[derive(Clone, Debug, Default)] +pub struct Resolver +where + K: Clone + Debug + Default + Eq + Hash, + V: Clone + Debug + Default, +{ + /// Map of keys that have been registered thus far to their value and to + /// the path from which they were registered. + map: HashMap, + + /// The set of keys for which resolve() was called at least once. Used to + /// detect unused keys. + used: HashSet, +} + +impl Resolver +where + K: Clone + Debug + Default + Eq + Hash, + V: Clone + Debug + Default, +{ + /// Creates a new resolver. + pub fn new() -> Self { + Self::default() + } + + /// Defines a key-value-path triplet. If a key was previously defined, its + /// entry is overridden, and the previous value-path pair is returned + /// in the form of an Err result. + pub fn define( + &mut self, + key: K, + value: V, + path: path::PathBuf, + ) -> Result<(), (V, path::PathBuf)> { + if let Some(previous) = self.map.insert(key, (value, path)) { + Err(previous) + } else { + Ok(()) + } + } + + /// Resolves the given key to its value-path pair. If no value was + /// registered for the given key, None is returned. If this was the first + /// use of this key (regardless of whether or not a value was registered + /// for it yet), it is recorded in the set of used keys. + pub fn resolve(&mut self, key: &K) -> Option<&(V, path::PathBuf)> { + self.used.insert(key.clone()); + self.map.get(key) + } + + /// Iterates over all key-value-path triplets corresponding to def + pub fn iter_unused(&self) -> impl Iterator + '_ { + self.map.iter().filter_map(|(k, (v, p))| { + if self.used.contains(k) { + None + } else { + Some((k.clone(), v.clone(), p.clone())) + } + }) + } +} + +/// Global state information tracked by the validation logic. +#[derive(Default)] +pub struct State { + /// URI anchor resolver. + pub extension_uris: Resolver>, + + /// YAML-defined function anchor resolver. + pub functions: Resolver>>, + + /// YAML-defined data type anchor resolver. + pub types: Resolver>>, + + /// YAML-defined type variation anchor resolver. + pub type_variations: Resolver>>, + + /// Protobuf Any type URL resolver. + pub proto_any_types: Resolver, + + /// Schema stack. This is what the validator for FieldRefs uses to + /// determine the return type of the FieldRef. The back of the vector + /// represents the innermost query, while entries further to the front + /// of the vector are used to break out of correlated subqueries. + /// None is used only for the top of the stack, and only when we're inside + /// a relation tree, but no schema is known yet (in terms of dataflow, + /// we're still in the time before the input relation has created a + /// stream). + pub schema_stack: Vec>>, + + /// The YAML data object under construction, if any. + pub yaml_data: Option, +} + +/// Breadcrumbs structure. Each breadcrumb is associated with a node, and +/// immutably links to the breadcrumb for its parent node (except for the +/// root). Used for two things: tracking the path leading up to the current +/// node from the root, and keeping track of mutable state information that +/// belongs to a specific node. +pub struct Breadcrumb<'a> { + /// Breadcrumb for the parent node, unless this is the root. + pub parent: Option<&'a Breadcrumb<'a>>, + + /// The path leading up to the node associated with this breadcrumb. Used + /// primarily for attaching information to diagnostic messages. + pub path: path::Path<'a>, + + /// The set of field names of the associated node that we've already + /// parsed. This is used to automatically search through message subtrees + /// that the validator doesn't yet implement: after all normal validation + /// for a node is done, the generic tree-walking logic checks whether there + /// are fields with non-default data associated with them of which the + /// field name hasn't been added to this set yet. It's also used to assert + /// that the same subtree isn't traversed twice. + pub fields_parsed: HashSet, +} + +impl Breadcrumb<'_> { + /// Creates a breadcrumb for the root node. + pub fn new(root_name: &'static str) -> Self { + Self { + parent: None, + path: path::Path::Root(root_name), + fields_parsed: HashSet::new(), + } + } + + /// Creates the next breadcrumb. + pub fn next(&self, element: path::PathElement) -> Breadcrumb { + Breadcrumb { + parent: Some(self), + path: self.path.with(element), + fields_parsed: HashSet::new(), + } + } +} diff --git a/rs/src/parse/expressions/conditionals.rs b/rs/src/parse/expressions/conditionals.rs new file mode 100644 index 00000000..8b298fc7 --- /dev/null +++ b/rs/src/parse/expressions/conditionals.rs @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for parsing/validating conditional expression types. + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions; +use crate::parse::expressions::literals; +use crate::parse::types; +use std::sync::Arc; + +// FIXME: what promotions are allowed and when? I saw Isthmus output an +// if/else with branches differing in nullability, and that makes sense to me +// as something to support. But on the other hand, explicit type casts for +// everything might be nicer for a machine format. Either way, I'm not sure +// the specification has anything to say about this? + +/// Parse an if-then expression. Returns a description of said expression. +pub fn parse_if_then( + x: &substrait::expression::IfThen, + y: &mut context::Context, +) -> diagnostic::Result { + let mut return_type: Arc = Arc::default(); + let mut args = vec![]; + + // Handle branches. + proto_required_repeated_field!(x, y, ifs, |x, y| { + // Parse fields. + let (n, e) = proto_required_field!(x, y, r#if, expressions::parse_predicate); + let condition = e.unwrap_or_default(); + let condition_type = n.data_type(); + let (n, e) = proto_required_field!(x, y, then, expressions::parse_expression); + let value = e.unwrap_or_default(); + let value_type = n.data_type(); + + // Check that the type is the same for each branch. + return_type = types::promote_and_assert_equal( + y, + &value_type, + &return_type, + "branches must yield the same type", + ); + + // Nulls in the condition are propagated to the output. + // FIXME: I guess? + if !condition_type.is_unresolved() && condition_type.nullable() { + return_type = return_type.make_nullable(); + } + + // Describe this branch. + describe!(y, Misc, "If {} yield {}", &condition, &value); + + // Save to the "arguments" of the function we'll use to describe this + // expression. + args.push(condition); + args.push(value); + + Ok(()) + }); + + // Handle else branch. + if x.r#else.is_some() { + // Parse field. + let (n, e) = proto_boxed_required_field!(x, y, r#else, expressions::parse_expression); + let value = e.unwrap_or_default(); + + // Check that the type is the same for each branch. + return_type = types::promote_and_assert_equal( + y, + &n.data_type(), + &return_type, + "branches must yield the same type", + ); + + // Save to the "arguments" of the function we'll use to describe this + // expression. + args.push(value); + } else { + // Allow missing else, making the type nullable. + comment!(y, "Otherwise, yield null."); + return_type = return_type.make_nullable(); + + // Yield null for the else clause. + args.push(expressions::Expression::new_null(return_type.clone())); + } + + // Describe node. + y.set_data_type(return_type); + summary!( + y, + "Selects the value corresponding to the first condition that yields \ + true. If none of the conditions yield true, return {}.", + args.last().unwrap() + ); + let expression = expressions::Expression::Function(String::from("if_then"), args); + describe!(y, Expression, "{}", expression); + Ok(expression) +} + +/// Parse a switch expression. Returns a description of said expression. +pub fn parse_switch( + x: &substrait::expression::SwitchExpression, + y: &mut context::Context, +) -> diagnostic::Result { + let mut return_type: Arc = Arc::default(); + let mut args = vec![]; + + // Parse value to match. + let (n, e) = proto_boxed_required_field!(x, y, r#match, expressions::parse_expression); + let mut match_type = n.data_type(); + args.push(e.unwrap_or_default()); + + // Handle branches. + proto_required_repeated_field!(x, y, ifs, |x, y| { + // Parse match field. + let (n, e) = proto_required_field!(x, y, r#if, literals::parse_literal); + let match_value = e.unwrap_or_default(); + + // Check that the type is the same for each branch. + match_type = types::promote_and_assert_equal( + y, + &n.data_type(), + &match_type, + "literal type must match switch expression", + ); + + // Parse value field. + let (n, e) = proto_required_field!(x, y, then, expressions::parse_expression); + let value = e.unwrap_or_default(); + + // Check that the type is the same for each branch. + return_type = types::promote_and_assert_equal( + y, + &n.data_type(), + &return_type, + "branches must yield the same type", + ); + + // Describe this branch. + describe!(y, Misc, "If match == {} yield {}", &match_value, &value); + + // Save to the "arguments" of the function we'll use to describe this + // expression. + args.push(match_value.into()); + args.push(value); + + Ok(()) + }); + + // Handle else branch. + if x.r#else.is_some() { + // Parse field. + let (n, e) = proto_boxed_required_field!(x, y, r#else, expressions::parse_expression); + let value = e.unwrap_or_default(); + + // Check that the type is the same for each branch. + return_type = types::promote_and_assert_equal( + y, + &n.data_type(), + &return_type, + "branches must yield the same type", + ); + + // Save to the "arguments" of the function we'll use to describe this + // expression. + args.push(value); + } else { + // Allow missing else, making the type nullable. + comment!(y, "Otherwise, yield null."); + return_type = return_type.make_nullable(); + + // Yield null for the else clause. + args.push(expressions::Expression::new_null(return_type.clone())); + } + + // Describe node. + y.set_data_type(return_type); + summary!( + y, + "Selects the value corresponding to the switch case that matches {}. \ + If none of the cases match, return {}.", + args.first().unwrap(), + args.last().unwrap() + ); + let expression = expressions::Expression::Function(String::from("switch"), args); + describe!(y, Expression, "{}", expression); + Ok(expression) +} + +/// Parse a "singular or list", i.e. something of the form +/// `x in (a, ..., c)`. +pub fn parse_singular_or_list( + x: &substrait::expression::SingularOrList, + y: &mut context::Context, +) -> diagnostic::Result { + let mut args = vec![]; + + // Parse value to match. + let (n, e) = proto_boxed_required_field!(x, y, value, expressions::parse_expression); + let match_type = n.data_type(); + args.push(e.unwrap_or_default()); + + // Handle allowed values. + proto_required_repeated_field!(x, y, options, |x, y| { + let expression = expressions::parse_expression(x, y)?; + let value_type = y.data_type(); + args.push(expression); + + // Check that the type is the same as the value. + types::assert_equal( + y, + &value_type, + &match_type, + "option type must match value type", + ); + + Ok(()) + }); + + // Describe node. + y.set_data_type(data_type::DataType::new_predicate(false)); + summary!( + y, + "Returns true if and only if {} is equal to any of the options.", + args.first().unwrap() + ); + let expression = expressions::Expression::Function(String::from("match"), args); + describe!(y, Expression, "{}", expression); + Ok(expression) +} + +/// Parse a "multi or list", i.e. something of the form +/// `(x, .., z) in ((ax, .., az), .., (cx, .., cz))`. +pub fn parse_multi_or_list( + x: &substrait::expression::MultiOrList, + y: &mut context::Context, +) -> diagnostic::Result { + // FIXME: why is there not just an expression that forms a struct from a + // number of expressions? Then this could go away. Alternatively, why does + // SingularOrList also exist, when it's just the special case of this + // expression for one-tuples? And why is it named this confusingly? + // (a in b, contains(a, b), matches(a, b) etc. would all make more sense + // to me... at least add a comment in the protobuf descriptions) + + let mut args = vec![]; + + // Parse value to match. + let (ns, es) = proto_required_repeated_field!(x, y, value, expressions::parse_expression); + let match_types = ns.iter().map(|x| x.data_type()).collect::>(); + args.push(expressions::Expression::Tuple( + es.into_iter().map(|x| x.unwrap_or_default()).collect(), + )); + + // Handle allowed values. + proto_required_repeated_field!(x, y, options, |x, y| { + let (ns, es) = proto_required_repeated_field!(x, y, fields, expressions::parse_expression); + let value_types = ns.iter().map(|x| x.data_type()).collect::>(); + args.push(expressions::Expression::Tuple( + es.into_iter().map(|x| x.unwrap_or_default()).collect(), + )); + + // Check that the type is the same as the value. + if match_types.len() != value_types.len() { + diagnostic!( + y, + Error, + TypeMismatch, + "option types must match value types: numbers of fields differ" + ) + } + for (index, (value_type, match_type)) in + value_types.iter().zip(match_types.iter()).enumerate() + { + types::assert_equal( + y, + value_type, + match_type, + format!("option type must match value type for field {index}"), + ); + } + + Ok(()) + }); + + // Describe node. + y.set_data_type(data_type::DataType::new_predicate(false)); + summary!( + y, + "Returns true if and only if {} is equal to any of the options.", + args.first().unwrap() + ); + let expression = expressions::Expression::Function(String::from("match"), args); + describe!(y, Expression, "{}", expression); + Ok(expression) +} diff --git a/rs/src/parse/expressions/functions.rs b/rs/src/parse/expressions/functions.rs new file mode 100644 index 00000000..a37ec938 --- /dev/null +++ b/rs/src/parse/expressions/functions.rs @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for parsing/validating function calls. + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::output::extension; +use crate::output::tree; +use crate::parse::context; +use crate::parse::expressions; +use crate::parse::extensions; +use crate::parse::sorts; +use crate::parse::types; +use std::sync::Arc; + +/// Matches a function call with its YAML definition, yielding its return type. +/// Yields an unresolved type if resolution fails. +pub fn check_function( + y: &mut context::Context, + _function: &extension::Function, + _options: &[Option], + _arg_types: &[Arc], +) -> Arc { + // TODO: check consistency of: + // - _function (function definition information from the YAML file); + // - _options: number of options passed to the function, and validity of + // their values; + // - _arg_types: whether an overload exists for this set of argument + // types; + diagnostic!( + y, + Warning, + NotYetImplemented, + "matching function calls with their definitions" + ); + Arc::default() +} + +/// Parsing logic common to scalar and window functions. +fn parse_function( + y: &mut context::Context, + function: Option>>, + arguments: (Vec>, Vec>), + return_type: Arc, +) -> (Arc, expressions::Expression) { + // Determine the name of the function. + let name = function + .as_ref() + .map(|x| x.name.to_string()) + .unwrap_or_else(|| String::from("?")); + + // Unpack the arguments into the function's enum options and regular + // arguments. + let mut opt_values = vec![]; + let mut opt_exprs = vec![]; + let mut arg_types = vec![]; + let mut arg_exprs = vec![]; + for (node, expr) in arguments + .0 + .into_iter() + .zip(arguments.1.into_iter().map(|x| x.unwrap_or_default())) + { + if let expressions::Expression::EnumVariant(x) = &expr { + if opt_exprs.is_empty() && !arg_exprs.is_empty() { + diagnostic!( + y, + Error, + IllegalValue, + "function option argument specified after first regular argument" + ); + } + opt_values.push(x.clone()); + opt_exprs.push(expr); + } else { + arg_types.push(node.data_type()); + arg_exprs.push(expr); + } + } + opt_exprs.extend(arg_exprs.into_iter()); + let expression = expressions::Expression::Function(name, opt_exprs); + let opt_values = opt_values; + let arg_types = arg_types; + + // If the function was resolved, check whether it's valid. + let return_type = if let Some(reference) = function { + if let Some(function) = &reference.definition { + let derived = check_function(y, function, &opt_values, &arg_types); + types::assert_equal( + y, + &return_type, + &derived, + "specified return type must match derived", + ) + } else { + diagnostic!( + y, + Warning, + ExpressionFunctionDefinitionUnavailable, + "cannot check validity of call" + ); + return_type + } + } else { + return_type + }; + + (return_type, expression) +} + +/// Parse a scalar function. Returns a description of the function call +/// expression. +pub fn parse_scalar_function( + x: &substrait::expression::ScalarFunction, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse function information. + let function = proto_primitive_field!( + x, + y, + function_reference, + extensions::simple::parse_function_reference + ) + .1; + let arguments = proto_repeated_field!(x, y, args, expressions::parse_function_argument); + let return_type = proto_required_field!(x, y, output_type, types::parse_type) + .0 + .data_type(); + + // Check function information. + let (return_type, expression) = parse_function(y, function, arguments, return_type); + + // Describe node. + y.set_data_type(return_type); + describe!(y, Expression, "{}", expression); + summary!(y, "Scalar function call: {:#}", expression); + Ok(expression) +} + +/// Parse a window function bound. +fn parse_bound( + _x: &substrait::expression::window_function::Bound, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // TODO: check window function bound. + // FIXME: I have no idea what these bounds signify. The spec doesn't + // seem to specify. + diagnostic!( + y, + Warning, + NotYetImplemented, + "validation of window function bounds" + ); + Ok(()) +} + +/// Parse a window function. Returns a description of the function call +/// expression. +pub fn parse_window_function( + x: &substrait::expression::WindowFunction, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse function information. + let function = proto_primitive_field!( + x, + y, + function_reference, + extensions::simple::parse_function_reference + ) + .1; + let arguments = proto_repeated_field!(x, y, args, expressions::parse_function_argument); + let return_type = proto_required_field!(x, y, output_type, types::parse_type) + .0 + .data_type(); + + // Check function information. + let (return_type, expression) = parse_function(y, function, arguments, return_type); + + // Parse modifiers. + proto_repeated_field!(x, y, partitions, expressions::parse_expression); + proto_repeated_field!(x, y, sorts, sorts::parse_sort_field); + proto_field!(x, y, upper_bound, parse_bound); + proto_field!(x, y, lower_bound, parse_bound); + proto_enum_field!(x, y, phase, substrait::AggregationPhase); + + // TODO: check window function configuration. + // FIXME: I have no idea what these partitions signify. The spec doesn't + // seem to specify. + if !x.partitions.is_empty() { + diagnostic!( + y, + Warning, + NotYetImplemented, + "validation of partitions field" + ); + } + + // Describe node. + y.set_data_type(return_type); + describe!(y, Expression, "{}", expression); + summary!(y, "Window function call: {:#}", expression); + Ok(expression) +} + +/// Parse an aggregate function. Returns a description of the function call +/// expression. +pub fn parse_aggregate_function( + x: &substrait::AggregateFunction, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse function information. + let function = proto_primitive_field!( + x, + y, + function_reference, + extensions::simple::parse_function_reference + ) + .1; + let arguments = proto_repeated_field!(x, y, args, expressions::parse_function_argument); + let return_type = proto_required_field!(x, y, output_type, types::parse_type) + .0 + .data_type(); + + // Check function information. + let (return_type, expression) = parse_function(y, function, arguments, return_type); + + // Parse modifiers. + proto_repeated_field!(x, y, sorts, sorts::parse_sort_field); + proto_enum_field!(x, y, phase, substrait::AggregationPhase); + proto_enum_field!( + x, + y, + invocation, + substrait::aggregate_function::AggregationInvocation + ); + + // Describe node. + y.set_data_type(return_type); + describe!(y, Expression, "{}", expression); + summary!(y, "Aggregate function call: {:#}", expression); + Ok(expression) +} diff --git a/rs/src/parse/expressions/literals.rs b/rs/src/parse/expressions/literals.rs new file mode 100644 index 00000000..4b29a234 --- /dev/null +++ b/rs/src/parse/expressions/literals.rs @@ -0,0 +1,946 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for parsing/validating literals. + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::types; +use crate::string_util; +use crate::string_util::Describe; +use std::sync::Arc; + +/// The value of a literal, not including type information. +#[derive(Clone)] +enum LiteralValue { + /// May be used for any nullable type. + Null, + + /// May be used only for booleans. + Boolean(bool), + + /// May be used only for I8, I16, I32, I64, Timestamp, TimestampTz, Date, and Time. + Integer(i64), + + /// May be used only for Fp32 and Fp64. + Float(f64), + + /// May be used only for decimals and UUIDs. + Data16(i128), + + /// May be used only for strings, FixedChars, and VarChars. + String(String), + + /// May be used only for binary and FixedBinary. + Binary(Vec), + + /// May be used only for IntervalYearToMonth and IntervalDayToSecond. + Interval(i32, i32), + + /// May be used only for structs and lists. + Items(Vec), + + /// May be used only for maps. + Pairs(Vec<(Literal, Literal)>), +} + +impl Default for LiteralValue { + fn default() -> Self { + LiteralValue::Null + } +} + +/// A complete literal, including type information. +#[derive(Clone, Default)] +pub struct Literal { + /// The value of the literal. + value: LiteralValue, + + /// The data type of the literal. LiteralValue must be a valid instance of + /// this. + data_type: Arc, +} + +/// Converts a value in microseconds since the epoch to a chrono::NaiveDateTime. +fn to_date_time(micros: i64) -> diagnostic::Result { + let secs = micros.div_euclid(1_000_000); + let nsecs = ((micros.rem_euclid(1_000_000)) * 1000) as u32; + chrono::NaiveDateTime::from_timestamp_opt(secs, nsecs).ok_or(ecause!( + ExpressionIllegalLiteralValue, + "timestamp out of range" + )) +} + +/// Converts a value in microseconds since the epoch to a string. +fn to_date_time_str(micros: i64, fmt: &str) -> String { + to_date_time(micros) + .map(|x| x.format(fmt).to_string()) + .unwrap_or_else(|_| String::from("?")) +} + +impl Literal { + /// Shorthand for a new null literal. + pub fn new_null(data_type: Arc) -> Literal { + Literal { + value: LiteralValue::Null, + data_type, + } + } + + /// Shorthand for a new simple literal. + fn new_simple( + value: LiteralValue, + simple: data_type::Simple, + nullable: bool, + ) -> diagnostic::Result { + Ok(Literal { + value, + data_type: data_type::DataType::new( + data_type::Class::Simple(simple), + nullable, + None, + vec![], + )?, + }) + } + + /// Shorthand for a new compound literal. + fn new_compound>( + value: LiteralValue, + compound: data_type::Compound, + nullable: bool, + args: Vec, + ) -> diagnostic::Result { + Ok(Literal { + value, + data_type: data_type::DataType::new( + data_type::Class::Compound(compound), + nullable, + None, + args.into_iter().map(|x| x.into()).collect(), + )?, + }) + } + + /// Returns the data type of this literal. + pub fn data_type(&self) -> &Arc { + &self.data_type + } +} + +impl Describe for Literal { + /// Represents the value of this literal with some size limit. The size + /// limit very roughly corresponds to a number of characters, but this is + /// purely a heuristic thing. + fn describe( + &self, + f: &mut std::fmt::Formatter<'_>, + limit: string_util::Limit, + ) -> std::fmt::Result { + match &self.value { + LiteralValue::Null => { + if self.data_type.is_unresolved() { + write!(f, "!") + } else { + write!(f, "null") + } + } + LiteralValue::Boolean(true) => write!(f, "true"), + LiteralValue::Boolean(false) => write!(f, "false"), + LiteralValue::Integer(i) => match self.data_type.class() { + data_type::Class::Simple(data_type::Simple::I8) => write!(f, "{i}i8"), + data_type::Class::Simple(data_type::Simple::I16) => write!(f, "{i}i16"), + data_type::Class::Simple(data_type::Simple::I32) => write!(f, "{i}i32"), + data_type::Class::Simple(data_type::Simple::I64) => write!(f, "{i}i64"), + data_type::Class::Simple(data_type::Simple::Timestamp) => { + write!(f, "{}", to_date_time_str(*i, "%Y-%m-%d %H:%M:%S%.6f")) + } + data_type::Class::Simple(data_type::Simple::TimestampTz) => { + write!(f, "{} UTC", to_date_time_str(*i, "%Y-%m-%d %H:%M:%S%.6f")) + } + data_type::Class::Simple(data_type::Simple::Date) => { + write!( + f, + "{}", + to_date_time_str(i.saturating_mul(24 * 60 * 60 * 1_000_000), "%Y-%m-%d") + ) + } + data_type::Class::Simple(data_type::Simple::Time) => { + write!(f, "{}", to_date_time_str(*i, "%H:%M:%S%.6f")) + } + _ => write!(f, "{i}"), + }, + LiteralValue::Float(v) => { + let max = std::cmp::min(std::cmp::max(3, limit.chars()), 10); + write!(f, "{:3.1$}", float_pretty_print::PrettyPrintFloat(*v), max) + } + LiteralValue::Data16(d) => match self.data_type.class() { + data_type::Class::Compound(data_type::Compound::Decimal) => { + if let Some(scale) = self.data_type.int_parameter(1) { + if d < &0 { + write!(f, "-")?; + } + let d = d.abs() as u128; + let s = 10u128.pow(scale as u32); + if self + .data_type + .int_parameter(0) + .map(|precision| scale < precision) + .unwrap_or(true) + { + write!(f, "{0}", d.div_euclid(s))?; + } + write!(f, ".")?; + if scale > 0 { + write!(f, "{0:01$}", d.rem_euclid(s), scale as usize)?; + } + Ok(()) + } else { + string_util::describe_binary(f, &d.to_le_bytes(), limit) + } + } + data_type::Class::Simple(data_type::Simple::Uuid) => { + let b = d.to_ne_bytes(); + write!( + f, + "{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}", + b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15] + ) + } + _ => string_util::describe_binary(f, &d.to_le_bytes(), limit), + }, + LiteralValue::String(s) => string_util::describe_string(f, s, limit), + LiteralValue::Binary(b) => string_util::describe_binary(f, b, limit), + LiteralValue::Interval(a, b) => match self.data_type.class() { + data_type::Class::Simple(data_type::Simple::IntervalYear) => { + write!(f, "{a}y{b:+}m") + } + data_type::Class::Simple(data_type::Simple::IntervalDay) => write!(f, "{a}d{b:+}s"), + _ => write!(f, "({a}, {b})"), + }, + LiteralValue::Items(x) => match self.data_type.class() { + data_type::Class::Compound(data_type::Compound::Struct) => { + write!(f, "(")?; + string_util::describe_sequence(f, x, limit, 20, |f, value, index, limit| { + write!(f, ".{index}: ")?; + value.describe(f, limit) + })?; + write!(f, ")") + } + data_type::Class::Compound(data_type::Compound::NamedStruct) => { + write!(f, "(")?; + string_util::describe_sequence(f, x, limit, 20, |f, value, index, limit| { + if let Some(name) = self + .data_type + .parameters() + .get(index) + .and_then(|x| x.get_name()) + { + write!(f, ".{}: ", string_util::as_ident_or_string(name))?; + } else { + write!(f, ".{index}: ")?; + } + value.describe(f, limit) + })?; + write!(f, ")") + } + data_type::Class::Compound(data_type::Compound::List) => { + write!(f, "[")?; + string_util::describe_sequence(f, x, limit, 20, |f, value, _, limit| { + value.describe(f, limit) + })?; + write!(f, "]") + } + _ => { + write!(f, "(")?; + string_util::describe_sequence(f, x, limit, 20, |f, value, _, limit| { + value.describe(f, limit) + })?; + write!(f, ")") + } + }, + LiteralValue::Pairs(x) => match self.data_type.class() { + data_type::Class::Compound(data_type::Compound::Map) => { + write!(f, "{{")?; + string_util::describe_sequence( + f, + x, + limit, + 40, + |f, (key, value), _, limit| { + let (key_limit, value_limit) = limit.split(20); + key.describe(f, key_limit)?; + write!(f, ": ")?; + value.describe(f, value_limit) + }, + )?; + write!(f, "}}") + } + _ => { + write!(f, "(")?; + string_util::describe_sequence( + f, + x, + limit, + 40, + |f, (key, value), _, limit| { + write!(f, "(")?; + let (key_limit, value_limit) = limit.split(20); + key.describe(f, key_limit)?; + write!(f, ": ")?; + value.describe(f, value_limit)?; + write!(f, ")") + }, + )?; + write!(f, ")") + } + }, + } + } +} + +impl std::fmt::Display for Literal { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.display().fmt(f) + } +} + +/// Parses a boolean literal. +fn parse_boolean( + x: &bool, + _y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + Literal::new_simple( + LiteralValue::Boolean(*x), + data_type::Simple::Boolean, + nullable, + ) +} + +/// Parses an i8 literal. +fn parse_i8(x: &i32, _y: &mut context::Context, nullable: bool) -> diagnostic::Result { + let x = i8::try_from(*x) + .map_err(|_| cause!(ExpressionIllegalLiteralValue, "i8 value out of range"))?; + Literal::new_simple( + LiteralValue::Integer(x as i64), + data_type::Simple::I8, + nullable, + ) +} + +/// Parses an i16 literal. +fn parse_i16(x: &i32, _y: &mut context::Context, nullable: bool) -> diagnostic::Result { + let x = i16::try_from(*x) + .map_err(|_| cause!(ExpressionIllegalLiteralValue, "i16 value out of range"))?; + Literal::new_simple( + LiteralValue::Integer(x as i64), + data_type::Simple::I16, + nullable, + ) +} + +/// Parses an i32 literal. +fn parse_i32(x: &i32, _y: &mut context::Context, nullable: bool) -> diagnostic::Result { + Literal::new_simple( + LiteralValue::Integer(*x as i64), + data_type::Simple::I32, + nullable, + ) +} + +/// Parses an i64 literal. +fn parse_i64(x: &i64, _y: &mut context::Context, nullable: bool) -> diagnostic::Result { + Literal::new_simple(LiteralValue::Integer(*x), data_type::Simple::I64, nullable) +} + +/// Parses an fp32 literal. +fn parse_fp32(x: &f32, _y: &mut context::Context, nullable: bool) -> diagnostic::Result { + Literal::new_simple( + LiteralValue::Float(*x as f64), + data_type::Simple::Fp32, + nullable, + ) +} + +/// Parses an fp64 literal. +fn parse_fp64(x: &f64, _y: &mut context::Context, nullable: bool) -> diagnostic::Result { + Literal::new_simple(LiteralValue::Float(*x), data_type::Simple::Fp64, nullable) +} + +/// Parses a string literal. +fn parse_string(x: &str, _y: &mut context::Context, nullable: bool) -> diagnostic::Result { + Literal::new_simple( + LiteralValue::String(x.to_string()), + data_type::Simple::String, + nullable, + ) +} + +/// Parses a binary literal. +fn parse_binary( + x: &[u8], + _y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + Literal::new_simple( + LiteralValue::Binary(x.to_owned()), + data_type::Simple::Binary, + nullable, + ) +} + +/// Parses a timestamp literal. +fn parse_timestamp( + x: &i64, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + let dt = to_date_time(*x)?; + if dt < chrono::NaiveDate::from_ymd(1000, 1, 1).and_hms(0, 0, 0) + || dt >= chrono::NaiveDate::from_ymd(10000, 1, 1).and_hms(0, 0, 0) + { + diagnostic!( + y, + Error, + ExpressionIllegalLiteralValue, + "timestamp out of range 1000-01-01 to 9999-12-31" + ); + } + Literal::new_simple( + LiteralValue::Integer(*x), + data_type::Simple::Timestamp, + nullable, + ) +} + +/// Parses a UTC timestamp literal. +fn parse_timestamp_tz( + x: &i64, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + let dt = to_date_time(*x)?; + if dt < chrono::NaiveDate::from_ymd(1000, 1, 1).and_hms(0, 0, 0) + || dt >= chrono::NaiveDate::from_ymd(10000, 1, 1).and_hms(0, 0, 0) + { + diagnostic!( + y, + Error, + ExpressionIllegalLiteralValue, + "timestamp out of range 1000-01-01 UTC to 9999-12-31 UTC" + ); + } + Literal::new_simple( + LiteralValue::Integer(*x), + data_type::Simple::TimestampTz, + nullable, + ) +} + +/// Parses a date literal. +fn parse_date(x: &i32, y: &mut context::Context, nullable: bool) -> diagnostic::Result { + let dt = to_date_time((*x as i64).saturating_mul(24 * 60 * 60 * 1_000_000))?; + if dt < chrono::NaiveDate::from_ymd(1000, 1, 1).and_hms(0, 0, 0) + || dt >= chrono::NaiveDate::from_ymd(10000, 1, 1).and_hms(0, 0, 0) + { + diagnostic!( + y, + Error, + ExpressionIllegalLiteralValue, + "date out of range 1000-01-01 UTC to 9999-12-31 UTC" + ); + } + Literal::new_simple( + LiteralValue::Integer(*x as i64), + data_type::Simple::Date, + nullable, + ) +} + +/// Parses a time literal. +fn parse_time(x: &i64, y: &mut context::Context, nullable: bool) -> diagnostic::Result { + if *x < 0 || *x >= 24 * 60 * 60 * 1_000_000 { + diagnostic!( + y, + Error, + ExpressionIllegalLiteralValue, + "time of day out of range 00:00:00.000000 to 23:59:59.999999" + ); + } + Literal::new_simple(LiteralValue::Integer(*x), data_type::Simple::Time, nullable) +} + +/// Parses a year to month interval literal. +fn parse_interval_year_to_month( + x: &substrait::expression::literal::IntervalYearToMonth, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + // FIXME: see FIXME for associated type. + proto_primitive_field!(x, y, years, |x, _| { + if *x < -10000 || *x > 10000 { + Err(cause!( + ExpressionIllegalLiteralValue, + "year count out of range -10000 to 10000" + )) + } else { + Ok(()) + } + }); + proto_primitive_field!(x, y, months, |x, _| { + if *x < -120000 || *x > 120000 { + Err(cause!( + ExpressionIllegalLiteralValue, + "month count out of range -120000 to 120000" + )) + } else { + Ok(()) + } + }); + let months = x.months.saturating_add(x.years.saturating_mul(12)); + if months < -120000 || months > 120000 { + diagnostic!( + y, + Error, + ExpressionIllegalLiteralValue, + "combined interval out of range -10000 to 10000 years" + ); + } + Literal::new_simple( + LiteralValue::Interval(x.years, x.months), + data_type::Simple::IntervalYear, + nullable, + ) +} + +/// Parses a day to second interval literal. +fn parse_interval_day_to_second( + x: &substrait::expression::literal::IntervalDayToSecond, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + // FIXME: see FIXME for associated type. + proto_primitive_field!(x, y, days, |x, _| { + if *x < -3650000 || *x > 3650000 { + Err(cause!( + ExpressionIllegalLiteralValue, + "day count out of range -3_650_000 to 3_650_000" + )) + } else { + Ok(()) + } + }); + + // FIXME: according to the docs, day to second supports microsecond + // precision. The literal doesn't. The i32 seconds also doesn't + // support the full specified range (but that range is weird + // anyway). + proto_primitive_field!(x, y, seconds); + Literal::new_simple( + LiteralValue::Interval(x.days, x.seconds), + data_type::Simple::IntervalDay, + nullable, + ) +} + +/// Parses a UUID literal. +fn parse_uuid(x: &[u8], _y: &mut context::Context, nullable: bool) -> diagnostic::Result { + if let Ok(x) = x.try_into() { + Literal::new_simple( + LiteralValue::Data16(i128::from_ne_bytes(x)), + data_type::Simple::Uuid, + nullable, + ) + } else { + Err(cause!( + ExpressionIllegalLiteralValue, + "uuid literals must be 16 bytes in length, got {}", + x.len() + )) + } +} + +/// Parses a fixed-length string literal. +fn parse_fixed_char( + x: &str, + _y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + Literal::new_compound( + LiteralValue::String(x.to_string()), + data_type::Compound::FixedChar, + nullable, + vec![x.len() as u64], + ) +} + +/// Parses a variable-length string literal. +fn parse_var_char( + x: &substrait::expression::literal::VarChar, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + proto_primitive_field!(x, y, length); + let len = x.length as usize; + proto_primitive_field!(x, y, value, |x, _| { + if x.len() > len { + Err(cause!( + ExpressionIllegalLiteralValue, + "varchar literal value is longer than specified length" + )) + } else { + Ok(()) + } + }); + Literal::new_compound( + LiteralValue::String(x.value.clone()), + data_type::Compound::VarChar, + nullable, + vec![len as u64], + ) +} + +/// Parses a fixed-length binary literal. +fn parse_fixed_binary( + x: &[u8], + _y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + Literal::new_compound( + LiteralValue::Binary(x.to_owned()), + data_type::Compound::FixedBinary, + nullable, + vec![x.len() as u64], + ) +} + +/// Parses a decimal literal. +fn parse_decimal( + x: &substrait::expression::literal::Decimal, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + proto_primitive_field!(x, y, precision, |x, _| { + if *x < 0 { + Err(cause!( + IllegalValue, + "negative type parameters are not supported" + )) + } else { + Ok(()) + } + }); + proto_primitive_field!(x, y, scale); + let val = proto_primitive_field!(x, y, value, |x, _| { + if let Ok(x) = (&x[..]).try_into() { + Ok(i128::from_le_bytes(x)) + } else { + Err(cause!( + ExpressionIllegalLiteralValue, + "decimal literals must be 16 bytes in length, got {}", + x.len() + )) + } + }) + .1; + let precision = u64::try_from(x.precision).unwrap_or_default(); + let scale = u64::try_from(x.scale).unwrap_or_default(); + + if let Some(val) = val { + let range = 10i128.saturating_pow(precision.try_into().unwrap_or_default()); + if val >= range || val <= -range { + Err(cause!( + ExpressionIllegalLiteralValue, + "decimal value is out of range for specificied precision and scale" + )) + } else { + Literal::new_compound( + LiteralValue::Data16(val), + data_type::Compound::Decimal, + nullable, + vec![precision, scale], + ) + } + } else { + Ok(Literal::default()) + } +} + +/// Parses a struct literal. +fn parse_struct_int( + x: &substrait::expression::literal::Struct, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + let (values, types): (Vec<_>, Vec<_>) = proto_repeated_field!(x, y, fields, parse_literal) + .1 + .into_iter() + .map(|x| { + let x = x.unwrap_or_default(); + let data_type = x.data_type.clone(); + (x, data_type) + }) + .unzip(); + Literal::new_compound( + LiteralValue::Items(values), + data_type::Compound::Struct, + nullable, + types, + ) +} + +/// Parses a struct literal. +pub fn parse_struct( + x: &substrait::expression::literal::Struct, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + let literal = parse_struct_int(x, y, nullable)?; + y.set_data_type(literal.data_type().clone()); + Ok(literal) +} + +/// Parses a list literal. +fn parse_list( + x: &substrait::expression::literal::List, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + let values: Vec<_> = proto_required_repeated_field!(x, y, values, parse_literal) + .1 + .into_iter() + .map(|x| x.unwrap_or_default()) + .collect(); + if values.is_empty() { + comment!( + y, + "At least one list element is required to derive type. Use EmptyList instead." + ); + } + let mut data_type = Arc::default(); + for (index, value) in values.iter().enumerate() { + data_type = types::assert_equal( + y, + value.data_type(), + &data_type, + format!("unexpected type for index {index}"), + ); + } + Literal::new_compound( + LiteralValue::Items(values), + data_type::Compound::List, + nullable, + vec![data_type], + ) +} + +/// Parses a map literal. +fn parse_map( + x: &substrait::expression::literal::Map, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + let values: Vec<_> = proto_required_repeated_field!(x, y, key_values, |x, y| { + let key = proto_required_field!(x, y, key, parse_literal) + .1 + .unwrap_or_default(); + let value = proto_required_field!(x, y, value, parse_literal) + .1 + .unwrap_or_default(); + Ok((key, value)) + }) + .1 + .into_iter() + .map(|x| x.unwrap_or_default()) + .collect(); + if values.is_empty() { + comment!( + y, + "At least one key-value pair is required to derive types. Use EmptyMap instead." + ); + } + let mut key_type = Arc::default(); + let mut value_type = Arc::default(); + for (index, value) in values.iter().enumerate() { + key_type = types::assert_equal( + y, + value.0.data_type(), + &key_type, + format!("unexpected key type for index {index}"), + ); + value_type = types::assert_equal( + y, + value.1.data_type(), + &value_type, + format!("unexpected value type for index {index}"), + ); + } + Literal::new_compound( + LiteralValue::Pairs(values), + data_type::Compound::Map, + nullable, + vec![key_type, value_type], + ) +} + +/// Parses an empty list literal. +fn parse_empty_list( + x: &substrait::r#type::List, + y: &mut context::Context, + _nullable: bool, +) -> diagnostic::Result { + // FIXME: nullability is redundantly specified, and the type + // variation reference would be if it had gotten the same + // treatment as nullability. Why doesn't EmptyList just map to only + // the element data type? + types::parse_list(x, y)?; + Ok(Literal { + value: LiteralValue::Items(vec![]), + data_type: y.data_type(), + }) +} + +/// Parses an empty map literal. +fn parse_empty_map( + x: &substrait::r#type::Map, + y: &mut context::Context, + _nullable: bool, +) -> diagnostic::Result { + // FIXME: same note as for EmptyList. + types::parse_map(x, y)?; + Ok(Literal { + value: LiteralValue::Pairs(vec![]), + data_type: y.data_type(), + }) +} + +/// Parses a null literal. +fn parse_null( + x: &substrait::Type, + y: &mut context::Context, + _nullable: bool, +) -> diagnostic::Result { + // FIXME: same note as for EmptyList. + types::parse_type(x, y)?; + let data_type = y.data_type(); + if !data_type.nullable() && !data_type.is_unresolved() { + Err(cause!( + TypeMismatchedNullability, + "type of null literal must be nullable" + )) + } else { + Ok(Literal { + value: LiteralValue::Null, + data_type: y.data_type(), + }) + } +} + +/// Parse a literal value. Returns the parsed literal. +fn parse_literal_type( + x: &substrait::expression::literal::LiteralType, + y: &mut context::Context, + nullable: bool, +) -> diagnostic::Result { + use substrait::expression::literal::LiteralType; + match x { + LiteralType::Boolean(x) => parse_boolean(x, y, nullable), + LiteralType::I8(x) => parse_i8(x, y, nullable), + LiteralType::I16(x) => parse_i16(x, y, nullable), + LiteralType::I32(x) => parse_i32(x, y, nullable), + LiteralType::I64(x) => parse_i64(x, y, nullable), + LiteralType::Fp32(x) => parse_fp32(x, y, nullable), + LiteralType::Fp64(x) => parse_fp64(x, y, nullable), + LiteralType::String(x) => parse_string(x, y, nullable), + LiteralType::Binary(x) => parse_binary(x, y, nullable), + LiteralType::Timestamp(x) => parse_timestamp(x, y, nullable), + LiteralType::TimestampTz(x) => parse_timestamp_tz(x, y, nullable), + LiteralType::Date(x) => parse_date(x, y, nullable), + LiteralType::Time(x) => parse_time(x, y, nullable), + LiteralType::IntervalYearToMonth(x) => parse_interval_year_to_month(x, y, nullable), + LiteralType::IntervalDayToSecond(x) => parse_interval_day_to_second(x, y, nullable), + LiteralType::Uuid(x) => parse_uuid(x, y, nullable), + LiteralType::FixedChar(x) => parse_fixed_char(x, y, nullable), + LiteralType::VarChar(x) => parse_var_char(x, y, nullable), + LiteralType::FixedBinary(x) => parse_fixed_binary(x, y, nullable), + LiteralType::Decimal(x) => parse_decimal(x, y, nullable), + LiteralType::Struct(x) => parse_struct_int(x, y, nullable), + LiteralType::List(x) => parse_list(x, y, nullable), + LiteralType::Map(x) => parse_map(x, y, nullable), + LiteralType::EmptyList(x) => parse_empty_list(x, y, nullable), + LiteralType::EmptyMap(x) => parse_empty_map(x, y, nullable), + LiteralType::Null(x) => parse_null(x, y, nullable), + } +} + +/// Parse a literal value. Returns the parsed literal. +pub fn parse_literal( + x: &substrait::expression::Literal, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse type parameters that apply to all literals (except empty objects + // and null...). + if !matches!( + x.literal_type, + Some(substrait::expression::literal::LiteralType::EmptyList(_)) + | Some(substrait::expression::literal::LiteralType::EmptyMap(_)) + | Some(substrait::expression::literal::LiteralType::Null(_)) + ) { + // FIXME: why isn't the nullability enum used here? Especially + // considering nullability here actually should be unspecified when + // above match yields false, while it must be specified everywhere + // else. Better yet, change the semantics as described in the other + // fixmes such that it is always mandatory everywhere, and then use + // a boolean everywhere? If the point of the enum is to allow types + // to be "partially unresolved," then the type system is pretty + // fundamentally broken, since overload resolution depends on it. + proto_primitive_field!(x, y, nullable); + + // FIXME: why would literals not support type variations? Feels like + // there should be a type variation reference here. + } else { + // FIXME: this is all very ugly. Since all types can be made nullable + // anyway, why isn't the nullability field taken out of the type kind + // for types as well? Then the "empty" values can just refer to the + // type kind rather than the whole type message, and the problem would + // be solved. Likewise, I don't see why type variations should get + // special treatment in the sense that (currently) user-defined types + // can't also have variations. Why explicitly disallow that? + proto_primitive_field!(x, y, nullable, |x, y| { + // Send diagnostic only when x is not set to its default value, + // since the default value is indistinguishable from unspecified. + if *x { + diagnostic!( + y, + Info, + RedundantField, + "this field is inoperative for empty lists, empty maps, and null." + ); + } else { + comment!( + y, + "This field is inoperative for empty lists, empty maps, and null." + ); + } + Ok(()) + }); + } + + // Parse the literal value. + let literal = proto_required_field!(x, y, literal_type, parse_literal_type, x.nullable) + .1 + .unwrap_or_default(); + + // Describe node. + y.set_data_type(literal.data_type().clone()); + describe!(y, Expression, "{}", literal); + summary!( + y, + "Literal of type {:#} with value {:#}", + literal.data_type(), + literal + ); + Ok(literal) +} diff --git a/rs/src/parse/expressions/misc.rs b/rs/src/parse/expressions/misc.rs new file mode 100644 index 00000000..0dcf9408 --- /dev/null +++ b/rs/src/parse/expressions/misc.rs @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for parsing/validating miscellaneous expression types. + +use crate::input::proto::substrait; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions; +use crate::parse::types; +use crate::string_util; + +/// Parse an enum expression. Returns a description of said expression. +pub fn parse_enum( + x: &substrait::expression::Enum, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse variant. + let variant = proto_required_field!(x, y, enum_kind, |x, y| { + match x { + substrait::expression::r#enum::EnumKind::Specified(x) => { + if x.is_empty() { + diagnostic!(y, Error, IllegalValue, "enum variant name cannot be empty"); + } + Ok(Some(x.clone())) + } + substrait::expression::r#enum::EnumKind::Unspecified(_) => Ok(None), + } + }) + .1 + .flatten(); + + // Describe node. + if let Some(variant) = &variant { + describe!( + y, + Misc, + "Function option variant {}", + string_util::as_ident_or_string(variant) + ); + } else { + describe!(y, Misc, "Default function option variant"); + } + + Ok(expressions::Expression::EnumVariant(variant)) +} + +/// Parse a typecast expression. Returns a description of said expression. +pub fn parse_cast( + x: &substrait::expression::Cast, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse fields. + let data_type = proto_required_field!(x, y, r#type, types::parse_type) + .0 + .data_type(); + let input = proto_boxed_required_field!(x, y, input, expressions::parse_expression) + .1 + .unwrap_or_default(); + let expression = expressions::Expression::Cast(data_type, Box::new(input)); + proto_enum_field!( + x, + y, + failure_behavior, + substrait::expression::cast::FailureBehavior + ); + + // TODO: check if this is a valid typecast. + // FIXME: how? + diagnostic!( + y, + Warning, + NotYetImplemented, + "typecast validation rules are not yet implemented" + ); + + // Describe node. + describe!(y, Expression, "{}", expression); + summary!(y, "Type conversion: {:#}", expression); + Ok(expression) +} diff --git a/rs/src/parse/expressions/mod.rs b/rs/src/parse/expressions/mod.rs new file mode 100644 index 00000000..f6c74070 --- /dev/null +++ b/rs/src/parse/expressions/mod.rs @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for parsing/validating expressions. + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::string_util; +use crate::string_util::Describe; +use std::sync::Arc; + +pub mod conditionals; +pub mod functions; +pub mod literals; +pub mod misc; +pub mod references; +pub mod subqueries; + +/// Description of an expression. +#[derive(Clone)] +pub enum Expression { + /// Used for unknown expression types. + Unresolved, + + /// Used for literals. + Literal(literals::Literal), + + /// Used for references. + Reference(Box), + + /// Used for function calls and conditionals (which, really, are just + /// builtin function calls). + Function(String, Vec), + + /// Used for subqueries, or anything else where the "arguments" are too + /// extensive to be reasonably described; the argument list is always + /// simply represented with an ellipsis. + BigFunction(String), + + /// Used to represent the values of a MultiOrList. + Tuple(Vec), + + /// Used for type casts. + Cast(Arc, Box), + + /// Used for function option enum variants. Note that these aren't normal + /// expressions, as they have no associated type. See FIXME at the bottom + /// of this file. + EnumVariant(Option), +} + +impl Default for Expression { + fn default() -> Self { + Expression::Unresolved + } +} + +impl From for Expression { + fn from(l: literals::Literal) -> Self { + Expression::Literal(l) + } +} + +impl From for Expression { + fn from(r: references::Reference) -> Self { + Expression::Reference(Box::new(r)) + } +} + +impl Describe for Expression { + fn describe( + &self, + f: &mut std::fmt::Formatter<'_>, + limit: string_util::Limit, + ) -> std::fmt::Result { + match self { + Expression::Unresolved => write!(f, "?"), + Expression::Literal(x) => x.describe(f, limit), + Expression::Reference(x) => x.describe(f, limit), + Expression::Function(name, args) => { + let (name_limit, args_limit) = limit.split(name.len()); + string_util::describe_identifier(f, name, name_limit)?; + write!(f, "(")?; + string_util::describe_sequence(f, args, args_limit, 20, |f, expr, _, limit| { + expr.describe(f, limit) + })?; + write!(f, ")") + } + Expression::BigFunction(name) => string_util::describe_identifier(f, name, limit), + Expression::Tuple(items) => { + write!(f, "(")?; + string_util::describe_sequence(f, items, limit, 20, |f, expr, _, limit| { + expr.describe(f, limit) + })?; + write!(f, ")") + } + Expression::Cast(data_type, expression) => { + let (type_limit, expr_limit) = limit.split(10); + write!(f, "(")?; + data_type.describe(f, type_limit)?; + write!(f, ")(")?; + expression.describe(f, expr_limit)?; + write!(f, ")") + } + Expression::EnumVariant(Some(x)) => string_util::describe_identifier(f, x, limit), + Expression::EnumVariant(None) => write!(f, "-"), + } + } +} + +impl std::fmt::Display for Expression { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.display().fmt(f) + } +} + +impl Expression { + /// Shorthand for a new null literal. + pub fn new_null(data_type: Arc) -> Expression { + literals::Literal::new_null(data_type).into() + } +} + +/// Parse an expression type. Returns a description of said expression. +fn parse_expression_type( + x: &substrait::expression::RexType, + y: &mut context::Context, + enum_allowed: bool, +) -> diagnostic::Result { + match x { + substrait::expression::RexType::Literal(x) => { + literals::parse_literal(x, y).map(Expression::from) + } + substrait::expression::RexType::Selection(x) => { + references::parse_field_reference(x.as_ref(), y).map(Expression::from) + } + substrait::expression::RexType::ScalarFunction(x) => functions::parse_scalar_function(x, y), + substrait::expression::RexType::WindowFunction(x) => functions::parse_window_function(x, y), + substrait::expression::RexType::IfThen(x) => conditionals::parse_if_then(x.as_ref(), y), + substrait::expression::RexType::SwitchExpression(x) => { + conditionals::parse_switch(x.as_ref(), y) + } + substrait::expression::RexType::SingularOrList(x) => { + conditionals::parse_singular_or_list(x.as_ref(), y) + } + substrait::expression::RexType::MultiOrList(x) => conditionals::parse_multi_or_list(x, y), + substrait::expression::RexType::Enum(x) => { + if !enum_allowed { + diagnostic!( + y, + Error, + IllegalValue, + "function option enum variants are not allowed here" + ); + } + misc::parse_enum(x, y) + } + substrait::expression::RexType::Cast(x) => misc::parse_cast(x.as_ref(), y), + substrait::expression::RexType::Subquery(x) => subqueries::parse_subquery(x.as_ref(), y), + } +} + +/// Parse an expression. Returns a description of said expression. +fn parse_expression_internal( + x: &substrait::Expression, + y: &mut context::Context, + enum_allowed: bool, +) -> diagnostic::Result { + // Parse the expression. + let (n, e) = proto_required_field!(x, y, rex_type, parse_expression_type, enum_allowed); + let expression = e.unwrap_or_default(); + let data_type = n.data_type(); + + // Describe node. + y.set_data_type(data_type); + describe!(y, Expression, "{}", expression); + summary!(y, "Expression: {:#}", expression); + Ok(expression) +} + +/// Parse a regular expression (anything except a function option enum +/// variant). Returns a description of said expression. +pub fn parse_expression( + x: &substrait::Expression, + y: &mut context::Context, +) -> diagnostic::Result { + parse_expression_internal(x, y, false) +} + +/// Parse a predicate expression (a normal expression that yields a boolean). +/// Returns a description of said expression. +pub fn parse_predicate( + x: &substrait::Expression, + y: &mut context::Context, +) -> diagnostic::Result { + let expression = parse_expression_internal(x, y, false)?; + let data_type = y.data_type(); + if !matches!( + data_type.class(), + data_type::Class::Simple(data_type::Simple::Boolean) | data_type::Class::Unresolved + ) { + diagnostic!( + y, + Error, + TypeMismatch, + "predicates must yield booleans, but found {}", + data_type + ); + } + Ok(expression) +} + +/// Parse a function argument, which can be an expression or an enum option. +fn parse_function_argument( + x: &substrait::Expression, + y: &mut context::Context, +) -> diagnostic::Result { + parse_expression_internal(x, y, true) +} + +// FIXME: above should really be solved with a oneof, or better yet, by +// separating the options passed to a function from its arguments. diff --git a/rs/src/parse/expressions/references/mask.rs b/rs/src/parse/expressions/references/mask.rs new file mode 100644 index 00000000..fe6d00ed --- /dev/null +++ b/rs/src/parse/expressions/references/mask.rs @@ -0,0 +1,476 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for parsing/validation mask expressions. + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::string_util; +use std::sync::Arc; + +/// Parse a struct item. +fn parse_struct_item( + x: &substrait::expression::mask_expression::StructItem, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result<()> { + // Handle the struct index field. + let data_type = proto_primitive_field!(x, y, field, super::parse_struct_field_index, root) + .1 + .unwrap_or_default(); + + // Set resulting data type. + y.set_data_type(data_type.clone()); + + // Handle child selection, if any, to recursively project the field type + // of the selected struct field. + if x.child.is_some() { + let data_type = proto_required_field!(x, y, child, parse_select, &data_type) + .0 + .data_type(); + + // Update data type. + y.set_data_type(data_type); + + // Describe node. + describe!(y, Expression, "Struct item selection and sub-selection"); + } else { + describe!(y, Expression, "Struct item selection"); + } + + Ok(()) +} + +/// Parse a struct selection, a filter/swizzle for a struct type. +fn parse_struct_select( + x: &substrait::expression::mask_expression::StructSelect, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result<()> { + // Struct selections can only be applied to structs. + if !root.is_unresolved() && !root.is_struct() { + diagnostic!( + y, + Error, + TypeMismatch, + "struct selection requires a struct type, but got a {}", + root.class() + ); + } + + // Parse fields. + let fields = proto_repeated_field!( + x, + y, + struct_items, + parse_struct_item, + |_, _, _, _, _| (), + root + ) + .0 + .iter() + .map(|x| x.data_type()) + .collect::>(); + + // Create struct. + y.set_data_type(data_type::DataType::new_struct(fields, root.nullable())); + + // Describe node. + describe!(y, Expression, "Struct selection"); + Ok(()) +} + +/// Parse a list element selection. +fn parse_list_select_item_element( + x: &substrait::expression::mask_expression::list_select::list_select_item::ListElement, + y: &mut context::Context, +) -> diagnostic::Result<()> { + proto_primitive_field!(x, y, field); + describe!( + y, + Expression, + "Select {} element", + string_util::describe_index(x.field) + ); + Ok(()) +} + +/// Parse a list slice selection. +fn parse_list_select_item_slice( + x: &substrait::expression::mask_expression::list_select::list_select_item::ListSlice, + y: &mut context::Context, +) -> diagnostic::Result<()> { + proto_primitive_field!(x, y, start); + proto_primitive_field!(x, y, end); + + // Raise a diagnostic if the slice is always null, and describe the slice. + let description = if (x.start >= 0) == (x.end >= 0) && x.start < x.end { + diagnostic!(y, Info, RedundantListSlice, "slice is always null"); + String::from("Selects an empty list slice") + } else if x.start == 0 { + match x.end { + i32::MIN..=-3 => format!("Selects all but the last {} elements", -x.end - 1), + -2 => String::from("Selects all but the last element"), + -1 => String::from("Selects the complete list"), + 0 => String::from("Selects the first element"), + 1..=i32::MAX => format!("Selects the first {} elements", x.end + 1), + } + } else if x.end == -1 { + match x.start { + i32::MIN..=-2 => format!("Selects the last {} elements", -x.start), + -1 => String::from("Selects the last element"), + 0 => String::from("Selects the complete list"), + 1 => String::from("Selects all but the first element"), + 2..=i32::MAX => format!("Selects all but the first {} elements", x.start), + } + } else { + format!( + "Select {} until {} element (inclusive)", + string_util::describe_index(x.start), + string_util::describe_index(x.end) + ) + }; + describe!(y, Expression, "{}", description); + + // Describe the node. + Ok(()) +} + +/// Parse a list selection item type. +fn parse_list_select_item_type( + x: &substrait::expression::mask_expression::list_select::list_select_item::Type, + y: &mut context::Context, +) -> diagnostic::Result<()> { + match x { + substrait::expression::mask_expression::list_select::list_select_item::Type::Item(x) => { + parse_list_select_item_element(x, y) + } + substrait::expression::mask_expression::list_select::list_select_item::Type::Slice(x) => { + parse_list_select_item_slice(x, y) + } + } +} + +/// Parse a list selection item. +fn parse_list_select_item( + x: &substrait::expression::mask_expression::list_select::ListSelectItem, + y: &mut context::Context, +) -> diagnostic::Result<()> { + proto_required_field!(x, y, r#type, parse_list_select_item_type); + Ok(()) +} + +/// Parse a list selection, a filter/swizzle for a list type. +fn parse_list_select( + x: &substrait::expression::mask_expression::ListSelect, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result<()> { + // List selections can only be applied to lists. + if !root.is_unresolved() && !root.is_list() { + diagnostic!( + y, + Error, + TypeMismatch, + "list selection requires a list type, but got a {}", + root.class() + ); + } + + // Parse fields. + proto_repeated_field!(x, y, selection, parse_list_select_item); + + // Set resulting data type. + y.set_data_type(root.clone()); + + // Handle child selection, if any, to recursively project the list element + // type. + if x.child.is_some() { + // Get the list element type. + let data_type = root.unwrap_list().unwrap_or_default(); + + // Apply selection logic recursively. + let data_type = proto_boxed_required_field!(x, y, child, parse_select, &data_type) + .0 + .data_type(); + + // Create the new type. + y.set_data_type(data_type::DataType::new_list(data_type, root.nullable())); + + // Describe node. + describe!(y, Expression, "List selection and sub-selection"); + } else { + describe!(y, Expression, "List selection"); + } + + Ok(()) +} + +/// Parse a map single-key selection. +fn parse_map_select_key( + _x: &substrait::expression::mask_expression::map_select::MapKey, + y: &mut context::Context, + _key_type: &Arc, +) -> diagnostic::Result<()> { + // FIXME: map keys are not necessarily strings. Why is this not a + // primitive? + diagnostic!( + y, + Error, + NotYetImplemented, + "map key remappings are not yet specified" + ); + describe!(y, Expression, "Single-key map selection"); + Ok(()) +} + +/// Parse a map selection by means of an expression. +fn parse_map_select_expression( + _x: &substrait::expression::mask_expression::map_select::MapKeyExpression, + y: &mut context::Context, + _key_type: &Arc, +) -> diagnostic::Result<()> { + // FIXME: in Rust vernacular, need an Fn(K) -> Option here. I suppose + // there is no structure for that yet? Or are these the regex-type things + // that are not yet specified? + diagnostic!( + y, + Error, + NotYetImplemented, + "map key remappings are not yet specified" + ); + describe!(y, Expression, "Map key remapping"); + Ok(()) +} + +/// Parse a map selection type. +fn parse_map_select_type( + x: &substrait::expression::mask_expression::map_select::Select, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result<()> { + match x { + substrait::expression::mask_expression::map_select::Select::Key(x) => { + parse_map_select_key(x, y, root) + } + substrait::expression::mask_expression::map_select::Select::Expression(x) => { + parse_map_select_expression(x, y, root) + } + } +} + +/// Parse a map selection. +fn parse_map_select( + x: &substrait::expression::mask_expression::MapSelect, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result<()> { + // Map selections can only be applied to maps. + if !root.is_unresolved() && !root.is_map() { + diagnostic!( + y, + Error, + TypeMismatch, + "map selection requires a map type, but got a {}", + root.class() + ); + } + + // Parse selection field. + if x.select.is_some() { + proto_required_field!( + x, + y, + select, + parse_map_select_type, + &root.unwrap_map_key().unwrap_or_default() + ); + } else { + comment!(y, "No select key specified: mapping is left unchanged."); + } + + // Set resulting data type. + y.set_data_type(root.clone()); + + // Handle child selection, if any, to recursively project the map value + // type. + if x.child.is_some() { + // Get the map types. + let value_type = root.unwrap_map().unwrap_or_default(); + let key_type = root.unwrap_map_key().unwrap_or_default(); + + // Apply selection logic recursively. + let value_type = proto_boxed_required_field!(x, y, child, parse_select, &value_type) + .0 + .data_type(); + + // Create the new type. + y.set_data_type(data_type::DataType::new_map( + key_type, + value_type, + root.nullable(), + )); + + // Describe node. + describe!(y, Expression, "Map selection and sub-selection"); + } else { + describe!(y, Expression, "Map selection"); + } + + Ok(()) +} + +/// Parse a selection type. +fn parse_select_type( + x: &substrait::expression::mask_expression::select::Type, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result<()> { + match x { + substrait::expression::mask_expression::select::Type::Struct(x) => { + parse_struct_select(x, y, root) + } + substrait::expression::mask_expression::select::Type::List(x) => { + parse_list_select(x.as_ref(), y, root) + } + substrait::expression::mask_expression::select::Type::Map(x) => { + parse_map_select(x.as_ref(), y, root) + } + } +} + +fn parse_select( + x: &substrait::expression::mask_expression::Select, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result<()> { + let data_type = proto_required_field!(x, y, r#type, parse_select_type, root) + .0 + .data_type(); + y.set_data_type(data_type); + Ok(()) +} + +/// Parses the maintain_singular_struct field of a mask expression. is_singular +/// must specify whether the data type is actually a singular struct, while +/// struct_required must specify whether the context of the mask expression +/// requires a struct type. Returns whether the data type is a singular struct +/// and should be unwrapped. +fn parse_maintain_singular_struct( + x: &bool, + y: &mut context::Context, + is_singular: bool, + struct_required: bool, +) -> diagnostic::Result { + let maintain = *x; + match (is_singular, maintain, struct_required) { + (true, true, _) => { + // Okay: maintain struct. + summary!( + y, + "Mask expression yields a singular struct, which is \ + maintained as-is." + ); + Ok(false) + } + (true, false, true) => { + // Error: request to remove struct, but context requires a struct. + summary!( + y, + "Mask expression yields a singular struct, which would be \ + reduced to its element type, but its context does not allow \ + this." + ); + diagnostic!( + y, + Error, + TypeStructRequired, + "context requires a struct type and type is a singular \ + struct, maintain_singular_struct must be set" + ); + Ok(false) + } + (true, false, false) => { + // Okay: remove singular struct wrapper. + summary!( + y, + "Mask expression yields a singular struct, which is reduced \ + to its element type." + ); + Ok(true) + } + (false, true, _) => { + // Okay: not a singular struct, so there is nothing to strip. + summary!( + y, + "Data type of mask expression is not a singular struct, so \ + there is nothing to strip or maintain. The explicit true is \ + redundant." + ); + Ok(false) + } + (false, false, _) => { + // Okay: not a singular struct, so there is nothing to strip. + summary!( + y, + "Data type of mask expression is not a singular struct, so \ + there is nothing to strip or maintain." + ); + Ok(false) + } + } +} + +/// Parse a mask expression; that is, a field selection that can output a +/// nested structure. root specifies the data type being indexed, while +/// struct_required must specify whether the context of the mask expression +/// requires a struct type. +pub fn parse_mask_expression( + x: &substrait::expression::MaskExpression, + y: &mut context::Context, + root: &Arc, + struct_required: bool, +) -> diagnostic::Result<()> { + // Parse the struct selection and get its data type. + let data_type = proto_required_field!(x, y, select, parse_struct_select, root) + .0 + .data_type(); + + // Determine if the data type is a singular struct (i.e. a struct with only + // one item) and its element type if so. + let singular_type = data_type.unwrap_singular_struct().map(|data_type| { + if root.nullable() { + data_type.make_nullable() + } else { + data_type + } + }); + + // Handle the maintain_singular_struct field. + let unwrap = proto_primitive_field!( + x, + y, + maintain_singular_struct, + parse_maintain_singular_struct, + singular_type.is_some(), + struct_required + ) + .1 + .unwrap_or_default(); + + // Set the data type. + y.set_data_type(if unwrap { + singular_type.unwrap() + } else { + data_type + }); + + // Describe node. + describe!( + y, + Expression, + "References fields into a new nested structure" + ); + Ok(()) +} diff --git a/rs/src/parse/expressions/references/mod.rs b/rs/src/parse/expressions/references/mod.rs new file mode 100644 index 00000000..15ef94d9 --- /dev/null +++ b/rs/src/parse/expressions/references/mod.rs @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for parsing/validating references. + +use crate::input::proto::substrait; +use crate::output::comment; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions; +use crate::string_util; +use crate::string_util::Describe; +use std::sync::Arc; + +pub mod mask; +pub mod scalar; + +/// Description of the root of a reference. +#[derive(Clone)] +enum Root { + Unresolved, + Expression(expressions::Expression), + Schema(usize), +} + +impl From for Root { + fn from(e: expressions::Expression) -> Self { + Root::Expression(e) + } +} + +impl Default for Root { + fn default() -> Self { + Root::Unresolved + } +} + +/// Description of a reference path. +#[derive(Clone)] +pub struct ReferencePath { + // *Reversed* list of segments. + segments: Vec, +} + +impl Default for ReferencePath { + fn default() -> Self { + Self { + segments: vec![String::from(".?")], + } + } +} + +impl ReferencePath { + fn new() -> Self { + Self { segments: vec![] } + } + + fn prefix(mut self, s: String) -> Self { + self.segments.push(s); + self + } + + /// Returns the length of the complete path string. + pub fn len(&self) -> usize { + self.segments.iter().map(String::len).sum() + } +} + +impl Describe for ReferencePath { + fn describe( + &self, + f: &mut std::fmt::Formatter<'_>, + limit: string_util::Limit, + ) -> std::fmt::Result { + let lens = self.segments.iter().map(String::len).collect::>(); + let (n_left, n_right) = limit.split_ns(&lens); + for i in 0..n_left { + write!(f, "{}", self.segments[self.segments.len() - i - 1])?; + } + if let Some(n_right) = n_right { + write!(f, "..")?; + for i in self.segments.len() - n_right..self.segments.len() { + write!(f, "{}", self.segments[self.segments.len() - i - 1])?; + } + } + Ok(()) + } +} + +impl std::fmt::Display for ReferencePath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.display().fmt(f) + } +} + +/// Description of a reference. +#[derive(Clone)] +pub struct Reference { + root: Root, + path: ReferencePath, +} + +impl Default for Reference { + fn default() -> Self { + Self { + root: Root::Schema(0), + path: ReferencePath::default(), + } + } +} + +impl Describe for Reference { + fn describe( + &self, + f: &mut std::fmt::Formatter<'_>, + limit: string_util::Limit, + ) -> std::fmt::Result { + let (path_limit, root_limit) = limit.split(self.path.len()); + match &self.root { + Root::Unresolved => write!(f, "?")?, + Root::Expression(e) => { + write!(f, "(")?; + e.describe(f, root_limit)?; + write!(f, ")")?; + } + Root::Schema(0) => write!(f, "<>")?, + Root::Schema(n) => write!(f, "<{n}>")?, + } + self.path.describe(f, path_limit) + } +} + +impl std::fmt::Display for Reference { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.display().fmt(f) + } +} + +/// Parse a struct field index into its data type. +fn parse_struct_field_index( + x: &i32, + _y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result> { + let index = *x; + if index < 0 { + return Err(cause!( + IllegalValue, + "struct indices cannot be less than zero" + )); + } + let index: usize = index.try_into().unwrap(); + if root.is_struct() { + let size = root.parameters().len(); + root.type_parameter(index) + .ok_or_else(|| cause!(IllegalValue, "struct index out of range (size = {size})")) + } else { + Ok(Arc::default()) + } +} + +/// Parse a reference root. +fn parse_root_type( + x: &substrait::expression::field_reference::RootType, + y: &mut context::Context, +) -> diagnostic::Result { + match x { + substrait::expression::field_reference::RootType::Expression(x) => { + expressions::parse_expression(x.as_ref(), y).map(Root::from) + } + substrait::expression::field_reference::RootType::RootReference(_) => { + describe!(y, Misc, "Reference to field of current query"); + y.set_data_type(y.schema(0)?); + Ok(Root::Schema(0)) + } + substrait::expression::field_reference::RootType::OuterReference(x) => { + describe!( + y, + Misc, + "Reference to field of {} outer query", + string_util::describe_nth(x.steps_out) + ); + proto_primitive_field!(x, y, steps_out, |x, y| { + if *x < 1 { + diagnostic!( + y, + Error, + IllegalValue, + "must be at least 1 (use RootReference instead)" + ); + } + Ok(()) + }); + let steps_out = x.steps_out as usize; + y.set_data_type(y.schema(steps_out)?); + Ok(Root::Schema(steps_out)) + } + } +} + +/// Parse a reference path. +fn parse_reference_type( + x: &substrait::expression::field_reference::ReferenceType, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result { + match x { + substrait::expression::field_reference::ReferenceType::DirectReference(x) => { + scalar::parse_reference_segment(x, y, root) + } + substrait::expression::field_reference::ReferenceType::MaskedReference(x) => { + mask::parse_mask_expression(x, y, root, false)?; + Ok(ReferencePath::new().prefix(String::from(".mask(..)"))) + } + } +} + +/// Parse a field reference. Returns a description of the nested reference. +pub fn parse_field_reference( + x: &substrait::expression::FieldReference, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse the root of the reference. + let (root_node, root) = proto_required_field!(x, y, root_type, parse_root_type); + let root = root.unwrap_or_default(); + + // Parse the reference type. + let (path_node, path) = proto_required_field!( + x, + y, + reference_type, + parse_reference_type, + &root_node.data_type() + ); + let path = path.unwrap_or_default(); + + // Set the data type. + y.set_data_type(path_node.data_type()); + + // Describe node. + let reference = Reference { root, path }; + describe!(y, Expression, "Selects {}", &reference); + summary!(y, "Full reference path: {:#}", &reference); + if let Root::Schema(depth) = &reference.root { + let depth = *depth; + y.push_summary(comment::Comment::new().nl()); + if depth == 0 { + summary!( + y, + "Here, <> is used to refer to the row currently being processed." + ); + } else { + summary!( + y, + "Here, <{depth}> is used to refer to the row being processed \ + by the {} outer query.", + string_util::describe_nth(depth as u32) + ); + } + } + Ok(reference) +} diff --git a/rs/src/parse/expressions/references/scalar.rs b/rs/src/parse/expressions/references/scalar.rs new file mode 100644 index 00000000..7c2cfdda --- /dev/null +++ b/rs/src/parse/expressions/references/scalar.rs @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for parsing/validation scalar references. + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions::literals; +use crate::parse::expressions::references; +use crate::parse::types; +use crate::string_util; +use std::sync::Arc; + +/// Parse a struct field reference. Returns a description of the nested +/// reference. +fn parse_struct_field( + x: &substrait::expression::reference_segment::StructField, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result { + // Struct selections can only be applied to structs. + if !root.is_unresolved() && !root.is_struct() { + diagnostic!( + y, + Error, + TypeMismatch, + "struct selection requires a struct type, but got a {}", + root.class() + ); + } + + // Create description. + let description = format!(".{}", x.field); + + // Determine result data type. + let data_type = proto_primitive_field!(x, y, field, super::parse_struct_field_index, root) + .1 + .unwrap_or_default(); + + // If the struct is nullable, the field must also be nullable. + let data_type = if root.nullable() { + data_type.make_nullable() + } else { + data_type + }; + + // Set resulting data type. + y.set_data_type(data_type.clone()); + + // Handle child selection, if any, to recursively select elements from + // the struct field. + let reference = if x.child.is_some() { + let (node, result) = + proto_boxed_required_field!(x, y, child, parse_reference_segment, &data_type); + + // Update data type. + y.set_data_type(node.data_type()); + + // Generate reference. + result.unwrap_or_default().prefix(description) + } else { + references::ReferencePath::new().prefix(description) + }; + + // Describe node. + describe!(y, Expression, "Selects {}", &reference); + summary!(y, "Full reference path: {:#}", &reference); + Ok(reference) +} + +/// Parse a list element reference. Returns a description of the nested +/// reference. +fn parse_list_element( + x: &substrait::expression::reference_segment::ListElement, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result { + // Struct selections can only be applied to lists. + if !root.is_unresolved() && !root.is_list() { + diagnostic!( + y, + Error, + TypeMismatch, + "list selection requires a list type, but got a {}", + root.class() + ); + } + + // Handle the list index field. + proto_primitive_field!(x, y, offset, |x, y| { + describe!( + y, + Misc, + "Selects {} list element", + string_util::describe_index(*x) + ); + Ok(()) + }); + + // Create description. + let description = format!(".[{}]", x.offset); + + // Determine result data type. + let data_type = root.unwrap_list().unwrap_or_default(); + + // If the list is nullable, the selection must also be nullable. + let data_type = if root.nullable() { + data_type.make_nullable() + } else { + data_type + }; + + // FIXME: what is the runtime behavior for index out of range, throw or + // yield null? In the latter case, the return type would always need to + // be nullable. + + // Set resulting data type. + y.set_data_type(data_type.clone()); + + // Handle child selection, if any, to recursively select elements from + // the list element. + let reference = if x.child.is_some() { + let (node, result) = + proto_boxed_required_field!(x, y, child, parse_reference_segment, &data_type); + + // Update data type. + y.set_data_type(node.data_type()); + + // Generate reference. + result.unwrap_or_default().prefix(description) + } else { + references::ReferencePath::new().prefix(description) + }; + + // Describe node. + describe!(y, Expression, "Selects {}", &reference); + summary!(y, "Full reference path: {:#}", &reference); + Ok(reference) +} + +/// Parse a map key reference. Returns a description of the nested +/// reference. +fn parse_map_key( + x: &substrait::expression::reference_segment::MapKey, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result { + // Map selections can only be applied to maps. + if !root.is_unresolved() && !root.is_map() { + diagnostic!( + y, + Error, + TypeMismatch, + "map selection requires a map type, but got a {}", + root.class() + ); + } + + // Handle the map key primitive. + let key = proto_required_field!(x, y, map_key, literals::parse_literal) + .1 + .unwrap_or_default(); + + // Check the key type. + types::assert_equal( + y, + key.data_type(), + &root.unwrap_map_key().unwrap_or_default(), + "map key type mismatch", + ); + + // Create description. + let description = format!(".[{}]", key); + + // Determine result data type. + let data_type = root.unwrap_map().unwrap_or_default(); + + // If the map is nullable, the selection must also be nullable. + let data_type = if root.nullable() { + data_type.make_nullable() + } else { + data_type + }; + + // FIXME: what is the runtime behavior for index out of range, throw or + // yield null? In the latter case, the return type would always need to + // be nullable. + + // Set resulting data type. + y.set_data_type(data_type.clone()); + + // Handle child selection, if any, to recursively select elements from + // the map value. + let reference = if x.child.is_some() { + let (node, result) = + proto_boxed_required_field!(x, y, child, parse_reference_segment, &data_type); + + // Update data type. + y.set_data_type(node.data_type()); + + // Generate reference. + result.unwrap_or_default().prefix(description) + } else { + references::ReferencePath::new().prefix(description) + }; + + // Describe node. + describe!(y, Expression, "Selects {}", &reference); + summary!(y, "Full reference path: {:#}", &reference); + Ok(reference) +} + +/// Parse a reference segment type. Returns a description of the nested +/// reference. +fn parse_reference_type( + x: &substrait::expression::reference_segment::ReferenceType, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result { + match x { + substrait::expression::reference_segment::ReferenceType::StructField(x) => { + parse_struct_field(x, y, root) + } + substrait::expression::reference_segment::ReferenceType::ListElement(x) => { + parse_list_element(x, y, root) + } + substrait::expression::reference_segment::ReferenceType::MapKey(x) => { + parse_map_key(x, y, root) + } + } +} + +/// Parse a reference segment, i.e. a scalar reference into some nested +/// structure of type root. Returns a description of the nested reference. +pub fn parse_reference_segment( + x: &substrait::expression::ReferenceSegment, + y: &mut context::Context, + root: &Arc, +) -> diagnostic::Result { + // Parse the selection. + let (node, result) = proto_required_field!(x, y, reference_type, parse_reference_type, root); + + // Set the data type. + y.set_data_type(node.data_type()); + + // Describe node. + let reference = result.unwrap_or_default(); + describe!(y, Expression, "Selects {}", &reference); + summary!(y, "Full reference path: {:#}", &reference); + Ok(reference) +} diff --git a/rs/src/parse/expressions/subqueries.rs b/rs/src/parse/expressions/subqueries.rs new file mode 100644 index 00000000..f0f47cd9 --- /dev/null +++ b/rs/src/parse/expressions/subqueries.rs @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for parsing/validating function calls. + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions; +use crate::parse::relations; +use crate::parse::types; +use std::sync::Arc; + +/// Parse a scalar subquery. +fn parse_scalar( + x: &substrait::expression::subquery::Scalar, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse the relation and get its schema. + let schema = y.enter_relation_root(|y| { + proto_boxed_required_field!(x, y, input, relations::parse_rel) + .0 + .data_type() + }); + + // Scalar subqueries must return one row and one column. We can't check the + // row count statically, but we can check the schema. + let return_type = if let Some(return_type) = schema.unwrap_singular_struct() { + return_type + } else { + if !schema.is_unresolved() { + diagnostic!( + y, + Error, + ExpressionIllegalSubquery, + "subquery must return a single column" + ); + } + Arc::default() + }; + + // FIXME: what is the behavior when the query doesn't yield one row? Should + // the returned data type be made nullable? + + // Describe node. + y.set_data_type(return_type); + summary!( + y, + "Executes the contained subquery for each row. The query is expected \ + to return a single row and column, the value of which is returned by \ + the expression." + ); + let expression = expressions::Expression::BigFunction(String::from("scalar_subquery")); + describe!(y, Expression, "{}", expression); + Ok(expression) +} + +/// Parse a containment subquery. +fn parse_in_predicate( + x: &substrait::expression::subquery::InPredicate, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse the needles. + let needle_types = proto_required_repeated_field!(x, y, needles, expressions::parse_expression) + .0 + .iter() + .map(|x| x.data_type()) + .collect::>(); + + // Parse the relation and get its schema. + let schema = y.enter_relation_root(|y| { + proto_boxed_required_field!(x, y, haystack, relations::parse_rel) + .0 + .data_type() + }); + + // Match data types of needles and haystack. + if let Some(field_types) = schema.unwrap_struct() { + if needle_types.len() != field_types.len() { + diagnostic!( + y, + Error, + TypeMismatch, + "column count mismatch between needle and haystack" + ); + } else { + for (index, (field_type, needle_type)) in + field_types.iter().zip(needle_types.iter()).enumerate() + { + types::assert_equal( + y, + field_type, + needle_type, + format!( + "haystack field type does not match needle type for column {}", + index + 1 + ), + ); + } + } + } else { + assert!(schema.is_unresolved()); + } + + // Describe node. + y.set_data_type(data_type::DataType::new_predicate(false)); + summary!( + y, + "Executes the contained subquery for each row. Returns true \ + if and only if the needle expressions match the fields of at \ + least one of the rows returned by the subquery." + ); + let expression = expressions::Expression::BigFunction(String::from("in_subquery")); + describe!(y, Expression, "{}", expression); + Ok(expression) +} + +/// Parse a set predicate subquery. +fn parse_set_predicate( + x: &substrait::expression::subquery::SetPredicate, + y: &mut context::Context, +) -> diagnostic::Result { + use substrait::expression::subquery::set_predicate::PredicateOp; + + // Parse the relation. + y.enter_relation_root(|y| proto_boxed_required_field!(x, y, tuples, relations::parse_rel)); + + // Parse the operation type. + let operation = proto_required_enum_field!(x, y, predicate_op, PredicateOp) + .1 + .unwrap_or_default(); + + // Describe node. + y.set_data_type(data_type::DataType::new_predicate(false)); + let expression = match operation { + PredicateOp::Unspecified => { + expressions::Expression::BigFunction(String::from("invalid_subquery")) + } + PredicateOp::Exists => { + summary!( + y, + "Executes the contained subquery for each row. Returns true \ + if and only if at least one row is returned by the subquery." + ); + expressions::Expression::BigFunction(String::from("subquery_exists")) + } + PredicateOp::Unique => { + summary!( + y, + "Executes the contained subquery for each row. Returns true \ + if and only if no duplicate rows are returned." + ); + expressions::Expression::BigFunction(String::from("subquery_unique")) + } + }; + describe!(y, Expression, "{}", expression); + Ok(expression) +} + +/// Parse a set comparison subquery. +fn parse_set_comparison( + x: &substrait::expression::subquery::SetComparison, + y: &mut context::Context, +) -> diagnostic::Result { + use substrait::expression::subquery::set_comparison::ComparisonOp; + use substrait::expression::subquery::set_comparison::ReductionOp; + + // Parse the left-hand side. + let (n, e) = proto_boxed_required_field!(x, y, left, expressions::parse_expression); + let left_type = n.data_type(); + let left_expression = e.unwrap_or_default(); + + // Parse the operation type. + let comparison_op = proto_required_enum_field!(x, y, comparison_op, ComparisonOp) + .1 + .unwrap_or_default(); + let reduction_op = proto_required_enum_field!(x, y, reduction_op, ReductionOp) + .1 + .unwrap_or_default(); + + // Parse the right-hand side. + let right_schema = y.enter_relation_root(|y| { + proto_boxed_required_field!(x, y, right, relations::parse_rel) + .0 + .data_type() + }); + + // Right-hand side must return a single column. + let right_type = if let Some(right_type) = right_schema.unwrap_singular_struct() { + right_type + } else { + if !right_schema.is_unresolved() { + diagnostic!( + y, + Error, + ExpressionIllegalSubquery, + "subquery must return a single column" + ); + } + Arc::default() + }; + + // Check that the data types match. + types::assert_equal( + y, + &right_type, + &left_type, + "subquery field type does not match expression type", + ); + + // Describe node. + y.set_data_type(data_type::DataType::new_predicate(false)); + let expression = expressions::Expression::BigFunction(format!( + "{}_{}_subquery", + match comparison_op { + ComparisonOp::Unspecified => "invalid", + ComparisonOp::Eq => "equal", + ComparisonOp::Ne => "not_equal", + ComparisonOp::Lt => "less_than", + ComparisonOp::Gt => "greater_than", + ComparisonOp::Le => "less_equal", + ComparisonOp::Ge => "greater_equal", + }, + match reduction_op { + ReductionOp::Unspecified => "invalid", + ReductionOp::Any => "any", + ReductionOp::All => "all", + }, + )); + summary!( + y, + "Executes the contained subquery for each row. Returns true if" + ); + summary!( + y, + "{}", + match reduction_op { + ReductionOp::Unspecified => "", + ReductionOp::Any => "any", + ReductionOp::All => "all", + } + ); + summary!( + y, + "rows returned are {}", + match comparison_op { + ComparisonOp::Unspecified => "", + ComparisonOp::Eq => "equal to", + ComparisonOp::Ne => "not equal to", + ComparisonOp::Lt => "less than", + ComparisonOp::Gt => "greater than", + ComparisonOp::Le => "less than or equal to", + ComparisonOp::Ge => "greater than or equal to", + } + ); + summary!(y, "{:#}.", left_expression); + describe!(y, Expression, "{}", expression); + Ok(expression) +} + +/// Parse a particular subquery type. +fn parse_subquery_type( + x: &substrait::expression::subquery::SubqueryType, + y: &mut context::Context, +) -> diagnostic::Result { + match x { + substrait::expression::subquery::SubqueryType::Scalar(x) => parse_scalar(x, y), + substrait::expression::subquery::SubqueryType::InPredicate(x) => parse_in_predicate(x, y), + substrait::expression::subquery::SubqueryType::SetPredicate(x) => parse_set_predicate(x, y), + substrait::expression::subquery::SubqueryType::SetComparison(x) => { + parse_set_comparison(x, y) + } + } +} + +/// Parse a subquery. +pub fn parse_subquery( + x: &substrait::expression::Subquery, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse fields. + let (n, e) = proto_required_field!(x, y, subquery_type, parse_subquery_type); + let return_type = n.data_type(); + let expression = e.unwrap_or_default(); + + // Describe node. + y.set_data_type(return_type); + describe!(y, Expression, "{}", expression); + Ok(expression) +} diff --git a/rs/src/parse/extensions/advanced.rs b/rs/src/parse/extensions/advanced.rs new file mode 100644 index 00000000..4ac72265 --- /dev/null +++ b/rs/src/parse/extensions/advanced.rs @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for advanced extensions, i.e. +//! those based around protobuf Any values. + +use crate::input::proto::substrait; +use crate::output::diagnostic::Result; +use crate::parse::context; + +/// Parse a protobuf "any" type declaration. +#[allow(clippy::ptr_arg)] +fn parse_expected_type_url(x: &String, y: &mut context::Context) -> Result<()> { + if let Err(path) = y.define_proto_any_type(x) { + diagnostic!( + y, + Info, + RedundantProtoAnyDeclaration, + "message type {x} redeclared" + ); + link!(y, path, "Previous declaration was here."); + } + Ok(()) +} + +/// Parse a protobuf "any" message that consumers may ignore. +pub fn parse_hint_any(x: &prost_types::Any, y: &mut context::Context) -> Result<()> { + let (allowed, path) = y.resolve_proto_any(x); + if allowed { + diagnostic!( + y, + Info, + ProtoAny, + "explicitly allowed hint of type {}", + x.type_url + ); + } else { + diagnostic!( + y, + Info, + ProtoAny, + "ignoring unknown hint of type {}", + x.type_url + ); + } + if let Some(path) = path { + link!(y, path, "Type URL declaration is here."); + } + Ok(()) +} + +/// Parse a protobuf "any" message that consumers are not allowed to ignore. +pub fn parse_functional_any(x: &prost_types::Any, y: &mut context::Context) -> Result<()> { + let (allowed, path) = y.resolve_proto_any(x); + if allowed { + diagnostic!( + y, + Info, + ProtoAny, + "explicitly allowed enhancement of type {}", + x.type_url + ); + } else { + diagnostic!( + y, + Warning, + ProtoAny, + "unknown enhancement of type {}; plan is only valid \ + for consumers recognizing this enhancement", + x.type_url + ); + } + if let Some(path) = path { + link!(y, path, "Type URL declaration is here."); + } + Ok(()) +} + +/// Parse an advanced extension message (based on protobuf "any" messages). +/// Returns whether an enhancement was specified. +pub fn parse_advanced_extension( + x: &substrait::extensions::AdvancedExtension, + y: &mut context::Context, +) -> Result { + proto_field!(x, y, optimization, parse_hint_any); + Ok(proto_field!(x, y, enhancement, parse_functional_any) + .0 + .is_some()) +} + +/// Parses the advanced extension information in a plan. +pub fn parse_plan(x: &substrait::Plan, y: &mut context::Context) { + proto_repeated_field!(x, y, expected_type_urls, parse_expected_type_url); + proto_field!(x, y, advanced_extensions, parse_advanced_extension); +} + +/// Generate Info diagnostics for any extension definitions that weren't used. +pub fn check_unused_definitions(y: &mut context::Context) { + for (uri, _, path) in y + .proto_any_types() + .iter_unused() + .collect::>() + .into_iter() + { + diagnostic!( + y, + Info, + RedundantProtoAnyDeclaration, + "message type {uri} is not present in the plan" + ); + link!(y, path, "Declaration was here."); + } +} diff --git a/rs/src/parse/extensions/mod.rs b/rs/src/parse/extensions/mod.rs new file mode 100644 index 00000000..82f0a82e --- /dev/null +++ b/rs/src/parse/extensions/mod.rs @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions relating to extensions. + +use crate::input::proto::substrait; +use crate::parse::context; + +pub mod advanced; +pub mod simple; + +/// Parses the extension information in a plan. +pub fn parse_plan(x: &substrait::Plan, y: &mut context::Context) { + advanced::parse_plan(x, y); + simple::parse_plan(x, y); +} + +/// Generate Info diagnostics for any extension definitions that weren't used. +pub fn check_unused_definitions(y: &mut context::Context) { + advanced::check_unused_definitions(y); + simple::check_unused_definitions(y); +} diff --git a/rs/src/parse/extensions/simple/function_decls.rs b/rs/src/parse/extensions/simple/function_decls.rs new file mode 100644 index 00000000..c8176f15 --- /dev/null +++ b/rs/src/parse/extensions/simple/function_decls.rs @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for parsing YAML function +//! declarations. + +use crate::input::yaml; +use crate::output::diagnostic::Result; +use crate::parse::context; + +/// Parse a scalar function declaration. +pub fn parse_scalar_function(_x: &yaml::Value, _y: &mut context::Context) -> Result<()> { + // TODO + Ok(()) +} + +/// Parse an aggregate function declaration. +pub fn parse_aggregate_function(_x: &yaml::Value, _y: &mut context::Context) -> Result<()> { + // TODO + Ok(()) +} diff --git a/rs/src/parse/extensions/simple/mod.rs b/rs/src/parse/extensions/simple/mod.rs new file mode 100644 index 00000000..f5dc8881 --- /dev/null +++ b/rs/src/parse/extensions/simple/mod.rs @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for advanced extensions, i.e. +//! those based around YAML files. + +use crate::input::proto::substrait; +use crate::output::diagnostic::Result; +use crate::output::extension; +use crate::parse::context; +use std::sync::Arc; + +mod function_decls; +mod type_decls; +mod type_variation_decls; +mod yaml; + +/// Parse a user-defined name. Note that names are matched case-insensitively +/// because we return the name as lowercase. +#[allow(clippy::ptr_arg)] +pub fn parse_name(x: &String, _y: &mut context::Context) -> Result { + // FIXME: nothing seems to say anything about the validity of names for + // things, but this seems rather important to define. + if x.is_empty() { + Err(cause!(IllegalValue, "names cannot be empty")) + } else { + Ok(x.to_lowercase()) + } +} + +/// "Parse" an anchor. This just reports an error if the anchor is 0. +fn parse_anchor(x: &u32, _y: &mut context::Context) -> Result { + if *x == 0 { + Err(cause!( + IllegalValue, + "anchor 0 is reserved to disambiguate unspecified optional references" + )) + } else { + Ok(*x) + } +} + +/// Parse a mapping from a URI anchor to a YAML extension. +fn parse_simple_extension_yaml_uri_mapping( + x: &substrait::extensions::SimpleExtensionUri, + y: &mut context::Context, +) -> Result<()> { + // Parse the fields. + let anchor = proto_primitive_field!(x, y, extension_uri_anchor, parse_anchor).1; + let yaml_data = proto_primitive_field!(x, y, uri, yaml::parse_uri) + .1 + .unwrap(); + + // If the specified anchor is valid, insert a mapping for it. + if let Some(anchor) = anchor { + if let Err((prev_data, prev_path)) = y.define_extension_uri(anchor, yaml_data) { + diagnostic!( + y, + Error, + IllegalValue, + "anchor {anchor} is already in use for URI {}", + prev_data.uri() + ); + link!(y, prev_path, "Previous definition was here."); + } + } + + Ok(()) +} + +/// Parse an URI reference and resolve it. +fn parse_uri_reference(x: &u32, y: &mut context::Context) -> Result> { + match y.extension_uris().resolve(x).cloned() { + Some((yaml_data, path)) => { + describe!(y, Misc, "{}", yaml_data.uri()); + link!(y, path, "URI anchor is defined here"); + Ok(yaml_data) + } + None => { + describe!(y, Misc, "Unresolved URI"); + Err(cause!(LinkMissingAnchor, "URI anchor {x} does not exist")) + } + } +} + +/// Adds a description to a resolved function/type/variation reference node. +fn describe_reference(y: &mut context::Context, reference: &Arc>) { + describe!(y, Misc, "{}", reference); +} + +/// Parse a type variation reference and resolve it. +pub fn parse_type_variation_reference( + x: &u32, + y: &mut context::Context, +) -> Result>> { + match y.tvars().resolve(x).cloned() { + Some((variation, path)) => { + describe_reference(y, &variation); + link!(y, path, "Type variation anchor is defined here"); + Ok(variation) + } + None => { + describe!(y, Misc, "Unresolved type variation"); + Err(cause!( + LinkMissingAnchor, + "Type variation anchor {x} does not exist" + )) + } + } +} + +/// Parse a type reference and resolve it. +pub fn parse_type_reference( + x: &u32, + y: &mut context::Context, +) -> Result>> { + match y.types().resolve(x).cloned() { + Some((data_type, path)) => { + describe_reference(y, &data_type); + link!(y, path, "Type anchor is defined here"); + Ok(data_type) + } + None => { + describe!(y, Misc, "Unresolved type"); + Err(cause!(LinkMissingAnchor, "Type anchor {x} does not exist")) + } + } +} + +/// Parse a function reference and resolve it. +pub fn parse_function_reference( + x: &u32, + y: &mut context::Context, +) -> Result>> { + match y.fns().resolve(x).cloned() { + Some((function, path)) => { + describe_reference(y, &function); + link!(y, path, "Function anchor is defined here"); + Ok(function) + } + None => { + describe!(y, Misc, "Unresolved function"); + Err(cause!( + LinkMissingAnchor, + "Function anchor {x} does not exist" + )) + } + } +} + +/// Parse a mapping from a function/type/variation anchor to an extension. +fn parse_extension_mapping_data( + x: &substrait::extensions::simple_extension_declaration::MappingType, + y: &mut context::Context, +) -> Result<()> { + match x { + substrait::extensions::simple_extension_declaration::MappingType::ExtensionType(x) => { + + // Parse the fields. + let yaml_info = proto_primitive_field!(x, y, extension_uri_reference, parse_uri_reference).1; + let anchor = proto_primitive_field!(x, y, type_anchor, parse_anchor).1; + let name = proto_primitive_field!(x, y, name, parse_name).1; + + // If we successfully resolved the URI reference to a URI, resolved + // that URI, and managed to parse the YAML it pointed to, try to + // resolve the data type in it. + let data_type = yaml_info.as_ref().and_then(|yaml_info| { + yaml_info.data().and_then(|data| { + name.as_ref().and_then(|name| { + let data_type = data.types.get(name).cloned(); + if data_type.is_none() { + // TODO: Error, LinkMissingTypeName + diagnostic!(y, Warning, NotYetImplemented, "failed to resolve data type {name:?} in {yaml_info}"); + } + data_type + }) + }) + }); + + // Construct a reference for this data type. + let reference = Arc::new(extension::Reference { + name: extension::NamedReference::new(name, Some(y.path_buf())), + uri: yaml_info.as_ref().map(|x| x.uri().clone()).unwrap_or_default(), + definition: data_type + }); + + // If the specified anchor is valid, insert a mapping for it. + if let Some(anchor) = anchor { + if let Err((prev_data, prev_path)) = y.define_type(anchor, reference) { + diagnostic!( + y, + Error, + IllegalValue, + "anchor {anchor} is already in use for data type {prev_data}" + ); + link!(y, prev_path, "Previous definition was here."); + } + } + + } + substrait::extensions::simple_extension_declaration::MappingType::ExtensionTypeVariation(x) => { + + // Parse the fields. + let yaml_info = proto_primitive_field!(x, y, extension_uri_reference, parse_uri_reference).1; + let anchor = proto_primitive_field!(x, y, type_variation_anchor, parse_anchor).1; + let name = proto_primitive_field!(x, y, name, parse_name).1; + + // If we successfully resolved the URI reference to a URI, resolved + // that URI, and managed to parse the YAML it pointed to, try to + // resolve the type variation in it. + let type_variation = yaml_info.as_ref().and_then(|yaml_info| { + yaml_info.data().and_then(|data| { + name.as_ref().and_then(|name| { + let type_variation = data.type_variations.get(name).cloned(); + if type_variation.is_none() { + // TODO: Error, LinkMissingTypeVariationName + diagnostic!(y, Warning, NotYetImplemented, "failed to resolve type variation {name:?} in {yaml_info}"); + } + type_variation + }) + }) + }); + + // Construct a reference for this type variation. + let reference = Arc::new(extension::Reference { + name: extension::NamedReference::new(name, Some(y.path_buf())), + uri: yaml_info.as_ref().map(|x| x.uri().clone()).unwrap_or_default(), + definition: type_variation + }); + + // If the specified anchor is valid, insert a mapping for it. + if let Some(anchor) = anchor { + if let Err((prev_data, prev_path)) = y.define_tvar(anchor, reference) { + diagnostic!( + y, + Error, + IllegalValue, + "anchor {anchor} is already in use for type variation {prev_data}" + ); + link!(y, prev_path, "Previous definition was here."); + } + } + + } + substrait::extensions::simple_extension_declaration::MappingType::ExtensionFunction(x) => { + + // Parse the fields. + let yaml_info = proto_primitive_field!(x, y, extension_uri_reference, parse_uri_reference).1; + let anchor = proto_primitive_field!(x, y, function_anchor, parse_anchor).1; + let name = proto_primitive_field!(x, y, name).1; + + // If we successfully resolved the URI reference to a URI, resolved + // that URI, and managed to parse the YAML it pointed to, try to + // resolve the data type in it. + let function = yaml_info.as_ref().and_then(|yaml_info| { + yaml_info.data().and_then(|data| { + name.as_ref().and_then(|name| { + let function = data.functions.get(name).cloned(); + if function.is_none() { + // TODO: Error, LinkMissingFunctionName + diagnostic!(y, Warning, NotYetImplemented, "failed to resolve function {name:?} in {yaml_info}"); + } + function + }) + }) + }); + + // Construct a reference for this data type. + let reference = Arc::new(extension::Reference { + name: extension::NamedReference::new(name, Some(y.path_buf())), + uri: yaml_info.as_ref().map(|x| x.uri().clone()).unwrap_or_default(), + definition: function + }); + + // If the specified anchor is valid, insert a mapping for it. + if let Some(anchor) = anchor { + if let Err((prev_data, prev_path)) = y.define_fn(anchor, reference) { + diagnostic!( + y, + Error, + IllegalValue, + "anchor {anchor} is already in use for function {prev_data}" + ); + link!(y, prev_path, "Previous definition was here."); + } + } + + } + }; + Ok(()) +} + +/// Parse a mapping from a function/type/variation anchor to an extension. +fn parse_extension_mapping( + x: &substrait::extensions::SimpleExtensionDeclaration, + y: &mut context::Context, +) -> Result<()> { + proto_required_field!(x, y, mapping_type, parse_extension_mapping_data); + Ok(()) +} + +/// Parses the simple extension information in a plan. +pub fn parse_plan(x: &substrait::Plan, y: &mut context::Context) { + proto_repeated_field!( + x, + y, + extension_uris, + parse_simple_extension_yaml_uri_mapping + ); + proto_repeated_field!(x, y, extensions, parse_extension_mapping); +} + +/// Generate Info diagnostics for any extension definitions that weren't used. +pub fn check_unused_definitions(y: &mut context::Context) { + // List unused function declarations. + for (anchor, info, path) in y.fns().iter_unused().collect::>().into_iter() { + diagnostic!( + y, + Info, + RedundantFunctionDeclaration, + "anchor {anchor} for function {info} is not present in the plan" + ); + link!(y, path, "Declaration was here."); + } + + // List unused type declarations. + for (anchor, info, path) in y.types().iter_unused().collect::>().into_iter() { + diagnostic!( + y, + Info, + RedundantTypeDeclaration, + "anchor {anchor} for type {info} is not present in the plan" + ); + link!(y, path, "Declaration was here."); + } + + // List unused type variation declarations. + for (anchor, info, path) in y.tvars().iter_unused().collect::>().into_iter() { + diagnostic!( + y, + Info, + RedundantTypeVariationDeclaration, + "anchor {anchor} for type variation {info} is not present in the plan" + ); + link!(y, path, "Declaration was here."); + } +} diff --git a/rs/src/parse/extensions/simple/type_decls.rs b/rs/src/parse/extensions/simple/type_decls.rs new file mode 100644 index 00000000..9f493fbe --- /dev/null +++ b/rs/src/parse/extensions/simple/type_decls.rs @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for parsing YAML type +//! declarations. + +use crate::input::yaml; +use crate::output::diagnostic::Result; +use crate::parse::context; + +/// Parse a type declaration. +pub fn parse_type(_x: &yaml::Value, _y: &mut context::Context) -> Result<()> { + // TODO + Ok(()) +} diff --git a/rs/src/parse/extensions/simple/type_variation_decls.rs b/rs/src/parse/extensions/simple/type_variation_decls.rs new file mode 100644 index 00000000..eecb13d5 --- /dev/null +++ b/rs/src/parse/extensions/simple/type_variation_decls.rs @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for parsing YAML type variation +//! declarations. + +use crate::input::yaml; +use crate::output::diagnostic::Result; +use crate::parse::context; + +/// Parse a type variation declaration. +pub fn parse_type_variation(_x: &yaml::Value, _y: &mut context::Context) -> Result<()> { + // TODO + Ok(()) +} diff --git a/rs/src/parse/extensions/simple/yaml.rs b/rs/src/parse/extensions/simple/yaml.rs new file mode 100644 index 00000000..344ab6c3 --- /dev/null +++ b/rs/src/parse/extensions/simple/yaml.rs @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for parsing YAML extension +//! files. + +use crate::input::yaml; +use crate::output::diagnostic::Result; +use crate::output::extension; +use crate::output::path; +use crate::parse::context; +use crate::parse::extensions::simple::function_decls; +use crate::parse::extensions::simple::type_decls; +use crate::parse::extensions::simple::type_variation_decls; +use crate::parse::traversal; +use crate::string_util; +use std::sync::Arc; + +/// Toplevel parse function for a simple extension YAML file. +fn parse_root(x: &yaml::Value, y: &mut context::Context) -> Result<()> { + yaml_repeated_field!(x, y, "types", type_decls::parse_type)?; + yaml_repeated_field!( + x, + y, + "type_variations", + type_variation_decls::parse_type_variation + )?; + yaml_repeated_field!( + x, + y, + "scalar_functions", + function_decls::parse_scalar_function + )?; + yaml_repeated_field!( + x, + y, + "aggregate_functions", + function_decls::parse_aggregate_function + )?; + Ok(()) +} + +/// Parse a YAML extension URI string. +pub fn parse_uri>( + x: &S, + y: &mut context::Context, +) -> Result> { + // Check URI syntax. + let x = x.as_ref(); + if let Err(e) = string_util::check_uri(x) { + diagnostic!(y, Error, e); + } + + // The schema for YAML extension files. + static SCHEMA: once_cell::sync::Lazy = + once_cell::sync::Lazy::new(|| { + jsonschema::JSONSchema::compile( + &yaml::yaml_to_json( + yaml_rust::YamlLoader::load_from_str(include_str!( + "../../../resources/text/simple_extensions_schema.yaml" + )) + .unwrap() + .pop() + .unwrap(), + &path::Path::default(), + ) + .unwrap(), + ) + .unwrap() + }); + + Ok(traversal::parse_yaml(x, y, Some(&SCHEMA), parse_root)) +} diff --git a/rs/src/parse/mod.rs b/rs/src/parse/mod.rs new file mode 100644 index 00000000..d35e493b --- /dev/null +++ b/rs/src/parse/mod.rs @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Parsing/validation module. +//! +//! Roughly speaking, this module takes a Substrait plan represented using the +//! types provided by the [`input`](crate::input) module, and transforms it to +//! an equivalent plan represented using the types provided by the +//! [`output`](crate::output) module. In doing so, it parses and validates the +//! plan. +//! +//! # Traversal +//! +//! Most of the boilerplate code for tree traversal is handled by the +//! [`traversal`] module. What remains are "parse functions" of the form +//! `(x: &T, y: &mut Context) -> Result`, where: +//! +//! - `x` is a reference to the the JSON/YAML value or the prost wrapper for +//! the protobuf message that is to be parsed and validated; +//! - `y` is the parse context ([`context::Context`], see next section); and +//! - `R` is any desired return type. +//! +//! The body of the parse function can use a wide variety of function-like +//! macros from [`traversal`] to traverse the children of `x` in the +//! appropriate order and with the appropriate parse functions. The macros +//! return a tuple of a reference to the created +//! [`Node`](crate::output::tree::Node) and the `R` returned by the parse +//! function (depending on the macro, these may be wrapped in [`Option`]s or +//! [`Vec`]s). Note that any children not traversed by the parse function will +//! automatically be traversed by [`traversal`] (along with a warning +//! diagnostic that these children were not validated), and that traversing a +//! child twice is illegal (this will panic). +//! +//! # Parser context +//! +//! The mutable [`context::Context`] reference that is passed into every parse +//! function and is needed for every traversal macro stores all contextual +//! information needed for parsing, except for the input. Any and all results +//! of the parse process need to eventually end up in here, and as such it has +//! quite a few functions defined on it. It also has a reference to the +//! configuration structure; it's kind of the odd one out here since the +//! configuration is more of an input than output or state; it's simply +//! convenient to pass it along with the context object to save on some typing +//! when defining parse functions. +//! +//! Besides macros strictly intended for traversal, the [`traversal`] module +//! also defines some convenience macros for pushing things other than child +//! nodes into the context, particularly for things that regularly involve +//! [format!]. +//! +//! ## Diagnostics +//! +//! Rather than just passing `Result`s around, diagnostics are used to +//! communicate whether a plan is valid or not. This solves two problems: +//! +//! - distinguishing between messages signalling provable invalidity +//! (errors), messages signalling inability to determine validity +//! (warnings), and messages that are just intended to provide extra +//! information to the user; +//! - returning as many diagnostics as possible, rather than just stopping +//! at the first sight of trouble. +//! +//! Diagnostics can be pushed into the parser context using the [`diagnostic!`] +//! and [`ediagnostic!`] macros. The latter allows third-party `Err` types to +//! be pushed as the message, the former uses a [format!] syntax. However, +//! sometimes it also very useful to just use the `?` operator for something. +//! Therefore, parse functions also return +//! [`diagnostic::Result`](crate::output::diagnostic::Result). This result +//! is taken care of by the traversal macros; when `Err`, the diagnostic cause +//! is simply pushed as an error. This also suppresses the usual "unknown +//! field" warning emitted when a parse function failed to traverse all its +//! children; after all, it probably exited early. +//! +//! More information about all the information recorded in a diagnostic can be +//! found in the docs for the [diagnostic](crate::output::diagnostic) module. +//! +//! Beyond diagnostics, it's also possible to push comments into the context. +//! This can be done using the [`comment!`] and [`link!`] macros, or, for more +//! control, by pushing a [] +//! +//! ## Data types +//! +//! Data type information gets some special treatment, because it is so +//! important for validation. It's also very useful to have when debugging a +//! tree. It's considered so important that each +//! [`Node`](crate::output::tree::Node) has a place where it can store its +//! "return type". What this type actually represents depends on the type of +//! node: +//! +//! - type nodes: the represented type; +//! - expression nodes: the returned type; +//! - relation nodes: the schema (automatically set by +//! [`set_schema()`](context::Context::set_schema())). +//! +//! The data type can be set using the +//! [`set_data_type()`](context::Context::set_data_type()) method. Note that +//! all of the parsers for the above node types should call +//! [`set_data_type()`](context::Context::set_data_type()) at +//! least once, even if they're unable to determine what the actual type is; +//! in the latter case they can just push an unresolved type (for example +//! using `Default`, but additional information can be attached using +//! [`new_unresolved()`](crate::output::data_type::DataType::new_unresolved()). +//! +//! [`set_data_type()`](context::Context::set_data_type()) may be called more +//! than once for a single node. The data type of the node will simply be the +//! last one that was set when parsing for that node completes. However, each +//! call also records the data type as a special type of child of the node, +//! making the complete history of +//! [`set_data_type()`](context::Context::set_data_type()) calls visible in the +//! resulting parse tree. +//! +//! ## Schemas +//! +//! Perhaps even more important than data types in general are schemas; in +//! general, in order to be able to determine the data type returned by an +//! expression, contextual information about the schema(s) of the data +//! stream(s) being operated on needs to be known. Moreover, the context in +//! which an expression is evaluated may contain more than one schema when +//! subqueries get involved. +//! +//! This information is tracked in the schema stack. The stack can be +//! manipulated using the following functions. +//! +//! - The root node of a relation tree must be parsed within the context +//! created by +//! [`enter_relation_root()`](context::Context::enter_relation_root()). This +//! macro ensures that a schema is pushed onto the stack prior to traversal +//! of the relation tree, and popped after traversal completes. Initially, +//! the schema is set to an unresolved type, but the actual type should not +//! matter at this stage, because it semantically doesn't exist until the +//! first leaf in the relation tree is parsed. +//! - All relations call [`clear_schema()`](context::Context::clear_schema()) +//! prior to any relation-specific logic (this is done by the RelType parse +//! function), because semantically, no schema exists prior to parsing a +//! relation. +//! - [`set_schema()`](context::Context::set_schema()) sets or updates the +//! current schema. It must be called every time the data stream is +//! functionally updated, and just after the data stream is first created +//! by leaf relations. Relations that combine data streams should call it +//! just after traversal of its data sources completes (otherwise the +//! active schema will be whatever the schema of the most recently parsed +//! data source turned out to be). Doing so will also push the data type +//! corresponding to the schema to the node, such that the final tree +//! contains a type node for every semantic change of the data stream for +//! debugging/documentation purposes. +//! +//! The current schema information can be retrieved using +//! [`schema()`](context::Context::schema()). Its integer argument specifies +//! how many subqueries to break out of; 0 is used to refer to the schema of +//! the current (sub)query, 1 is its parent query, 2 is its grandparent, and +//! so on. +//! +//! ## How the parser context works +//! +//! A context object contains the following things: +//! +//! - [`output: &mut tree::Node`](crate::output::tree::Node), a mutable +//! reference to the node in the output tree that we're writing to. Note +//! that the [`traversal`] macros create a +//! [`Node`](crate::output::tree::Node) already populated with the default +//! [`NodeType`](crate::output::tree::NodeType) before calling the parse +//! function, including a copy of the primitive data element for leaf nodes, +//! and almost everything else can be added using the [`traversal`] macros, +//! so you shouldn't normally have to mutate this. Exceptions exist, however, +//! for example when an integer primitive needs to be upgraded to an anchor +//! reference. +//! - [`state: &mut context::State`](context::State), a mutable reference to a +//! global state structure for the parser. This includes, for instance, +//! lookup tables for things previously defined in the plan, such as +//! function declarations. The state object is initially constructed by +//! [`traversal`] using [`Default`], and is then just recursively passed to +//! every parse function. +//! - [`breadcrumb: &mut context::Breadcrumb`](context::Breadcrumb). This +//! fulfills a similar purpose as `state`, but using a stack-like structure: +//! for every child node, a new [`Breadcrumb`](context::Breadcrumb) is +//! pushed onto the stack. Note that only the top of the stack is mutable. +//! This is mostly used for keeping track of the current +//! [`Path`](crate::output::path::Path) and internally by the [`traversal`] +//! module; the parse functions can and should just use local variables when +//! they need to store something this way. +//! - [`config: &config::Config`](config::Config), a reference to the +//! configuration structure that the validator was called with. + +#[macro_use] +pub mod traversal; + +#[macro_use] +pub mod context; + +mod expressions; +mod extensions; +mod plan; +mod relations; +mod sorts; +mod types; + +use crate::input::config; +use crate::input::proto; +use crate::output::parse_result; + +/// Validates the given substrait.Plan message and returns the parse tree. +pub fn parse( + buffer: B, + config: &config::Config, +) -> parse_result::ParseResult { + traversal::parse_proto::( + buffer, + "plan", + plan::parse_plan, + &mut context::State::default(), + config, + ) +} diff --git a/rs/src/parse/plan.rs b/rs/src/parse/plan.rs new file mode 100644 index 00000000..1fe93790 --- /dev/null +++ b/rs/src/parse/plan.rs @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing toplevel parse/validation functions for plans. + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::extensions; +use crate::parse::relations; +use std::sync::Arc; + +// Parse a relation root, i.e. a toplevel relation that includes field name +// information. +fn parse_rel_root(x: &substrait::RelRoot, y: &mut context::Context) -> diagnostic::Result<()> { + // Parse the fields. + let schema = proto_required_field!(x, y, input, relations::parse_rel) + .0 + .data_type(); + proto_repeated_field!(x, y, names); + + // Relate the names to the schema. + let schema = schema + .apply_field_names(&x.names) + .map_err(|x| diagnostic!(y, Error, x)) + .unwrap_or_default(); + y.set_schema(schema); + + // Describe the node. + describe!(y, Misc, "Named relation root"); + summary!(y, "Attaches names to result schema"); + Ok(()) +} + +// Parse a relation type. +fn parse_rel_type( + x: &substrait::plan_rel::RelType, + y: &mut context::Context, +) -> diagnostic::Result> { + match x { + substrait::plan_rel::RelType::Rel(x) => { + relations::parse_rel(x, y)?; + Ok(y.data_type().strip_field_names()) + } + substrait::plan_rel::RelType::Root(x) => { + parse_rel_root(x, y)?; + Ok(y.data_type()) + } + } +} + +/// Parse a PlanRel node. +fn parse_plan_rel(x: &substrait::PlanRel, y: &mut context::Context) -> diagnostic::Result<()> { + let data_type = y.enter_relation_root(|y| { + proto_required_field!(x, y, rel_type, parse_rel_type) + .1 + .unwrap_or_default() + }); + + // Describe the node. + y.set_data_type(data_type); + describe!(y, Misc, "Relation root"); + Ok(()) +} + +/// Toplevel parse function for a plan. +pub fn parse_plan(x: &substrait::Plan, y: &mut context::Context) -> diagnostic::Result<()> { + // Handle extensions first, because we'll need their declarations to + // correctly interpret the relations. + extensions::parse_plan(x, y); + + // Handle the relations. + let num_relations = proto_repeated_field!(x, y, relations, parse_plan_rel) + .0 + .len(); + if num_relations == 0 { + diagnostic!( + y, + Error, + RelationRootMissing, + "a plan must have at least one relation" + ); + } + + // Generate an Info diagnostic for every extension definition that wasn't + // used at any point, and can thus be safely removed. + extensions::check_unused_definitions(y); + + Ok(()) +} diff --git a/rs/src/parse/relations/aggregate.rs b/rs/src/parse/relations/aggregate.rs new file mode 100644 index 00000000..15ed4d63 --- /dev/null +++ b/rs/src/parse/relations/aggregate.rs @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for aggregate relations. +//! +//! The aggregate operation groups input data on one or more sets of grouping +//! keys, calculating each measure for each combination of grouping key. +//! +//! See + +use std::collections::HashSet; +use std::sync::Arc; + +use crate::input::proto::substrait; +use crate::output::comment; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions; +use crate::parse::expressions::functions; + +/// Type of output field. +enum FieldType { + /// A field passed straight on from the input, but uniquified. + GroupedField, + + /// Like GroupedField, but grouping sets exist that this field is not a + /// part of. Null will be returned for such rows. + NullableGroupedField, + + /// An aggregate function applied to the input rows that were combined for + /// the current output row. + Measure, + + /// The index of the grouping set that the result corresponds to. + GroupingSetIndex, +} + +/// A grouping or aggregate expression returned by the aggregate relation. +struct Field { + /// Description of the grouping or aggregate expression. + expression: expressions::Expression, + + /// Data type returned by the expression. + data_type: Arc, + + /// The type of field. + field_type: FieldType, +} + +/// Parse a measure. +fn parse_measure( + x: &substrait::aggregate_rel::Measure, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse the aggregate function. + let (n, e) = proto_required_field!(x, y, measure, functions::parse_aggregate_function); + let data_type = n.data_type(); + let expression = e.unwrap_or_default(); + y.set_data_type(data_type); + + // Parse the filter and describe the node. + if x.filter.is_some() { + let filter = proto_required_field!(x, y, filter, expressions::parse_predicate) + .1 + .unwrap_or_default(); + summary!( + y, + "Applies aggregate function {expression:#} to all rows for \ + which {filter:#} returns true." + ); + let filtered_expression = + expressions::Expression::Function(String::from("filter"), vec![filter, expression]); + describe!( + y, + Expression, + "Filtered aggregate function: {filtered_expression}" + ); + Ok(filtered_expression) + } else { + summary!(y, "Applies aggregate function {expression:#} to all rows."); + describe!(y, Expression, "Aggregate function: {expression}"); + Ok(expression) + } +} + +/// Parse aggregate relation. +pub fn parse_aggregate_rel( + x: &substrait::AggregateRel, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Parse input. + let in_type = handle_rel_input!(x, y); + + // Set schema context for the grouping and measure expressions. + y.set_schema(in_type); + + // Parse grouping sets. + let mut grouping_set_expressions: Vec = vec![]; + let mut fields = vec![]; + let mut sets = vec![]; + proto_repeated_field!(x, y, groupings, |x, y| { + sets.push(vec![]); + proto_repeated_field!(x, y, grouping_expressions, |x, y| { + let result = expressions::parse_expression(x, y); + + // See if we parsed this expression before. If not, add it to the + // field list. Return the index in the field list. + let index = grouping_set_expressions + .iter() + .enumerate() + .find(|(_, e)| e == &x) + .map(|(i, _)| i) + .unwrap_or_else(|| { + // Create new field. + grouping_set_expressions.push(x.clone()); + fields.push(Field { + expression: result.as_ref().cloned().unwrap_or_default(), + data_type: y.data_type(), + field_type: FieldType::NullableGroupedField, + }); + + fields.len() - 1 + }); + + // Add index of uniquified field to grouping set. + sets.last_mut().unwrap().push(index); + + result + }); + match x.grouping_expressions.len() { + 0 => summary!(y, "A grouping set that aggregates all rows."), + 1 => summary!( + y, + "A grouping set that aggregates all rows for which \ + the expression yields the same value." + ), + x => summary!( + y, + "A grouping set that aggregates all rows for which \ + the {x} expressions yield the same tuple of values." + ), + } + Ok(()) + }); + drop(grouping_set_expressions); + let sets = sets; + + // Each field that is part of all sets will never be made nullable by the + // aggregate relation, so its type does not need to be made nullable. + let mut set_iter = sets.iter(); + if let Some(first_set) = set_iter.next() { + let mut fields_in_all_sets = first_set.iter().cloned().collect::>(); + for set in set_iter { + fields_in_all_sets = &fields_in_all_sets & &set.iter().cloned().collect::>(); + } + for index in fields_in_all_sets { + fields[index].field_type = FieldType::GroupedField; + } + } + + // Parse measures. + proto_repeated_field!(x, y, measures, |x, y| { + let result = parse_measure(x, y); + fields.push(Field { + expression: result.as_ref().cloned().unwrap_or_default(), + data_type: y.data_type(), + field_type: FieldType::Measure, + }); + result + }); + + // The relation is invalid if no fields result from it. + if fields.is_empty() { + diagnostic!( + y, + Error, + RelationInvalid, + "aggregate relations must have at least one grouping expression or measure" + ); + } + + // Add the column for the grouping set index. + // FIXME: this field makes no sense for aggregate relations that only have + // measures. It's also disputable whether it should exist when there is + // only one grouping set. + fields.push(Field { + expression: expressions::Expression::Function(String::from("group_index"), vec![]), + data_type: data_type::DataType::new_integer(false), + field_type: FieldType::GroupingSetIndex, + }); + let fields = fields; + + // Derive schema. + y.set_schema(data_type::DataType::new_struct( + fields.iter().map(|x| { + if matches!(x.field_type, FieldType::NullableGroupedField) { + x.data_type.make_nullable() + } else { + x.data_type.clone() + } + }), + false, + )); + + // Describe the relation. + if x.groupings.is_empty() { + describe!(y, Relation, "Aggregate"); + summary!( + y, + "This relation computes {} aggregate function(s) over all rows, \ + returning a single row.", + x.measures.len() + ); + } else if x.measures.is_empty() { + describe!(y, Relation, "Group"); + summary!( + y, + "This relation groups rows from the input by the result of some \ + expression(s)." + ); + } else { + describe!(y, Relation, "Group & aggregate"); + summary!( + y, + "This relation groups rows from the input by the result of some \ + expression(s), and also compures {} aggregate function(s) over \ + each group.", + x.measures.len() + ); + } + let mut comment = comment::Comment::new() + .plain("The significance of the returned field(s) is:") + .lo(); + for (index, field) in fields.iter().enumerate() { + comment = comment.li().plain(match field.field_type { + FieldType::GroupedField => format!( + "Field {index}: value of grouping expression {:#}.", + field.expression + ), + FieldType::NullableGroupedField => format!( + "Field {index}: value of grouping expression {:#} if it is \ + part of the grouping set being returned, null otherwise.", + field.expression + ), + FieldType::Measure => { + if x.groupings.is_empty() { + format!( + "Field {index}: result of aggregate function {:#} \ + applied to all input rows.", + field.expression + ) + } else { + format!( + "Field {index}: result of aggregate function {:#} \ + applied to the rows from the current group.", + field.expression + ) + } + } + FieldType::GroupingSetIndex => { + if x.groupings.is_empty() { + format!( + "Field {index}: undefined value, reserved for grouping \ + set index." + ) + } else if x.groupings.len() == 1 { + format!( + "Field {index}: always zero, representing the index of the \ + matched grouping set (of which there is only one here)." + ) + } else { + format!( + "Field {index}: integer between 0 and {} inclusive, \ + representing the index of the matched grouping set.", + x.groupings.len() - 1 + ) + } + } + }); + } + y.push_summary(comment.lc()); + + // Handle the common field. + handle_rel_common!(x, y); + + // Handle the advanced extension field. + handle_advanced_extension!(x, y); + + Ok(()) +} diff --git a/rs/src/parse/relations/common.rs b/rs/src/parse/relations/common.rs new file mode 100644 index 00000000..fe36b00c --- /dev/null +++ b/rs/src/parse/relations/common.rs @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for parsing logic common to all relation types. + +use std::sync::Arc; + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; + +/// Parse a stats node. +fn parse_stats( + x: &substrait::rel_common::hint::Stats, + y: &mut context::Context, +) -> diagnostic::Result<()> { + proto_primitive_field!(x, y, row_count, |x, y| { + if *x < 0.0 { + diagnostic!( + y, + Error, + IllegalValueInHint, + "negative row counts are nonsensical" + ); + } + Ok(()) + }); + proto_primitive_field!(x, y, record_size, |x, y| { + if *x < 0.0 { + diagnostic!( + y, + Error, + IllegalValueInHint, + "negative record sizes are nonsensical" + ); + } + Ok(()) + }); + proto_field!( + x, + y, + advanced_extension, + crate::parse::extensions::advanced::parse_advanced_extension + ); + Ok(()) +} + +/// Parse a constraints node. +fn parse_runtime_constraint( + x: &substrait::rel_common::hint::RuntimeConstraint, + y: &mut context::Context, +) -> diagnostic::Result<()> { + proto_field!( + x, + y, + advanced_extension, + crate::parse::extensions::advanced::parse_advanced_extension + ); + Ok(()) +} + +/// Parse a hint node. +fn parse_hint(x: &substrait::rel_common::Hint, y: &mut context::Context) -> diagnostic::Result<()> { + proto_field!(x, y, stats, parse_stats); + proto_field!(x, y, constraint, parse_runtime_constraint); + proto_field!( + x, + y, + advanced_extension, + crate::parse::extensions::advanced::parse_advanced_extension + ); + Ok(()) +} + +/// Parse emit mapping. Takes the data type of the schema thus far as argument. +fn parse_emit_mapping( + x: &i32, + _: &mut context::Context, + data_type: Arc, +) -> diagnostic::Result> { + let x: usize = (*x) + .try_into() + .map_err(|_| cause!(TypeInvalidSwizzle, "index cannot be negative"))?; + data_type + .index_struct(x) + .ok_or_else(|| cause!(TypeInvalidSwizzle, "index out of range")) +} + +/// Parse emit kind. Takes the data type of the schema thus far as argument. +fn parse_emit_kind( + x: &substrait::rel_common::EmitKind, + y: &mut context::Context, + data_type: Arc, +) -> diagnostic::Result> { + match x { + substrait::rel_common::EmitKind::Direct(_) => Ok(data_type), + substrait::rel_common::EmitKind::Emit(x) => { + let fields = proto_repeated_field!( + x, + y, + output_mapping, + parse_emit_mapping, + |_, _, _, _, _| (), + data_type.clone() + ) + .1 + .into_iter() + .map(|x| x.unwrap_or_default()) + .collect::>(); + Ok(data_type::DataType::new_struct(fields, false)) + } + } +} + +/// Parse RelCommon node. This should be processed after the rest of the +/// relation has processed, as it can transmute the data type. +pub fn parse_rel_common( + x: &substrait::RelCommon, + y: &mut context::Context, + data_type: Arc, +) -> diagnostic::Result> { + // Handle hint. + proto_field!(x, y, hint, parse_hint); + + // Handle advanced extension. + let data_type = if proto_field!( + x, + y, + advanced_extension, + crate::parse::extensions::advanced::parse_advanced_extension + ) + .1 + .unwrap_or_default() + { + data_type::DataType::new_unresolved() + } else { + data_type + }; + + // Parse emit kind. + let data_type = proto_field!(x, y, emit_kind, parse_emit_kind, data_type.clone()) + .1 + .unwrap_or(data_type); + + Ok(data_type) +} + +/// Handle the common field for a relation. This should be processed after the +/// rest of the relation has processed, as it can transmute the data type. +macro_rules! handle_rel_common { + ($input:expr, $context:expr) => { + let data_type = $context.data_type(); + + // Call the parser. + let result = proto_field!( + $input, + $context, + common, + crate::parse::relations::common::parse_rel_common, + data_type + ) + .1; + + // If common was populated and its parser succeeded (it should always + // do that), update the type information. + if let Some(data_type) = result { + $context.set_schema(data_type); + } + }; +} + +/// Handle the advanced extension field for a builtin relation. +macro_rules! handle_advanced_extension { + ($input:expr, $context:expr) => { + if proto_field!( + $input, + $context, + advanced_extension, + crate::parse::extensions::advanced::parse_advanced_extension + ) + .1 + .unwrap_or_default() + { + $context.set_schema(std::sync::Arc::default()); + } + }; +} + +/// Shorthand for handling the input field of a relation. Returns a the data +/// type corresponding to the schema returned by the relation. +macro_rules! handle_rel_input { + ($input:expr, $context:expr) => { + handle_rel_input!($input, $context, input) + }; + ($input:expr, $context:expr, $field:ident) => { + proto_boxed_required_field!($input, $context, $field, crate::parse::relations::parse_rel) + .0 + .data_type() + }; +} + +/// Shorthand for handling the input fields of a relation that takes a flexible +/// amount of inputs. Returns an iterator to references to the data types +/// corresponding to the schemas returned by the relations. Each data type can +/// be None if schema type deduction failed. +macro_rules! handle_rel_inputs { + ($input:expr, $context:expr) => { + handle_rel_inputs!($input, $context, inputs) + }; + ($input:expr, $context:expr, $field:ident) => { + proto_repeated_field!($input, $context, $field, crate::parse::relations::parse_rel) + .0 + .iter() + .map(|x| x.data_type()) + }; +} diff --git a/rs/src/parse/relations/cross.rs b/rs/src/parse/relations/cross.rs new file mode 100644 index 00000000..cbbdfaac --- /dev/null +++ b/rs/src/parse/relations/cross.rs @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for cross relations. +//! +//! The cross product operation will combine two separate inputs into a single +//! output. It pairs every record from the left input with every record of the +//! right input. +//! +//! See + +use std::sync::Arc; + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; + +/// Parse cross relation. +pub fn parse_cross_rel( + x: &substrait::CrossRel, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Parse input. + let left = handle_rel_input!(x, y, left); + let right = handle_rel_input!(x, y, right); + + // Derive schema. + if let (Some(mut fields), Some(additional_fields)) = + (left.unwrap_struct(), right.unwrap_struct()) + { + fields.extend(additional_fields.into_iter()); + let schema = data_type::DataType::new_struct(fields, false); + y.set_schema(schema); + } else { + y.set_schema(Arc::default()); + } + + // Describe the relation. + describe!(y, Relation, "Cross product"); + summary!( + y, + "This relation computes the cross product of its two input datasets." + ); + + // Handle the common field. + handle_rel_common!(x, y); + + // Handle the advanced extension field. + handle_advanced_extension!(x, y); + + Ok(()) +} diff --git a/rs/src/parse/relations/extension.rs b/rs/src/parse/relations/extension.rs new file mode 100644 index 00000000..bb22f849 --- /dev/null +++ b/rs/src/parse/relations/extension.rs @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for relational algebra +//! extensions. + +use std::sync::Arc; + +use crate::input::proto::substrait; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::extensions; + +/// Parse one to one extension. +pub fn parse_extension_single_rel( + x: &substrait::ExtensionSingleRel, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Parse input. + let _in_type = handle_rel_input!(x, y); + + // Set schema to an unresolved type. + y.set_schema(Arc::default()); + + // Parse the extension data. + proto_required_field!(x, y, detail, extensions::advanced::parse_functional_any); + + // Describe the relation. + if let Some(x) = &x.detail { + describe!(y, Relation, "{} extension", x.type_url); + } else { + describe!(y, Relation, "Unknown extension"); + } + + // Handle the common field. + handle_rel_common!(x, y); + + Ok(()) +} + +/// Parse many to one extension. +pub fn parse_extension_multi_rel( + x: &substrait::ExtensionMultiRel, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Parse inputs. + let _in_types: Vec<_> = handle_rel_inputs!(x, y).collect(); + + // Set schema to an unresolved type. + y.set_schema(Arc::default()); + + // Parse the extension data. + proto_required_field!(x, y, detail, extensions::advanced::parse_functional_any); + + // Describe the relation. + if let Some(x) = &x.detail { + describe!(y, Relation, "{} extension", x.type_url); + } else { + describe!(y, Relation, "Unknown extension"); + } + + // Handle the common field. + handle_rel_common!(x, y); + + Ok(()) +} + +/// Parse input extension. +pub fn parse_extension_leaf_rel( + x: &substrait::ExtensionLeafRel, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Set schema to an unresolved type. + y.set_schema(Arc::default()); + + // Parse the extension data. + proto_required_field!(x, y, detail, extensions::advanced::parse_functional_any); + + // Describe the relation. + if let Some(x) = &x.detail { + describe!(y, Relation, "{} extension", x.type_url); + } else { + describe!(y, Relation, "Unknown extension"); + } + + // Handle the common field. + handle_rel_common!(x, y); + + Ok(()) +} diff --git a/rs/src/parse/relations/fetch.rs b/rs/src/parse/relations/fetch.rs new file mode 100644 index 00000000..f9616cf4 --- /dev/null +++ b/rs/src/parse/relations/fetch.rs @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for fetch relations. +//! +//! The fetch operation eliminates records outside a desired window. Typically +//! corresponds to a fetch/offset SQL clause. +//! +//! See + +use crate::input::proto::substrait; +use crate::output::diagnostic; +use crate::parse::context; +use crate::string_util; + +/// Parse fetch relation. +pub fn parse_fetch_rel( + x: &substrait::FetchRel, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Parse input. + let in_type = handle_rel_input!(x, y); + + // Filters pass through their input schema unchanged. + y.set_schema(in_type); + + // Parse offset and count. + proto_primitive_field!(x, y, offset, |x, y| { + if *x < 0 { + diagnostic!(y, Error, IllegalValue, "offsets cannot be negative"); + } + Ok(()) + }); + proto_primitive_field!(x, y, count, |x, y| { + if *x < 0 { + diagnostic!(y, Error, IllegalValue, "count cannot be negative"); + } + Ok(()) + }); + + // Describe the relation. + if x.count == 1 { + describe!( + y, + Relation, + "Propagate only the {} row", + (x.offset + 1) + .try_into() + .map(string_util::describe_nth) + .unwrap_or_else(|_| String::from("?")) + ); + } else if x.count > 1 { + if x.offset > 1 { + describe!( + y, + Relation, + "Propagate only {} rows, starting from the {}", + x.count, + (x.offset + 1) + .try_into() + .map(string_util::describe_nth) + .unwrap_or_else(|_| String::from("?")) + ); + } else { + describe!(y, Relation, "Propagate only the first {} rows", x.count); + } + } else if x.offset == 0 { + describe!(y, Relation, "Fetch all rows"); + } else if x.offset == 1 { + describe!(y, Relation, "Discard the first row"); + } else if x.offset > 1 { + describe!(y, Relation, "Discard the first {} rows", x.offset); + } else { + describe!(y, Relation, "Invalid fetch relation"); + } + + // Handle the common field. + handle_rel_common!(x, y); + + // Handle the advanced extension field. + handle_advanced_extension!(x, y); + + Ok(()) +} diff --git a/rs/src/parse/relations/filter.rs b/rs/src/parse/relations/filter.rs new file mode 100644 index 00000000..d327ccd6 --- /dev/null +++ b/rs/src/parse/relations/filter.rs @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for filter relations. +//! +//! The filter operator eliminates one or more records from the input data +//! based on a boolean filter expression. +//! +//! See + +use crate::input::proto::substrait; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions; + +/// Parse filter relation. +pub fn parse_filter_rel( + x: &substrait::FilterRel, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Parse input. + let in_type = handle_rel_input!(x, y); + + // Filters pass through their input schema unchanged. + y.set_schema(in_type); + + // Check the filter predicate. + let (n, e) = proto_boxed_required_field!(x, y, condition, expressions::parse_predicate); + let predicate = e.unwrap_or_default(); + let nullable = n.data_type().nullable(); + + // Describe the relation. + describe!(y, Relation, "Filter by {}", &predicate); + summary!( + y, + "This relation discards all rows for which {} yields false.", + &predicate + ); + if nullable { + // FIXME: what's the behavior when a filter condition is nullable and + // yields null? Same applies for all other usages of parse_predicate(). + summary!(y, "Behavior for a null condition is unspecified."); + } + + // Handle the common field. + handle_rel_common!(x, y); + + // Handle the advanced extension field. + handle_advanced_extension!(x, y); + + Ok(()) +} diff --git a/rs/src/parse/relations/join.rs b/rs/src/parse/relations/join.rs new file mode 100644 index 00000000..a31772a4 --- /dev/null +++ b/rs/src/parse/relations/join.rs @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for join relations. +//! +//! The join operation will combine two separate inputs into a single output, +//! based on a join expression. A common subtype of joins is a equality join +//! where the join expression is constrained to a list of equality (or +//! equality + null equality) conditions between the two inputs of the join. +//! +//! See + +use std::sync::Arc; + +use crate::input::proto::substrait; +use crate::output::comment; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions; + +/// Parse join relation. +pub fn parse_join_rel(x: &substrait::JoinRel, y: &mut context::Context) -> diagnostic::Result<()> { + use substrait::join_rel::JoinType; + + // Parse input. + let left = handle_rel_input!(x, y, left); + let right = handle_rel_input!(x, y, right); + + // Derive schema with which the join expression is evaluated. + if let (Some(mut fields), Some(additional_fields)) = + (left.unwrap_struct(), right.unwrap_struct()) + { + fields.extend(additional_fields.into_iter()); + let schema = data_type::DataType::new_struct(fields, false); + y.set_schema(schema); + } else { + y.set_schema(Arc::default()); + } + + // Parse join expression. + let join_expression = + proto_boxed_required_field!(x, y, expression, expressions::parse_predicate) + .1 + .unwrap_or_default(); + + // Parse join type. + let join_type = proto_required_enum_field!(x, y, r#type, JoinType) + .1 + .unwrap_or_default(); + + // Determine whether the join can null the left and/or right side, and + // whether the right side is returned at all. + let (left_nullable, right_nullable) = match join_type { + JoinType::Unspecified => (false, Some(false)), + JoinType::Inner => (false, Some(false)), + JoinType::Outer => (true, Some(true)), + JoinType::Left => (false, Some(true)), + JoinType::Right => (true, Some(false)), + JoinType::Semi => (false, None), + JoinType::Anti => (false, None), + JoinType::Single => (false, Some(true)), + }; + + // Derive final schema. + if let (Some(left_fields), Some(right_fields)) = (left.unwrap_struct(), right.unwrap_struct()) { + let mut fields = Vec::with_capacity(left_fields.len() + right_fields.len()); + if left_nullable { + fields.extend(left_fields.into_iter().map(|x| x.make_nullable())) + } else { + fields.extend(left_fields.into_iter()) + } + if let Some(right_nullable) = right_nullable { + if right_nullable { + fields.extend(right_fields.into_iter().map(|x| x.make_nullable())) + } else { + fields.extend(right_fields.into_iter()) + } + } + let schema = data_type::DataType::new_struct(fields, false); + y.set_schema(schema); + } else { + y.set_schema(Arc::default()); + } + + // Handle optional post-join filter. + let filter_expression = + proto_boxed_field!(x, y, post_join_filter, expressions::parse_predicate).1; + + // Describe the relation. + let prefix = match (join_type, x.post_join_filter.is_some()) { + (JoinType::Unspecified, _) => "Unknown", + (JoinType::Inner, true) => "Filtered inner", + (JoinType::Inner, false) => "Inner", + (JoinType::Outer, true) => "Filtered outer", + (JoinType::Outer, false) => "Outer", + (JoinType::Left, true) => "Filtered left", + (JoinType::Left, false) => "Left", + (JoinType::Right, true) => "Filtered right", + (JoinType::Right, false) => "Right", + (JoinType::Semi, true) => "Filtered semi", + (JoinType::Semi, false) => "Semi", + (JoinType::Anti, true) => "Filtered anti", + (JoinType::Anti, false) => "Anti", + (JoinType::Single, true) => "Filtered single", + (JoinType::Single, false) => "Single", + }; + describe!(y, Relation, "{prefix} join by {join_expression}"); + summary!(y, "{prefix} join by {join_expression:#}."); + y.push_summary(comment::Comment::new().nl().plain(match join_type { + JoinType::Unspecified => "", + JoinType::Inner => concat!( + " Returns rows combining the row from the left and right ", + "input for each pair where the join expression yields true.", + ), + JoinType::Outer => concat!( + " Returns rows combining the row from the left and right ", + "input for each pair where the join expression yields true. ", + "If the join expression never yields true for any left or ", + "right row, this returns a row anyway, with the fields ", + "corresponding to the other input set to null.", + ), + JoinType::Left => concat!( + " Returns rows combining the row from the left and right ", + "input for each pair where the join expression yields true. ", + "If the join expression never yields true for a row from the ", + "left, this returns a row anyway, with the fields corresponding ", + "to the right input set to null.", + ), + JoinType::Right => concat!( + " Returns rows combining the row from the left and right ", + "input for each pair where the join expression yields true. ", + "If the join expression never yields true for a row from the ", + "right, this returns a row anyway, with the fields corresponding ", + "to the left input set to null.", + ), + JoinType::Semi => concat!( + " Filters rows from the left input, propagating a row only if ", + "the join expression yields true for that row combined with ", + "any row from the right input.", + ), + JoinType::Anti => concat!( + " Filters rows from the left input, propagating a row only if ", + "the join expression does not yield true for that row combined ", + "with any row from the right input.", + ), + JoinType::Single => concat!( + " Returns a row for each row from the left input, concatenating ", + "it with the row from the right input for which the join ", + "expression yields true. If the expression never yields true for ", + "a left input, the fields corresponding to the right input are ", + "set to null. If the expression yields true for a left row and ", + "multiple right rows, this may return the first pair encountered ", + "or throw an error." + ), + })); + if let Some(filter_expression) = filter_expression { + y.push_summary( + comment::Comment::new() + .nl() + .plain(format!("The result is filtered by {filter_expression:#}.")), + ); + } + + // Handle the common field. + handle_rel_common!(x, y); + + // Handle the advanced extension field. + handle_advanced_extension!(x, y); + + Ok(()) +} diff --git a/rs/src/parse/relations/mod.rs b/rs/src/parse/relations/mod.rs new file mode 100644 index 00000000..f5b24452 --- /dev/null +++ b/rs/src/parse/relations/mod.rs @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for relational algebra. + +#[macro_use] +mod common; +mod aggregate; +mod cross; +mod extension; +mod fetch; +mod filter; +mod join; +mod project; +mod read; +mod set; +mod sort; + +use crate::input::proto::substrait; +use crate::input::traits::ProtoOneOf; +use crate::output::diagnostic; +use crate::parse::context; + +/// Parse a relation type. +fn parse_rel_type(x: &substrait::rel::RelType, y: &mut context::Context) -> diagnostic::Result<()> { + // Ensure that the top of the schema stack exists and it set to an + // unresolved type. + y.clear_schema(); + + // Set a basic description, to ensure that these nodes are always marked + // as relations. + describe!(y, Relation, "{} relation", x.proto_oneof_variant()); + + // NOTE: if you're here because you added a relation type and now CI is + // failing, you can just add "_ => Ok(())," to the end of this list. The + // validator will then automatically throw a "not yet implemented" warning + // if it finds that relation type in a plan. + match x { + substrait::rel::RelType::Read(x) => read::parse_read_rel(x, y), + substrait::rel::RelType::Filter(x) => filter::parse_filter_rel(x, y), + substrait::rel::RelType::Fetch(x) => fetch::parse_fetch_rel(x, y), + substrait::rel::RelType::Aggregate(x) => aggregate::parse_aggregate_rel(x, y), + substrait::rel::RelType::Sort(x) => sort::parse_sort_rel(x, y), + substrait::rel::RelType::Join(x) => join::parse_join_rel(x, y), + substrait::rel::RelType::Project(x) => project::parse_project_rel(x, y), + substrait::rel::RelType::Set(x) => set::parse_set_rel(x, y), + substrait::rel::RelType::ExtensionSingle(x) => extension::parse_extension_single_rel(x, y), + substrait::rel::RelType::ExtensionMulti(x) => extension::parse_extension_multi_rel(x, y), + substrait::rel::RelType::ExtensionLeaf(x) => extension::parse_extension_leaf_rel(x, y), + substrait::rel::RelType::Cross(x) => cross::parse_cross_rel(x, y), + // _ => Ok(()), + } +} + +/// Parse a relation root, i.e. a toplevel relation that includes field name +/// information. +pub fn parse_rel(x: &substrait::Rel, y: &mut context::Context) -> diagnostic::Result<()> { + let schema = proto_required_field!(x, y, rel_type, parse_rel_type) + .0 + .data_type(); + y.set_schema(schema); + Ok(()) +} diff --git a/rs/src/parse/relations/project.rs b/rs/src/parse/relations/project.rs new file mode 100644 index 00000000..ed3d3d84 --- /dev/null +++ b/rs/src/parse/relations/project.rs @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for project relations. +//! +//! The project operation will produce one or more additional expressions based +//! on the inputs of the dataset. +//! +//! See + +use std::sync::Arc; + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions; + +/// Parse project relation. +pub fn parse_project_rel( + x: &substrait::ProjectRel, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Parse input. + let mut schema = handle_rel_input!(x, y); + + // Start with the input schema. + y.set_schema(schema.clone()); + + // Parse the expressions that are to be appended to the schema. + let expressions = proto_required_repeated_field!( + x, + y, + expressions, + expressions::parse_expression, + |_x, y, _i, n, _r| { + // Update the schema. + if let Some(mut fields) = schema.unwrap_struct() { + fields.push(n.data_type()); + schema = data_type::DataType::new_struct(fields, false); + y.set_schema(schema.clone()); + } else { + y.set_schema(Arc::default()); + } + } + ) + .1; + + // Describe the relation. + describe!(y, Relation, "Projection"); + if expressions.len() > 1 { + summary!( + y, + "This relation generates {} new columns by projecting the existing columns using scalar expressions.", + expressions.len() + ); + } else { + summary!( + y, + "This relation generates a new column by projecting the existing columns using a scalar expression." + ); + } + + // Handle the common field. + handle_rel_common!(x, y); + + // Handle the advanced extension field. + handle_advanced_extension!(x, y); + + Ok(()) +} diff --git a/rs/src/parse/relations/read.rs b/rs/src/parse/relations/read.rs new file mode 100644 index 00000000..df65de7e --- /dev/null +++ b/rs/src/parse/relations/read.rs @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for read relations. +//! +//! The read operator is an operator that produces one output. A simple example +//! would be the reading of a Parquet file. +//! +//! See + +use std::sync::Arc; + +use crate::input::proto::substrait; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions; +use crate::parse::expressions::literals; +use crate::parse::expressions::references::mask; +use crate::parse::extensions; +use crate::parse::types; +use crate::string_util; + +/// Information about a data source. +struct SourceInfo { + /// Short description of the data source, used in the brief of the read + /// relation. + pub name: String, + + /// The schema of the data, if not context-sensitive. + pub data_type: Option>, +} + +/// Parse virtual table. +fn parse_virtual_table( + x: &substrait::read_rel::VirtualTable, + y: &mut context::Context, +) -> diagnostic::Result { + let mut data_type: Arc = Arc::default(); + + // Parse rows, ensuring that they all have the same type. + proto_repeated_field!(x, y, values, |x, y| { + let result = literals::parse_struct(x, y, false); + data_type = types::assert_equal( + y, + &y.data_type(), + &data_type, + "virtual table rows must have the same type", + ); + result + }); + + // Describe the node. + describe!(y, Misc, "Virtual table"); + Ok(SourceInfo { + name: String::from("virtual table"), + data_type: Some(data_type), + }) +} + +/// Parse file entry. Returns whether this matches multiple files. +fn parse_path_type( + x: &substrait::read_rel::local_files::file_or_files::PathType, + y: &mut context::Context, +) -> diagnostic::Result { + // FIXME: I'm not sure these paths should even be URIs. These are supposed + // to be local files after all, so shouldn't they just be paths? But they + // really shouldn't be called URIs if they're not going to conform to the + // standard governing them, and if they're paths, there should still be + // some specification about what kind of paths they can be (POSIX? Windows + // with slashes? UNC? etc). + // + // Note that the diagnostics for this have their own code, so if a user + // disagrees with the syntax they can just downgrade these warnings to + // infos. + use substrait::read_rel::local_files::file_or_files::PathType; + match x { + PathType::UriPath(x) => { + if let Err(e) = string_util::check_uri(x) { + diagnostic!(y, Error, e); + } + Ok(false) + } + PathType::UriPathGlob(x) => { + if let Err(e) = string_util::check_uri_glob(x) { + diagnostic!(y, Error, e); + } + Ok(true) + } + PathType::UriFile(x) => { + if let Err(e) = string_util::check_uri(x) { + diagnostic!(y, Error, e); + } + Ok(false) + } + PathType::UriFolder(x) => { + if let Err(e) = string_util::check_uri(x) { + diagnostic!(y, Error, e); + } + Ok(true) + } + } +} + +/// Parse file entry. +fn parse_file_or_files( + x: &substrait::read_rel::local_files::FileOrFiles, + y: &mut context::Context, + extension_present: bool, +) -> diagnostic::Result<()> { + // Parse path. + let multiple = proto_required_field!(x, y, path_type, parse_path_type) + .1 + .unwrap_or_default(); + + // Parse read configuration. + let format = proto_enum_field!( + x, + y, + format, + substrait::read_rel::local_files::file_or_files::FileFormat, + |x, y| { + if !extension_present + && matches!( + x, + substrait::read_rel::local_files::file_or_files::FileFormat::Unspecified + ) + { + diagnostic!( + y, + Error, + IllegalValue, + "file format must be specified when no enhancement extension is present" + ); + } + Ok(*x) + } + ) + .1 + .unwrap_or_default(); + proto_primitive_field!(x, y, partition_index); + proto_primitive_field!(x, y, start); + proto_primitive_field!(x, y, length); + + // Having nonzero file offsets makes no sense when this entry refers to + // multiple files. + if multiple && (x.start > 0 || x.length > 0) { + diagnostic!( + y, + Error, + IllegalValue, + "file offsets are not allowed in conjunction with multiple files" + ); + } + + // Describe the node. + if multiple { + describe!(y, Misc, "Multiple files"); + } else { + describe!(y, Misc, "Single file"); + } + summary!(y, "Read"); + if x.partition_index != 0 { + summary!(y, "partition {}", x.partition_index); + } + summary!(y, "from"); + if multiple { + summary!(y, "multiple"); + } else { + if x.start > 0 { + if x.length > 0 { + summary!(y, "byte offset {} to {} of", x.start, x.start + x.length); + } else { + summary!(y, "byte offset {} to the end of", x.start); + } + } else if x.length > 0 { + summary!(y, "the first {} byte(s) of", x.length); + } + summary!(y, "a single"); + } + match format { + substrait::read_rel::local_files::file_or_files::FileFormat::Unspecified => {} + substrait::read_rel::local_files::file_or_files::FileFormat::Parquet => { + summary!(y, "Parquet"); + } + } + if multiple { + summary!(y, "files"); + } else { + summary!(y, "file"); + } + + Ok(()) +} + +/// Parse local files. +fn parse_local_files( + x: &substrait::read_rel::LocalFiles, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse fields. + let extension_present = x + .advanced_extension + .as_ref() + .and_then(|x| x.enhancement.as_ref()) + .is_some(); + proto_required_repeated_field!( + x, + y, + items, + parse_file_or_files, + |_, _, _, _, _| (), + extension_present + ); + proto_field!( + x, + y, + advanced_extension, + extensions::advanced::parse_advanced_extension + ); + + // Describe the node. + describe!(y, Misc, "Table from file(s)"); + Ok(SourceInfo { + name: String::from("local files"), + data_type: None, + }) +} + +/// Parse named table. +fn parse_named_table( + x: &substrait::read_rel::NamedTable, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse fields. + proto_required_repeated_field!(x, y, names); + proto_field!( + x, + y, + advanced_extension, + extensions::advanced::parse_advanced_extension + ); + + // Determine and check consistency of the table name. + let name = if x.names.is_empty() { + String::from("?") + } else { + if x.names.len() > 1 { + // FIXME: what does this mean? + diagnostic!( + y, + Warning, + NotYetImplemented, + "named tables with multiple names" + ); + } + string_util::as_ident_or_string(x.names.first().unwrap()) + }; + + // Describe the node. + describe!( + y, + Misc, + "Named table {}", + string_util::as_ident_or_string(&name) + ); + Ok(SourceInfo { + name, + data_type: None, + }) +} + +/// Parse extension table. +fn parse_extension_table( + x: &substrait::read_rel::ExtensionTable, + y: &mut context::Context, +) -> diagnostic::Result { + proto_required_field!(x, y, detail, extensions::advanced::parse_functional_any); + + // Describe the node. + describe!( + y, + Misc, + "{} extension", + x.detail + .as_ref() + .map(|x| x.type_url.clone()) + .unwrap_or_else(|| String::from("Unknown")) + ); + Ok(SourceInfo { + name: x + .detail + .as_ref() + .map(|x| x.type_url.to_string()) + .unwrap_or_else(|| String::from("extension")), + data_type: None, + }) +} + +/// Parse read type. +fn parse_read_type( + x: &substrait::read_rel::ReadType, + y: &mut context::Context, +) -> diagnostic::Result { + match x { + substrait::read_rel::ReadType::VirtualTable(x) => parse_virtual_table(x, y), + substrait::read_rel::ReadType::LocalFiles(x) => parse_local_files(x, y), + substrait::read_rel::ReadType::NamedTable(x) => parse_named_table(x, y), + substrait::read_rel::ReadType::ExtensionTable(x) => parse_extension_table(x, y), + } +} + +/// Parse read relation. +pub fn parse_read_rel(x: &substrait::ReadRel, y: &mut context::Context) -> diagnostic::Result<()> { + // Handle read type field. + let source = proto_required_field!(x, y, read_type, parse_read_type) + .1 + .unwrap_or(SourceInfo { + name: String::from("unknown source"), + data_type: None, + }); + + // Handle schema field. + let schema = proto_required_field!(x, y, base_schema, types::parse_named_struct) + .0 + .data_type + .clone(); + + // If both data_type and schema are known, verify that they are the same. + let mut schema = match (source.data_type, schema) { + (Some(data_type), Some(schema)) => { + types::assert_equal(y, &schema, &data_type, "data differs from schema") + } + (Some(data_type), None) => data_type, + (None, Some(schema)) => schema, + (None, None) => Arc::default(), + }; + + // The outer struct of a schema should not be nullable. + if !schema.is_unresolved() && schema.nullable() { + diagnostic!( + y, + Error, + TypeMismatchedNullability, + "the outer struct representing a schema must not be nullable" + ); + } + + // Set the schema to the merged data type. + y.set_schema(schema.clone()); + + // Handle filter. + proto_boxed_field!(x, y, filter, expressions::parse_predicate); + + // Handle projection. + if x.projection.is_some() { + schema = + proto_required_field!(x, y, projection, mask::parse_mask_expression, &schema, true) + .0 + .data_type(); + y.set_schema(schema.clone()); + } + + // Describe the relation. + match (x.filter.is_some(), x.projection.is_some()) { + (false, false) => describe!(y, Relation, "Read from {}", source.name), + (false, true) => describe!(y, Relation, "Partial read from {}", source.name), + (true, false) => describe!(y, Relation, "Filtered read from {}", source.name), + (true, true) => describe!(y, Relation, "Filtered partial read from {}", source.name), + } + + // Handle the common field. + handle_rel_common!(x, y); + + // Handle the advanced extension field. + handle_advanced_extension!(x, y); + + Ok(()) +} diff --git a/rs/src/parse/relations/set.rs b/rs/src/parse/relations/set.rs new file mode 100644 index 00000000..e5a8239a --- /dev/null +++ b/rs/src/parse/relations/set.rs @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for set relations. +//! +//! The set operation encompasses several set level operations that support +//! combining datasets based, possibly excluding records based on various +//! types of record level matching. +//! +//! See + +use std::sync::Arc; + +use crate::input::proto::substrait; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::types; + +enum Operation { + Invalid, + Subtract, + SubtractByUnion, + SubtractByIntersection, + Intersect, + IntersectWithUnion, + Union, + Merge, +} + +/// Parse set relation. +pub fn parse_set_rel(x: &substrait::SetRel, y: &mut context::Context) -> diagnostic::Result<()> { + use substrait::set_rel::SetOp; + + // Parse inputs. + let in_types: Vec<_> = handle_rel_inputs!(x, y).collect(); + + // Check inputs and derive schema. + if in_types.len() < 2 { + diagnostic!( + y, + Error, + RelationMissing, + "set operations require at least two input relations" + ); + } + let mut schema = Arc::default(); + for in_type in in_types.iter() { + schema = types::assert_equal( + y, + &in_type.strip_field_names(), + &schema, + "all set inputs must have matching schemas", + ); + } + y.set_schema(schema); + + // Check set operation. + let op = proto_required_enum_field!(x, y, op, SetOp) + .1 + .unwrap_or_default(); + let op = match (op, in_types.len() > 2) { + (SetOp::Unspecified, _) => Operation::Invalid, + (SetOp::MinusPrimary, true) => Operation::SubtractByUnion, + (SetOp::MinusPrimary, false) => Operation::Subtract, + (SetOp::MinusMultiset, true) => Operation::SubtractByIntersection, + (SetOp::MinusMultiset, false) => Operation::Subtract, + (SetOp::IntersectionPrimary, true) => Operation::IntersectWithUnion, + (SetOp::IntersectionPrimary, false) => Operation::Intersect, + (SetOp::IntersectionMultiset, _) => Operation::Intersect, + (SetOp::UnionDistinct, _) => Operation::Union, + (SetOp::UnionAll, _) => Operation::Merge, + }; + + // Describe the relation. + match op { + Operation::Invalid => { + describe!(y, Relation, "Invalid set operation"); + } + Operation::Subtract => { + describe!(y, Relation, "Set subtraction"); + summary!( + y, + "Yields all rows from the first dataset that do not exist \ + in the second dataset." + ); + } + Operation::SubtractByUnion => { + describe!(y, Relation, "Set subtract by union"); + summary!( + y, + "Yields all rows from the first dataset that do not exist \ + in any of the other datasets." + ); + } + Operation::SubtractByIntersection => { + describe!(y, Relation, "Set subtract by intersection"); + summary!( + y, + "Yields all rows from the first dataset that do not exist in \ + all of the other datasets." + ); + } + Operation::Intersect => { + describe!(y, Relation, "Set intersection"); + summary!( + y, + "Yields all rows from the first dataset that exist in all \ + datasets." + ); + } + Operation::IntersectWithUnion => { + describe!(y, Relation, "Set intersect with union"); + summary!( + y, + "Yields all rows from the first dataset that exist in any of \ + the other datasets." + ); + } + Operation::Union => { + describe!(y, Relation, "Set union"); + summary!( + y, + "Yields all rows that exist in any dataset, removing duplicates." + ); + } + Operation::Merge => { + describe!(y, Relation, "Merge"); + summary!(y, "Yields all rows from all incoming datasets."); + } + }; + + // Handle the common field. + handle_rel_common!(x, y); + + // Handle the advanced extension field. + handle_advanced_extension!(x, y); + + Ok(()) +} diff --git a/rs/src/parse/relations/sort.rs b/rs/src/parse/relations/sort.rs new file mode 100644 index 00000000..a4ccba29 --- /dev/null +++ b/rs/src/parse/relations/sort.rs @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for sort relations. +//! +//! The sort operator reorders a dataset based on one or more identified +//! sort fields as well as a sorting function. +//! +//! See + +use crate::input::proto::substrait; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::sorts; + +/// Parse sort relation. +pub fn parse_sort_rel(x: &substrait::SortRel, y: &mut context::Context) -> diagnostic::Result<()> { + // Parse input. + let in_type = handle_rel_input!(x, y); + + // Sorts pass through their input schema unchanged. + y.set_schema(in_type); + + // Check the sorts. + let keys = proto_required_repeated_field!(x, y, sorts, sorts::parse_sort_field).1; + + // Describe the relation. + describe!( + y, + Relation, + "Order by {}", + keys.first().cloned().flatten().unwrap_or_default() + ); + if x.sorts.len() > 1 { + summary!( + y, + "This relation reorders or coalesces a dataset based on {} keys. \ + For sorts, the first key has greatest priority; only if the first \ + key is equivalent for two rows will the next key be checked.", + x.sorts.len() + ); + } else { + summary!( + y, + "This relation reorders or coalesces a dataset based on the value of {}.", + keys.first().cloned().flatten().unwrap_or_default() + ); + } + + // Handle the common field. + handle_rel_common!(x, y); + + // Handle the advanced extension field. + handle_advanced_extension!(x, y); + + Ok(()) +} diff --git a/rs/src/parse/sorts.rs b/rs/src/parse/sorts.rs new file mode 100644 index 00000000..9db76984 --- /dev/null +++ b/rs/src/parse/sorts.rs @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for sort fields. + +use std::sync::Arc; + +use crate::input::proto::substrait; +use crate::input::traits::ProtoEnum; +use crate::output::comment; +use crate::output::data_type; +use crate::output::diagnostic; +use crate::parse::context; +use crate::parse::expressions; +use crate::parse::expressions::functions; +use crate::parse::extensions; + +/// Parse a sort direction. +fn parse_sort_direction(x: &i32, y: &mut context::Context) -> diagnostic::Result<&'static str> { + use substrait::sort_field::SortDirection; + match SortDirection::proto_enum_from_i32(*x) { + None => { + diagnostic!( + y, + Error, + IllegalValue, + "unknown value {x} for {}", + SortDirection::proto_enum_type() + ); + Ok("Invalid sort by") + } + Some(SortDirection::Unspecified) => { + diagnostic!(y, Error, ProtoMissingField, "direction"); + Ok("Invalid sort by") + } + Some(SortDirection::AscNullsFirst) => { + describe!(y, Misc, "Sort ascending, nulls first"); + Ok("Ascending sort by") + } + Some(SortDirection::AscNullsLast) => { + describe!(y, Misc, "Sort ascending, nulls last"); + Ok("Ascending sort by") + } + Some(SortDirection::DescNullsFirst) => { + describe!(y, Misc, "Sort descending, nulls first"); + Ok("Descending sort by") + } + Some(SortDirection::DescNullsLast) => { + describe!(y, Misc, "Sort descending, nulls last"); + Ok("Descending sort by") + } + Some(SortDirection::Clustered) => { + describe!(y, Misc, "Coalesce equal values"); + summary!( + y, + "Equal values are grouped together, but no ordering is defined between clusters." + ); + Ok("Coalesce") + } + } +} + +/// Parse a function reference that should resolve to a comparison function +/// (i.e. one usable for sorts) for the given type. +fn parse_comparison_function_reference( + x: &u32, + y: &mut context::Context, + data_type: &Arc, +) -> diagnostic::Result<&'static str> { + // Resolve the reference as normal. + let function = extensions::simple::parse_function_reference(x, y)?; + + // Check the function. + if let Some(function) = &function.definition { + let return_type = + functions::check_function(y, function, &[], &[data_type.clone(), data_type.clone()]); + if !matches!( + return_type.class(), + data_type::Class::Simple(data_type::Simple::Boolean) + | data_type::Class::Simple(data_type::Simple::I8) + | data_type::Class::Simple(data_type::Simple::I16) + | data_type::Class::Simple(data_type::Simple::I32) + | data_type::Class::Simple(data_type::Simple::I64) + | data_type::Class::Unresolved + ) { + diagnostic!( + y, + Error, + TypeMismatch, + "comparison functions must yield booleans (a < b) or integers (a ?= b), but found {}", + return_type + ); + } + } else { + diagnostic!( + y, + Warning, + ExpressionFunctionDefinitionUnavailable, + "cannot check validity of comparison function" + ); + } + + // Describe how the function is to be interpreted. + y.push_summary( + comment::Comment::new() + .plain("Comparison function for sorting. Taking two elements as input,") + .plain("it must determine the correct sort order. Comparison functions") + .plain("may return booleans or integers, interpreted as follows:") + .lo() + .plain("f(a, b) => true or negative: a sorts before b;") + .li() + .plain("f(a, b) => false or positive: b sorts before a;") + .li() + .plain("f(a, b) => 0 or null: a and b have no defined sort order.") + .lc() + .plain("This corresponds to f: a < b or f: a ?= b."), + ); + + Ok("Custom sort") +} + +/// Parse a sort kind, applicable to elements of the given data type. +fn parse_sort_kind( + x: &substrait::sort_field::SortKind, + y: &mut context::Context, + data_type: &Arc, +) -> diagnostic::Result<&'static str> { + match x { + substrait::sort_field::SortKind::Direction(x) => parse_sort_direction(x, y), + substrait::sort_field::SortKind::ComparisonFunctionReference(x) => { + parse_comparison_function_reference(x, y, data_type) + } + } +} + +/// Parse a sort field. +pub fn parse_sort_field( + x: &substrait::SortField, + y: &mut context::Context, +) -> diagnostic::Result { + // Parse fields. + let (n, e) = proto_required_field!(x, y, expr, expressions::parse_expression); + let expression = e.unwrap_or_default(); + let method = proto_required_field!(x, y, sort_kind, parse_sort_kind, &n.data_type()) + .1 + .unwrap_or("Invalid sort by"); + + // Describe node. + describe!(y, Misc, "{method} {expression}"); + summary!(y, "{method} {expression:#}."); + Ok(expression) +} diff --git a/rs/src/parse/traversal.rs b/rs/src/parse/traversal.rs new file mode 100644 index 00000000..c61e4038 --- /dev/null +++ b/rs/src/parse/traversal.rs @@ -0,0 +1,1251 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module for the boilerplate code involved with traversing an input +//! protobuf/YAML tree to form the output [tree](tree::Node). +//! +//! Refer to the documentation for [`parse`](mod@crate::parse) for more +//! information. + +// TODO: remove once validation code is finished. +#![allow(dead_code)] +#![allow(unused_macros)] + +use crate::input::config; +use crate::input::traits::InputNode; +use crate::input::traits::ProtoEnum; +use crate::input::yaml; +use crate::output::diagnostic; +use crate::output::extension; +use crate::output::parse_result; +use crate::output::path; +use crate::output::primitive_data; +use crate::output::tree; +use crate::parse::context; +use std::sync::Arc; + +//============================================================================= +// Type definitions +//============================================================================= + +// Return value for parse macros for optional fields. The first element refers +// to the node for the field, if the field was present. The second is the +// return value of the supplied parse function, if it was called and didn't +// fail. +type OptionalResult = (Option>, Option); + +// Return value for parse macros for required fields. The first element refers +// to the node for the field; if the required field wasn't actually specified, +// a dummy node would have been made, so this is not an Option. The second is +// the return value of the supplied parse function, if it was called and didn't +// fail, just like for OptionalResult. +type RequiredResult = (Arc, Option); + +// Return value for parse macros for repeated fields. Same as RequiredResult, +// but with each tuple entry wrapped in a vector. Both vectors will have equal +// length. +type RepeatedResult = (Vec>, Vec>); + +//============================================================================= +// Macros for pushing annotations +//============================================================================= + +/// Convenience/shorthand macro for pushing diagnostic messages to a node. +macro_rules! diagnostic { + ($context:expr, $level:ident, $class:ident, $($args:expr),*) => { + diagnostic!($context, $level, cause!($class, $($args),*)) + }; + ($context:expr, $level:ident, $cause:expr) => { + crate::parse::traversal::push_diagnostic($context, crate::output::diagnostic::Level::$level, $cause) + }; + ($context:expr, $diag:expr) => { + $context.push_diagnostic($diag) + }; +} +macro_rules! ediagnostic { + ($context:expr, $level:ident, $class:ident, $err:expr) => { + diagnostic!($context, $level, ecause!($class, $err)) + }; +} + +/// Pushes a diagnostic message to the node information list. +pub fn push_diagnostic( + context: &mut context::Context, + level: diagnostic::Level, + cause: diagnostic::Cause, +) { + context.push_diagnostic(diagnostic::RawDiagnostic { + cause, + level, + path: context.path_buf(), + }); +} + +/// Convenience/shorthand macro for pushing formatted comments to a node. +macro_rules! comment { + ($context:expr, $($fmts:expr),*) => { + $context.push_comment(format!($($fmts),*)) + }; +} + +/// Convenience/shorthand macro for pushing formatted comments that link to +/// some path to a node. +macro_rules! link { + ($context:expr, $path:expr, $($fmts:expr),*) => { + $context.push_comment(crate::output::comment::Comment::new().link(format!($($fmts),*), $path)) + }; +} + +/// Convenience/shorthand macro for setting descriptive information for a node. +macro_rules! describe { + ($context:expr, $class:ident, $($fmts:expr),*) => { + $context.set_description(crate::output::tree::Class::$class, Some(format!($($fmts),*))) + }; +} + +/// Convenience/shorthand macro for appending plain text to the summary of a +/// node. +macro_rules! summary { + ($context:expr, $($fmts:expr),*) => { + $context.push_summary(format!($($fmts),*)) + }; +} + +//============================================================================= +// Generic code for field handling +//============================================================================= + +/// Parses a child node and pushes it into the provided parent context. +fn push_child( + context: &mut context::Context, + child: &TF, + path_element: path::PathElement, + unknown_subtree: bool, + parser: FP, +) -> RequiredResult +where + TF: InputNode, + FP: FnOnce(&TF, &mut context::Context) -> diagnostic::Result, +{ + // Create the node for the child. + let mut field_output = child.data_to_node(); + + // Create the context for calling the parse function for the child. + let mut field_context = context.child(&mut field_output, path_element.clone()); + + // Call the provided parser function. + let result = parser(child, &mut field_context) + .map_err(|cause| { + diagnostic!(&mut field_context, Error, cause); + }) + .ok(); + + // Handle any fields not handled by the provided parse function. Only + // generate a warning diagnostic for unhandled children if the parse + // function succeeded and we're not already in an unknown subtree. + handle_unknown_children( + child, + &mut field_context, + result.is_some() && !unknown_subtree, + ); + + // Push and return the completed node. + let field_output = Arc::new(field_output); + context.push(tree::NodeData::Child(tree::Child { + path_element, + node: field_output.clone(), + recognized: !unknown_subtree, + })); + + (field_output, result) +} + +/// Handle all children that haven't already been handled. If with_diagnostic +/// is set, this also generates a diagnostic message if there were +/// populated/non-default unhandled fields. +fn handle_unknown_children( + input: &T, + context: &mut context::Context, + with_diagnostic: bool, +) { + if input.parse_unknown(context) && with_diagnostic { + let mut fields = vec![]; + for data in context.node_data().iter() { + if let tree::NodeData::Child(child) = data { + if !child.recognized { + fields.push(child.path_element.to_string_without_dot()); + } + } + } + if !fields.is_empty() { + let fields: String = + itertools::Itertools::intersperse(fields.into_iter(), ", ".to_string()).collect(); + diagnostic!( + context, + Warning, + NotYetImplemented, + "the following child nodes were not recognized by the validator: {fields}" + ); + } + } +} + +//============================================================================= +// Protobuf optional field handling +//============================================================================= + +/// Convenience/shorthand macro for parsing optional protobuf fields. +macro_rules! proto_field { + ($input:expr, $context:expr, $field:ident) => { + proto_field!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr) => { + crate::parse::traversal::push_proto_field( + $context, + &$input.$field.as_ref(), + crate::input::proto::cook_ident(stringify!($field)), + false, + $parser, + ) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr, $($args:expr),*) => { + proto_field!($input, $context, $field, |x, y| $parser(x, y, $($args),*)) + }; +} + +/// Convenience/shorthand macro for parsing optional protobuf fields that were +/// wrapped in a Box by prost. +macro_rules! proto_boxed_field { + ($input:expr, $context:expr, $field:ident) => { + proto_boxed_field!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr) => { + crate::parse::traversal::push_proto_field( + $context, + &$input.$field, + crate::input::proto::cook_ident(stringify!($field)), + false, + $parser, + ) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr, $($args:expr),*) => { + proto_boxed_field!($input, $context, $field, |x, y| $parser(x, y, $($args),*)) + }; +} + +/// Parse and push a protobuf optional field. +pub fn push_proto_field( + context: &mut context::Context, + field: &Option>, + field_name: &'static str, + unknown_subtree: bool, + parser: FP, +) -> OptionalResult +where + TF: InputNode, + FP: FnOnce(&TF, &mut context::Context) -> diagnostic::Result, +{ + if !context.set_field_parsed(field_name) { + panic!("field {field_name} was parsed multiple times"); + } + + if let Some(field_input) = field { + let path_element = if let Some(variant) = field_input.oneof_variant() { + path::PathElement::Variant(field_name.to_string(), variant.to_string()) + } else { + path::PathElement::Field(field_name.to_string()) + }; + let (field_output, result) = push_child( + context, + field_input.deref(), + path_element, + unknown_subtree, + parser, + ); + (Some(field_output), result) + } else { + (None, None) + } +} + +//============================================================================= +// Protobuf required and primitive field handling +//============================================================================= + +/// Convenience/shorthand macro for parsing required protobuf fields. +macro_rules! proto_required_field { + ($input:expr, $context:expr, $field:ident) => { + proto_required_field!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr) => { + crate::parse::traversal::push_proto_required_field( + $context, + &$input.$field.as_ref(), + crate::input::proto::cook_ident(stringify!($field)), + false, + $parser, + ) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr, $($args:expr),*) => { + proto_required_field!($input, $context, $field, |x, y| $parser(x, y, $($args),*)) + }; +} + +/// Convenience/shorthand macro for parsing required protobuf fields that were +/// wrapped in a Box by prost. +macro_rules! proto_boxed_required_field { + ($input:expr, $context:expr, $field:ident) => { + proto_boxed_required_field!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr) => { + crate::parse::traversal::push_proto_required_field( + $context, + &$input.$field, + crate::input::proto::cook_ident(stringify!($field)), + false, + $parser, + ) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr, $($args:expr),*) => { + proto_boxed_required_field!($input, $context, $field, |x, y| $parser(x, y, $($args),*)) + }; +} + +/// Convenience/shorthand macro for parsing primitive protobuf fields. +macro_rules! proto_primitive_field { + ($input:expr, $context:expr, $field:ident) => { + proto_primitive_field!($input, $context, $field, |x, _| Ok(x.to_owned())) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr) => { + crate::parse::traversal::push_proto_required_field( + $context, + &Some(&$input.$field), + crate::input::proto::cook_ident(stringify!($field)), + false, + $parser, + ) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr, $($args:expr),*) => { + proto_primitive_field!($input, $context, $field, |x, y| $parser(x, y, $($args),*)) + }; +} + +/// Parse and push a required field of some message type. If the field is +/// not populated, a MissingField diagnostic is pushed automatically, and +/// an empty node is returned as an error recovery placeholder. +pub fn push_proto_required_field( + context: &mut context::Context, + field: &Option>, + field_name: &'static str, + unknown_subtree: bool, + parser: FP, +) -> RequiredResult +where + TF: InputNode, + FP: FnOnce(&TF, &mut context::Context) -> diagnostic::Result, +{ + if let (Some(node), result) = + push_proto_field(context, field, field_name, unknown_subtree, parser) + { + (node, result) + } else { + ediagnostic!(context, Error, ProtoMissingField, field_name); + (Arc::new(TF::type_to_node()), None) + } +} + +/// Convenience/shorthand macro for parsing enumeration protobuf fields. +macro_rules! proto_enum_field { + ($input:expr, $context:expr, $field:ident, $typ:ty) => { + proto_enum_field!($input, $context, $field, $typ, |x, _| Ok(x.to_owned())) + }; + ($input:expr, $context:expr, $field:ident, $typ:ty, $parser:expr) => { + crate::parse::traversal::push_proto_enum_field::<$typ, _, _>( + $context, + $input.$field, + crate::input::proto::cook_ident(stringify!($field)), + false, + $parser, + ) + }; + ($input:expr, $context:expr, $field:ident, $typ:ty, $parser:expr, $($args:expr),*) => { + proto_enum_field!($input, $context, $field, $typ, |x, y| $parser(x, y, $($args),*)) + }; +} + +/// Parse and push an enumeration field of some message type. The i32 in the +/// struct generated by prost is automatically converted to the enum; if the +/// value is out of range, an error is generated. +pub fn push_proto_enum_field( + context: &mut context::Context, + field: i32, + field_name: &'static str, + unknown_subtree: bool, + parser: FP, +) -> RequiredResult +where + TF: ProtoEnum, + FP: FnOnce(&TF, &mut context::Context) -> diagnostic::Result, +{ + if let Some(field) = TF::proto_enum_from_i32(field) { + push_proto_required_field(context, &Some(&field), field_name, unknown_subtree, parser) + } else { + ( + push_proto_required_field( + context, + &Some(&field), + field_name, + unknown_subtree, + |x, y| { + diagnostic!( + y, + Error, + IllegalValue, + "unknown value {x} for {}", + TF::proto_enum_type() + ); + Ok(()) + }, + ) + .0, + None, + ) + } +} + +/// Convenience/shorthand macro for parsing enumeration protobuf fields of +/// which the value must be specified. +macro_rules! proto_required_enum_field { + ($input:expr, $context:expr, $field:ident, $typ:ty) => { + proto_required_enum_field!($input, $context, $field, $typ, |x, _| Ok(x.to_owned())) + }; + ($input:expr, $context:expr, $field:ident, $typ:ty, $parser:expr) => { + crate::parse::traversal::push_proto_required_enum_field::<$typ, _, _>( + $context, + $input.$field, + crate::input::proto::cook_ident(stringify!($field)), + false, + $parser, + ) + }; + ($input:expr, $context:expr, $field:ident, $typ:ty, $parser:expr, $($args:expr),*) => { + proto_required_enum_field!($input, $context, $field, $typ, |x, y| $parser(x, y, $($args),*)) + }; +} + +/// Parse and push an enumeration field of some message type. The i32 in the +/// struct generated by prost is automatically converted to the enum; if the +/// value is out of range, an error is generated. +pub fn push_proto_required_enum_field( + context: &mut context::Context, + field: i32, + field_name: &'static str, + unknown_subtree: bool, + parser: FP, +) -> RequiredResult +where + TF: ProtoEnum, + FP: FnOnce(&TF, &mut context::Context) -> diagnostic::Result, +{ + push_proto_enum_field(context, field, field_name, unknown_subtree, |x, y| { + if field == 0 { + diagnostic!( + y, + Error, + IllegalValue, + "this enum may not be left unspecified" + ); + } + parser(x, y) + }) +} + +//============================================================================= +// Protobuf repeated field handling +//============================================================================= + +/// Convenience/shorthand macro for parsing repeated protobuf fields. +macro_rules! proto_repeated_field { + ($input:expr, $context:expr, $field:ident) => { + proto_repeated_field!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr) => { + proto_repeated_field!($input, $context, $field, $parser, |_, _, _, _, _| ()) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr, $validator:expr) => { + crate::parse::traversal::push_proto_repeated_field( + $context, + &$input.$field, + crate::input::proto::cook_ident(stringify!($field)), + false, + $parser, + $validator, + ) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr, $validator:expr, $($args:expr),*) => { + proto_repeated_field!($input, $context, $field, |x, y| $parser(x, y, $($args),*), $validator) + }; +} + +/// Parse and push a repeated field of some message type. +pub fn push_proto_repeated_field( + context: &mut context::Context, + field: &[TF], + field_name: &'static str, + unknown_subtree: bool, + mut parser: FP, + mut validator: FV, +) -> RepeatedResult +where + TF: InputNode, + FP: FnMut(&TF, &mut context::Context) -> diagnostic::Result, + FV: FnMut(&TF, &mut context::Context, usize, &Arc, Option<&TR>), +{ + if !context.set_field_parsed(field_name) { + panic!("field {field_name} was parsed multiple times"); + } + + field + .iter() + .enumerate() + .map(|(index, child)| { + let (node, result) = push_child( + context, + child, + path::PathElement::Repeated(field_name.to_string(), index), + unknown_subtree, + &mut parser, + ); + validator(child, context, index, &node, result.as_ref()); + (node, result) + }) + .unzip() +} + +/// Convenience/shorthand macro for parsing repeated protobuf fields for which +/// at least one element must exist. +macro_rules! proto_required_repeated_field { + ($input:expr, $context:expr, $field:ident) => { + proto_required_repeated_field!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr) => { + proto_required_repeated_field!($input, $context, $field, $parser, |_, _, _, _, _| ()) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr, $validator:expr) => { + crate::parse::traversal::push_proto_required_repeated_field( + $context, + &$input.$field, + crate::input::proto::cook_ident(stringify!($field)), + false, + $parser, + $validator, + ) + }; + ($input:expr, $context:expr, $field:ident, $parser:expr, $validator:expr, $($args:expr),*) => { + proto_required_repeated_field!($input, $context, $field, |x, y| $parser(x, y, $($args),*), $validator) + }; +} + +/// Parse and push a repeated field of some message type, and check that at +/// least one element exists. +pub fn push_proto_required_repeated_field( + context: &mut context::Context, + field: &[TF], + field_name: &'static str, + unknown_subtree: bool, + parser: FP, + validator: FV, +) -> RepeatedResult +where + TF: InputNode, + FP: FnMut(&TF, &mut context::Context) -> diagnostic::Result, + FV: FnMut(&TF, &mut context::Context, usize, &Arc, Option<&TR>), +{ + if field.is_empty() { + ediagnostic!(context, Error, ProtoMissingField, field_name); + } + push_proto_repeated_field( + context, + field, + field_name, + unknown_subtree, + parser, + validator, + ) +} + +//============================================================================= +// Protobuf root message handling +//============================================================================= + +/// Parses a serialized protobuf message using the given root parse function, +/// initial state, and configuration. +pub fn parse_proto( + buffer: B, + root_name: &'static str, + root_parser: F, + state: &mut context::State, + config: &config::Config, +) -> parse_result::ParseResult +where + T: prost::Message + InputNode + Default, + F: FnOnce(&T, &mut context::Context) -> diagnostic::Result<()>, + B: prost::bytes::Buf, +{ + match T::decode(buffer) { + Err(err) => { + // Create a minimal root node with just the decode error + // diagnostic. + let mut root = T::type_to_node(); + + // Create a root context for it. + let mut context = context::Context::new(root_name, &mut root, state, config); + + // Push the diagnostic using the context. + context.push_diagnostic(diagnostic::RawDiagnostic { + cause: ecause!(ProtoParseFailed, err), + level: diagnostic::Level::Error, + path: path::PathBuf { + root: root_name, + elements: vec![], + }, + }); + + parse_result::ParseResult { root } + } + Ok(input) => { + // Create the root node. + let mut root = input.data_to_node(); + + // Create the root context. + let mut context = context::Context::new(root_name, &mut root, state, config); + + // Call the provided parser function. + let success = root_parser(&input, &mut context) + .map_err(|cause| { + diagnostic!(&mut context, Error, cause); + }) + .is_ok(); + + // Handle any fields not handled by the provided parse function. + // Only generate a warning diagnostic for unhandled children if the + // parse function succeeded. + handle_unknown_children(&input, &mut context, success); + + parse_result::ParseResult { root } + } + } +} + +//============================================================================= +// YAML object handling +//============================================================================= + +/// Convenience/shorthand macro for parsing optional YAML fields. +macro_rules! yaml_field { + ($input:expr, $context:expr, $field:expr) => { + yaml_field!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $field:expr, $parser:expr) => { + crate::parse::traversal::push_yaml_field($input, $context, $field, false, $parser) + }; +} + +/// Parse and push an optional YAML field. +pub fn push_yaml_field( + input: &yaml::Value, + context: &mut context::Context, + field_name: TS, + unknown_subtree: bool, + parser: FP, +) -> diagnostic::Result> +where + TS: AsRef, + FP: FnOnce(&yaml::Value, &mut context::Context) -> diagnostic::Result, +{ + if let serde_json::Value::Object(input) = input { + let field_name = field_name.as_ref(); + if !context.set_field_parsed(field_name) { + panic!("field {field_name} was parsed multiple times"); + } + + if let Some(child) = input.get(field_name) { + let (field_output, result) = push_child( + context, + child, + path::PathElement::Field(field_name.to_string()), + unknown_subtree, + parser, + ); + Ok((Some(field_output), result)) + } else { + Ok((None, None)) + } + } else { + Err(cause!(YamlInvalidType, "object expected")) + } +} + +/// Convenience/shorthand macro for parsing required YAML fields. +macro_rules! yaml_required_field { + ($input:expr, $context:expr, $field:expr) => { + yaml_required_field!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $field:expr, $parser:expr) => { + crate::parse::traversal::push_yaml_required_field($input, $context, $field, false, $parser) + }; +} + +/// Parse and push a required field of a YAML object. If the field does not +/// exist, a MissingField diagnostic is pushed automatically, and an empty node +/// is returned as an error recovery placeholder. +pub fn push_yaml_required_field( + input: &yaml::Value, + context: &mut context::Context, + field_name: TS, + unknown_subtree: bool, + parser: FP, +) -> diagnostic::Result> +where + TS: AsRef, + FP: FnOnce(&yaml::Value, &mut context::Context) -> diagnostic::Result, +{ + let field_name = field_name.as_ref(); + if let (Some(node), result) = + push_yaml_field(input, context, field_name, unknown_subtree, parser)? + { + Ok((node, result)) + } else { + ediagnostic!(context, Error, YamlMissingKey, field_name); + Ok(( + Arc::new(tree::NodeType::YamlPrimitive(primitive_data::PrimitiveData::Null).into()), + None, + )) + } +} + +//============================================================================= +// YAML array handling +//============================================================================= + +/// Convenience/shorthand macro for parsing a YAML array that may be empty. +macro_rules! yaml_array { + ($input:expr, $context:expr) => { + yaml_array!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $parser:expr) => { + yaml_array!($input, $context, $field, $parser, 0) + }; + ($input:expr, $context:expr, $parser:expr, $min_size:expr) => { + crate::parse::traversal::push_yaml_array($input, $context, $min_size, false, $parser) + }; +} + +/// Convenience/shorthand macro for parsing a YAML array that must have at +/// least one value. +macro_rules! yaml_required_array { + ($input:expr, $context:expr) => { + yaml_required_array!($input, $context, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $parser:expr) => { + yaml_array!($input, $context, $parser, 1) + }; +} + +/// Parse and push an optional YAML array element. +pub fn push_yaml_element( + input: &yaml::Array, + context: &mut context::Context, + index: usize, + unknown_subtree: bool, + parser: FP, +) -> OptionalResult +where + FP: FnOnce(&yaml::Value, &mut context::Context) -> diagnostic::Result, +{ + if !context.set_field_parsed(index) { + panic!("element {index} was parsed multiple times"); + } + + if let Some(child) = input.get(index) { + let (field_output, result) = push_child( + context, + child, + path::PathElement::Index(index), + unknown_subtree, + parser, + ); + (Some(field_output), result) + } else { + (None, None) + } +} + +/// Parse and push a required element of a YAML array. If the element does not +/// exist, a MissingElement diagnostic is pushed automatically, and an empty node +/// is returned as an error recovery placeholder. +pub fn push_yaml_required_element( + input: &yaml::Array, + context: &mut context::Context, + index: usize, + unknown_subtree: bool, + parser: FP, +) -> RequiredResult +where + FP: FnOnce(&yaml::Value, &mut context::Context) -> diagnostic::Result, +{ + if let (Some(node), result) = push_yaml_element(input, context, index, unknown_subtree, parser) + { + (node, result) + } else { + diagnostic!(context, Error, YamlMissingElement, "index {index}"); + ( + Arc::new(tree::NodeType::YamlPrimitive(primitive_data::PrimitiveData::Null).into()), + None, + ) + } +} + +/// Parse and push a complete YAML array. If a required element does not exist, +/// a MissingElement diagnostic is pushed automatically, and an empty node is +/// returned as an error recovery placeholder. +pub fn push_yaml_array( + input: &yaml::Value, + context: &mut context::Context, + min_size: usize, + unknown_subtree: bool, + mut parser: FP, +) -> diagnostic::Result> +where + FP: FnMut(&yaml::Value, &mut context::Context) -> diagnostic::Result, +{ + if let serde_json::Value::Array(input) = input { + let size = std::cmp::max(min_size, input.len()); + Ok((0..size) + .into_iter() + .map(|index| { + push_yaml_required_element(input, context, index, unknown_subtree, &mut parser) + }) + .unzip()) + } else { + Err(cause!(YamlInvalidType, "array expected")) + } +} + +/// Shorthand for fields that must be arrays if specified. +macro_rules! yaml_repeated_field { + ($input:expr, $context:expr, $field:expr) => { + yaml_repeated_field!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $field:expr, $parser:expr) => { + yaml_repeated_field!($input, $context, $field, $parser, 0) + }; + ($input:expr, $context:expr, $field:expr, $parser:expr, $min_size:expr) => { + crate::parse::traversal::push_yaml_repeated_field( + $input, $context, $field, false, $min_size, false, $parser, + ) + }; +} + +/// Shorthand for fields that must be arrays. +macro_rules! yaml_required_repeated_field { + ($input:expr, $context:expr, $field:expr) => { + yaml_required_repeated_field!($input, $context, $field, |_, _| Ok(())) + }; + ($input:expr, $context:expr, $field:expr, $parser:expr) => { + yaml_required_repeated_field!($input, $context, $field, $parser, 1) + }; + ($input:expr, $context:expr, $field:expr, $parser:expr, $min_size:expr) => { + crate::parse::traversal::push_yaml_repeated_field( + $input, $context, $field, true, $min_size, false, $parser, + ) + }; +} + +/// Parse and push a complete YAML array. If a required element does not exist, +/// a MissingElement diagnostic is pushed automatically, and an empty node is +/// returned as an error recovery placeholder. +pub fn push_yaml_repeated_field( + input: &yaml::Value, + context: &mut context::Context, + field_name: &'static str, + field_required: bool, + min_size: usize, + unknown_subtree: bool, + parser: FP, +) -> diagnostic::Result> +where + FP: FnMut(&yaml::Value, &mut context::Context) -> diagnostic::Result, +{ + Ok(if field_required { + push_yaml_required_field(input, context, field_name, unknown_subtree, |x, y| { + yaml_array!(x, y, parser, min_size) + })? + .1 + } else { + push_yaml_field(input, context, field_name, unknown_subtree, |x, y| { + yaml_array!(x, y, parser, min_size) + })? + .1 + } + .unwrap_or_else(|| (vec![], vec![]))) +} + +//============================================================================= +// YAML primitive handling +//============================================================================= + +/// Convenience/shorthand macro for parsing optional YAML fields. +macro_rules! yaml_prim { + ($typ:ident) => { + |x, y| crate::parse::traversal::yaml_primitive_parsers::$typ(x, y, |x, _| Ok(x.to_owned())) + }; + ($typ:ident, $parser:expr) => { + |x, y| crate::parse::traversal::yaml_primitive_parsers::$typ(x, y, $parser) + }; +} + +pub mod yaml_primitive_parsers { + use super::*; + + /// Boolean primitive helper. + pub fn bool( + x: &yaml::Value, + y: &mut context::Context, + parser: FP, + ) -> diagnostic::Result + where + FP: FnOnce(&bool, &mut context::Context) -> diagnostic::Result, + { + if let serde_json::Value::Bool(x) = x { + parser(x, y) + } else { + Err(cause!(YamlInvalidType, "string expected")) + } + } + + /// Signed integer primitive helper. + pub fn i64( + x: &yaml::Value, + y: &mut context::Context, + parser: FP, + ) -> diagnostic::Result + where + FP: FnOnce(&i64, &mut context::Context) -> diagnostic::Result, + { + if let serde_json::Value::Number(x) = x { + if let Some(x) = x.as_i64() { + return parser(&x, y); + } + } + Err(cause!(YamlInvalidType, "signed integer expected")) + } + + /// Unsigned integer primitive helper. + pub fn u64( + x: &yaml::Value, + y: &mut context::Context, + parser: FP, + ) -> diagnostic::Result + where + FP: FnOnce(&u64, &mut context::Context) -> diagnostic::Result, + { + if let serde_json::Value::Number(x) = x { + if let Some(x) = x.as_u64() { + return parser(&x, y); + } + } + Err(cause!(YamlInvalidType, "unsigned integer expected")) + } + + /// Float primitive helper. + pub fn f64( + x: &yaml::Value, + y: &mut context::Context, + parser: FP, + ) -> diagnostic::Result + where + FP: FnOnce(&f64, &mut context::Context) -> diagnostic::Result, + { + if let serde_json::Value::Number(x) = x { + if let Some(x) = x.as_f64() { + return parser(&x, y); + } + } + Err(cause!(YamlInvalidType, "floating point number expected")) + } + + /// String primitive helper. + pub fn str( + x: &yaml::Value, + y: &mut context::Context, + parser: FP, + ) -> diagnostic::Result + where + FP: FnOnce(&str, &mut context::Context) -> diagnostic::Result, + { + if let serde_json::Value::String(x) = x { + parser(x, y) + } else { + Err(cause!(YamlInvalidType, "string expected")) + } + } +} + +//============================================================================= +// YAML root handling +//============================================================================= + +/// Attempts to resolve a URI. +fn resolve_uri( + uri: &str, + context: &mut context::Context, +) -> diagnostic::Result { + // Apply yaml_uri_overrides configuration. + let remapped_uri = context + .config + .uri_overrides + .iter() + .find_map(|(pattern, mapping)| { + if pattern.matches(uri) { + Some(mapping.as_ref().map(|x| &x[..])) + } else { + None + } + }); + let is_remapped = remapped_uri.is_some(); + let remapped_uri = remapped_uri.unwrap_or(Some(uri)); + + let remapped_uri = if let Some(remapped_uri) = remapped_uri { + remapped_uri.to_owned() + } else { + return Err(cause!( + YamlResolutionDisabled, + "YAML resolution for {uri} was disabled" + )); + }; + if is_remapped { + diagnostic!(context, Info, Yaml, "URI was remapped to {remapped_uri}"); + } + + // If a custom download function is specified, use it to resolve. + if let Some(ref resolver) = context.config.uri_resolver { + return resolver(&remapped_uri) + .map_err(|x| ecause!(YamlResolutionFailed, x.as_ref().to_string())); + } + + // Parse as a URL. + let url = match url::Url::parse(&remapped_uri) { + Ok(url) => url, + Err(e) => { + return Err(if is_remapped { + cause!( + YamlResolutionFailed, + "configured URI remapping ({remapped_uri}) did not parse as URL: {e}" + ) + } else { + cause!( + YamlResolutionFailed, + "failed to parse {remapped_uri} as URL: {e}" + ) + }); + } + }; + + // Reject anything that isn't file://-based. + if url.scheme() != "file" { + return Err(if is_remapped { + cause!( + YamlResolutionFailed, + "configured URI remapping ({remapped_uri}) does not use file:// scheme" + ) + } else { + cause!(YamlResolutionFailed, "URI does not use file:// scheme") + }); + } + + // Convert to path. + let path = match url.to_file_path() { + Ok(path) => path, + Err(_) => { + return Err(if is_remapped { + cause!( + YamlResolutionFailed, + "configured URI remapping ({remapped_uri}) could not be converted to file path" + ) + } else { + cause!( + YamlResolutionFailed, + "URI could not be converted to file path" + ) + }); + } + }; + + // Read the file. + std::fs::read(path) + .map_err(|e| { + if is_remapped { + cause!( + YamlResolutionFailed, + "failed to file remapping for URI ({remapped_uri}): {e}" + ) + } else { + ecause!(YamlResolutionFailed, e) + } + }) + .map(|d| -> config::BinaryData { Box::new(d) }) +} + +/// Resolves a URI to a YAML file, parses the YAML syntax, and optionally +/// validates it using the given JSON schema. +fn load_yaml( + uri: &str, + context: &mut context::Context, + schema: Option<&jsonschema::JSONSchema>, +) -> Option { + // Try to resolve the YAML file. Note that failure to resolve is a warning, + // not an error; it means the plan isn't valid in the current environment, + // but it might still be valid in another one, in particular for consumers + // that don't need to be able to resolve the YAML files to use the plan. + let binary_data = match resolve_uri(uri, context) { + Err(e) => { + diagnostic!(context, Warning, e); + return None; + } + Ok(x) => x, + }; + + // Parse as UTF-8. + let string_data = match std::str::from_utf8(binary_data.as_ref().as_ref()) { + Err(e) => { + ediagnostic!(context, Error, YamlParseFailed, e); + return None; + } + Ok(x) => x, + }; + + // Parse as YAML. + let yaml_data = match yaml_rust::YamlLoader::load_from_str(string_data) { + Err(e) => { + ediagnostic!(context, Error, YamlParseFailed, e); + return None; + } + Ok(x) => { + if x.len() > 1 { + diagnostic!( + context, + Warning, + YamlParseFailed, + "YAML file contains multiple documents; ignoring all but the first" + ); + } + match x.into_iter().next() { + None => { + diagnostic!( + context, + Error, + YamlParseFailed, + "YAML file contains zero documents" + ); + return None; + } + Some(x) => x, + } + } + }; + + // Convert to JSON DOM. + let json_data = match yaml::yaml_to_json(yaml_data, context.path()) { + Err(e) => { + diagnostic!(context, e); + return None; + } + Ok(x) => x, + }; + + // Validate with schema. + if let Some(schema) = schema { + if let Err(es) = schema.validate(&json_data) { + for e in es { + ediagnostic!(context, Error, YamlSchemaValidationFailed, e); + } + return None; + } + } + + Some(json_data) +} + +/// Attempt to load and parse a YAML file using the given root parse function, +/// initial state, and configuration. +pub fn parse_yaml( + uri: TS, + context: &mut context::Context, + schema: Option<&jsonschema::JSONSchema>, + parser: FP, +) -> Arc +where + TS: AsRef, + FP: Fn(&yaml::Value, &mut context::Context) -> diagnostic::Result<()>, +{ + let uri = uri.as_ref(); + let uri_reference = extension::NamedReference::new(Some(uri), context.parent_path_buf()); + + // Resolve the YAML file. + let yaml_info = Arc::new(if let Some(root_input) = load_yaml(uri, context, schema) { + // Create an empty YamlData object. + *context.yaml_data_opt() = Some(extension::YamlData::new(uri_reference)); + + // Create the node for the YAML data root. + let mut root_output = root_input.data_to_node(); + + // Create the path element for referring to the YAML data root. + let path_element = path::PathElement::Field("data".to_string()); + + // Create the context for the YAML data root. + let mut root_context = context.child(&mut root_output, path_element.clone()); + + // Create a PathBuf for the root node. + let root_path = root_context.path_buf(); + + // Call the provided root parser. + let success = parser(&root_input, &mut root_context) + .map_err(|cause| { + diagnostic!(&mut root_context, Error, cause); + }) + .is_ok(); + + // Handle any fields not handled by the provided parse function. + handle_unknown_children(&root_input, &mut root_context, success); + + // Push and return the completed node. + let root_output = Arc::new(root_output); + context.push(tree::NodeData::Child(tree::Child { + path_element, + node: root_output.clone(), + recognized: true, + })); + + // Take the constructed YAML data object from the context. + let mut yaml_data = context.yaml_data_opt().take().unwrap(); + + // Configure the reference to the root node in the YamlData object. + yaml_data.data.path = root_path; + yaml_data.data.node = root_output; + + // Wrap the completed YAML data object in an Arc. + let yaml_data = Arc::new(yaml_data); + + // The node type will have been set as if this is a normal string + // primitive. We want extra information though, namely the contents of + // the YAML file. So we change the node type. + context.replace_node_type(tree::NodeType::YamlReference(yaml_data.clone())); + + // Construct the YAML information object. + extension::YamlInfo::Resolved(yaml_data) + } else { + extension::YamlInfo::Unresolved(uri_reference) + }); + + yaml_info +} diff --git a/rs/src/parse/types.rs b/rs/src/parse/types.rs new file mode 100644 index 00000000..3ebb5194 --- /dev/null +++ b/rs/src/parse/types.rs @@ -0,0 +1,1156 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Module providing parse/validation functions for types. + +use std::sync::Arc; + +use crate::input::proto::substrait; +use crate::output::comment; +use crate::output::data_type; +use crate::output::data_type::ParameterInfo; +use crate::output::diagnostic; +use crate::output::extension; +use crate::parse::context; +use crate::parse::extensions; +use crate::string_util; + +/// Parses a required nullability enum. +fn parse_required_nullability( + x: &substrait::r#type::Nullability, + _: &mut context::Context, +) -> diagnostic::Result { + match x { + substrait::r#type::Nullability::Nullable => Ok(true), + substrait::r#type::Nullability::Required => Ok(false), + substrait::r#type::Nullability::Unspecified => Err(cause!( + IllegalValue, + "nullability information is required in this context" + )), + } +} + +/// Parses an optional type variation reference. +fn parse_type_variation_reference( + x: &u32, + y: &mut context::Context, +) -> diagnostic::Result { + if *x == 0 { + Ok(None) + } else { + Some(extensions::simple::parse_type_variation_reference(x, y)).transpose() + } +} + +/// Parses an unsigned integer type parameter. +fn parse_integral_type_parameter( + x: &i32, + _: &mut context::Context, +) -> diagnostic::Result { + Ok(u64::try_from(*x) + .map_err(|_| cause!(IllegalValue, "integral type parameters cannot be negative"))? + .into()) +} + +/// Macro for simple types, since they're all the same. +macro_rules! parse_simple_type { + ($input:expr, $context:expr, $typ:ident) => {{ + // Parse fields. + let nullable = proto_enum_field!( + $input, + $context, + nullability, + substrait::r#type::Nullability, + parse_required_nullability + ) + .1; + let variation = proto_primitive_field!( + $input, + $context, + type_variation_reference, + parse_type_variation_reference + ) + .1; + + // Convert to internal type object. + let data_type = if let (Some(nullable), Some(variation)) = (nullable, variation) { + data_type::DataType::new( + data_type::Class::Simple(data_type::Simple::$typ), + nullable, + variation, + vec![], + ) + .map_err(|e| diagnostic!($context, Error, e)) + .unwrap_or_default() + } else { + Arc::default() + }; + + // Attach the type to the node. + $context.set_data_type(data_type); + + Ok(()) + }}; +} + +/// Parses a boolean type. +pub fn parse_boolean( + x: &substrait::r#type::Boolean, + y: &mut context::Context, +) -> diagnostic::Result<()> { + parse_simple_type!(x, y, Boolean) +} + +/// Parses a i8 type. +pub fn parse_i8(x: &substrait::r#type::I8, y: &mut context::Context) -> diagnostic::Result<()> { + parse_simple_type!(x, y, I8) +} + +/// Parses a i16 type. +pub fn parse_i16(x: &substrait::r#type::I16, y: &mut context::Context) -> diagnostic::Result<()> { + parse_simple_type!(x, y, I16) +} + +/// Parses a i32 type. +pub fn parse_i32(x: &substrait::r#type::I32, y: &mut context::Context) -> diagnostic::Result<()> { + parse_simple_type!(x, y, I32) +} + +/// Parses a i64 type. +pub fn parse_i64(x: &substrait::r#type::I64, y: &mut context::Context) -> diagnostic::Result<()> { + parse_simple_type!(x, y, I64) +} + +/// Parses a fp32 type. +pub fn parse_fp32(x: &substrait::r#type::Fp32, y: &mut context::Context) -> diagnostic::Result<()> { + parse_simple_type!(x, y, Fp32) +} + +/// Parses a fp64 type. +pub fn parse_fp64(x: &substrait::r#type::Fp64, y: &mut context::Context) -> diagnostic::Result<()> { + parse_simple_type!(x, y, Fp64) +} + +/// Parses a string type. +pub fn parse_string( + x: &substrait::r#type::String, + y: &mut context::Context, +) -> diagnostic::Result<()> { + parse_simple_type!(x, y, String) +} + +/// Parses a binary type. +pub fn parse_binary( + x: &substrait::r#type::Binary, + y: &mut context::Context, +) -> diagnostic::Result<()> { + parse_simple_type!(x, y, Binary) +} + +/// Parses a timestamp type. +pub fn parse_timestamp( + x: &substrait::r#type::Timestamp, + y: &mut context::Context, +) -> diagnostic::Result<()> { + parse_simple_type!(x, y, Timestamp) +} + +/// Parses a date type. +pub fn parse_date(x: &substrait::r#type::Date, y: &mut context::Context) -> diagnostic::Result<()> { + parse_simple_type!(x, y, Date) +} + +/// Parses a time type. +pub fn parse_time(x: &substrait::r#type::Time, y: &mut context::Context) -> diagnostic::Result<()> { + parse_simple_type!(x, y, Time) +} + +/// Parses a interval-year type. +pub fn parse_interval_year( + x: &substrait::r#type::IntervalYear, + y: &mut context::Context, +) -> diagnostic::Result<()> { + parse_simple_type!(x, y, IntervalYear) +} + +/// Parses a interval-day type. +pub fn parse_interval_day( + x: &substrait::r#type::IntervalDay, + y: &mut context::Context, +) -> diagnostic::Result<()> { + parse_simple_type!(x, y, IntervalDay) +} + +/// Parses a timestamp-tz type. +pub fn parse_timestamp_tz( + x: &substrait::r#type::TimestampTz, + y: &mut context::Context, +) -> diagnostic::Result<()> { + parse_simple_type!(x, y, TimestampTz) +} + +/// Parses a UUID type. +pub fn parse_uuid(x: &substrait::r#type::Uuid, y: &mut context::Context) -> diagnostic::Result<()> { + parse_simple_type!(x, y, Uuid) +} + +/// Macro for compound types with just a length, since they're all the same. +macro_rules! parse_compound_type_with_length { + ($input:expr, $context:expr, $typ:ident) => {{ + // Parse fields. + let length = + proto_primitive_field!($input, $context, length, parse_integral_type_parameter).1; + let nullable = proto_enum_field!( + $input, + $context, + nullability, + substrait::r#type::Nullability, + parse_required_nullability + ) + .1; + let variation = proto_primitive_field!( + $input, + $context, + type_variation_reference, + parse_type_variation_reference + ) + .1; + + // Convert to internal type object. + let data_type = if let (Some(length), Some(nullable), Some(variation)) = + (length, nullable, variation) + { + data_type::DataType::new( + data_type::Class::Compound(data_type::Compound::$typ), + nullable, + variation, + vec![length], + ) + .map_err(|e| diagnostic!($context, Error, e)) + .unwrap_or_default() + } else { + Arc::default() + }; + + // Attach the type to the node. + $context.set_data_type(data_type); + + Ok(()) + }}; +} + +/// Parses a fixed-char type. +pub fn parse_fixed_char( + x: &substrait::r#type::FixedChar, + y: &mut context::Context, +) -> diagnostic::Result<()> { + parse_compound_type_with_length!(x, y, FixedChar) +} + +/// Parses a varchar type. +pub fn parse_var_char( + x: &substrait::r#type::VarChar, + y: &mut context::Context, +) -> diagnostic::Result<()> { + parse_compound_type_with_length!(x, y, VarChar) +} + +/// Parses a fixed-binary type. +pub fn parse_fixed_binary( + x: &substrait::r#type::FixedBinary, + y: &mut context::Context, +) -> diagnostic::Result<()> { + parse_compound_type_with_length!(x, y, FixedBinary) +} + +/// Parses a decimal type. +pub fn parse_decimal( + x: &substrait::r#type::Decimal, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Parse fields. + let precision = proto_primitive_field!(x, y, precision, parse_integral_type_parameter).1; + let scale = proto_primitive_field!(x, y, scale, parse_integral_type_parameter).1; + let nullable = proto_enum_field!( + x, + y, + nullability, + substrait::r#type::Nullability, + parse_required_nullability + ) + .1; + let variation = proto_primitive_field!( + x, + y, + type_variation_reference, + parse_type_variation_reference + ) + .1; + + // Convert to internal type object. + let data_type = if let (Some(precision), Some(scale), Some(nullable), Some(variation)) = + (precision, scale, nullable, variation) + { + data_type::DataType::new( + data_type::Class::Compound(data_type::Compound::Decimal), + nullable, + variation, + vec![precision, scale], + ) + .map_err(|e| diagnostic!(y, Error, e)) + .unwrap_or_default() + } else { + Arc::default() + }; + + // Attach the type to the node. + y.set_data_type(data_type); + + Ok(()) +} + +/// Parses a struct type. +pub fn parse_struct( + x: &substrait::r#type::Struct, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Parse fields. + let types = proto_repeated_field!(x, y, types, parse_type) + .0 + .iter() + .map(|n| n.data_type.clone().unwrap_or_default().into()) + .collect(); + let nullable = proto_enum_field!( + x, + y, + nullability, + substrait::r#type::Nullability, + parse_required_nullability + ) + .1; + let variation = proto_primitive_field!( + x, + y, + type_variation_reference, + parse_type_variation_reference + ) + .1; + + // Convert to internal type object. + let data_type = if let (Some(nullable), Some(variation)) = (nullable, variation) { + data_type::DataType::new( + data_type::Class::Compound(data_type::Compound::Struct), + nullable, + variation, + types, + ) + .map_err(|e| diagnostic!(y, Error, e)) + .unwrap_or_default() + } else { + Arc::default() + }; + + // Attach the type to the node. + y.set_data_type(data_type); + + Ok(()) +} + +/// Parses a list type. +pub fn parse_list(x: &substrait::r#type::List, y: &mut context::Context) -> diagnostic::Result<()> { + // Parse fields. + let element_type = proto_boxed_required_field!(x, y, r#type, parse_type) + .0 + .data_type + .clone() + .unwrap_or_default(); + let nullable = proto_enum_field!( + x, + y, + nullability, + substrait::r#type::Nullability, + parse_required_nullability + ) + .1; + let variation = proto_primitive_field!( + x, + y, + type_variation_reference, + parse_type_variation_reference + ) + .1; + + // Convert to internal type object. + let data_type = if let (Some(nullable), Some(variation)) = (nullable, variation) { + data_type::DataType::new( + data_type::Class::Compound(data_type::Compound::List), + nullable, + variation, + vec![element_type.into()], + ) + .map_err(|e| diagnostic!(y, Error, e)) + .unwrap_or_default() + } else { + Arc::default() + }; + + // Attach the type to the node. + y.set_data_type(data_type); + + Ok(()) +} + +/// Parses a map type. +pub fn parse_map(x: &substrait::r#type::Map, y: &mut context::Context) -> diagnostic::Result<()> { + // Parse fields. + let key_type = proto_boxed_required_field!(x, y, key, parse_type) + .0 + .data_type + .clone() + .unwrap_or_default(); + let value_type = proto_boxed_required_field!(x, y, value, parse_type) + .0 + .data_type + .clone() + .unwrap_or_default(); + let nullable = proto_enum_field!( + x, + y, + nullability, + substrait::r#type::Nullability, + parse_required_nullability + ) + .1; + let variation = proto_primitive_field!( + x, + y, + type_variation_reference, + parse_type_variation_reference + ) + .1; + + // Convert to internal type object. + let data_type = if let (Some(nullable), Some(variation)) = (nullable, variation) { + data_type::DataType::new( + data_type::Class::Compound(data_type::Compound::Map), + nullable, + variation, + vec![key_type.into(), value_type.into()], + ) + .map_err(|e| diagnostic!(y, Error, e)) + .unwrap_or_default() + } else { + Arc::default() + }; + + // Attach the type to the node. + y.set_data_type(data_type); + + Ok(()) +} + +/// Parses a user-defined type. +pub fn parse_user_defined(x: &u32, y: &mut context::Context) -> diagnostic::Result<()> { + // Parse fields. + let user_type = extensions::simple::parse_type_reference(x, y) + .map_err(|e| diagnostic!(y, Error, e)) + .ok(); + + // Convert to internal type object. + let data_type = if let Some(user_type) = user_type { + data_type::DataType::new( + data_type::Class::UserDefined(user_type), + false, + None, + vec![], + ) + .map_err(|e| diagnostic!(y, Error, e)) + .unwrap_or_default() + } else { + Arc::default() + }; + + // Attach the type to the node. + y.set_data_type(data_type); + + Ok(()) +} + +/// Parses a type kind. +pub fn parse_type_kind( + x: &substrait::r#type::Kind, + y: &mut context::Context, +) -> diagnostic::Result<()> { + match x { + substrait::r#type::Kind::Bool(x) => parse_boolean(x, y), + substrait::r#type::Kind::I8(x) => parse_i8(x, y), + substrait::r#type::Kind::I16(x) => parse_i16(x, y), + substrait::r#type::Kind::I32(x) => parse_i32(x, y), + substrait::r#type::Kind::I64(x) => parse_i64(x, y), + substrait::r#type::Kind::Fp32(x) => parse_fp32(x, y), + substrait::r#type::Kind::Fp64(x) => parse_fp64(x, y), + substrait::r#type::Kind::String(x) => parse_string(x, y), + substrait::r#type::Kind::Binary(x) => parse_binary(x, y), + substrait::r#type::Kind::Timestamp(x) => parse_timestamp(x, y), + substrait::r#type::Kind::Date(x) => parse_date(x, y), + substrait::r#type::Kind::Time(x) => parse_time(x, y), + substrait::r#type::Kind::IntervalYear(x) => parse_interval_year(x, y), + substrait::r#type::Kind::IntervalDay(x) => parse_interval_day(x, y), + substrait::r#type::Kind::TimestampTz(x) => parse_timestamp_tz(x, y), + substrait::r#type::Kind::Uuid(x) => parse_uuid(x, y), + substrait::r#type::Kind::FixedChar(x) => parse_fixed_char(x, y), + substrait::r#type::Kind::Varchar(x) => parse_var_char(x, y), + substrait::r#type::Kind::FixedBinary(x) => parse_fixed_binary(x, y), + substrait::r#type::Kind::Decimal(x) => parse_decimal(x, y), + substrait::r#type::Kind::Struct(x) => parse_struct(x, y), + substrait::r#type::Kind::List(x) => parse_list(x, y), + substrait::r#type::Kind::Map(x) => parse_map(x, y), + substrait::r#type::Kind::UserDefinedTypeReference(x) => parse_user_defined(x, y), + } +} + +fn describe_type(y: &mut context::Context, data_type: &Arc) { + let mut brief = match &data_type.class() { + data_type::Class::Simple(data_type::Simple::Boolean) => { + summary!(y, "Values of this type can be either true or false."); + String::from("boolean type") + } + data_type::Class::Simple(data_type::Simple::I8) => { + summary!( + y, + "Implementations of this type must support all integers in \ + the range [-2^7, 2^7)." + ); + String::from("8-bit signed integer type") + } + data_type::Class::Simple(data_type::Simple::I16) => { + summary!( + y, + "Implementations of this type must support all integers in \ + the range [-2^15, 2^15)." + ); + String::from("16-bit signed integer type") + } + data_type::Class::Simple(data_type::Simple::I32) => { + summary!( + y, + "Implementations of this type must support all integers in \ + the range [-2^31, 2^31)." + ); + String::from("32-bit signed integer type") + } + data_type::Class::Simple(data_type::Simple::I64) => { + summary!( + y, + "Implementations of this type must support all integers in \ + the range [-2^63, 2^63)." + ); + String::from("64-bit signed integer type") + } + data_type::Class::Simple(data_type::Simple::Fp32) => { + summary!( + y, + "Implementations of this type must support a superset of the \ + values representable using IEEE 754 binary32." + ); + String::from("single-precision float type") + } + data_type::Class::Simple(data_type::Simple::Fp64) => { + summary!( + y, + "Implementations of this type must support a superset of the \ + values representable using IEEE 754 binary64." + ); + String::from("double-precision float type") + } + data_type::Class::Simple(data_type::Simple::String) => { + summary!( + y, + "Implementations of this type must support all strings \ + representable using UTF-8 encoding and up to 2^31-1 bytes of \ + storage." + ); + String::from("Unicode string type") + } + data_type::Class::Simple(data_type::Simple::Binary) => { + summary!( + y, + "Implementations of this type must support all byte strings \ + of up to 2^31-1 bytes in length." + ); + String::from("Binary string type") + } + data_type::Class::Simple(data_type::Simple::Timestamp) => { + summary!( + y, + "Implementations of this type must support all timestamps \ + within the range [1000-01-01 00:00:00.000000, \ + 9999-12-31 23:59:59.999999] with microsecond precision. \ + Timezone information is however not encoded, so contextual \ + information would be needed to map the timestamp to a fixed \ + point in time." + ); + String::from("Timezone-naive timestamp type") + } + data_type::Class::Simple(data_type::Simple::TimestampTz) => { + summary!( + y, + "Implementations of this type must support all timestamps \ + within the range [1000-01-01 00:00:00.000000 UTC, \ + 9999-12-31 23:59:59.999999 UTC] with microsecond precision." + ); + String::from("Timezone-aware timestamp type") + } + data_type::Class::Simple(data_type::Simple::Date) => { + summary!( + y, + "Implementations of this type must support all dates within \ + the range [1000-01-01, 9999-12-31]." + ); + String::from("Date type") + } + data_type::Class::Simple(data_type::Simple::Time) => { + summary!( + y, + "Implementations of this type must support all times of day \ + with microsecond precision, not counting leap seconds; that \ + is, any integer number of microseconds since the start of a \ + day in the range [0, 24*60*60*10^6]." + ); + String::from("Time-of-day type") + } + data_type::Class::Simple(data_type::Simple::IntervalYear) => { + // FIXME: the way this type is defined makes no sense; its + // definition conflicts with the analog representations of at least + // Arrow as specified on the website (assuming INTERVAL_MONTHS was + // intended), and intuitively does not make sense either. The way + // it's written, for example [10000y, -120000m] necessarily encodes + // a semantically different value [0y, 0m], rather than that these + // can just be aliases of each other. Wouldn't it be better to + // define it as needing to represent all integer numbers of months + // in the range [-120000, 120000]? If someone then really wants the + // current semantics, they can just use + // + // NSTRUCT + // + // with some additional constraints. However, an implementation + // that wants to encode this interval type as an integer number of + // years plus an integer number of months still complies with the + // [-120000, 120000] months requirement just fine. + // + // Renaming it to interval_month makes a lot more sense then too, + // i.e. a signed interval with at least month precision and + // +/- 10000 year range, and that's it. + summary!( + y, + "Implementations of this type must support a range of any \ + combination of years and months that total less than or equal \ + to 10000 years. Each component can be specified as positive or \ + negative." + ); + String::from("Year/month interval type") + } + data_type::Class::Simple(data_type::Simple::IntervalDay) => { + // FIXME: see note for IntervalYear, making this + // interval_microsecond, i.e. a signed interval with at least + // microsecond precision and +/- 10000 year range. + // + // Worth noting in addition that 2^63 nanoseconds is a lot more + // than 10000 years. It doesn't make much sense to me to use + // I64 limits (for a different precision to boot) when all the + // other limits are based around +/- 10000 years. + summary!( + y, + "Implementations of this type must support a range of any \ + combination of [-365*10000, 365*10000] days and \ + [ceil(-2^63/1000), floor(2^63/1000)] integer microseconds." + ); + String::from("Day/microsecond interval type") + } + data_type::Class::Simple(data_type::Simple::Uuid) => { + summary!( + y, + "Implementations of this type must support 2^128 different \ + values, typically represented using the following hex format: \ + c48ffa9e-64f4-44cb-ae47-152b4e60e77b." + ); + String::from("128-bit identifier type") + } + data_type::Class::Compound(data_type::Compound::FixedChar) => { + let length = data_type + .parameters() + .get(0) + .map(|x| x.to_string()) + .unwrap_or_else(|| String::from("?")); + summary!( + y, + "Implementations of this type must support all unicode \ + strings with exactly {length} characters (i.e. code points). \ + Values shorter than that must be right-padded with spaces." + ); + format!("Fixed-length ({length}) unicode string type") + } + data_type::Class::Compound(data_type::Compound::VarChar) => { + let length = data_type + .parameters() + .get(0) + .map(|x| x.to_string()) + .unwrap_or_else(|| String::from("?")); + summary!( + y, + "Implementations of this type must support all unicode \ + strings with 0 to {length} characters (i.e. code points)." + ); + format!("Variable-length ({length}) unicode string type") + } + data_type::Class::Compound(data_type::Compound::FixedBinary) => { + let length = data_type + .parameters() + .get(0) + .map(|x| x.to_string()) + .unwrap_or_else(|| String::from("?")); + summary!( + y, + "Implementations of this type must support all binary \ + strings of exactly {length} bytes in length. Values shorter \ + than that must be right-padded with zero bytes." + ); + format!("Fixed-length ({length}) binary string type") + } + data_type::Class::Compound(data_type::Compound::Decimal) => { + let precision = data_type.int_parameter(0); + let scale = data_type.int_parameter(1); + let (p, i, s) = if let (Some(precision), Some(scale)) = (precision, scale) { + ( + precision.to_string(), + (precision - scale).to_string(), + scale.to_string(), + ) + } else { + (String::from("?"), String::from("?"), String::from("?")) + }; + summary!( + y, + "Implementations of this type must support all decimal \ + numbers with {i} integer digits and {s} fractional digits \ + (precision = {p}, scale = {s})." + ); + format!("Decimal number type with {i} integer and {s} fractional digits") + } + data_type::Class::Compound(data_type::Compound::Struct) + | data_type::Class::Compound(data_type::Compound::NamedStruct) => { + let n = data_type.parameters().len(); + if n == 1 { + summary!(y, "Structure with one field."); + String::from("Structure with one field") + } else { + summary!(y, "Structure with {n} fields."); + format!("Structure with {n} fields") + } + } + data_type::Class::Compound(data_type::Compound::List) => { + let e = data_type + .type_parameter(0) + .map(|t| t.to_string()) + .unwrap_or_else(|| String::from("?")); + summary!( + y, + "Implementations of this type must support all sequences of \ + 0 to 2^31-1 {e} elements." + ); + String::from("List type") + } + data_type::Class::Compound(data_type::Compound::Map) => { + // FIXME: the definition in the spec is technically a multimap, + // because it says nothing about key uniqueness, but that's + // probably not intentional (how would references work, then?). + // Also, unlike all the other types, there's no specified size + // limit here. Assuming the other size limits are 2^31-1 for + // Java compatibility, the same would need to apply here. + let k = data_type + .type_parameter(0) + .map(|t| t.to_string()) + .unwrap_or_else(|| String::from("?")); + let v = data_type + .type_parameter(1) + .map(|t| t.to_string()) + .unwrap_or_else(|| String::from("?")); + summary!( + y, + "Implementations of this type must support any mapping from \ + {k} keys to {v} values, consisting of up to 2^31-1 key-value \ + pairs. No key uniqueness check is required on insertion, but \ + resolving the mapping for a key for which multiple values are \ + defined is undefined behavior." + ); + String::from("Map type") + } + data_type::Class::UserDefined(u) => { + summary!(y, "Extension type {u}."); + if let Some(x) = &u.definition { + y.push_summary( + comment::Comment::new() + .plain("Internal structure corresponds to:") + .lo(), + ); + let mut first = true; + for (name, class) in &x.structure { + if first { + first = false; + } else { + y.push_summary(comment::Comment::new().li()); + } + summary!(y, "{}: {}", string_util::as_ident_or_string(name), class); + } + y.push_summary(comment::Comment::new().lc()); + } + format!("Extension type {}", u.name) + } + data_type::Class::Unresolved => { + summary!( + y, + "Failed to resolve information about this type due to \ + validation errors." + ); + String::from("Unresolved type") + } + }; + if data_type.nullable() { + brief += ", nullable"; + summary!( + y, + "Values of this type are optional, i.e. this type is nullable." + ); + } else { + summary!( + y, + "Values of this type are required, i.e. the type is not nullable." + ); + } + let variation = if let Some(u) = data_type.variation() { + let mut variation = format!("This is the {u} variation of this type"); + if let Some(tv) = &u.definition { + if tv.function_behavior == extension::FunctionBehavior::Inherits { + variation += + ", which behaves the same as the base type w.r.t. overload resolution."; + } else { + variation += ", which behaves as a separate type w.r.t. overload resolution."; + } + } else { + variation += "."; + } + variation + } else { + String::from("This is the base variation of this type.") + }; + summary!(y, "{}", variation); + describe!(y, Type, "{}", brief); +} + +/// Parses a type. +pub fn parse_type(x: &substrait::Type, y: &mut context::Context) -> diagnostic::Result<()> { + // Parse fields. + let data_type = proto_required_field!(x, y, kind, parse_type_kind) + .0 + .data_type(); + + // Describe the data type. + describe_type(y, &data_type); + + // Attach the type to the node. + y.set_data_type(data_type); + + Ok(()) +} + +/// Parses a named struct. +pub fn parse_named_struct( + x: &substrait::NamedStruct, + y: &mut context::Context, +) -> diagnostic::Result<()> { + // Parse fields. + proto_repeated_field!(x, y, names); + let node = proto_required_field!(x, y, r#struct, parse_struct).0; + + // Try to apply the names to the data type. + let data_type = match node.data_type().apply_field_names(&x.names) { + Err(e) => { + diagnostic!(y, Error, e); + node.data_type() + } + Ok(data_type) => data_type, + }; + + // Describe the data type. + describe_type(y, &data_type); + + // Attach the type to the node. + y.set_data_type(data_type); + + Ok(()) +} + +/// Asserts that two types are equal, and returns the combined type, pushing +/// diagnostics if there is a mismatch. Warnings are used for field name +/// mismatches, errors are used for any other difference. If either type is +/// unresolved at any point in the tree, the other is returned. If both are +/// unresolved, base is returned. +fn assert_equal_internal( + context: &mut context::Context, + other: &Arc, + promote_other: bool, + base: &Arc, + promote_base: bool, + message: &str, + path: &str, +) -> Arc { + if other.is_unresolved() { + base.clone() + } else if base.is_unresolved() { + other.clone() + } else { + // Match base types. + let base_types_match = match (other.class(), base.class()) { + ( + data_type::Class::Compound(data_type::Compound::Struct), + data_type::Class::Compound(data_type::Compound::NamedStruct), + ) => true, + ( + data_type::Class::Compound(data_type::Compound::NamedStruct), + data_type::Class::Compound(data_type::Compound::Struct), + ) => true, + (a, b) => a == b, + }; + if !base_types_match { + diagnostic!( + context, + Error, + TypeMismatch, + "{message}: {} vs. {}{path}", + other.class(), + base.class() + ); + + // No sense in comparing parameters if the base type is already + // different, so just return here. + return base.clone(); + } + + // Match nullability. + let nullable = match (other.nullable(), base.nullable()) { + (true, false) => { + if promote_base { + true + } else { + diagnostic!( + context, + Error, + TypeMismatchedNullability, + "{message}: nullable vs. required{path}" + ); + false + } + } + (false, true) => { + if !promote_other { + diagnostic!( + context, + Error, + TypeMismatchedNullability, + "{message}: required vs. nullable{path}" + ); + } + true + } + (_, x) => x, + }; + + // Match variations. + match (other.variation(), base.variation()) { + (Some(other), Some(base)) => { + if base != other { + diagnostic!( + context, + Error, + TypeMismatchedVariation, + "{message}: variation {other} vs. {base}{path}" + ); + } + } + (Some(other), None) => diagnostic!( + context, + Error, + TypeMismatchedVariation, + "{message}: variation {other} vs. no variation{path}" + ), + (None, Some(base)) => diagnostic!( + context, + Error, + TypeMismatchedVariation, + "{message}: no variation vs. variation {base}{path}" + ), + (None, None) => {} + } + + // Match parameter count. + let other_len = other.parameters().len(); + let base_len = base.parameters().len(); + if other_len != base_len { + diagnostic!( + context, + Error, + TypeMismatch, + "{message}: {other_len} parameter(s) vs. {base_len} parameter(s){path}" + ); + return base.clone(); + } + + // Now match the parameters. We call ourselves recursively for each + // type parameter, using the combined type to form the new type + // parameter, such that information present in only one of the + // parameters ends up in the final parameter, regardless of which + // it is. + let parameters = other + .parameters() + .iter() + .zip(base.parameters().iter()) + .enumerate() + .map(|(index, (other_param, base_param))| { + let path_element = base_param + .get_name() + .or_else(|| other_param.get_name()) + .map(String::from) + .or_else(|| base.class().parameter_name(index)) + .unwrap_or_else(|| String::from("!")); + let path = if path.is_empty() { + format!(" on parameter path {path_element}") + } else { + format!("{path}.{path_element}") + }; + match (other_param, base_param) { + (data_type::Parameter::Type(other), data_type::Parameter::Type(base)) => { + data_type::Parameter::Type(assert_equal_internal( + context, + other, + promote_other, + base, + promote_base, + message, + &path, + )) + } + ( + data_type::Parameter::Type(other), + data_type::Parameter::NamedType(name, base), + ) => data_type::Parameter::NamedType( + name.clone(), + assert_equal_internal( + context, + other, + promote_other, + base, + promote_base, + message, + &path, + ), + ), + ( + data_type::Parameter::NamedType(name, other), + data_type::Parameter::Type(base), + ) => data_type::Parameter::NamedType( + name.clone(), + assert_equal_internal( + context, + other, + promote_other, + base, + promote_base, + message, + &path, + ), + ), + ( + data_type::Parameter::NamedType(other_name, other), + data_type::Parameter::NamedType(base_name, base), + ) => { + if other_name != base_name { + diagnostic!( + context, + Warning, + TypeMismatch, + "{message}: field name {} vs. {}{path}", + string_util::as_ident_or_string(&other_name), + string_util::as_ident_or_string(&base_name) + ); + } + data_type::Parameter::NamedType( + base_name.clone(), + assert_equal_internal( + context, + other, + promote_other, + base, + promote_base, + message, + &path, + ), + ) + } + (other, base) => { + if other != base { + diagnostic!( + context, + Error, + TypeMismatch, + "{message}: {other} vs. {base}{path}" + ); + } + base.clone() + } + } + }) + .collect(); + + // If either type is a named struct, the result should be a named + // struct, since we'll have taken the field names from the type that + // has them in the loop above. + let class = match (other.class(), base.class()) { + ( + data_type::Class::Compound(data_type::Compound::Struct), + data_type::Class::Compound(data_type::Compound::NamedStruct), + ) => data_type::Class::Compound(data_type::Compound::NamedStruct), + ( + data_type::Class::Compound(data_type::Compound::NamedStruct), + data_type::Class::Compound(data_type::Compound::Struct), + ) => data_type::Class::Compound(data_type::Compound::NamedStruct), + (a, _) => a.clone(), + }; + + data_type::DataType::new(class, nullable, base.variation().clone(), parameters) + .expect("assert_equal() failed to correctly combine types") + } +} + +/// Asserts that two types are equal, and returns the combined type, pushing +/// diagnostics if there is a mismatch. Warnings are used for field name +/// mismatches, errors are used for any other difference. If either type is +/// unresolved at any point in the tree, the other is returned. If both are +/// unresolved, base is returned. +pub fn assert_equal>( + context: &mut context::Context, + other: &Arc, + base: &Arc, + message: S, +) -> Arc { + assert_equal_internal(context, other, false, base, false, message.as_ref(), "") +} + +/// Like assert_equal, but will first promote either input to try to make them +/// match. +pub fn promote_and_assert_equal>( + context: &mut context::Context, + other: &Arc, + base: &Arc, + message: S, +) -> Arc { + assert_equal_internal(context, other, true, base, true, message.as_ref(), "") +} diff --git a/rs/src/string_util.rs b/rs/src/string_util.rs new file mode 100644 index 00000000..a42a8c8f --- /dev/null +++ b/rs/src/string_util.rs @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Some misc. string utility functions. + +use crate::output::diagnostic; + +/// Returns whether the given string is a valid identifier. +pub fn is_identifier(s: &str) -> bool { + static IDENTIFIER_RE: once_cell::sync::Lazy = + once_cell::sync::Lazy::new(|| regex::Regex::new("[a-zA-Z_][a-zA-Z0-9_]*").unwrap()); + IDENTIFIER_RE.is_match(s) +} + +/// Checks an URI for validity. +pub fn check_uri(s: &str) -> diagnostic::Result { + uriparse::URIReference::try_from(s).map_err(|e| ecause!(IllegalUri, e)) +} + +/// Checks an URI that may include glob syntax in its path for validity. +pub fn check_uri_glob(s: &str) -> diagnostic::Result<()> { + // Parse as URI first, then obtain the path. + let uri = check_uri(s)?; + let path = uri.path().to_string(); + + // The glob characters `?`, `[`, and `]` are reserved in URIs, so they must + // be percent-encoded. So, in order to check the glob syntax, we must first + // percent-decode the string. Without loss of generality we use the lossy + // decode function because we don't really care about characters other than + // `*?[]` for syntax-checking the glob. + let decoded_path = percent_encoding::percent_decode_str(&path).decode_utf8_lossy(); + + // Check the glob syntax. + glob::Pattern::new(&decoded_path).map_err(|e| ecause!(IllegalGlob, e))?; + + Ok(()) +} + +/// Returns the given string as a quoted string. +pub fn as_quoted_string>(s: S) -> String { + let s = s.as_ref(); + let mut result = String::with_capacity(s.len() + 2); + result.push('"'); + for c in s.chars() { + match c { + '\\' => result += "\\\\", + '"' => result += "\"", + c => result.push(c), + } + } + result.push('"'); + result +} + +/// Returns the given string as-is if it's a valid identifier (i.e. if it +/// matches `[a-zA-Z_][a-zA-Z0-9_]*`), or returns it as an escaped string +/// otherwise, using (only) \" and \\ as escape sequences. +pub fn as_ident_or_string>(s: S) -> String { + let s = s.as_ref(); + if is_identifier(s) { + s.to_string() + } else { + as_quoted_string(s) + } +} + +/// Returns th in English, using the correct suffix for the number. +pub fn describe_nth(index: u32) -> String { + // Overkill? Yes. Couldn't help myself. + match index { + 0 => String::from("zeroth"), + 1 => String::from("first"), + 2 => String::from("second"), + 3 => String::from("third"), + 4 => String::from("fourth"), + 5 => String::from("fifth"), + 6 => String::from("sixth"), + 7 => String::from("seventh"), + 8 => String::from("eighth"), + 9 => String::from("ninth"), + 10 => String::from("tenth"), + 11 => String::from("eleventh"), + 12 => String::from("twelfth"), + 13 => String::from("thirteenth"), + 14 => String::from("fourteenth"), + 15 => String::from("fifteenth"), + 16 => String::from("sixteenth"), + 17 => String::from("seventeenth"), + 18 => String::from("eighteenth"), + 19 => String::from("nineteenth"), + 20 => String::from("twentieth"), + _ => match index % 10 { + 1 => format!("{index}st"), + 2 => format!("{index}nd"), + 3 => format!("{index}rd"), + _ => format!("{index}th"), + }, + } +} + +/// Describes an index. +pub fn describe_index(index: i32) -> String { + match index { + i32::MIN..=-2 => format!("the {} to last", describe_nth(-index as u32)), + -1 => String::from("the last"), + 0..=i32::MAX => format!("the {}", describe_nth((index + 1) as u32)), + } +} + +/// Representation of an approximate character limit for printing descriptions. +#[derive(Clone, Copy, Debug)] +pub struct Limit { + limit: Option, +} + +impl Default for Limit { + /// Creates a limit object for the default number of characters. + fn default() -> Self { + Self { limit: Some(100) } + } +} + +impl Limit { + /// Creates a limit object for the given target number of characters. + pub fn new(limit: usize) -> Self { + Self { limit: Some(limit) } + } + + /// Creates a limit object signifying a lack of a character limit (i.e. + /// print everything). + pub fn unlimited() -> Self { + Self { limit: None } + } + + /// Returns the character limit in number of characters. + pub fn chars(&self) -> usize { + self.limit.unwrap_or(usize::MAX) + } + + /// Splits this limit up into two limits. The first limit will use all + /// available characters up to min_amount, and the remainder will go to the + /// second. + pub fn split(self, min_amount: usize) -> (Self, Self) { + if let Some(limit) = self.limit { + if limit < min_amount { + (Self::new(limit), Self::new(0)) + } else { + (Self::new(min_amount), Self::new(limit - min_amount)) + } + } else { + (Self::unlimited(), Self::unlimited()) + } + } + + /// Heuristically divides the current limit up into a number of elements, + /// each allocated a number of characters, being at least min_element_size. + /// If enough characters are available to give that amount of characters to + /// each element, this returns (num_elements, None, element_limit); if not, + /// this returns (left, Some(right), min_element_limit), where left and + /// right define how many of the elements on the left/right side of the + /// sequence should be printed. In this case, left + right < num_elements. + pub fn split_n( + self, + num_elements: usize, + min_element_size: usize, + ) -> (usize, Option, Limit) { + if let Some(limit) = self.limit { + let n = limit.checked_div(min_element_size).unwrap_or(usize::MAX); + if n < num_elements { + // Apply heuristics for how many elements to print on either + // side. For some small values, this yields: + // - 0 -> .. + // - 1 -> a, .. + // - 2 -> a, .., z + // - 3 -> a, b, .., z + // - 4 -> a, b, c, .., z + // - 5 -> a, b, c, .., y, z + // - 10 -> a, b, c, d, e, f, g, .., x, y, z + // Limit is twice as many elements on the left as on the + // right. + let n_right = (n + 1) / 3; + let n_left = n - n_right; + let limit = Self::new(limit.checked_div(n).unwrap_or(limit)); + (n_left, Some(n_right), limit) + } else { + ( + num_elements, + None, + Self::new(limit.checked_div(num_elements).unwrap_or(limit)), + ) + } + } else { + (num_elements, None, Self::unlimited()) + } + } + + /// Same as split_n(), but with the element size specified per element. + pub fn split_ns(self, elements: &[usize]) -> (usize, Option) { + if let Some(limit) = self.limit { + if elements.iter().cloned().sum::() > limit { + let mut remain = (limit + 1) / 3; + let mut total = 0; + let mut n_right = 0; + for size in elements.iter().rev() { + let size = *size; + if size > remain { + n_right += 1; + remain -= size; + total += size; + } else { + break; + } + } + let mut remain = limit - total; + let mut n_left = 0; + for size in elements.iter() { + let size = *size; + if size > remain { + n_left += 1; + remain -= size; + } else { + break; + } + } + return (n_left, Some(n_right)); + } + } + (elements.len(), None) + } +} + +/// Like Display, but with a heuristic character limit. +pub trait Describe { + fn describe(&self, f: &mut std::fmt::Formatter<'_>, limit: Limit) -> std::fmt::Result; + fn display(&self) -> Describer { + Describer(self) + } +} + +pub struct Describer<'a, T: Describe + ?Sized>(&'a T); + +impl<'a, T: Describe> std::fmt::Display for Describer<'a, T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.describe( + f, + if f.alternate() { + Limit::unlimited() + } else { + Limit::default() + }, + ) + } +} + +/// Represent data as an identifier. If the identifier is too long, abbreviate +/// it. limit specifies the rough resulting string length that is considered +/// to be "too long." +pub fn describe_identifier( + f: &mut std::fmt::Formatter<'_>, + data: &str, + limit: Limit, +) -> std::fmt::Result { + if is_identifier(data) { + let (n_left, n_right, _) = limit.split_n(data.len(), 1); + if n_left > 0 || n_right.is_none() { + write!(f, "{}", &data[..n_left])?; + } + if let Some(n_right) = n_right { + write!(f, "..")?; + if n_right > 0 { + write!(f, "{}", &data[data.len() - n_right..])?; + } + } + Ok(()) + } else { + describe_string(f, data, limit) + } +} + +/// Represent data as a quoted string. If the string is too long, abbreviate +/// it. limit specifies the rough resulting string length that is considered +/// to be "too long." +pub fn describe_string( + f: &mut std::fmt::Formatter<'_>, + data: &str, + limit: Limit, +) -> std::fmt::Result { + let (n_left, n_right, _) = limit.split_n(data.len(), 1); + if n_left > 0 || n_right.is_none() { + write!(f, "{}", as_quoted_string(&data[..n_left]))?; + } + if let Some(n_right) = n_right { + write!(f, "..")?; + if n_right > 0 { + write!(f, "{}", as_quoted_string(&data[data.len() - n_right..]))?; + } + } + Ok(()) +} + +/// Represent data as a complete hexdump. +fn describe_binary_all(f: &mut std::fmt::Formatter<'_>, data: &[u8]) -> std::fmt::Result { + let mut first = true; + for byte in data { + if first { + first = false; + } else { + write!(f, " ")?; + } + write!(f, "{byte:02X}")?; + } + Ok(()) +} + +/// Represent data as a hexdump. If the resulting dump is too long, abbreviate +/// it. limit specifies the rough resulting string length that is considered +/// to be "too long." +pub fn describe_binary( + f: &mut std::fmt::Formatter<'_>, + data: &[u8], + limit: Limit, +) -> std::fmt::Result { + let (n_left, n_right, _) = limit.split_n(data.len(), 3); + describe_binary_all(f, &data[..n_left])?; + if let Some(n_right) = n_right { + write!(f, "..")?; + describe_binary_all(f, &data[data.len() - n_right..])?; + } + Ok(()) +} + +/// Represent the given sequence completely. +fn describe_sequence_all( + f: &mut std::fmt::Formatter<'_>, + values: &[T], + offset: usize, + el_limit: Limit, + repr: &F, +) -> std::fmt::Result +where + F: Fn(&mut std::fmt::Formatter<'_>, &T, usize, Limit) -> std::fmt::Result, +{ + let mut first = true; + for (index, value) in values.iter().enumerate() { + if first { + first = false; + } else { + write!(f, ", ")?; + } + repr(f, value, index + offset, el_limit)?; + } + Ok(()) +} + +/// Represent the given sequence with heuristic length limits. +pub fn describe_sequence( + f: &mut std::fmt::Formatter<'_>, + values: &[T], + limit: Limit, + element_size: usize, + repr: F, +) -> std::fmt::Result +where + F: Fn(&mut std::fmt::Formatter<'_>, &T, usize, Limit) -> std::fmt::Result, +{ + let (n_left, n_right, el_limit) = limit.split_n(values.len(), element_size); + describe_sequence_all(f, &values[..n_left], 0, el_limit, &repr)?; + if let Some(n_right) = n_right { + if n_left > 0 { + write!(f, ", ")?; + } + write!(f, "..")?; + if n_right > 0 { + write!(f, ", ")?; + } + let offset = values.len() - n_right; + describe_sequence_all(f, &values[offset..], offset, el_limit, &repr)?; + } + Ok(()) +} diff --git a/substrait b/substrait new file mode 160000 index 00000000..88463636 --- /dev/null +++ b/substrait @@ -0,0 +1 @@ +Subproject commit 88463636b22a503adeddd9cb4da1295bbc5b15be diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 00000000..bf36bb59 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,4 @@ +substrait/ +*.test +*.test.html +*.test.*.yaml diff --git a/tests/Cargo.toml b/tests/Cargo.toml new file mode 100644 index 00000000..068c231e --- /dev/null +++ b/tests/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "test-runner" +version = "0.0.1" +edition = "2018" +license = "Apache-2.0" +default-run = "runner" + +[[bin]] +name = "runner" +path = "src/runner.rs" + +[[bin]] +name = "find_protoc" +path = "src/find_protoc.rs" + +[dependencies] +substrait-validator = { path = "../rs", version = "0.0.1" } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +walkdir = "2" +glob = "0.3" +prost-build = "0.9" +rayon = "1.5" diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..c29b0368 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,142 @@ +Validation tests +================ + +This folder is dedicated to testing the output of the validator, while the more +local tests only test the API. The test runner is written such that there are +no (long*) recompilation times every time a test changes: the tests are parsed +by the test runner on-the-fly. + +*the Python-level parser isn't smart enough to do dependency checks, so it just +reruns protoc and re-parses each test every time it's run. At present, this +still happens in the blink of an eye, but as this slows down, it should be made +smarter. + +Usage +----- + +To run the tests, you will need: + + - Rust/cargo, in order to compile the validator and the test runner; and + - Python 3.x, with `protobuf`, `pyyaml`, and `click` installed. + +After that, just run `python3 runner.py run`. This will "compile" the test +descriptions in the tests folder to a format more easy to understand by the +Rust code (notably, the Rust bindings for protobuf can't read protobuf JSON +serializations) and then run the test runner using cargo. + +A whole bunch of files are generated by this process: + + - the `substrait` folder, containing the Python protobuf files for Substrait, + used by the compiler; and + - for each test: + - a `.test` file, containing the intermediate format passed to + the Rust test runner; + - a `.test.plan.yaml` file, containing the YAML representation of + the protobuf JSON serialization of the plan, as generated from the test + description; + - possibly some `.*.yaml` files, representing the extension YAML + files as generated from the test description; and + - a `.test.html` file, generated by the validator when the test + is run (useful when debugging and writing tests). + +You can remove all generated files by running `python3 runner.py clean`. + +Test descriptions +----------------- + +Tests can currently only be described using YAML (TODO: also support JDOT when +this stabilizes). Any `*.yaml` file found recursively in the `tests` folder +(that isn't a generated file, so not `*.test.*.yaml`) is a test case. + +The expected input format is structured as follows. + + { + "name": "", + "diags"?: [ + { + "code": , + "min": "", + "max": "", + }* + ], + "plan": + } + +The `"name"` key specifies a friendly name for the test, which is printed by +the runner when it is run and again at the end if it fails. + +The `"diags"` key allows diagnostic levels to be overridden. `` is just +the integer diagnostic code, and the error levels can be either `"e"` for +error, `"w"` for warning, or `"i"` for info. + +The contents of the `"plan"` key, roughly speaking, correspond to the JSON +serialization structure of the `substrait.Plan` protobuf message. However, +additional keys may be added to objects to give instructions to the test +runner, and YAML extensions can be embedded into the plans. + +The checks that the runner must perform are embedded in the plan structure +using `"[sub-path]__test"` keys. Usually, `[sub-path]` is left blank, which +means that the embedded checks relate to the dictionary that key is a part +of, but it may also be set to a period-separated list of subkeys and/or +list indices, to allow `__test` data to be attached to non-dict values. The +data associated with `__test` keys must be a list of dictionaries with the +following format: + + { + "level"?: [], + "diag"?: { + "code"?: , + "level"?: , + "original_level"?: , + "msg"?: , + "before"?: , + "after"?: + }, + "type"?: "expected-type" + } + +Exactly one key must be specified for the outermost dictionary: + + - `"level"` matches the (recursive) error level of the current node, failing + the test if the actual level is not in the list. + - `"diag"` removes the first diagnostic attached to the node that complies + with all patterns from the node's diagnostics, while failing the test if + no such diagnostic exists. Because the diagnostic is effectively + removed, subsequent "level" checks can be used to ensure that no + unexpected diagnostics remain. The following checks can be added: + - `"code"` matches the diagnostic code exactly; + - `"level"` matches the adjusted error level exactly; + - `"original_level"` matches the original error level exactly; + - `"msg"` matches the error message, using `*` for zero or more characters, + and `**` as an escape for matching `*` literally; + - `"before"` only checks for diagnostics that occur before the child with + the given path element was written; + - `"after"` only checks for diagnostics that occur after the child with + the given path element was written. + The path elements have the following syntax: + - `"field"`: regular field named "field"; + - `"field[x]"`: element x of repeated field named "field"; + - `"field"`: oneof field named "field" with variant "variant"; + - `"[x]"` -> YAML list index x. + Fields and variants that aren't valid identifiers can be specified using + double-quoted strings, using `\"` and `\\` as escape sequences, but be aware + that the serialization format you're writing may want non-identifier things + to be quoted, too. For example, in YAML, a field with named `!` would be + written as `'"!"'`, the single quotes delimiting the YAML string. + - `"type"` matches the (final) data type attached to the node with the given + string. There's no intelligence here; the string must match exactly. + +Evaluation order is depth-first, so diagnostics attached to child nodes are +removed before the level of their parent node is checked. + +`__yaml` keys may be used in place of URI keys to embed extension YAML +files. The key will be replaced with `""`, set to the string +`"test:.yaml"`. The corresponding YAML file is written to +`"..yaml"`. The test runner installs a custom URI handler +with the validator to ensure that the extension file will be linked up +appropriately. + +Just like the protobuf message structure, the embedded YAML data may have +`__test` tags associated with it, so check instructions can also be attached +to the extension +files. diff --git a/tests/runner.py b/tests/runner.py new file mode 100755 index 00000000..65b2fef0 --- /dev/null +++ b/tests/runner.py @@ -0,0 +1,580 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 + +import os +import pathlib +import subprocess +import shutil +import sys +import click +import yaml +import json +import re + + +def destringify_ident(s): + """Converts potentially stringified identifiers to strings.""" + if s.startswith('"') and s.endswith('"'): + return s[1:-1].replace('\\"', '"').replace("\\\\", "\\") + return s + + +def path_element_field(f): + """Returns the serialization of a field path element.""" + return {"Field": {"field": f}} + + +def path_element_repeated(f, i): + """Returns the serialization of a repeated field path element.""" + return {"Repeated": {"field": f, "index": i}} + + +def path_element_oneof(f, v): + """Returns the serialization of a oneof field path element.""" + return {"Oneof": {"field": f, "variant": v}} + + +def path_element_index(i): + """Returns the serialization of an array index path element.""" + return {"Index": {"index": i}} + + +def convert_if_int(x): + """Convert string x to an integer of possible, otherwise keep it as or + convert it to a string.""" + try: + return int(x) + except ValueError: + return str(x) + + +def parse_path_element(s): + """Parses the Rust path element syntax to its serialized form.""" + ident_re = r'([a-zA-Z_][a-zA-Z0-9_]*|"(?:[^\\]|\\[\\"])*")' + index_re = r"\[([1-9][0-9]*|0)\]" + field_mat = re.fullmatch(ident_re, s) + if field_mat: + return path_element_field(destringify_ident(field_mat.group(1))) + oneof_mat = re.fullmatch(ident_re + "<" + ident_re + ">", s) + if oneof_mat: + return path_element_oneof( + destringify_ident(oneof_mat.group(1)), destringify_ident(oneof_mat.group(2)) + ) + repeated_mat = re.fullmatch(ident_re + index_re, s) + if repeated_mat: + return path_element_repeated( + destringify_ident(repeated_mat.group(1)), int(repeated_mat.group(2)) + ) + index_mat = re.fullmatch(index_re, s) + if index_mat: + return path_element_index(int(index_mat.group(1))) + raise ValueError(f"failed to parse {s} as path element") + + +def parse_diag_overrides(diags): + """Parses and checks the input syntax for diagnostic overrides into the + Rust/serde syntax.""" + diag_overrides = [] + if diags is not None: + if not isinstance(diags, list): + raise Exception("diags key must map to a list") + for diag in diags: + diag_data = {} + + code = diag.pop("code", None) + if not isinstance(code, int): + raise Exception("diags[].code must be an integer") + diag_data["code"] = code + + level = diag.pop("min", "i") + if level not in ("i", "w", "e"): + raise Exception('diags[].min must be either "i", "w", or "e"') + diag_data["min"] = level + + level = diag.pop("max", "e") + if level not in ("i", "w", "e"): + raise Exception('diags[].max must be either "i", "w", or "e"') + diag_data["max"] = level + + if diag: + raise Exception( + "Found unknown key(s) in diag[]: {}".format(", ".join(diag.keys())) + ) + diag_overrides.append(diag_data) + return diag_overrides + + +def strip_test_tags(data, path=(), yaml_counter=None): + """ + Modifies data recursively, yielding a flattened set of instruction triple: + + - Pops all "[sub_path]__test" keys from the given data. For each popped + value, yields a ('test', path + sub_path, test_data) triple. sub_path + may be left blank, or may be a .-separated list of key names and list + indices. + - Replaces all "__yaml" keys with "", replacing their value + with "test:.yaml", where index is a unique integer index within + the plan. For each replaced value, the original yaml data is recursively + stripped using 'data' for the path element (this is how it will appear + in the validator output tree) and then yielded in the form of a + ('yaml', index, data) triple. + """ + if yaml_counter is None: + yaml_counter = [0] + if isinstance(data, dict): + # Handle __test keys. + keys = [] + for key in data.keys(): + if not isinstance(key, str): + raise Exception( + "found non-string key at {}".format(".".join(map(str, path))) + ) + if key.endswith("__test"): + keys.append(key) + for key in keys: + test_data = data.pop(key) + sub_path = tuple(map(convert_if_int, key.rsplit("__")[0].split("."))) + if sub_path == ("",): + sub_path = () + yield ("test", path + sub_path, test_data) + + # Handle __yaml keys. + keys = [] + for key in data.keys(): + if key.endswith("__yaml"): + keys.append(key) + for key in keys: + index = yaml_counter[0] + yaml_counter[0] += 1 + yaml_data = data.pop(key) + new_key = key.rsplit("__")[0] + data[new_key] = f"test:{index}.yaml" + for x in strip_test_tags(yaml_data, path + (new_key, "data"), yaml_counter): + yield x + yield ("yaml", index, yaml_data) + + # Traverse into dict. + for key, value in data.items(): + for x in strip_test_tags(value, path + (key,), yaml_counter): + yield x + elif isinstance(data, list): + # Traverse into list. + for index, value in enumerate(data): + for x in strip_test_tags(value, path + (index,), yaml_counter): + yield x + + +def resolve_path(path, msg_desc): + """Converts a JSON path to the protobuf path elements that Rust derives + from the prost-generated structures.""" + while path: + el, *path = path + if isinstance(el, int): + if msg_desc is None: + yield path_element_index(el) + else: + raise Exception( + f"unexpected integer in path description, currently at {msg_desc.full_name}" + ) + elif msg_desc is None: + yield path_element_field(el) + else: + field_desc = msg_desc.fields_by_camelcase_name.get(el, None) + if field_desc is None: + field_desc = msg_desc.fields_by_name.get(el, None) + if field_desc is None: + raise Exception(f"unknown field {el} for {msg_desc.full_name}") + if field_desc.label == field_desc.LABEL_REPEATED: + if not path: + raise Exception( + f"ran out of path elements for repeated {msg_desc.full_name}" + ) + el2, *path = path + if not isinstance(el2, int): + raise Exception( + f"found non-index path element for repeated {msg_desc.full_name}" + ) + yield path_element_repeated(field_desc.name, el2) + else: + if field_desc.containing_oneof is not None: + yield path_element_oneof( + field_desc.containing_oneof.name, field_desc.name + ) + else: + yield path_element_field(field_desc.name) + msg_desc = field_desc.message_type + + +def parse_level_instruction(allowed_levels, path): + """Parses an error level instruction in the input format into the + Rust/serde instruction syntax.""" + if allowed_levels is None: + return [] + + if not isinstance(allowed_levels, (list, str)): + raise Exception("__test.level must be a list or string") + allowed_levels = list(allowed_levels) + for level in allowed_levels: + if level not in ("i", "w", "e"): + raise Exception('__test.level[] must be either "i", "w", or "e"') + return [dict(Level=dict(path=path, allowed_levels=allowed_levels))] + + +def parse_diag_instruction(diag_data, path): + """Parses a diagnostic matching instruction in the input format into the + Rust/serde instruction syntax.""" + if diag_data is None: + return [] + + rust_diag_data = {} + if not isinstance(diag_data, dict): + raise Exception("__test.diag must be a dict") + + code = diag_data.pop("code", None) + if code is not None: + if not isinstance(code, int): + raise Exception("__test.diag.code must be an int") + rust_diag_data["code"] = code + + level = diag_data.pop("level", None) + if level is not None: + if level not in ("i", "w", "e"): + raise Exception('__test.diag.level must be either "i", "w", or "e"') + rust_diag_data["level"] = level + + level = diag_data.pop("original_level", None) + if level is not None: + if level not in ("i", "w", "e"): + raise Exception( + '__test.diag.original_level must be either "i", "w", or "e"' + ) + rust_diag_data["original_level"] = level + + msg_pattern = diag_data.pop("msg", None) + if msg_pattern is not None: + if not isinstance(msg_pattern, str): + raise Exception("__test.diag.msg must be a string") + # Convert to full glob pattern... We don't use the full + # pattern syntax in the description because escape + # sequences are needed for some rather common characters + # in messages (i.e. '[', ']', and '?'). + i = 0 + glob_pattern = "" + while i < len(msg_pattern): + if msg_pattern[i : i + 2] == "**": + glob_pattern += "[*]" + i += 1 + break + c = msg_pattern[i] + if c in ("?", "[", "]"): + glob_pattern += f"[{c}]" + else: + glob_pattern += c + i += 1 + rust_diag_data["msg"] = glob_pattern + + element = diag_data.pop("before", None) + if element is not None: + if not isinstance(element, str): + raise Exception("__test.diag.before must be a path element string") + rust_diag_data["before"] = parse_path_element(element) + + element = diag_data.pop("after", None) + if element is not None: + if not isinstance(element, str): + raise Exception("__test.diag.after must be a path element string") + rust_diag_data["after"] = parse_path_element(element) + + if diag_data: + raise Exception( + "Found unknown __test.diag key(s): {}".format(", ".join(diag_data.keys())) + ) + return [dict(Diag=dict(path=path, **rust_diag_data))] + + +def parse_type_instruction(type_str, path): + """Parses a data type check instruction in the input format into the + Rust/serde instruction syntax.""" + if type_str is None: + return [] + + if not isinstance(type_str, str): + raise Exception("__test.type must be a string") + return [dict(DataType=dict(path=path, data_type=type_str))] + + +def parse_instructions(test_tags, fname, proto_desc): + """Parses and checks the syntax for instructions in the input format into + the Rust/serde instruction syntax.""" + instructions = [] + for insn, loc, data in test_tags: + if insn == "test": + path = list(resolve_path(loc, proto_desc)) + for insn_type in data: + + # Handle level instructions. + instructions.extend( + parse_level_instruction(insn_type.pop("level", None), path) + ) + + # Handle diag instructions. + instructions.extend( + parse_diag_instruction(insn_type.pop("diag", None), path) + ) + + # Handle type instructions. + instructions.extend( + parse_type_instruction(insn_type.pop("type", None), path) + ) + + if insn_type: + raise Exception( + "Found unknown __test key(s): {}".format( + ", ".join(insn_type.keys()) + ) + ) + + if insn == "yaml": + with open(f"{fname}.{loc}.yaml", "w") as f: + f.write(yaml.safe_dump(data)) + + return instructions + + +def compile_test(fname, data, proto_parse, proto_desc): + """Compile test data into a bunch of test files, of which fname itself is + the main test file and the remainder are of the form fname..yaml, + containing supplementary information. proto_parse should be a function + that parses a Python dict representation of the JSON corresponding to a + Substrait plan into its binary representation, and proto_desc must point + to the descriptor for substrait.Plan. + + See README.md for format information.""" + + # Get name. + name = data.pop("name", None) + if not isinstance(name, str): + raise Exception("Missing valid test name") + + # Parse diagnostic overrides. + diag_overrides = parse_diag_overrides(data.pop("diags", None)) + + # Get plan data. + plan = data.pop("plan", None) + if not isinstance(plan, dict): + raise Exception("Missing Substrait plan") + + if data: + raise Exception( + "Found unknown key(s) in root: {}".format(", ".join(data.keys())) + ) + + # Strip test tags from the test data. + test_tags = list(strip_test_tags(plan)) + + # strip_test_tags does post-order tree traversal, but we need the + # instructions ordered pre-order. Easiest way to do that is to just reverse + # the list. + test_tags.reverse() + + # Write the converted plan for debugging purposes. + with open(f"{fname}.plan.yaml", "w") as f: + f.write(yaml.safe_dump(plan)) + + # Parse and serialize the stripped plan using protobuf. + plan = proto_parse(plan) + + # Parse the instructions derived from the test tags now that we know the + # protobuf structure was found to be valid by protobuf (it generates far + # better error messages than the path resolver does, in case something is + # wrong in the test description). + instructions = parse_instructions(test_tags, fname, proto_desc) + + # Write output file. + with open(fname, "w") as f: + f.write( + json.dumps( + dict( + name=name, + plan=list(plan), + diag_overrides=diag_overrides, + instructions=instructions, + ) + ) + ) + + +def mtime(path) -> float: + """Yields the mtime of the given path, or 0 if it doesn't exist.""" + try: + if os.path.isfile(path): + return os.path.getmtime(path) + except OSError: + pass + return 0.0 + + +@click.group() +def cli(): + pass + + +@cli.command( + short_help="Runs the test suite", + help=( + "Runs the test suite, or only run tests matching the " + "given glob-capable filter (matching test case names)." + ), +) +@click.argument("filter", required=False, default="*") +@click.option( + "--release/--no-release", + default=False, + help=( + "Build Rust application in --release mode. Recompilation " + "will take longer, but rerunning the tests will be much " + "faster." + ), +) +@click.option( + "--html/--no-html", + default=True, + help=( + "Enables or disables exporting HTML for the plans under " + "test. Enabled by default." + ), +) +def run(filter, release, html): + + # Build and run with optimizations if --release is passed. + if release: + release = ["--release"] + else: + release = [] + + # Run cargo build without capturing output. + code = subprocess.run(["cargo", "build"] + release).returncode + if code: + sys.exit(code) + + # Find all proto files and check if they've changed since the last run. + click.echo("Scanning for proto files...") + script_path = os.path.dirname(os.path.realpath(__file__)) + repo_path = os.path.realpath(os.path.join(script_path, "..")) + proto_paths = [ + os.path.join(repo_path, "proto"), + os.path.join(repo_path, "substrait", "proto"), + ] + proto_files = [] + proto_path_args = [] + for proto_path in proto_paths: + proto_files.extend( + pathlib.Path(os.path.join(proto_path, "substrait")).rglob("*.proto") + ) + proto_path_args.extend(("-I", proto_path)) + proto_mtime = max(map(mtime, proto_files)) + output_path = os.path.join(script_path, "substrait") + stamp_path = os.path.join(output_path, "__init__.py") + stamp_mtime = mtime(stamp_path) + if proto_mtime < stamp_mtime: + click.echo("Protobuf bindings are up-to-date.") + else: + + # Find the path to a protoc executable. We rely on prost for this, which is + # capable of shipping it for most operating systems. + click.echo("Finding protoc location...") + protoc = subprocess.run( + ["cargo", "run"] + release + ["-q", "--bin", "find_protoc"], + capture_output=True, + ).stdout.strip() + + # (Re)generate and import protobuf files and import them. + click.echo("Generating protobuf bindings...") + if os.path.isdir(output_path): + shutil.rmtree(output_path) + subprocess.check_call( + [protoc, *proto_path_args, "--python_out", script_path, *proto_files] + ) + for subdir in (".", "extensions", "validator"): + fname = os.path.join(output_path, subdir, "__init__.py") + with open(fname, "w") as f: + f.write("\n") + + # Import the generated protobuf bindings. + from substrait import plan_pb2 + + assert os.path.samefile(plan_pb2.__file__, os.path.join(output_path, "plan_pb2.py")) + from google.protobuf.json_format import ParseDict + + proto_desc = plan_pb2.Plan.DESCRIPTOR + + def proto_parse(data): + return ParseDict(data, plan_pb2.Plan()).SerializeToString() + + # Rather than failing immediately when the first error occurs, store errors + # here. The output for test files that compile without errors will then + # still be written. + errors = {} + + # Deserialize test input files (multiple input formats can be added here). + click.echo("Scanning for test description files...") + suite_path = os.path.join(script_path, "tests") + test_inputs = {} + for fname in pathlib.Path(suite_path).rglob("*.yaml"): + if ".test." in fname.name: + continue + try: + output_fname = str(fname) + ".test" + if mtime(fname) >= mtime(output_fname): + with open(fname, "r") as f: + test_inputs[fname] = (yaml.safe_load(f.read()), output_fname) + except Exception as e: + errors[fname] = ("reading", e) + + # Compile the contents of the test input files. + if not test_inputs: + click.echo("All test descriptions are up-to-date.") + else: + click.echo(f"Parsing {len(test_inputs)} test description(s)...") + for fname, (test_input, output_fname) in test_inputs.items(): + try: + compile_test(output_fname, test_input, proto_parse, proto_desc) + except Exception as e: + if os.path.isfile(output_fname): + os.remove(output_fname) + errors[fname] = ("compiling", e) + + # Fail if there were any errors. + if errors: + for fname, (action, error) in errors.items(): + rel_path = os.path.relpath(fname, suite_path) + click.echo(f"{type(error).__name__} while {action} {rel_path}: {error}") + sys.exit(1) + + # Now run the test suite. + sys.exit( + subprocess.run( + ["cargo", "run"] + release + ["-q", suite_path, str(int(html)), filter] + ).returncode + ) + + +@cli.command( + short_help="Removes all generated files", help="Removes all generated files." +) +def clean(): + script_path = os.path.dirname(os.path.realpath(__file__)) + + # Remove generated protobuf files. + proto_output_path = os.path.join(script_path, "substrait") + if os.path.isdir(proto_output_path): + shutil.rmtree(proto_output_path) + + # Remove compiled test files and test results. + suite_path = os.path.join(script_path, "tests") + for fname in pathlib.Path(suite_path).rglob("*.test*"): + os.remove(fname) + + +if __name__ == "__main__": + cli() diff --git a/tests/src/find_protoc.rs b/tests/src/find_protoc.rs new file mode 100644 index 00000000..08b8f874 --- /dev/null +++ b/tests/src/find_protoc.rs @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Prost has some magic for finding the path to protoc, so let's use that in +//! the Python code as well... + +fn main() { + println!("{}", prost_build::protoc().display()); +} diff --git a/tests/src/runner.rs b/tests/src/runner.rs new file mode 100644 index 00000000..45259a62 --- /dev/null +++ b/tests/src/runner.rs @@ -0,0 +1,580 @@ +// SPDX-License-Identifier: Apache-2.0 + +//! Test runner for the [substrait_validator] crate. + +use rayon::prelude::*; +use std::collections::HashMap; +use std::collections::HashSet; +use substrait_validator as sv; + +#[derive(serde::Deserialize, PartialEq, Eq, Hash, Debug, Clone, Copy)] +enum ErrorLevel { + #[serde(rename(deserialize = "e"))] + Error, + #[serde(rename(deserialize = "w"))] + Warning, + #[serde(rename(deserialize = "i"))] + Info, +} + +impl From for sv::Level { + fn from(l: ErrorLevel) -> Self { + match l { + ErrorLevel::Error => sv::Level::Error, + ErrorLevel::Warning => sv::Level::Warning, + ErrorLevel::Info => sv::Level::Info, + } + } +} + +#[derive(serde::Deserialize, Debug, Clone)] +enum PathElement { + Field { field: String }, + Oneof { field: String, variant: String }, + Repeated { field: String, index: usize }, + Index { index: usize }, +} + +impl From for sv::output::path::PathElement { + fn from(e: PathElement) -> Self { + match e { + PathElement::Field { field } => sv::output::path::PathElement::Field(field), + PathElement::Oneof { field, variant } => { + sv::output::path::PathElement::Variant(field, variant) + } + PathElement::Repeated { field, index } => { + sv::output::path::PathElement::Repeated(field, index) + } + PathElement::Index { index } => sv::output::path::PathElement::Index(index), + } + } +} + +fn convert_path(path: &[PathElement]) -> sv::output::path::PathBuf { + sv::output::path::PathBuf { + root: "plan", + elements: path.iter().map(|x| x.clone().into()).collect(), + } +} + +#[derive(serde::Deserialize, Debug)] +struct LevelTest { + pub path: Vec, + pub allowed_levels: HashSet, +} + +#[derive(serde::Deserialize, Debug)] +struct DiagnosticTest { + pub path: Vec, + pub code: Option, + pub level: Option, + pub original_level: Option, + pub msg: Option, + pub before: Option, + pub after: Option, +} + +#[derive(serde::Deserialize, Debug)] +struct DataTypeTest { + pub path: Vec, + pub data_type: String, +} + +impl DiagnosticTest { + pub fn matches(&self, diag: &sv::Diagnostic) -> bool { + // Check code. + if let Some(code) = &self.code { + if diag.cause.classification.code() != *code { + return false; + } + } + + // Check adjusted level. + if let Some(level) = &self.level { + let level = sv::Level::from(*level); + if diag.adjusted_level != level { + return false; + } + } + + // Check original level. + if let Some(level) = &self.original_level { + let level = sv::Level::from(*level); + if diag.original_level != level { + return false; + } + } + + // Check message. + if let Some(msg) = &self.msg { + let msg = glob::Pattern::new(msg).unwrap(); + if !msg.matches(&diag.cause.to_string()) { + return false; + } + } + + true + } +} + +/// A validation result checking instruction. +#[derive(serde::Deserialize, Debug)] +enum Instruction { + Level(LevelTest), + Diag(DiagnosticTest), + DataType(DataTypeTest), +} + +/// A diagnostic level override command. +#[derive(serde::Deserialize, Debug)] +struct DiagOverride { + code: u32, + min: ErrorLevel, + max: ErrorLevel, +} + +/// Test case description structure. +#[derive(serde::Deserialize, Debug)] +struct TestDescription { + /// Test case name. + pub name: String, + + /// List of diagnostic level overrides to apply. + pub diag_overrides: Vec, + + /// The binary serialization of the plan. + pub plan: Vec, + + /// The instructions for checking the validation result. + pub instructions: Vec, +} + +/// The result of a test case including messages. +#[derive(Default)] +struct TestResult { + /// Log messages generated while running the test. + pub messages: Vec, + + /// Whether there were failures in this test case. + pub failed: bool, + + /// Whether the test case was skipped. + pub skipped: bool, +} + +impl TestResult { + pub fn log(&mut self, msg: S) { + self.messages.push(msg.to_string()); + } + + pub fn error(&mut self, msg: S) { + self.failed = true; + self.log(format!("Error: {msg}")); + } + + pub fn handle_result(&mut self, e: Result, msg: F) -> Option + where + F: FnOnce() -> S, + S: std::fmt::Display, + E: std::error::Error, + { + match e { + Ok(x) => Some(x), + Err(e) => { + let msg = msg(); + self.error(format!("{msg}: {e}")); + None + } + } + } + + pub fn handle_option(&mut self, option: Option, msg: F) -> Option + where + F: FnOnce() -> S, + S: std::fmt::Display, + { + if option.is_none() { + let msg = msg(); + self.error(format!("{msg}")); + } + option + } +} + +/// Configuration structure for the test runner. +struct Configuration { + /// Skip test cases for which the name does not match this pattern. + pub filter: glob::Pattern, + + /// Whether HTML output files should be written. + pub enable_html: bool, +} + +/// All information related to a test case, including its result. +struct TestCase { + /// Path to the test case input file. + pub path: std::path::PathBuf, + + /// The test description file, if parsing succeeded. + pub description: Option, + + /// The result of the test. + pub result: TestResult, +} + +impl TestCase { + /// Traverse the given path within the given node tree, and then apply f on the + /// selected node. + fn traverse<'a, I, F>( + result: &mut TestResult, + node: &mut sv::output::tree::Node, + mut path: I, + f: F, + ) where + I: Iterator, + F: FnOnce(&mut TestResult, &mut sv::output::tree::Node), + { + match path.next() { + Some(path_element) => { + for data in node.data.iter_mut() { + if let sv::output::tree::NodeData::Child(c) = data { + if &c.path_element == path_element { + let mut node = c.node.as_ref().clone(); + Self::traverse(result, &mut node, path, f); + c.node = std::sync::Arc::new(node); + return; + } + } + } + result.error(format!("missing child node {path_element}")); + } + None => f(result, node), + } + } + + /// Searches for the child node of node at the given path element and + /// returns its index. If the child does not exist, None is returned, and + /// an error is pushed. + fn find_child_index( + result: &mut TestResult, + node: &mut sv::output::tree::Node, + desc: &PathElement, + ) -> Option { + let path_element = sv::output::path::PathElement::from(desc.clone()); + result.handle_option( + node.data.iter().enumerate().find_map(|(index, data)| { + if let sv::output::tree::NodeData::Child(c) = data { + if c.path_element == path_element { + return Some(index); + } + } + None + }), + || format!("child {path_element} does not exist"), + ) + } + + /// Runs the given level test instruction. + fn run_level_test( + result: &mut TestResult, + root: &mut sv::output::tree::Node, + desc: &LevelTest, + ) { + let path = convert_path(&desc.path); + result.log(format!("Checking level at {path}...")); + Self::traverse(result, root, path.elements.iter(), |result, node| { + let actual_level = node + .get_diagnostic() + .map(|d| d.adjusted_level) + .unwrap_or(sv::Level::Info); + if !desc + .allowed_levels + .iter() + .any(|l| sv::Level::from(*l) == actual_level) + { + result.error(format!("unexpected error level {actual_level:?}")); + } + }); + } + + /// Runs the given diagnostic test instruction. + fn run_diag_test( + result: &mut TestResult, + root: &mut sv::output::tree::Node, + desc: &DiagnosticTest, + ) { + let path = convert_path(&desc.path); + result.log(format!("Checking diagnostic at {path}...")); + Self::traverse(result, root, path.elements.iter(), |result, node| { + // Find node data start index based on after (if specified). + let start_index = desc + .after + .as_ref() + .and_then(|path_element| Self::find_child_index(result, node, path_element)) + .unwrap_or(0); + + // Find node data end index based on before (if specified). + let end_index = desc + .before + .as_ref() + .and_then(|path_element| Self::find_child_index(result, node, path_element)) + .unwrap_or(node.data.len()); + + // Look for diagnostics within that range. + let diag_index = result.handle_option( + node.data[start_index..end_index] + .iter() + .enumerate() + .find_map(|(index, data)| { + if let sv::output::tree::NodeData::Diagnostic(diag) = data { + if desc.matches(diag) { + return Some(index); + } + } + None + }), + || "no diagnostic found that matches expectations", + ); + + // Remove the diagnostic we found from the tree. + if let Some(diag_index) = diag_index { + node.data.remove(diag_index); + } + }); + } + + /// Runs the given data type test instruction. + fn run_data_type_test( + result: &mut TestResult, + root: &mut sv::output::tree::Node, + desc: &DataTypeTest, + ) { + let path = convert_path(&desc.path); + result.log(format!("Checking data type at {path}...")); + Self::traverse(result, root, path.elements.iter(), |result, node| { + let actual = format!("{:#}", node.data_type()); + if actual != desc.data_type { + result.error(format!("data type mismatch; found {actual}")); + } + }) + } + + /// Runs the given test case, updating result. + fn run( + result: &mut TestResult, + path: &std::path::Path, + desc: &TestDescription, + cfg: &Configuration, + ) { + // Create validator configuration. + let mut validator_config = sv::Config::new(); + for diag_override in desc.diag_overrides.iter() { + validator_config.override_diagnostic_level( + result + .handle_option(sv::Classification::from_code(diag_override.code), || { + format!("invalid error code {}", diag_override.code) + }) + .unwrap_or_default(), + diag_override.min.into(), + diag_override.max.into(), + ); + } + let path_os_str = path.as_os_str().to_owned(); + validator_config.add_uri_resolver(move |uri| { + if let Some(name) = uri.strip_prefix("test:") { + let mut yaml_path = path_os_str.clone(); + yaml_path.push("."); + yaml_path.push(name); + let yaml_path = std::path::PathBuf::from(yaml_path); + std::fs::read(yaml_path) + } else if let Some(uri) = uri.strip_prefix('/') { + std::fs::read(std::path::PathBuf::from("../substrait/extensions").join(uri)) + } else { + Err(std::io::Error::new( + std::io::ErrorKind::Other, + "non-test URI", + )) + } + }); + + // Parse the plan. + let parse_result = sv::parse(&desc.plan[..], &validator_config); + + // Export result to HTML for debugging. + if cfg.enable_html { + let mut html_path = path.as_os_str().to_owned(); + html_path.push(".html"); + result.handle_result( + std::fs::File::create(html_path) + .and_then(|mut f| parse_result.export(&mut f, sv::export::Format::Html)), + || "Error while attempting to write HTML output", + ); + } + + // Execute test instructions. + let mut root = parse_result.root; + for insn in desc.instructions.iter() { + match insn { + Instruction::Level(level) => Self::run_level_test(result, &mut root, level), + Instruction::Diag(diag) => Self::run_diag_test(result, &mut root, diag), + Instruction::DataType(data_type) => { + Self::run_data_type_test(result, &mut root, data_type) + } + } + } + } + + /// Loads a plan from the given file and runs it, returning the result. + pub fn load_and_run>( + path: P, + cfg: &Configuration, + ) -> Box { + // Construct the path. + let path = path.into(); + + // Construct the result object. + let mut result = TestResult::default(); + + // Read input file. + let input = result.handle_result(std::fs::read_to_string(&path), || { + "failed to read test file" + }); + + // Parse input file. + let description = input.and_then(|input| { + result.handle_result(serde_json::from_str::(&input), || { + "failed to parse test file" + }) + }); + + // Match test case filter. + let skip = description + .as_ref() + .map(|d| !cfg.filter.matches(&d.name)) + .unwrap_or_default(); + + // Run the test case. + if skip { + result.skipped = true; + } else if let Some(desc) = &description { + Self::run(&mut result, &path, desc, cfg); + } + + // Log the result. + result.log(format!( + "Test case {} ({}): {}", + description.as_ref().map(|d| &d.name[..]).unwrap_or("?"), + path.display(), + if result.skipped { + "skipped" + } else if result.failed { + "FAILED" + } else { + "passed" + } + )); + + Box::new(TestCase { + path, + description, + result, + }) + } +} + +fn print_usage_and_fail() -> ! { + let me = std::env::args() + .next() + .unwrap_or_else(|| String::from("test_runner")); + println!("Usage: {me} "); + println!("Runs all *.test files in the test directory for which the name matches the pattern."); + println!("NOTE: you should be running this with runner.py."); + std::process::exit(2); +} + +pub fn main() { + // "Parse" command line arguments. + let args: Vec<_> = std::env::args().collect(); + if args.len() != 4 { + print_usage_and_fail(); + } + let cfg = Configuration { + filter: glob::Pattern::new(&args[3]).expect("invalid filter pattern"), + enable_html: match &args[2][..] { + "1" => true, + "0" => false, + _ => print_usage_and_fail(), + }, + }; + + // Find all test cases and run them. + println!("Running test suite..."); + let paths = walkdir::WalkDir::new(&args[1]) + .into_iter() + .filter_map(|e| e.ok()) + .filter(|e| { + e.path().extension() == Some(std::ffi::OsStr::new("test")) + && e.metadata().unwrap().is_file() + }) + .map(|e| e.into_path()) + .collect::>(); + let test_cases = paths + .par_iter() + .map(|p| TestCase::load_and_run(p, &cfg)) + .collect::>(); + + // Print test name collisions. + let mut names = HashMap::new(); + for test_case in test_cases.iter() { + if let Some(desc) = &test_case.description { + if let Some(previous) = names.insert(&desc.name, &test_case.path) { + println!( + "Warning: duplicate test name {}: {} and {}", + &desc.name, + test_case.path.display(), + previous.display() + ); + } + } + } + + // Print logs for failing tests. + for test_case in test_cases.iter().filter(|x| x.result.failed) { + println!(); + if let Some(desc) = &test_case.description { + println!("Test {} ({}) FAILED:", desc.name, test_case.path.display()); + } else { + println!("Test {} FAILED:", test_case.path.display()); + } + for msg in test_case.result.messages.iter() { + println!(" {msg}"); + } + } + + // Print summary. + let n_total = test_cases.len(); + let n_run = test_cases.iter().filter(|x| !x.result.skipped).count(); + let n_failed = test_cases.iter().filter(|x| x.result.failed).count(); + if n_total == 0 { + println!("FAIL: no test cases were found. Did you run me using runner.py?"); + std::process::exit(1); + } else if n_run == 0 { + println!("FAIL: none of the {n_total} test case(s) matched the specified filter."); + std::process::exit(1); + } else if n_failed == 0 { + if n_run != n_total { + println!("PASS: all {n_run}/{n_total} matching test case(s) passed."); + } else { + println!("PASS: all {n_run} test case(s) passed."); + } + std::process::exit(0); + } else { + println!(); + if n_run != n_total { + println!("FAIL: {n_failed} out of {n_run}/{n_total} matching test case(s) failed."); + } else { + println!("FAIL: {n_failed} out of {n_run} test case(s) failed."); + } + std::process::exit(1); + } +} diff --git a/tests/tests/README.md b/tests/tests/README.md new file mode 100644 index 00000000..ccf0f241 --- /dev/null +++ b/tests/tests/README.md @@ -0,0 +1,60 @@ +State of test suite coverage +============================ + + - [ ] TPC-H (integration tests) + - [x] 1 + - [x] 2 + - [x] 3 + - [x] 4 + - [x] 5 + - [x] 6 + - [x] 7 + - [x] 8 + - [x] 9 + - [x] 10 + - [ ] 11 + - [ ] 12 + - [ ] 13 + - [x] 14 + - [ ] 15 + - [ ] 16 + - [ ] 17 + - [ ] 18 + - [x] 19 + - [ ] 20 + - [ ] 21 + - [ ] 22 + Note: Isthmus crashes on the other queries, so they'll have to be written + manually or be generated with another tool. Note also: since validation for + functions doesn't really work yet, some diagnostics are currently + blanket-disabled. + - [x] Expressions + - [x] Literals + - [x] References + - [x] Conditionals + - [x] Subqueries + - [x] Relations + - [x] Relation root + - [x] Common logic + - [x] Read + - [x] Virtual data source + - [x] Named data source + - [x] File data source + - [x] Extension data source + - [x] Base schema + - [x] Filter + - [x] Projection + - [x] Filter + - [x] Sort + - [x] Project + - [x] Cross + - [x] Join + - [x] Set + - [x] Fetch + - [x] Aggregate + - [x] Extensions + - [ ] Extensions + - [ ] Types + - [ ] Type variations + - [ ] Functions + - [x] Advanced extensions diff --git a/tests/tests/expressions/conditionals/if-else.yaml b/tests/tests/expressions/conditionals/if-else.yaml new file mode 100644 index 00000000..222472ce --- /dev/null +++ b/tests/tests/expressions/conditionals/if-else.yaml @@ -0,0 +1,133 @@ +name: if-else +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [a, b, c, d, e] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + - i8: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_NULLABLE } + - bool: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + expressions: + - ifThen: + ifs: + - if: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + then: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + else: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ type: "i8" ] + - ifThen: + ifs: + - if: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + then: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ type: "i8?" ] + - ifThen: + ifs: + - if: + selection: + rootReference: {} + directReference: { structField: { field: 4 } } + then: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + else: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ type: "i8?" ] + - ifThen: + ifs: + - if: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + then: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + else: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + __test: [ type: "i16?" ] + - ifThen: + ifs: + - if: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + then: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + - if: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + then: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + else: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + __test: [ type: "i16?" ] + - ifThen: + else: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: ifs*" } ] + - ifThen: + ifs: + - if: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ diag: { level: e, code: 4005, msg: "*predicates must yield booleans, but found i8*" } ] + then: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + - if: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + then: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + __test: [ diag: { level: e, code: 4005, msg: "*branches must yield the same type: i16 vs. i8*" } ] + else: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + __test: [ diag: { level: e, code: 4005, msg: "*branches must yield the same type: i16 vs. i8*" } ] + __test: [ type: "i8" ] diff --git a/tests/tests/expressions/conditionals/matches-scalar.yaml b/tests/tests/expressions/conditionals/matches-scalar.yaml new file mode 100644 index 00000000..84c0b6cb --- /dev/null +++ b/tests/tests/expressions/conditionals/matches-scalar.yaml @@ -0,0 +1,50 @@ +name: matches-scalar +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [a, b, c, d] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + - i8: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + expressions: + - singularOrList: + value: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + options: + - literal: { i16: 1 } + - literal: { i16: 2 } + - literal: { i16: 3 } + __test: [ type: "boolean" ] + - singularOrList: + value: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + options: [] + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: options*" } ] + - singularOrList: + value: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + options: + - literal: { i16: 1 } + - literal: { i16: 2, nullable: true } + __test: [ diag: { level: e, code: 4008, msg: "*nullable vs. required*" } ] + - literal: { i32: 3 } + __test: [ diag: { level: e, code: 4005, msg: "*i32 vs. i16*" } ] + __test: [ type: "boolean" ] diff --git a/tests/tests/expressions/conditionals/matches-vector.yaml b/tests/tests/expressions/conditionals/matches-vector.yaml new file mode 100644 index 00000000..fc8f9f7e --- /dev/null +++ b/tests/tests/expressions/conditionals/matches-vector.yaml @@ -0,0 +1,74 @@ +name: matches-vector +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [a, b, c, d] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + - i8: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + expressions: + - multiOrList: + value: + - selection: + rootReference: {} + directReference: { structField: { field: 1 } } + - selection: + rootReference: {} + directReference: { structField: { field: 2 } } + options: + - fields: + - literal: { i8: 1 } + - literal: { i16: 2 } + - fields: + - literal: { i8: 3 } + - literal: { i16: 4 } + - fields: + - literal: { i8: 5 } + - literal: { i16: 6 } + __test: [ type: "boolean" ] + - multiOrList: + value: + - selection: + rootReference: {} + directReference: { structField: { field: 1 } } + - selection: + rootReference: {} + directReference: { structField: { field: 2 } } + options: [] + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: options*" } ] + - multiOrList: + value: + - selection: + rootReference: {} + directReference: { structField: { field: 1 } } + - selection: + rootReference: {} + directReference: { structField: { field: 2 } } + options: + - fields: + - literal: { i8: 1 } + - literal: { i16: 2 } + - fields: + - literal: { i8: 3 } + - literal: { i16: 2, nullable: true } + __test: [ diag: { level: e, code: 4008, msg: "*nullable vs. required*" } ] + - fields: + - literal: { i8: 5 } + - literal: { i32: 3 } + __test: [ diag: { level: e, code: 4005, msg: "*i32 vs. i16*" } ] + - fields: + - literal: { i8: 5 } + __test: [ diag: { level: e, code: 4005, msg: "*numbers of fields differ*" } ] + __test: [ type: "boolean" ] diff --git a/tests/tests/expressions/conditionals/switch.yaml b/tests/tests/expressions/conditionals/switch.yaml new file mode 100644 index 00000000..48691ca9 --- /dev/null +++ b/tests/tests/expressions/conditionals/switch.yaml @@ -0,0 +1,121 @@ +name: switch +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [a, b, c, d] + struct: + nullability: NULLABILITY_REQUIRED + types: + - i32: { nullability: NULLABILITY_REQUIRED } + - i8: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + expressions: + - switchExpression: + match: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + ifs: + - if: { i32: 0 } + then: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + else: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ type: "i8" ] + - switchExpression: + match: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + ifs: + - if: { i32: 0 } + then: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ type: "i8?" ] + - switchExpression: + match: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + ifs: + - if: { i32: 0 } + then: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + else: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + __test: [ type: "i16?" ] + - switchExpression: + match: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + ifs: + - if: { i32: 0 } + then: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + - if: { i32: 1 } + then: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + else: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + __test: [ type: "i16?" ] + - switchExpression: + match: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + else: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: ifs*" } ] + - switchExpression: + match: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + ifs: + - if: { i16: 0 } + then: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: + - diag: { level: e, code: 4005, msg: "*literal type must match switch expression: i16 vs. i32*" } + - if: { i32: 1 } + then: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + __test: [ diag: { level: e, code: 4005, msg: "*branches must yield the same type: i16 vs. i8*" } ] + else: + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + __test: [ diag: { level: e, code: 4005, msg: "*branches must yield the same type: i16 vs. i8*" } ] + __test: [ type: "i8" ] diff --git a/tests/tests/expressions/field-refs/README.md b/tests/tests/expressions/field-refs/README.md new file mode 100644 index 00000000..e025a9a6 --- /dev/null +++ b/tests/tests/expressions/field-refs/README.md @@ -0,0 +1 @@ +This directory contains corner case tests for field references. diff --git a/tests/tests/expressions/field-refs/mask-ref.yaml b/tests/tests/expressions/field-refs/mask-ref.yaml new file mode 100644 index 00000000..9eda6cea --- /dev/null +++ b/tests/tests/expressions/field-refs/mask-ref.yaml @@ -0,0 +1,114 @@ +name: mask-ref +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b, c, d, x, y, e] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + - i8: { nullability: NULLABILITY_REQUIRED } + - list: + nullability: NULLABILITY_REQUIRED + type: { i16: { nullability: NULLABILITY_REQUIRED } } + - struct: + nullability: NULLABILITY_REQUIRED + types: + - i32: { nullability: NULLABILITY_REQUIRED } + - i64: { nullability: NULLABILITY_REQUIRED } + - map: + nullability: NULLABILITY_REQUIRED + key: { string: { nullability: NULLABILITY_REQUIRED } } + value: { date: { nullability: NULLABILITY_REQUIRED } } + namedTable: + names: + - test + expressions: + - selection: + rootReference: {} + maskedReference: + select: + structItems: + - field: 0 + - field: 4 + child: + map: + key: + mapKey: key + __test: + - diag: { level: e, code: 1 } + - diag: { level: w, code: 1 } + - field: 3 + child: + struct: + structItems: + - field: 1 + - field: 2 + child: + list: + selection: + - item: { field: -1 } + - slice: { start: 3, end: -2 } + - field: 5 + field__test: [ diag: { level: e, code: 2, msg: "*out of range*" } ] + __test: [ type: "STRUCT, STRUCT, LIST, !>" ] + - selection: + rootReference: {} + maskedReference: + select: [] + __test: [ type: "STRUCT<>" ] + - selection: + rootReference: {} + maskedReference: + select: + structItems: + - field: 3 + child: + struct: + structItems: + - field: 1 + __test: [ type: "STRUCT" ] + - selection: + rootReference: {} + maskedReference: + maintainSingularStruct: true + select: + structItems: + - field: 3 + child: + struct: + structItems: + - field: 1 + __test: [ type: "STRUCT>" ] + - selection: + rootReference: {} + maskedReference: + select: + structItems: + - field: 2 + child: + map: + __test: [ diag: { level: e, code: 4005, msg: "*requires a map*LIST*" } ] + key: + mapKey: key + __test: + - diag: { level: e, code: 1 } + - diag: { level: w, code: 1 } + - field: 0 + child: + struct: + __test: [ diag: { level: e, code: 4005, msg: "*requires a struct*boolean*" } ] + structItems: [] + - field: 4 + child: + list: + __test: [ diag: { level: e, code: 4005, msg: "*requires a list*MAP*" } ] + selection: + - item: { field: -1 } diff --git a/tests/tests/expressions/field-refs/outer-ref.yaml b/tests/tests/expressions/field-refs/outer-ref.yaml new file mode 100644 index 00000000..a14189f5 --- /dev/null +++ b/tests/tests/expressions/field-refs/outer-ref.yaml @@ -0,0 +1,72 @@ +name: outer-ref +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [a, b, c] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + - i8: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + expressions: + - subquery: + scalar: + input: + project: + common: { emit: { outputMapping: [2] } } + input: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + expressions: + - ifThen: + ifs: + - if: + selection: + outerReference: + stepsOut: 1 + __test: [ type: "NSTRUCT" ] + directReference: { structField: { field: 0 } } + then: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + else: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + - selection: + outerReference: + stepsOut: 2 + __test: + - diag: { level: e, code: 6001, msg: "*indexing query beyond current query depth (2)*" } + directReference: { structField: { field: 0 } } + - selection: + outerReference: + stepsOut: 1 + __test: + - diag: { level: e, code: 6001, msg: "*indexing query beyond current query depth (1)*" } + directReference: { structField: { field: 0 } } + - selection: + outerReference: + stepsOut: 0 + stepsOut__test: + - diag: { level: e, code: 2, msg: "*must be at least 1 (use RootReference instead)*" } + directReference: { structField: { field: 0 } } diff --git a/tests/tests/expressions/field-refs/scalar-list-ref.yaml b/tests/tests/expressions/field-refs/scalar-list-ref.yaml new file mode 100644 index 00000000..d8fc5450 --- /dev/null +++ b/tests/tests/expressions/field-refs/scalar-list-ref.yaml @@ -0,0 +1,96 @@ +name: scalar-list-ref +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - list: + nullability: NULLABILITY_REQUIRED + type: + list: + nullability: NULLABILITY_NULLABLE + type: + bool: + nullability: NULLABILITY_REQUIRED + - list: + nullability: NULLABILITY_REQUIRED + type: + bool: + nullability: NULLABILITY_REQUIRED + namedTable: + names: + - test + expressions: + - selection: + rootReference: {} + directReference: + structField: + field: 0 + __test: [ type: "LIST>" ] + - selection: + rootReference: {} + directReference: + structField: + field: 0 + child: + listElement: + offset: 0 + __test: [ type: "LIST?" ] + - selection: + rootReference: {} + directReference: + structField: + field: 0 + child: + listElement: + offset: -1 + __test: [ type: "LIST?" ] + - selection: + rootReference: {} + directReference: + structField: + field: 0 + child: + listElement: + offset: 0 + child: + listElement: + offset: 0 + __test: [ type: "boolean?" ] + - selection: + rootReference: {} + directReference: + structField: + field: 1 + child: + listElement: + offset: 0 + # FIXME: should this yield nullable or not? That is, is out-of-range + # a runtime error or does it yield null? Or does that depend on the + # nullability of the field type? + __test: [ type: "boolean" ] + - selection: + rootReference: {} + directReference: + structField: + field: 0 + child: + listElement: + offset: 0 + child: + listElement: + offset: 0 + child: + listElement: + offset: 0 + __test: [ diag: { level: e, code: 4005, msg: "*requires a list type*boolean*" } ] + __test: [ type: "!?" ] diff --git a/tests/tests/expressions/field-refs/scalar-map-ref.yaml b/tests/tests/expressions/field-refs/scalar-map-ref.yaml new file mode 100644 index 00000000..44bff7dc --- /dev/null +++ b/tests/tests/expressions/field-refs/scalar-map-ref.yaml @@ -0,0 +1,96 @@ +name: scalar-map-ref +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - map: + nullability: NULLABILITY_REQUIRED + key: { i8: { nullability: NULLABILITY_REQUIRED } } + value: + map: + nullability: NULLABILITY_NULLABLE + key: { string: { nullability: NULLABILITY_REQUIRED } } + value: { bool: { nullability: NULLABILITY_REQUIRED } } + - map: + nullability: NULLABILITY_REQUIRED + key: { i32: { nullability: NULLABILITY_REQUIRED } } + value: { bool: { nullability: NULLABILITY_REQUIRED } } + namedTable: + names: + - test + expressions: + - selection: + rootReference: {} + directReference: + structField: + field: 0 + __test: [ type: "MAP>" ] + - selection: + rootReference: {} + directReference: + structField: + field: 0 + child: + mapKey: + mapKey: { i8: 0, nullable: false } + __test: [ type: "MAP?" ] + - selection: + rootReference: {} + directReference: + structField: + field: 0 + child: + mapKey: + mapKey: { i16: 0, nullable: false } + __test: [ diag: { level: e, code: 4005, msg: "*map key type mismatch*" } ] + __test: [ type: "MAP?" ] + - selection: + rootReference: {} + directReference: + structField: + field: 0 + child: + mapKey: + mapKey: { i8: 0, nullable: false } + child: + mapKey: + mapKey: { string: hello, nullable: false } + __test: [ type: "boolean?" ] + - selection: + rootReference: {} + directReference: + structField: + field: 1 + child: + mapKey: + mapKey: { i32: 0, nullable: false } + # FIXME: should this yield nullable or not? That is, are missing keys + # a runtime error or does it yield null? Or does that depend on the + # nullability of the value type? + __test: [ type: "boolean" ] + - selection: + rootReference: {} + directReference: + structField: + field: 0 + child: + mapKey: + mapKey: { i8: 0, nullable: false } + child: + mapKey: + mapKey: { string: hello, nullable: false } + child: + mapKey: + mapKey: { string: hello, nullable: false } + __test: [ diag: { level: e, code: 4005, msg: "*requires a map type*boolean*" } ] + __test: [ type: "!?" ] diff --git a/tests/tests/expressions/field-refs/scalar-struct-ref.yaml b/tests/tests/expressions/field-refs/scalar-struct-ref.yaml new file mode 100644 index 00000000..24326ae7 --- /dev/null +++ b/tests/tests/expressions/field-refs/scalar-struct-ref.yaml @@ -0,0 +1,110 @@ +name: scalar-struct-ref +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b, c, d, x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_NULLABLE } + - i8: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_REQUIRED } + - struct: + nullability: NULLABILITY_NULLABLE + types: + - i32: { nullability: NULLABILITY_REQUIRED } + - i64: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + expressions: + - selection: + rootReference: {} + directReference: + structField: + field: 4 + field__test: [ diag: { level: e, code: 2, msg: "*out of range*" } ] + - selection: + rootReference: {} + directReference: + structField: + field: -1 + field__test: [ diag: { level: e, code: 2, msg: "*cannot be less than zero*" } ] + - selection: + rootReference: {} + directReference: { structField: { field: 0 }} + __test: [ type: "boolean?" ] + - selection: + rootReference: {} + directReference: { structField: { field: 1 }} + __test: [ type: "i8" ] + - selection: + rootReference: {} + directReference: { structField: { field: 2 }} + __test: [ type: "i16" ] + - selection: + rootReference: {} + directReference: { structField: { field: 3 }} + __test: [ type: "NSTRUCT?" ] + - selection: + rootReference: {} + directReference: + structField: + field: 3 + child: + structField: + field: 0 + __test: [ type: "i32?" ] + - selection: + rootReference: {} + directReference: + structField: + field: 3 + child: + structField: + field: 1 + __test: [ type: "i64?" ] + - selection: + rootReference: {} + directReference: + structField: + field: 3 + child: + structField: + field: 2 + field__test: [ diag: { level: e, code: 2, msg: "*out of range*" } ] + __test: [ type: "!?" ] + - selection: + rootReference: {} + directReference: + structField: + field: 2 + child: + structField: + field: 0 + __test: [ diag: { level: e, code: 4005, msg: "*requires a struct type*i16*" } ] + __test: [ type: "!" ] + - selection: + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 }} + directReference: { structField: { field: 0 }} + __test: [ type: "i32?" ] + - selection: + expression: + selection: + rootReference: {} + directReference: { structField: { field: 2 }} + directReference: + structField: + field: 0 + __test: [ diag: { level: e, code: 4005, msg: "*requires a struct type*i16*" } ] + __test: [ type: "!" ] diff --git a/tests/tests/expressions/literals/README.md b/tests/tests/expressions/literals/README.md new file mode 100644 index 00000000..174cc77d --- /dev/null +++ b/tests/tests/expressions/literals/README.md @@ -0,0 +1 @@ +This directory contains corner case tests for all literals and types. diff --git a/tests/tests/expressions/literals/binary.yaml b/tests/tests/expressions/literals/binary.yaml new file mode 100644 index 00000000..945e31ab --- /dev/null +++ b/tests/tests/expressions/literals/binary.yaml @@ -0,0 +1,34 @@ +name: binary +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - binary: { nullability: NULLABILITY_REQUIRED } + - binary: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - binary: Zm9v # base64(foo) + nullable: false + - binary: YmFy # base64(bar) + nullable: true + - fields: + - binary: "" + nullable: false + - binary: BAgVFiNC # base64(04 08 15 16 23 42) + nullable: true + - fields: + - "null": + binary: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + binary: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/boolean.yaml b/tests/tests/expressions/literals/boolean.yaml new file mode 100644 index 00000000..b20ef75c --- /dev/null +++ b/tests/tests/expressions/literals/boolean.yaml @@ -0,0 +1,30 @@ +name: boolean +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - { boolean: false, nullable: false } + - { boolean: false, nullable: true } + - fields: + - { boolean: true, nullable: false } + - { boolean: true, nullable: true } + - fields: + - "null": + bool: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + bool: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/date.yaml b/tests/tests/expressions/literals/date.yaml new file mode 100644 index 00000000..b6cf1245 --- /dev/null +++ b/tests/tests/expressions/literals/date.yaml @@ -0,0 +1,48 @@ +name: date +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - date: { nullability: NULLABILITY_REQUIRED } + - date: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - date: 0 # 1970 epoch + nullable: false + - date: 18231 # 2019-12-00 00:00:00.000000 + nullable: true + - fields: + - date: -354285 # 1000-01-01 + nullable: false + - date: -354286 # 999-12-31 + date__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - date: 2932897 # 10000-01-01 + date__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - date: 2932896 # 9999-12-31 + nullable: true + - fields: + - date: -2147483648 # i32 min (not representable with chrono::NaiveDate; don't panic!) + date__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - date: 2147483647 # i32 max (not representable with chrono::NaiveDate; don't panic!) + date__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - "null": + date: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + date: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/decimal.yaml b/tests/tests/expressions/literals/decimal.yaml new file mode 100644 index 00000000..69be98b2 --- /dev/null +++ b/tests/tests/expressions/literals/decimal.yaml @@ -0,0 +1,156 @@ +name: decimal +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - decimal: { scale: 2, precision: 10, nullability: NULLABILITY_REQUIRED } + - decimal: { scale: 5, precision: 5, nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - decimal: + value: OTAAAAAAAAAAAAAAAAAAAA== # 123.45 + scale: 2 + precision: 10 + nullable: false + - decimal: + value: OTAAAAAAAAAAAAAAAAAAAA== # .12345 + scale: 5 + precision: 5 + nullable: true + - fields: + - decimal: + value: /+MLVAIAAAAAAAAAAAAAAA== # 99999999.99 + scale: 2 + precision: 10 + nullable: false + - decimal: + value: n4YBAAAAAAAAAAAAAAAAAA== # .99999 + scale: 5 + precision: 5 + nullable: true + - fields: + - decimal: + value: AOQLVAIAAAAAAAAAAAAAAA== # 100000000.00 + scale: 2 + precision: 10 + __test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - decimal: + value: oIYBAAAAAAAAAAAAAAAAAA== # 1.00000 + scale: 5 + precision: 5 + __test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - decimal: + value: ARz0q/3//////////////w== # -99999999.99 + scale: 2 + precision: 10 + nullable: false + - decimal: + value: YXn+/////////////////w== # -.99999 + scale: 5 + precision: 5 + nullable: true + - fields: + - decimal: + value: ABz0q/3//////////////w== # -100000000.00 + scale: 2 + precision: 10 + __test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - decimal: + value: YHn+/////////////////w== # -1.00000 + scale: 5 + precision: 5 + __test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - decimal: + value: /////z8iigl6xIZaqEw7Sw== # 99999999999999999999999999999999999999 (max) + scale: 0 + precision: 38 + nullable: false + - decimal: + value: AQAAAMDddfaFO3mlV7PEtA== # -99999999999999999999999999999999999999 (min) + scale: 0 + precision: 38 + nullable: true + __test: + - diag: { level: e, code: 4005 } + - diag: { level: e, code: 4005 } + - diag: { level: e, code: 4005 } + - diag: { level: e, code: 4005 } + - fields: + - decimal: + value: AAAAAAAAAAAAAAAAAAAA # 15 bytes + value__test: [ diag: { level: e, code: 6002, msg: "*16 bytes*15*" } ] + scale: 2 + precision: 10 + nullable: false + - decimal: + value: AAAAAAAAAAAAAAAAAAAAAAA= # 17 bytes + value__test: [ diag: { level: e, code: 6002, msg: "*16 bytes*17*" } ] + scale: 5 + precision: 5 + nullable: true + - fields: + - "null": + decimal: { scale: 2, precision: 10, nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + decimal: { scale: 5, precision: 5, nullability: NULLABILITY_NULLABLE } + - fields: + - "null": + decimal: + scale: -2147483648 # i32 minimum + scale__test: [ diag: { level: e, code: 2, msg: "*parameters cannot be negative*" } ] + precision: -2147483648 # i32 minimum + precision__test: [ diag: { level: e, code: 2, msg: "*parameters cannot be negative*" } ] + nullability: NULLABILITY_NULLABLE + - "null": + decimal: + scale: 0 + precision: 0 # 0 digits doesn't make sense + nullability: NULLABILITY_NULLABLE + __test: [ diag: { level: e, code: 4002, msg: "*out of range*" } ] + - fields: + - "null": + decimal: + scale: 0 + precision: 1 # minimum precision + nullability: NULLABILITY_NULLABLE + - "null": + decimal: + scale: 0 + precision: 38 # maximum precision + nullability: NULLABILITY_NULLABLE + __test: + - diag: { level: e, code: 4008 } + - diag: { level: e, code: 4005 } + - diag: { level: e, code: 4005 } + - diag: { level: e, code: 4005 } + - diag: { level: e, code: 4005 } + - fields: + - "null": + decimal: + scale: 0 + precision: 39 # beyond maximum precision + nullability: NULLABILITY_NULLABLE + __test: [ diag: { level: e, code: 4002, msg: "*out of range*" } ] + - "null": + decimal: + scale: 6 # scale is not allowed to be greater than precision + precision: 5 + nullability: NULLABILITY_NULLABLE + __test: [ diag: { level: e, code: 4002, msg: "*out of range 0..5*" } ] + __test: [ type: "NSTRUCT, y: DECIMAL?<5, 5>>" ] diff --git a/tests/tests/expressions/literals/fixed_binary.yaml b/tests/tests/expressions/literals/fixed_binary.yaml new file mode 100644 index 00000000..41a9543e --- /dev/null +++ b/tests/tests/expressions/literals/fixed_binary.yaml @@ -0,0 +1,71 @@ +name: fixed_binary +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fixed_binary: { length: 3, nullability: NULLABILITY_REQUIRED } + - fixed_binary: { length: 5, nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - fixed_binary: Zm9v # base64("foo") + nullable: false + - fixed_binary: aGVsbG8= # base64("hello") + nullable: true + - fields: + - fixed_binary: YmFy # base64("bar") + nullable: false + - fixed_binary: d29ybGQ= # base64("world") + nullable: true + - fields: + - fixed_binary: d3JvbmcgbGVuZ3Ro # base64("wrong length") + nullable: false + - fixed_binary: AAECAwQ= # base64(00 01 02 03 04) + nullable: true + __test: [ diag: { level: e, code: 4005, msg: "*12 vs. 3*"} ] + - fields: + - "null": + fixed_binary: { length: 3, nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + fixed_binary: { length: 5, nullability: NULLABILITY_NULLABLE } + - fields: + - "null": + fixed_binary: + length: -2147483648 # i32 minimum + length__test: [ diag: { level: e, code: 2, msg: "*parameters cannot be negative*" } ] + nullability: NULLABILITY_NULLABLE + - "null": + fixed_binary: + length: 0 # size 0 not allowed + nullability: NULLABILITY_NULLABLE + __test: [ diag: { level: e, code: 4002, msg: "*out of range*" } ] + - fields: + - "null": + fixed_binary: + length: 1 # minimum size + nullability: NULLABILITY_NULLABLE + - "null": + fixed_binary: + length: 2147483647 # maximum size + nullability: NULLABILITY_NULLABLE + __test: + - diag: { level: e, code: 4008 } + - diag: { level: e, code: 4005 } + - diag: { level: e, code: 4005 } + - fields: + - fixed_binary: "" + fixed_binary__test: [ diag: { level: e, code: 4002, msg: "*out of range*" } ] + nullable: false + - fixed_binary: AA== # base64(00) + nullable: true + __test: [ diag: { level: e, code: 4005 } ] + __test: [ type: "NSTRUCT, y: FIXEDBINARY?<5>>" ] diff --git a/tests/tests/expressions/literals/fixed_char.yaml b/tests/tests/expressions/literals/fixed_char.yaml new file mode 100644 index 00000000..680afc95 --- /dev/null +++ b/tests/tests/expressions/literals/fixed_char.yaml @@ -0,0 +1,71 @@ +name: fixed_char +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fixed_char: { length: 3, nullability: NULLABILITY_REQUIRED } + - fixed_char: { length: 5, nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - fixed_char: "foo" + nullable: false + - fixed_char: "hello" + nullable: true + - fields: + - fixed_char: "bar" + nullable: false + - fixed_char: "world" + nullable: true + - fields: + - fixed_char: "wrong length" + nullable: false + - fixed_char: "!@#$%" + nullable: true + __test: [ diag: { level: e, code: 4005, msg: "*12 vs. 3*"} ] + - fields: + - "null": + fixed_char: { length: 3, nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + fixed_char: { length: 5, nullability: NULLABILITY_NULLABLE } + - fields: + - "null": + fixed_char: + length: -2147483648 # i32 minimum + length__test: [ diag: { level: e, code: 2, msg: "*parameters cannot be negative*" } ] + nullability: NULLABILITY_NULLABLE + - "null": + fixed_char: + length: 0 # size 0 not allowed + nullability: NULLABILITY_NULLABLE + __test: [ diag: { level: e, code: 4002, msg: "*out of range*" } ] + - fields: + - "null": + fixed_char: + length: 1 # minimum size + nullability: NULLABILITY_NULLABLE + - "null": + fixed_char: + length: 2147483647 # maximum size + nullability: NULLABILITY_NULLABLE + __test: + - diag: { level: e, code: 4008 } + - diag: { level: e, code: 4005 } + - diag: { level: e, code: 4005 } + - fields: + - fixed_char: "" + fixed_char__test: [ diag: { level: e, code: 4002, msg: "*out of range*" } ] + nullable: false + - fixed_char: " " + nullable: true + __test: [ diag: { level: e, code: 4005 } ] + __test: [ type: "NSTRUCT, y: FIXEDCHAR?<5>>" ] diff --git a/tests/tests/expressions/literals/fp32.yaml b/tests/tests/expressions/literals/fp32.yaml new file mode 100644 index 00000000..d8b14454 --- /dev/null +++ b/tests/tests/expressions/literals/fp32.yaml @@ -0,0 +1,34 @@ +name: fp32 +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - fp32: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - fp32: 0.0 + nullable: false + - fp32: 3.1415926535897932384626433832795028841971 + nullable: true + - fields: + - fp32: -100000000000000000000000.0 + nullable: false + - fp32: 100000000000000000000000.0 + nullable: true + - fields: + - "null": + fp32: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + fp32: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/fp64.yaml b/tests/tests/expressions/literals/fp64.yaml new file mode 100644 index 00000000..4bc1c157 --- /dev/null +++ b/tests/tests/expressions/literals/fp64.yaml @@ -0,0 +1,34 @@ +name: fp64 +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp64: { nullability: NULLABILITY_REQUIRED } + - fp64: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - fp64: 0.0 + nullable: false + - fp64: 3.1415926535897932384626433832795028841971 + nullable: true + - fields: + - fp64: -100000000000000000000000.0 + nullable: false + - fp64: 100000000000000000000000.0 + nullable: true + - fields: + - "null": + fp64: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + fp64: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/i16.yaml b/tests/tests/expressions/literals/i16.yaml new file mode 100644 index 00000000..84f4e1b1 --- /dev/null +++ b/tests/tests/expressions/literals/i16.yaml @@ -0,0 +1,41 @@ +name: i16 +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - i16: { nullability: NULLABILITY_REQUIRED } + - i16: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - i16: 0 + nullable: false + - i16: 0 + nullable: true + - fields: + - i16: -32768 + nullable: false + - i16: -32769 + i16__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - i16: 32768 + i16__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - i16: 32767 + nullable: true + - fields: + - "null": + i16: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + i16: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/i32.yaml b/tests/tests/expressions/literals/i32.yaml new file mode 100644 index 00000000..f5c49b55 --- /dev/null +++ b/tests/tests/expressions/literals/i32.yaml @@ -0,0 +1,34 @@ +name: i32 +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - i32: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - i32: 0 + nullable: false + - i32: 0 + nullable: true + - fields: + - i32: -2147483648 + nullable: false + - i32: 2147483647 + nullable: true + - fields: + - "null": + i32: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + i32: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/i64.yaml b/tests/tests/expressions/literals/i64.yaml new file mode 100644 index 00000000..ee559a2f --- /dev/null +++ b/tests/tests/expressions/literals/i64.yaml @@ -0,0 +1,34 @@ +name: i64 +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: { nullability: NULLABILITY_REQUIRED } + - i64: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - i64: 0 + nullable: false + - i64: 0 + nullable: true + - fields: + - i64: -9223372036854775808 + nullable: false + - i64: 9223372036854775807 + nullable: true + - fields: + - "null": + i64: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + i64: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/i8.yaml b/tests/tests/expressions/literals/i8.yaml new file mode 100644 index 00000000..1323d751 --- /dev/null +++ b/tests/tests/expressions/literals/i8.yaml @@ -0,0 +1,41 @@ +name: i8 +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - i8: { nullability: NULLABILITY_REQUIRED } + - i8: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - i8: 0 + nullable: false + - i8: 0 + nullable: true + - fields: + - i8: -128 + nullable: false + - i8: -129 + i8__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - i8: 128 + i8__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - i8: 127 + nullable: true + - fields: + - "null": + i8: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + i8: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/interval_day.yaml b/tests/tests/expressions/literals/interval_day.yaml new file mode 100644 index 00000000..5adedae2 --- /dev/null +++ b/tests/tests/expressions/literals/interval_day.yaml @@ -0,0 +1,73 @@ +name: interval_day +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - interval_day: { nullability: NULLABILITY_REQUIRED } + - interval_day: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - interval_day_to_second: + days: 0 + seconds: 0 + nullable: false + - interval_day_to_second: + days: 123 + seconds: 456 + nullable: true + - fields: + - interval_day_to_second: + days: 3650000 + seconds: 0 + nullable: false + - interval_day_to_second: + days: 3650001 + days__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + seconds: 0 + nullable: true + - fields: + - interval_day_to_second: + days: -3650000 + seconds: 0 + nullable: false + - interval_day_to_second: + days: -3650001 + days__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + seconds: 0 + nullable: true + - fields: + - interval_day_to_second: + days: -2147483648 + days__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + seconds: 0 + nullable: false + - interval_day_to_second: + days: 2147483647 + days__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + seconds: 0 + nullable: true + - fields: + - interval_day_to_second: + days: 0 + seconds: -2147483648 + nullable: false + - interval_day_to_second: + days: 0 + seconds: 2147483647 + nullable: true + - fields: + - "null": + interval_day: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + interval_day: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/interval_year.yaml b/tests/tests/expressions/literals/interval_year.yaml new file mode 100644 index 00000000..f3ab2bb5 --- /dev/null +++ b/tests/tests/expressions/literals/interval_year.yaml @@ -0,0 +1,101 @@ +name: interval_year +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - interval_year: { nullability: NULLABILITY_REQUIRED } + - interval_year: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - interval_year_to_month: + years: 0 + months: 0 + nullable: false + - interval_year_to_month: + years: 123 + months: 456 + nullable: true + - fields: + - interval_year_to_month: + years: 10000 + months: -120000 + nullable: false + - interval_year_to_month: + years: 10001 + years__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + months: -120001 + months__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - interval_year_to_month: + years: -10000 + months: 120000 + nullable: false + - interval_year_to_month: + years: -10001 + years__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + months: 120001 + months__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - interval_year_to_month: + years: 5000 + months: 60000 + nullable: false + - interval_year_to_month: + years: 5000 + months: 60001 + __test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - interval_year_to_month: + years: -5000 + months: -60000 + nullable: false + - interval_year_to_month: + years: -5001 + months: -60000 + __test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - interval_year_to_month: + years: -2147483648 + years__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + months: 0 + __test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - interval_year_to_month: + years: 2147483647 + years__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + months: 0 + __test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - interval_year_to_month: + years: 0 + months: -2147483648 + months__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + __test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - interval_year_to_month: + years: 0 + months: 2147483647 + months__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + __test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - "null": + interval_year: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + interval_year: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/list.yaml b/tests/tests/expressions/literals/list.yaml new file mode 100644 index 00000000..a9eed041 --- /dev/null +++ b/tests/tests/expressions/literals/list.yaml @@ -0,0 +1,61 @@ +name: list +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - list: + nullability: NULLABILITY_REQUIRED + type: + bool: { nullability: NULLABILITY_NULLABLE } + - list: + nullability: NULLABILITY_NULLABLE + type: + bool: { nullability: NULLABILITY_REQUIRED } + virtualTable: + values: + - fields: + - list: + values: + - boolean: false + nullable: true + - boolean: true + nullable: true + nullable: false + - list: + values: + - boolean: false + nullable: false + - boolean: true + nullable: false + nullable: true + - fields: + - list: + values: [] + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: values*" } ] + nullable: false + - empty_list: + nullability: NULLABILITY_NULLABLE + type: + bool: { nullability: NULLABILITY_REQUIRED } + nullable: true + - fields: + - "null": + list: + nullability: NULLABILITY_REQUIRED + type: + bool: { nullability: NULLABILITY_NULLABLE } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + list: + nullability: NULLABILITY_NULLABLE + type: + bool: { nullability: NULLABILITY_REQUIRED } + __test: [ type: "NSTRUCT, y: LIST?>" ] diff --git a/tests/tests/expressions/literals/map.yaml b/tests/tests/expressions/literals/map.yaml new file mode 100644 index 00000000..d4a5404c --- /dev/null +++ b/tests/tests/expressions/literals/map.yaml @@ -0,0 +1,77 @@ +name: map +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - map: + nullability: NULLABILITY_REQUIRED + key: { string: { nullability: NULLABILITY_NULLABLE } } + value: { bool: { nullability: NULLABILITY_REQUIRED } } + - map: + nullability: NULLABILITY_NULLABLE + key: { bool: { nullability: NULLABILITY_REQUIRED } } + value: { string: { nullability: NULLABILITY_NULLABLE } } + virtualTable: + values: + - fields: + - map: + key_values: + - key: + string: hello + nullable: true + value: + boolean: false + nullable: false + - key: + string: world + nullable: true + value: + boolean: true + nullable: false + nullable: false + - map: + key_values: + - key: + boolean: false + nullable: false + value: + string: hello + nullable: true + - key: + boolean: true + nullable: false + value: + string: world + nullable: true + nullable: true + - fields: + - map: + key_values: [] + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: key_values*" } ] + nullable: false + - empty_map: + nullability: NULLABILITY_NULLABLE + key: { bool: { nullability: NULLABILITY_REQUIRED } } + value: { string: { nullability: NULLABILITY_NULLABLE } } + nullable: true + - fields: + - "null": + map: + nullability: NULLABILITY_REQUIRED + key: { string: { nullability: NULLABILITY_NULLABLE } } + value: { bool: { nullability: NULLABILITY_REQUIRED } } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + map: + nullability: NULLABILITY_NULLABLE + key: { bool: { nullability: NULLABILITY_REQUIRED } } + value: { string: { nullability: NULLABILITY_NULLABLE } } + __test: [ type: "NSTRUCT, y: MAP?>" ] diff --git a/tests/tests/expressions/literals/string.yaml b/tests/tests/expressions/literals/string.yaml new file mode 100644 index 00000000..23f66cb4 --- /dev/null +++ b/tests/tests/expressions/literals/string.yaml @@ -0,0 +1,34 @@ +name: string +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - string: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - string: "foo" + nullable: false + - string: "bar" + nullable: true + - fields: + - string: "" + nullable: false + - string: "!@#$%^&*()_+<>" + nullable: true + - fields: + - "null": + string: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + string: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/struct.yaml b/tests/tests/expressions/literals/struct.yaml new file mode 100644 index 00000000..e8942608 --- /dev/null +++ b/tests/tests/expressions/literals/struct.yaml @@ -0,0 +1,40 @@ +name: struct +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, a, b, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_NULLABLE } + - struct: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - struct: + fields: + - { boolean: false, nullable: false } + - { boolean: false, nullable: true } + nullable: false + - struct: {} + nullable: true + - fields: + - "null": + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_NULLABLE } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + struct: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT, y: NSTRUCT?<>>" ] diff --git a/tests/tests/expressions/literals/time.yaml b/tests/tests/expressions/literals/time.yaml new file mode 100644 index 00000000..55cbca4a --- /dev/null +++ b/tests/tests/expressions/literals/time.yaml @@ -0,0 +1,48 @@ +name: time +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - time: { nullability: NULLABILITY_REQUIRED } + - time: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - time: 61200000000 # 17:00:00.000000 + nullable: false + - time: 45296789876 # 12:34:56.789876 + nullable: true + - fields: + - time: 0 # 00:00:00.000000 (lowest value) + nullable: false + - time: -1 # before start of day + time__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - time: 86400000000 # after end of day (leap seconds not supported) + time__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - time: 86399999999 # 23:59:59.999999 (highest value) + nullable: true + - fields: + - time: -9223372036854775808 # i64 min + time__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - time: 9223372036854775807 # i64 max + time__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - "null": + time: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + time: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/timestamp.yaml b/tests/tests/expressions/literals/timestamp.yaml new file mode 100644 index 00000000..5410c4df --- /dev/null +++ b/tests/tests/expressions/literals/timestamp.yaml @@ -0,0 +1,48 @@ +name: timestamp +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - timestamp: { nullability: NULLABILITY_REQUIRED } + - timestamp: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - timestamp: 0 # 1970 epoch + nullable: false + - timestamp: 1575158400000000 # 2019-12-00 00:00:00.000000 + nullable: true + - fields: + - timestamp: -30610224000000000 # 1000-01-01 00:00:00.000000 + nullable: false + - timestamp: -30610224000000001 # 999-12-31 23:59:59.999999 + timestamp__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - timestamp: 253402300800000000 # 10000-01-01 00:00:00.000000 + timestamp__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - timestamp: 253402300799999999 # 9999-12-31 23:59:59.999999 + nullable: true + - fields: + - timestamp: -9223372036854775808 # i64 min (not representable with chrono::NaiveDate; don't panic!) + timestamp__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - timestamp: 9223372036854775807 # i64 max (not representable with chrono::NaiveDate; don't panic!) + timestamp__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - "null": + timestamp: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + timestamp: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/timestamp_tz.yaml b/tests/tests/expressions/literals/timestamp_tz.yaml new file mode 100644 index 00000000..27f230e2 --- /dev/null +++ b/tests/tests/expressions/literals/timestamp_tz.yaml @@ -0,0 +1,48 @@ +name: timestamp_tz +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - timestamp_tz: { nullability: NULLABILITY_REQUIRED } + - timestamp_tz: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - timestamp_tz: 0 # 1970 epoch + nullable: false + - timestamp_tz: 1575158400000000 # 2019-12-00 00:00:00.000000 + nullable: true + - fields: + - timestamp_tz: -30610224000000000 # 1000-01-01 00:00:00.000000 + nullable: false + - timestamp_tz: -30610224000000001 # 999-12-31 23:59:59.999999 + timestamp_tz__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - timestamp_tz: 253402300800000000 # 10000-01-01 00:00:00.000000 + timestamp_tz__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - timestamp_tz: 253402300799999999 # 9999-12-31 23:59:59.999999 + nullable: true + - fields: + - timestamp_tz: -9223372036854775808 # i64 min (not representable with chrono::NaiveDate; don't panic!) + timestamp_tz__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: false + - timestamp_tz: 9223372036854775807 # i64 max (not representable with chrono::NaiveDate; don't panic!) + timestamp_tz__test: [ diag: { level: e, code: 6002, msg: "*out of range*" } ] + nullable: true + - fields: + - "null": + timestamp_tz: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + timestamp_tz: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/uuid.yaml b/tests/tests/expressions/literals/uuid.yaml new file mode 100644 index 00000000..1993392a --- /dev/null +++ b/tests/tests/expressions/literals/uuid.yaml @@ -0,0 +1,36 @@ +name: uuid +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - uuid: { nullability: NULLABILITY_REQUIRED } + - uuid: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - uuid: K2lgaYZgQDqtouub8GqT+g== # base64(2b696069-8660-403a-ada2-eb9bf06a93fa) + nullable: false + - uuid: AAAAAAAAAAAAAAAAAAAAAA== # base64(00000000-0000-0000-0000-000000000000) + nullable: true + - fields: + - uuid: AAAAAAAAAAAAAAAAAAAA # 15 bytes + uuid__test: [ diag: { level: e, code: 6002, msg: "*16 bytes*15*" } ] + nullable: false + - uuid: AAAAAAAAAAAAAAAAAAAAAAA= # 17 bytes + uuid__test: [ diag: { level: e, code: 6002, msg: "*16 bytes*17*" } ] + nullable: true + - fields: + - "null": + uuid: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + uuid: { nullability: NULLABILITY_NULLABLE } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/expressions/literals/var_char.yaml b/tests/tests/expressions/literals/var_char.yaml new file mode 100644 index 00000000..34100737 --- /dev/null +++ b/tests/tests/expressions/literals/var_char.yaml @@ -0,0 +1,76 @@ +name: varchar +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - varchar: { length: 3, nullability: NULLABILITY_REQUIRED } + - varchar: { length: 6, nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - var_char: + value: "foo" + length: 3 + nullable: false + - var_char: + value: "hello" + length: 6 + nullable: true + - fields: + - var_char: + value: "" + length: 3 + nullable: false + - var_char: + value: "world!" + length: 6 + nullable: true + - fields: + - var_char: + value: "too long" + value__test: [ diag: { level: e, code: 6002, msg: "*longer than specified length*" } ] + length: 3 + nullable: false + - var_char: + value: "!@#$%^" + length: 6 + nullable: true + - fields: + - "null": + varchar: { length: 3, nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4008, msg: "*type of null literal must be nullable*" } ] + - "null": + varchar: { length: 6, nullability: NULLABILITY_NULLABLE } + - fields: + - "null": + varchar: + length: -2147483648 # i32 minimum + length__test: [ diag: { level: e, code: 2, msg: "*parameters cannot be negative*" } ] + nullability: NULLABILITY_NULLABLE + - "null": + varchar: + length: 0 # size 0 not allowed + nullability: NULLABILITY_NULLABLE + __test: [ diag: { level: e, code: 4002, msg: "*out of range*" } ] + - fields: + - "null": + varchar: + length: 1 # minimum size + nullability: NULLABILITY_NULLABLE + - "null": + varchar: + length: 2147483647 # maximum size + nullability: NULLABILITY_NULLABLE + __test: + - diag: { level: e, code: 4008 } + - diag: { level: e, code: 4005 } + - diag: { level: e, code: 4005 } + __test: [ type: "NSTRUCT, y: VARCHAR?<6>>" ] diff --git a/tests/tests/expressions/subqueries/comparison.yaml b/tests/tests/expressions/subqueries/comparison.yaml new file mode 100644 index 00000000..486c7dc5 --- /dev/null +++ b/tests/tests/expressions/subqueries/comparison.yaml @@ -0,0 +1,72 @@ +name: comparison-subquery +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [x] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: { names: [ test ] } + expressions: + - subquery: + setComparison: + right: + read: + baseSchema: + names: [x] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + left: { literal: { string: test } } + comparisonOp: COMPARISON_OP_EQ + reductionOp: REDUCTION_OP_ALL + __test: [ type: boolean ] + - subquery: + setComparison: + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test3 + __test: [ diag: { level: e, code: 6004, msg: "*subquery must return a single column*" } ] + left: { literal: { string: test } } + comparisonOp: COMPARISON_OP_UNSPECIFIED + comparisonOp__test: [ diag: { level: e, code: 2, msg: "*this enum may not be left unspecified*" } ] + reductionOp: REDUCTION_OP_UNSPECIFIED + reductionOp__test: [ diag: { level: e, code: 2, msg: "*this enum may not be left unspecified*" } ] + __test: [ type: boolean ] + - subquery: + setComparison: + right: + read: + baseSchema: + names: [x] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test4 + __test: [ diag: { level: e, code: 4005, msg: "*string vs. i32*" } ] + left: { literal: { i32: 0 } } + comparisonOp: COMPARISON_OP_GE + reductionOp: REDUCTION_OP_ANY + __test: [ type: boolean ] diff --git a/tests/tests/expressions/subqueries/in-predicate.yaml b/tests/tests/expressions/subqueries/in-predicate.yaml new file mode 100644 index 00000000..65bb3528 --- /dev/null +++ b/tests/tests/expressions/subqueries/in-predicate.yaml @@ -0,0 +1,69 @@ +name: in-subquery +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [x] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: { names: [ test ] } + expressions: + - subquery: + inPredicate: + haystack: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + needles: + - literal: { string: test } + - literal: { i32: 0 } + __test: [ type: boolean ] + - subquery: + inPredicate: + haystack: + read: + baseSchema: + names: [x] + struct: + nullability: NULLABILITY_REQUIRED + types: + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test3 + needles: + - literal: { i16: 0 } + __test: [ diag: { level: e, code: 4005, msg: "*column 1*i32 vs. i16*" } ] + __test: [ type: boolean ] + - subquery: + inPredicate: + haystack: + read: + baseSchema: + names: [x] + struct: + nullability: NULLABILITY_REQUIRED + types: + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test3 + needles: + - literal: { i32: 0 } + - literal: { i16: 0 } + __test: [ diag: { level: e, code: 4005, msg: "*column count mismatch*" } ] + __test: [ type: boolean ] diff --git a/tests/tests/expressions/subqueries/scalar.yaml b/tests/tests/expressions/subqueries/scalar.yaml new file mode 100644 index 00000000..cbf2382a --- /dev/null +++ b/tests/tests/expressions/subqueries/scalar.yaml @@ -0,0 +1,45 @@ +name: scalar-subquery +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [x] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: { names: [ test ] } + expressions: + - subquery: + scalar: + input: + read: + baseSchema: + names: [x] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + __test: [ type: string ] + - subquery: + scalar: + input: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - date: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test3 + __test: [ diag: { level: e, code: 6004, msg: "*subquery must return a single column*" } ] diff --git a/tests/tests/expressions/subqueries/set.yaml b/tests/tests/expressions/subqueries/set.yaml new file mode 100644 index 00000000..306500e3 --- /dev/null +++ b/tests/tests/expressions/subqueries/set.yaml @@ -0,0 +1,65 @@ +name: set-subquery +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [x] + struct: + nullability: NULLABILITY_REQUIRED + types: + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: { names: [ test ] } + expressions: + - subquery: + setPredicate: + tuples: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + predicateOp: PREDICATE_OP_EXISTS + __test: [ type: boolean ] + - subquery: + setPredicate: + tuples: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + predicateOp: PREDICATE_OP_UNIQUE + __test: [ type: boolean ] + - subquery: + setPredicate: + tuples: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + predicateOp: PREDICATE_OP_UNSPECIFIED + predicateOp__test: [ diag: { level: e, code: 2, msg: "*this enum may not be left unspecified*" } ] + __test: [ type: boolean ] diff --git a/tests/tests/extensions/advanced/enhancement-not-declared.yaml b/tests/tests/extensions/advanced/enhancement-not-declared.yaml new file mode 100644 index 00000000..7c679487 --- /dev/null +++ b/tests/tests/extensions/advanced/enhancement-not-declared.yaml @@ -0,0 +1,21 @@ +name: adv-ext-enhancement-not-declared +plan: + __test: [ level: w ] + relations: + - rel: + read: + common: + advancedExtension: + enhancement: + "@type": substrait.Plan + __test: [ diag: { level: e, code: 1006, msg: '*missing protobuf "any" declaration*' } ] + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test diff --git a/tests/tests/extensions/advanced/enhancement.yaml b/tests/tests/extensions/advanced/enhancement.yaml new file mode 100644 index 00000000..3f0d7566 --- /dev/null +++ b/tests/tests/extensions/advanced/enhancement.yaml @@ -0,0 +1,22 @@ +name: adv-ext-enhancement +plan: + __test: [ level: w ] + relations: + - rel: + read: + common: + advancedExtension: + enhancement: + "@type": substrait.Plan + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/extensions/advanced/missing-declaration.yaml b/tests/tests/extensions/advanced/missing-declaration.yaml new file mode 100644 index 00000000..d217392d --- /dev/null +++ b/tests/tests/extensions/advanced/missing-declaration.yaml @@ -0,0 +1,21 @@ +name: adv-ext-missing-declaration +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + advancedExtension: + optimization: + "@type": substrait.Plan + __test: [ diag: { level: e, code: 1006, msg: '*missing protobuf "any" declaration: substrait.Plan*' } ] + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test diff --git a/tests/tests/extensions/advanced/optimization-not-declared.yaml b/tests/tests/extensions/advanced/optimization-not-declared.yaml new file mode 100644 index 00000000..5f52547a --- /dev/null +++ b/tests/tests/extensions/advanced/optimization-not-declared.yaml @@ -0,0 +1,21 @@ +name: adv-ext-optimization-not-declared +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + advancedExtension: + optimization: + "@type": substrait.Plan + __test: [ diag: { level: e, code: 1006, msg: '*missing protobuf "any" declaration*' } ] + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test diff --git a/tests/tests/extensions/advanced/optimization.yaml b/tests/tests/extensions/advanced/optimization.yaml new file mode 100644 index 00000000..1972de0c --- /dev/null +++ b/tests/tests/extensions/advanced/optimization.yaml @@ -0,0 +1,22 @@ +name: adv-ext-optimization +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + advancedExtension: + optimization: + "@type": substrait.Plan + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/extensions/advanced/unused-declaration.yaml b/tests/tests/extensions/advanced/unused-declaration.yaml new file mode 100644 index 00000000..432e1c5a --- /dev/null +++ b/tests/tests/extensions/advanced/unused-declaration.yaml @@ -0,0 +1,24 @@ +name: adv-ext-unused-declaration +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + advancedExtension: + optimization: + "@type": substrait.Plan + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + expectedTypeUrls: + - substrait.Plan + - not.Used + __test: [ diag: { level: i, code: 7001, msg: '*not.Used is not present in the plan*' } ] diff --git a/tests/tests/relations/aggregate/measure-and-group.yaml b/tests/tests/relations/aggregate/measure-and-group.yaml new file mode 100644 index 00000000..cd2acdff --- /dev/null +++ b/tests/tests/relations/aggregate/measure-and-group.yaml @@ -0,0 +1,45 @@ +name: aggregate-measure-and-group +plan: + __test: [ level: iw ] + extensionUris: + - extensionUriAnchor: 1 + uri__yaml: + aggregate_functions: + - name: "count" + description: Count number of rows + impls: + - args: [] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64 + return: i64 + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: "count:" + relations: + - rel: + aggregate: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - fp32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + groupings: + - groupingExpressions: + - selection: + rootReference: {} + directReference: { structField: { field: 0 } } + measures: + - measure: + functionReference: 1 + output_type: { i64: { nullability: NULLABILITY_REQUIRED } } + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/aggregate/measure.yaml b/tests/tests/relations/aggregate/measure.yaml new file mode 100644 index 00000000..9dc2dc35 --- /dev/null +++ b/tests/tests/relations/aggregate/measure.yaml @@ -0,0 +1,40 @@ +name: aggregate-measure +plan: + __test: [ level: iw ] + extensionUris: + - extensionUriAnchor: 1 + uri__yaml: + aggregate_functions: + - name: "count" + description: Count number of rows + impls: + - args: [] + nullability: DECLARED_OUTPUT + decomposable: MANY + intermediate: i64 + return: i64 + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: "count:" + relations: + - rel: + aggregate: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - fp32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + measures: + - measure: + functionReference: 1 + output_type: { i64: { nullability: NULLABILITY_REQUIRED } } + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/aggregate/missing-set-or-measure.yaml b/tests/tests/relations/aggregate/missing-set-or-measure.yaml new file mode 100644 index 00000000..deb680a6 --- /dev/null +++ b/tests/tests/relations/aggregate/missing-set-or-measure.yaml @@ -0,0 +1,19 @@ +name: aggregate-missing-set-or-measure +plan: + __test: [ level: i ] + relations: + - rel: + aggregate: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - fp32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ diag: { level: e, code: 5003, msg: "*must have at least one grouping expression or measure*" } ] diff --git a/tests/tests/relations/aggregate/multi-set-reused.yaml b/tests/tests/relations/aggregate/multi-set-reused.yaml new file mode 100644 index 00000000..d9214a3b --- /dev/null +++ b/tests/tests/relations/aggregate/multi-set-reused.yaml @@ -0,0 +1,31 @@ +name: aggregate-multi-set-reused +plan: + __test: [ level: i ] + relations: + - rel: + aggregate: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - fp32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + groupings: + - groupingExpressions: + - selection: + rootReference: {} + directReference: { structField: { field: 1 } } + - selection: + rootReference: {} + directReference: { structField: { field: 0 } } + - groupingExpressions: + - selection: + rootReference: {} + directReference: { structField: { field: 0 } } + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/aggregate/multi-set.yaml b/tests/tests/relations/aggregate/multi-set.yaml new file mode 100644 index 00000000..4a6a5add --- /dev/null +++ b/tests/tests/relations/aggregate/multi-set.yaml @@ -0,0 +1,28 @@ +name: aggregate-multi-set +plan: + __test: [ level: i ] + relations: + - rel: + aggregate: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - fp32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + groupings: + - groupingExpressions: + - selection: + rootReference: {} + directReference: { structField: { field: 1 } } + - groupingExpressions: + - selection: + rootReference: {} + directReference: { structField: { field: 0 } } + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/aggregate/single-set-one-expr.yaml b/tests/tests/relations/aggregate/single-set-one-expr.yaml new file mode 100644 index 00000000..aee213da --- /dev/null +++ b/tests/tests/relations/aggregate/single-set-one-expr.yaml @@ -0,0 +1,24 @@ +name: aggregate-single-set-one-expr +plan: + __test: [ level: i ] + relations: + - rel: + aggregate: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - fp32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + groupings: + - groupingExpressions: + - selection: + rootReference: {} + directReference: { structField: { field: 0 } } + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/aggregate/single-set-two-expr.yaml b/tests/tests/relations/aggregate/single-set-two-expr.yaml new file mode 100644 index 00000000..9b3d8a4a --- /dev/null +++ b/tests/tests/relations/aggregate/single-set-two-expr.yaml @@ -0,0 +1,27 @@ +name: aggregate-single-set-two-expr +plan: + __test: [ level: i ] + relations: + - rel: + aggregate: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - fp32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + groupings: + - groupingExpressions: + - selection: + rootReference: {} + directReference: { structField: { field: 1 } } + - selection: + rootReference: {} + directReference: { structField: { field: 0 } } + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/aggregate/single-set-zero-expr.yaml b/tests/tests/relations/aggregate/single-set-zero-expr.yaml new file mode 100644 index 00000000..a87b9529 --- /dev/null +++ b/tests/tests/relations/aggregate/single-set-zero-expr.yaml @@ -0,0 +1,22 @@ +name: aggregate-single-set-missing-expr +plan: + __test: [ level: i ] + relations: + - rel: + aggregate: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - fp32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + groupings: + - groupingExpressions: [] + __test: + - diag: { level: e, code: 5003, msg: "*must have at least one grouping expression or measure*" } diff --git a/tests/tests/relations/common/direct.yaml b/tests/tests/relations/common/direct.yaml new file mode 100644 index 00000000..52376ae7 --- /dev/null +++ b/tests/tests/relations/common/direct.yaml @@ -0,0 +1,18 @@ +name: rel-common-direct +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + direct: {} + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/common/emit-basic.yaml b/tests/tests/relations/common/emit-basic.yaml new file mode 100644 index 00000000..4acb3a81 --- /dev/null +++ b/tests/tests/relations/common/emit-basic.yaml @@ -0,0 +1,22 @@ +name: rel-common-emit-basic +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + emit: + outputMapping: + - 1 + - 0 + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/common/emit-empty.yaml b/tests/tests/relations/common/emit-empty.yaml new file mode 100644 index 00000000..797774d1 --- /dev/null +++ b/tests/tests/relations/common/emit-empty.yaml @@ -0,0 +1,20 @@ +name: rel-common-emit-empty +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + emit: + outputMapping: [] + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ type: "STRUCT<>" ] diff --git a/tests/tests/relations/common/emit-out-of-range.yaml b/tests/tests/relations/common/emit-out-of-range.yaml new file mode 100644 index 00000000..6d3f7277 --- /dev/null +++ b/tests/tests/relations/common/emit-out-of-range.yaml @@ -0,0 +1,21 @@ +name: rel-common-emit-out-of-range +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + emit: + outputMapping: + - 2 + outputMapping.0__test: [ diag: { level: e, code: 4004, msg: "*index out of range*" } ] + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test diff --git a/tests/tests/relations/common/enhancement.yaml b/tests/tests/relations/common/enhancement.yaml new file mode 100644 index 00000000..eba8d7c1 --- /dev/null +++ b/tests/tests/relations/common/enhancement.yaml @@ -0,0 +1,23 @@ +name: rel-common-enhancement +plan: + __test: [ level: w ] + relations: + - rel: + read: + common: + advancedExtension: + enhancement: + '@type': substrait.Plan + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + # enhancements may affect schema, so validator does not know type + __test: [ type: "!" ] + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/common/hints-functional.yaml b/tests/tests/relations/common/hints-functional.yaml new file mode 100644 index 00000000..b4efbf34 --- /dev/null +++ b/tests/tests/relations/common/hints-functional.yaml @@ -0,0 +1,26 @@ +name: rel-common-hints-functional +plan: + __test: [ level: w ] + relations: + - rel: + read: + common: + hint: + stats: + rowCount: 100 + recordSize: 100 + advancedExtension: + enhancement: + '@type': substrait.Plan + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ type: "NSTRUCT" ] + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/common/hints.yaml b/tests/tests/relations/common/hints.yaml new file mode 100644 index 00000000..0c11dbe4 --- /dev/null +++ b/tests/tests/relations/common/hints.yaml @@ -0,0 +1,26 @@ +name: rel-common-hints +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + hint: + stats: + rowCount: 100 + recordSize: 100 + advancedExtension: + optimization: + '@type': substrait.Plan + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ type: "NSTRUCT" ] + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/common/omitted.yaml b/tests/tests/relations/common/omitted.yaml new file mode 100644 index 00000000..fcdc42dd --- /dev/null +++ b/tests/tests/relations/common/omitted.yaml @@ -0,0 +1,16 @@ +name: rel-common-omitted +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/common/optimization.yaml b/tests/tests/relations/common/optimization.yaml new file mode 100644 index 00000000..2476e9c9 --- /dev/null +++ b/tests/tests/relations/common/optimization.yaml @@ -0,0 +1,22 @@ +name: rel-common-optimization +plan: + __test: [ level: i ] + relations: + - rel: + read: + common: + advancedExtension: + optimization: + '@type': substrait.Plan + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ type: "NSTRUCT" ] + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/cross/basic.yaml b/tests/tests/relations/cross/basic.yaml new file mode 100644 index 00000000..92ee81d0 --- /dev/null +++ b/tests/tests/relations/cross/basic.yaml @@ -0,0 +1,31 @@ +name: cross-basic +plan: + __test: [ level: i ] + relations: + - rel: + cross: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/cross/unknown-schema.yaml b/tests/tests/relations/cross/unknown-schema.yaml new file mode 100644 index 00000000..df1499b0 --- /dev/null +++ b/tests/tests/relations/cross/unknown-schema.yaml @@ -0,0 +1,25 @@ +name: cross-unknown-schema +plan: + __test: [ level: i ] + relations: + - rel: + cross: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + namedTable: + names: + - test2 + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: base_schema*" } ] + __test: [ type: "!" ] diff --git a/tests/tests/relations/extensions/leaf-missing-detail.yaml b/tests/tests/relations/extensions/leaf-missing-detail.yaml new file mode 100644 index 00000000..d1bc266d --- /dev/null +++ b/tests/tests/relations/extensions/leaf-missing-detail.yaml @@ -0,0 +1,9 @@ +name: rel-extension-leaf-missing-detail +plan: + __test: [ level: i ] + relations: + - rel: + extensionLeaf: + __test: + - diag: { level: e, code: 1002, msg: "*missing required protobuf field: detail*" } + - type: "!" diff --git a/tests/tests/relations/extensions/leaf.yaml b/tests/tests/relations/extensions/leaf.yaml new file mode 100644 index 00000000..5c188f3d --- /dev/null +++ b/tests/tests/relations/extensions/leaf.yaml @@ -0,0 +1,11 @@ +name: rel-extension-leaf +plan: + __test: [ level: w ] + relations: + - rel: + extensionLeaf: + detail: + '@type': substrait.Plan + __test: [ type: "!" ] + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/extensions/multi-missing-detail.yaml b/tests/tests/relations/extensions/multi-missing-detail.yaml new file mode 100644 index 00000000..e7924d85 --- /dev/null +++ b/tests/tests/relations/extensions/multi-missing-detail.yaml @@ -0,0 +1,22 @@ +name: rel-extension-multi-missing-detail +plan: + __test: [ level: i ] + relations: + - rel: + extensionMulti: + inputs: + - read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: + - diag: { level: e, code: 1002, msg: "*missing required protobuf field: detail*" } + - type: "!" + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/extensions/multi-without-inputs.yaml b/tests/tests/relations/extensions/multi-without-inputs.yaml new file mode 100644 index 00000000..82e30896 --- /dev/null +++ b/tests/tests/relations/extensions/multi-without-inputs.yaml @@ -0,0 +1,12 @@ +name: rel-extension-multi-without-inputs +plan: + __test: [ level: w ] + relations: + - rel: + extensionMulti: + inputs: [] + detail: + '@type': substrait.Plan + __test: [ type: "!" ] + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/extensions/multi.yaml b/tests/tests/relations/extensions/multi.yaml new file mode 100644 index 00000000..26318821 --- /dev/null +++ b/tests/tests/relations/extensions/multi.yaml @@ -0,0 +1,22 @@ +name: rel-extension-multi +plan: + __test: [ level: w ] + relations: + - rel: + extensionMulti: + inputs: + - read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + detail: + '@type': substrait.Plan + __test: [ type: "!" ] + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/extensions/single-missing-detail.yaml b/tests/tests/relations/extensions/single-missing-detail.yaml new file mode 100644 index 00000000..005edc6b --- /dev/null +++ b/tests/tests/relations/extensions/single-missing-detail.yaml @@ -0,0 +1,22 @@ +name: rel-extension-single-missing-detail +plan: + __test: [ level: i ] + relations: + - rel: + extensionSingle: + input: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: + - diag: { level: e, code: 1002, msg: "*missing required protobuf field: detail*" } + - type: "!" + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/extensions/single.yaml b/tests/tests/relations/extensions/single.yaml new file mode 100644 index 00000000..b86248d6 --- /dev/null +++ b/tests/tests/relations/extensions/single.yaml @@ -0,0 +1,22 @@ +name: rel-extension-single +plan: + __test: [ level: w ] + relations: + - rel: + extensionSingle: + input: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + detail: + '@type': substrait.Plan + __test: [ type: "!" ] + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/fetch/all.yaml b/tests/tests/relations/fetch/all.yaml new file mode 100644 index 00000000..b8ffb6c9 --- /dev/null +++ b/tests/tests/relations/fetch/all.yaml @@ -0,0 +1,18 @@ +name: fetch-all +plan: + __test: [ level: i ] + relations: + - rel: + fetch: + input: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/fetch/discard-first-n.yaml b/tests/tests/relations/fetch/discard-first-n.yaml new file mode 100644 index 00000000..e5901a2b --- /dev/null +++ b/tests/tests/relations/fetch/discard-first-n.yaml @@ -0,0 +1,19 @@ +name: fetch-discard-first-n +plan: + __test: [ level: i ] + relations: + - rel: + fetch: + input: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + offset: 100 + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/fetch/discard-first.yaml b/tests/tests/relations/fetch/discard-first.yaml new file mode 100644 index 00000000..2184b98b --- /dev/null +++ b/tests/tests/relations/fetch/discard-first.yaml @@ -0,0 +1,19 @@ +name: fetch-discard-first +plan: + __test: [ level: i ] + relations: + - rel: + fetch: + input: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + offset: 1 + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/fetch/n-rows.yaml b/tests/tests/relations/fetch/n-rows.yaml new file mode 100644 index 00000000..b86186f2 --- /dev/null +++ b/tests/tests/relations/fetch/n-rows.yaml @@ -0,0 +1,20 @@ +name: fetch-n-rows +plan: + __test: [ level: i ] + relations: + - rel: + fetch: + input: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + offset: 100 + count: 50 + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/fetch/single-row.yaml b/tests/tests/relations/fetch/single-row.yaml new file mode 100644 index 00000000..6b2fefef --- /dev/null +++ b/tests/tests/relations/fetch/single-row.yaml @@ -0,0 +1,20 @@ +name: fetch-single-row +plan: + __test: [ level: i ] + relations: + - rel: + fetch: + input: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + offset: 100 + count: 1 + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/filter/basic.yaml b/tests/tests/relations/filter/basic.yaml new file mode 100644 index 00000000..f4b7d334 --- /dev/null +++ b/tests/tests/relations/filter/basic.yaml @@ -0,0 +1,23 @@ +name: filter-basic +plan: + __test: [ level: i ] + relations: + - rel: + filter: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + condition: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/filter/missing.yaml b/tests/tests/relations/filter/missing.yaml new file mode 100644 index 00000000..965f2dab --- /dev/null +++ b/tests/tests/relations/filter/missing.yaml @@ -0,0 +1,19 @@ +name: filter-missing +plan: + __test: [ level: i ] + relations: + - rel: + filter: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: condition*" } ] diff --git a/tests/tests/relations/filter/not-bool.yaml b/tests/tests/relations/filter/not-bool.yaml new file mode 100644 index 00000000..5961eb3b --- /dev/null +++ b/tests/tests/relations/filter/not-bool.yaml @@ -0,0 +1,24 @@ +name: filter-not-bool +plan: + __test: [ level: i ] + relations: + - rel: + filter: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + condition: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + __test: [ diag: { level: e, code: 4005, msg: "*must yield booleans*string*" } ] + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/filter/nullable.yaml b/tests/tests/relations/filter/nullable.yaml new file mode 100644 index 00000000..8ee210f8 --- /dev/null +++ b/tests/tests/relations/filter/nullable.yaml @@ -0,0 +1,23 @@ +name: filter-nullable +plan: + __test: [ level: i ] + relations: + - rel: + filter: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + condition: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/join/anti.yaml b/tests/tests/relations/join/anti.yaml new file mode 100644 index 00000000..d6ebdd78 --- /dev/null +++ b/tests/tests/relations/join/anti.yaml @@ -0,0 +1,36 @@ +name: join-anti +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + type: JOIN_TYPE_ANTI + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/join/expr-not-bool.yaml b/tests/tests/relations/join/expr-not-bool.yaml new file mode 100644 index 00000000..dd69c4ef --- /dev/null +++ b/tests/tests/relations/join/expr-not-bool.yaml @@ -0,0 +1,36 @@ +name: join-expr-not-bool +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + expression: + __test: [ diag: { level: e, code: 4005, msg: "*predicates must yield booleans*fp32*" } ] + selection: + rootReference: {} + directReference: { structField: { field: 2 } } + type: JOIN_TYPE_INNER diff --git a/tests/tests/relations/join/filter-not-bool.yaml b/tests/tests/relations/join/filter-not-bool.yaml new file mode 100644 index 00000000..b4595ae9 --- /dev/null +++ b/tests/tests/relations/join/filter-not-bool.yaml @@ -0,0 +1,40 @@ +name: join-filter-not-bool +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + type: JOIN_TYPE_INNER + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + postJoinFilter: + __test: [ diag: { level: e, code: 4005, msg: "*predicates must yield booleans*fp32*" } ] + selection: + rootReference: {} + directReference: { structField: { field: 2 } } diff --git a/tests/tests/relations/join/filter-range.yaml b/tests/tests/relations/join/filter-range.yaml new file mode 100644 index 00000000..87b55d52 --- /dev/null +++ b/tests/tests/relations/join/filter-range.yaml @@ -0,0 +1,42 @@ +name: join-filter-range +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + type: JOIN_TYPE_SEMI + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + postJoinFilter: + selection: + rootReference: {} + directReference: + structField: + field: 3 + field__test: [ diag: { level: e, code: 2, msg: "*struct index out of range (size = 2)*" } ] diff --git a/tests/tests/relations/join/filter.yaml b/tests/tests/relations/join/filter.yaml new file mode 100644 index 00000000..f58f54cf --- /dev/null +++ b/tests/tests/relations/join/filter.yaml @@ -0,0 +1,39 @@ +name: join-filter +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + type: JOIN_TYPE_INNER + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + postJoinFilter: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } diff --git a/tests/tests/relations/join/inner.yaml b/tests/tests/relations/join/inner.yaml new file mode 100644 index 00000000..b8fdbfb6 --- /dev/null +++ b/tests/tests/relations/join/inner.yaml @@ -0,0 +1,36 @@ +name: join-inner +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + type: JOIN_TYPE_INNER + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/join/left.yaml b/tests/tests/relations/join/left.yaml new file mode 100644 index 00000000..53880dae --- /dev/null +++ b/tests/tests/relations/join/left.yaml @@ -0,0 +1,36 @@ +name: join-left +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + type: JOIN_TYPE_LEFT + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/join/missing-expr.yaml b/tests/tests/relations/join/missing-expr.yaml new file mode 100644 index 00000000..9facf797 --- /dev/null +++ b/tests/tests/relations/join/missing-expr.yaml @@ -0,0 +1,32 @@ +name: join-missing-expr +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + type: JOIN_TYPE_INNER + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: expression*" } ] diff --git a/tests/tests/relations/join/missing-type.yaml b/tests/tests/relations/join/missing-type.yaml new file mode 100644 index 00000000..befa6c31 --- /dev/null +++ b/tests/tests/relations/join/missing-type.yaml @@ -0,0 +1,35 @@ +name: join-missing-type +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + type__test: [ diag: { level: e, code: 2, msg: "*this enum may not be left unspecified*" } ] diff --git a/tests/tests/relations/join/outer.yaml b/tests/tests/relations/join/outer.yaml new file mode 100644 index 00000000..bf720706 --- /dev/null +++ b/tests/tests/relations/join/outer.yaml @@ -0,0 +1,36 @@ +name: join-outer +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + type: JOIN_TYPE_OUTER + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/join/right.yaml b/tests/tests/relations/join/right.yaml new file mode 100644 index 00000000..f73e2a90 --- /dev/null +++ b/tests/tests/relations/join/right.yaml @@ -0,0 +1,36 @@ +name: join-right +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + type: JOIN_TYPE_RIGHT + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/join/semi.yaml b/tests/tests/relations/join/semi.yaml new file mode 100644 index 00000000..dcecf050 --- /dev/null +++ b/tests/tests/relations/join/semi.yaml @@ -0,0 +1,36 @@ +name: join-semi +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + type: JOIN_TYPE_SEMI + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/join/single.yaml b/tests/tests/relations/join/single.yaml new file mode 100644 index 00000000..28b40286 --- /dev/null +++ b/tests/tests/relations/join/single.yaml @@ -0,0 +1,36 @@ +name: join-single +plan: + __test: [ level: i ] + relations: + - rel: + join: + left: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + right: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - fp32: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + expression: + selection: + rootReference: {} + directReference: { structField: { field: 3 } } + type: JOIN_TYPE_SINGLE + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/project/dependent.yaml b/tests/tests/relations/project/dependent.yaml new file mode 100644 index 00000000..eef74eaa --- /dev/null +++ b/tests/tests/relations/project/dependent.yaml @@ -0,0 +1,26 @@ +name: project-dependent +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + expressions: + - selection: + rootReference: {} + directReference: { structField: { field: 1 } } + - selection: + rootReference: {} + directReference: { structField: { field: 2 } } + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/project/missing.yaml b/tests/tests/relations/project/missing.yaml new file mode 100644 index 00000000..d1312f3a --- /dev/null +++ b/tests/tests/relations/project/missing.yaml @@ -0,0 +1,18 @@ +name: project-missing +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: expressions*" } ] diff --git a/tests/tests/relations/project/multiple.yaml b/tests/tests/relations/project/multiple.yaml new file mode 100644 index 00000000..2ea75098 --- /dev/null +++ b/tests/tests/relations/project/multiple.yaml @@ -0,0 +1,26 @@ +name: project-multiple +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + expressions: + - selection: + rootReference: {} + directReference: { structField: { field: 1 } } + - selection: + rootReference: {} + directReference: { structField: { field: 0 } } + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/project/single.yaml b/tests/tests/relations/project/single.yaml new file mode 100644 index 00000000..bfe1800f --- /dev/null +++ b/tests/tests/relations/project/single.yaml @@ -0,0 +1,23 @@ +name: project-single +plan: + __test: [ level: i ] + relations: + - rel: + project: + input: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + expressions: + - selection: + rootReference: {} + directReference: { structField: { field: 1 } } + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/read/extension-table/basic.yaml b/tests/tests/relations/read/extension-table/basic.yaml new file mode 100644 index 00000000..09b8fb2f --- /dev/null +++ b/tests/tests/relations/read/extension-table/basic.yaml @@ -0,0 +1,19 @@ +name: read-extension-basic +plan: + __test: [ level: w ] + relations: + - rel: + read: + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + extensionTable: + detail: + '@type': substrait.Plan + __test: [ type: "NSTRUCT" ] + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/read/file-table/basic.yaml b/tests/tests/relations/read/file-table/basic.yaml new file mode 100644 index 00000000..9791216f --- /dev/null +++ b/tests/tests/relations/read/file-table/basic.yaml @@ -0,0 +1,16 @@ +name: read-files-basic +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + localFiles: + items: + - uriPath: "1/2/3" + format: FILE_FORMAT_PARQUET diff --git a/tests/tests/relations/read/file-table/extension-format.yaml b/tests/tests/relations/read/file-table/extension-format.yaml new file mode 100644 index 00000000..99e178fc --- /dev/null +++ b/tests/tests/relations/read/file-table/extension-format.yaml @@ -0,0 +1,20 @@ +name: read-files-extension-format +plan: + __test: [ level: w ] + relations: + - rel: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + localFiles: + items: + - uriPath: "1/2/3" + advancedExtension: + enhancement: + "@type": substrait.Plan + expectedTypeUrls: + - substrait.Plan diff --git a/tests/tests/relations/read/file-table/missing-format.yaml b/tests/tests/relations/read/file-table/missing-format.yaml new file mode 100644 index 00000000..6813c4d8 --- /dev/null +++ b/tests/tests/relations/read/file-table/missing-format.yaml @@ -0,0 +1,16 @@ +name: read-files-missing-format +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + localFiles: + items: + - uriPath: "1/2/3" + format__test: [ diag: { level: e, code: 2, msg: "*file format must be specified*" } ] diff --git a/tests/tests/relations/read/file-table/partial.yaml b/tests/tests/relations/read/file-table/partial.yaml new file mode 100644 index 00000000..91fed1c5 --- /dev/null +++ b/tests/tests/relations/read/file-table/partial.yaml @@ -0,0 +1,30 @@ +name: read-partial-files +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + localFiles: + items: + - uriFile: "a/b/c" + format: FILE_FORMAT_PARQUET + partitionIndex: 3 + - uriFolder: "a/b/c" + format: FILE_FORMAT_PARQUET + partitionIndex: 3 + - uriFile: "a/b/c" + format: FILE_FORMAT_PARQUET + start: 10 + length: 20 + - uriFolder: "a/b/c" + format: FILE_FORMAT_PARQUET + start: 10 + length: 20 + __test: + - diag: { level: e, code: 2, msg: "*file offsets are not allowed in conjunction with multiple files*" } diff --git a/tests/tests/relations/read/file-table/uri-validation.yaml b/tests/tests/relations/read/file-table/uri-validation.yaml new file mode 100644 index 00000000..7fa88e24 --- /dev/null +++ b/tests/tests/relations/read/file-table/uri-validation.yaml @@ -0,0 +1,50 @@ +name: read-files-uri-validation +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + localFiles: + items: + - uriPath: "not a valid URI" + uriPath__test: [ diag: { level: e, code: 4, msg: "*invalid path character*" } ] + format: FILE_FORMAT_PARQUET + - uriPath: "a%20valid%20relative%20URI" + format: FILE_FORMAT_PARQUET + - uriPath: "/path/to/local/file" + format: FILE_FORMAT_PARQUET + - uriPath: "file:///path/to/local/file" + format: FILE_FORMAT_PARQUET + - uriPath: "protocol://with.an.authority/path/goes/here" + format: FILE_FORMAT_PARQUET + - uriPath: "protocol:urn" + format: FILE_FORMAT_PARQUET + - uriPath: 'C:\windows\paths\are\not\uris' + uriPath__test: [ diag: { level: e, code: 4, msg: "*invalid path character*" } ] + format: FILE_FORMAT_PARQUET + - uriPath: 'file://C:/write/them/like/this' + format: FILE_FORMAT_PARQUET + - uriPath: 'C:/or/like/this' + format: FILE_FORMAT_PARQUET + - uriPathGlob: '/can/have/*/and/?/in/path/globs' + format: FILE_FORMAT_PARQUET + - uriPathGlob: 'file:///can/have/*/and/?/in/path/globs' + format: FILE_FORMAT_PARQUET + - uriPathGlob: '/character/classes/must/be/escaped/[cls]' + uriPathGlob__test: [ diag: { level: e, code: 4, msg: "*invalid path character*" } ] + format: FILE_FORMAT_PARQUET + - uriPathGlob: '/character/classes/must/be/escaped/%5Bcls%5D' + format: FILE_FORMAT_PARQUET + - uriPathGlob: '/invalid/glob/syntax/%5Dcls%5B' + uriPathGlob__test: [ diag: { level: e, code: 5, msg: "*invalid range pattern*" } ] + format: FILE_FORMAT_PARQUET + - uriFile: "/path/to/local/file" + format: FILE_FORMAT_PARQUET + - uriFolder: "/path/to/local/folder" + format: FILE_FORMAT_PARQUET diff --git a/tests/tests/relations/read/filter-project/both.yaml b/tests/tests/relations/read/filter-project/both.yaml new file mode 100644 index 00000000..ff56ad40 --- /dev/null +++ b/tests/tests/relations/read/filter-project/both.yaml @@ -0,0 +1,26 @@ +name: read-filter-project +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + filter: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + projection: + maintain_singular_struct: true + select: + structItems: + - field: 0 + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/read/filter-project/filter-not-bool.yaml b/tests/tests/relations/read/filter-project/filter-not-bool.yaml new file mode 100644 index 00000000..cf8861b2 --- /dev/null +++ b/tests/tests/relations/read/filter-project/filter-not-bool.yaml @@ -0,0 +1,21 @@ +name: read-filter-not-bool +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + filter: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + __test: [ diag: { level: e, code: 4005, msg: "*predicates must yield booleans*string*" } ] diff --git a/tests/tests/relations/read/filter-project/projection-multiple.yaml b/tests/tests/relations/read/filter-project/projection-multiple.yaml new file mode 100644 index 00000000..f4e26577 --- /dev/null +++ b/tests/tests/relations/read/filter-project/projection-multiple.yaml @@ -0,0 +1,22 @@ +name: read-projection-multiple +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + projection: + select: + structItems: + - field: 1 + - field: 0 + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/read/filter-project/projection-singular.yaml b/tests/tests/relations/read/filter-project/projection-singular.yaml new file mode 100644 index 00000000..04d104bb --- /dev/null +++ b/tests/tests/relations/read/filter-project/projection-singular.yaml @@ -0,0 +1,21 @@ +name: read-projection-singular +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - bool: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + projection: + select: + structItems: + - field: 0 + maintainSingularStruct__test: [ diag: { level: e, code: 4006, msg: "*must be set*" } ] diff --git a/tests/tests/relations/read/named-table/basic.yaml b/tests/tests/relations/read/named-table/basic.yaml new file mode 100644 index 00000000..806cb34d --- /dev/null +++ b/tests/tests/relations/read/named-table/basic.yaml @@ -0,0 +1,15 @@ +name: read-named-basic +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test diff --git a/tests/tests/relations/read/named-table/missing.yaml b/tests/tests/relations/read/named-table/missing.yaml new file mode 100644 index 00000000..e2840df5 --- /dev/null +++ b/tests/tests/relations/read/named-table/missing.yaml @@ -0,0 +1,15 @@ +name: read-named-missing +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: [] + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: names*" } ] diff --git a/tests/tests/relations/read/named-table/multiple.yaml b/tests/tests/relations/read/named-table/multiple.yaml new file mode 100644 index 00000000..c5494b06 --- /dev/null +++ b/tests/tests/relations/read/named-table/multiple.yaml @@ -0,0 +1,17 @@ +name: read-named-multiple +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - a + - b + __test: [ diag: { level: w, msg: "*named tables with multiple names*" } ] diff --git a/tests/tests/relations/read/schema/missing-names.yaml b/tests/tests/relations/read/schema/missing-names.yaml new file mode 100644 index 00000000..cf4600ee --- /dev/null +++ b/tests/tests/relations/read/schema/missing-names.yaml @@ -0,0 +1,15 @@ +name: read-schema-missing-names +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4003, msg: "*1 too few*" } ] + namedTable: + names: + - test diff --git a/tests/tests/relations/read/schema/missing-schema.yaml b/tests/tests/relations/read/schema/missing-schema.yaml new file mode 100644 index 00000000..4d1b11cf --- /dev/null +++ b/tests/tests/relations/read/schema/missing-schema.yaml @@ -0,0 +1,10 @@ +name: read-schema-missing +plan: + __test: [ level: i ] + relations: + - rel: + read: + namedTable: + names: + - test + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: base_schema*" } ] diff --git a/tests/tests/relations/read/schema/nullable.yaml b/tests/tests/relations/read/schema/nullable.yaml new file mode 100644 index 00000000..5fadb8e1 --- /dev/null +++ b/tests/tests/relations/read/schema/nullable.yaml @@ -0,0 +1,16 @@ +name: read-schema-nullable +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a] + struct: + nullability: NULLABILITY_NULLABLE + types: + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ diag: { level: e, code: 4008, msg: "*the outer struct representing a schema must not be nullable*" } ] diff --git a/tests/tests/relations/read/schema/wrong-name-count.yaml b/tests/tests/relations/read/schema/wrong-name-count.yaml new file mode 100644 index 00000000..37003f9b --- /dev/null +++ b/tests/tests/relations/read/schema/wrong-name-count.yaml @@ -0,0 +1,16 @@ +name: read-schema-wrong-name-count +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + __test: [ diag: { level: e, code: 4003, msg: "*1 too many*" } ] + namedTable: + names: + - test diff --git a/tests/tests/relations/read/virtual-table/basic.yaml b/tests/tests/relations/read/virtual-table/basic.yaml new file mode 100644 index 00000000..5f84ea05 --- /dev/null +++ b/tests/tests/relations/read/virtual-table/basic.yaml @@ -0,0 +1,35 @@ +name: read-virtual-basic +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - string: one + nullable: false + - i32: 1 + nullable: true + - fields: + - string: two + nullable: false + - i32: 2 + nullable: true + - fields: + - string: three + nullable: false + - i32: 3 + nullable: true + - fields: + - string: banana + nullable: false + - "null": { i32: { nullability: NULLABILITY_NULLABLE } } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/read/virtual-table/empty.yaml b/tests/tests/relations/read/virtual-table/empty.yaml new file mode 100644 index 00000000..277852d7 --- /dev/null +++ b/tests/tests/relations/read/virtual-table/empty.yaml @@ -0,0 +1,16 @@ +name: read-virtual-empty +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: [] + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/read/virtual-table/row-field-count-mismatch.yaml b/tests/tests/relations/read/virtual-table/row-field-count-mismatch.yaml new file mode 100644 index 00000000..71b33fb4 --- /dev/null +++ b/tests/tests/relations/read/virtual-table/row-field-count-mismatch.yaml @@ -0,0 +1,34 @@ +name: read-virtual-field-count-mismatch +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - string: one + nullable: false + - i32: 1 + nullable: true + - fields: + - string: one + nullable: false + __test: [ diag: { level: e, code: 4005, msg: "*1 parameter(s) vs. 2 parameter(s)*" } ] + - fields: + - string: three + nullable: false + - i32: 3 + nullable: true + - fields: + - string: banana + nullable: false + - "null": { i32: { nullability: NULLABILITY_NULLABLE } } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/read/virtual-table/row-type-mismatch.yaml b/tests/tests/relations/read/virtual-table/row-type-mismatch.yaml new file mode 100644 index 00000000..0d9eaabd --- /dev/null +++ b/tests/tests/relations/read/virtual-table/row-type-mismatch.yaml @@ -0,0 +1,38 @@ +name: read-virtual-row-type-mismatch +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [word, value] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + virtualTable: + values: + - fields: + - string: one + nullable: false + - i32: 1 + nullable: true + - fields: + - varChar: + value: two + length: 25 + nullable: false + - i32: 2 + nullable: true + __test: [ diag: { level: e, code: 4005, msg: "*VARCHAR vs. string on parameter path 0*" } ] + - fields: + - string: three + nullable: false + - i32: 3 + nullable: true + - fields: + - string: banana + nullable: false + - "null": { i32: { nullability: NULLABILITY_NULLABLE } } + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/root/missing.yaml b/tests/tests/relations/root/missing.yaml new file mode 100644 index 00000000..1137baf5 --- /dev/null +++ b/tests/tests/relations/root/missing.yaml @@ -0,0 +1,5 @@ +name: rel-root-missing +plan: + __test: [ level: i ] + relations: [] + __test: [ diag: { level: e, code: 5001, msg: "*must have at least one relation*" } ] diff --git a/tests/tests/relations/root/multiple.yaml b/tests/tests/relations/root/multiple.yaml new file mode 100644 index 00000000..00941080 --- /dev/null +++ b/tests/tests/relations/root/multiple.yaml @@ -0,0 +1,28 @@ +name: rel-root-multiple +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + - rel: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test diff --git a/tests/tests/relations/root/with-names-nested.yaml b/tests/tests/relations/root/with-names-nested.yaml new file mode 100644 index 00000000..0c0bb20b --- /dev/null +++ b/tests/tests/relations/root/with-names-nested.yaml @@ -0,0 +1,30 @@ +name: rel-root-with-names-nested +plan: + __test: [ level: i ] + relations: + - root: + names: [x, y, a, b, z] + input: + read: + baseSchema: + names: [a, b, c, d, e] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_NULLABLE } + - list: + nullability: NULLABILITY_REQUIRED + type: + struct: + nullability: NULLABILITY_REQUIRED + types: + - map: + nullability: NULLABILITY_REQUIRED + key: { string: { nullability: NULLABILITY_NULLABLE } } + value: { string: { nullability: NULLABILITY_NULLABLE } } + - bool: { nullability: NULLABILITY_NULLABLE } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + __test: [ type: "NSTRUCT, b: boolean?>>, z: i32?>" ] diff --git a/tests/tests/relations/root/with-names.yaml b/tests/tests/relations/root/with-names.yaml new file mode 100644 index 00000000..5c1737a7 --- /dev/null +++ b/tests/tests/relations/root/with-names.yaml @@ -0,0 +1,21 @@ +name: rel-root-with-names +plan: + __test: [ level: i ] + relations: + - root: + names: + - a + - b + input: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/root/without-names-nested.yaml b/tests/tests/relations/root/without-names-nested.yaml new file mode 100644 index 00000000..14a8e530 --- /dev/null +++ b/tests/tests/relations/root/without-names-nested.yaml @@ -0,0 +1,28 @@ +name: rel-root-without-names-nested +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [a, b, c, d, e] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_NULLABLE } + - list: + nullability: NULLABILITY_REQUIRED + type: + struct: + nullability: NULLABILITY_REQUIRED + types: + - map: + nullability: NULLABILITY_REQUIRED + key: { string: { nullability: NULLABILITY_NULLABLE } } + value: { string: { nullability: NULLABILITY_NULLABLE } } + - bool: { nullability: NULLABILITY_NULLABLE } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + __test: [ type: "STRUCT, boolean?>>, i32?>" ] diff --git a/tests/tests/relations/root/without-names.yaml b/tests/tests/relations/root/without-names.yaml new file mode 100644 index 00000000..698cfc96 --- /dev/null +++ b/tests/tests/relations/root/without-names.yaml @@ -0,0 +1,17 @@ +name: rel-root-without-names +plan: + __test: [ level: i ] + relations: + - rel: + read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_NULLABLE } + namedTable: + names: + - test + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/set/different-inputs.yaml b/tests/tests/relations/set/different-inputs.yaml new file mode 100644 index 00000000..40e0e9d0 --- /dev/null +++ b/tests/tests/relations/set/different-inputs.yaml @@ -0,0 +1,33 @@ +name: set-different-inputs +plan: + __test: [ level: i ] + relations: + - rel: + set: + inputs: + - read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + - read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - i32: { nullability: NULLABILITY_REQUIRED } + - string: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + op: SET_OP_UNION_ALL + __test: + - diag: { level: e, code: 4005, msg: "*i32 vs. string on parameter path 0*" } + - diag: { level: e, code: 4005, msg: "*string vs. i32 on parameter path 1*" } diff --git a/tests/tests/relations/set/insufficient-inputs.yaml b/tests/tests/relations/set/insufficient-inputs.yaml new file mode 100644 index 00000000..3d565c38 --- /dev/null +++ b/tests/tests/relations/set/insufficient-inputs.yaml @@ -0,0 +1,20 @@ +name: set-insufficient-inputs +plan: + __test: [ level: i ] + relations: + - rel: + set: + inputs: + - read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + op: SET_OP_UNION_ALL + __test: [ diag: { level: e, code: 5002, msg: "*set operations require at least two input relations*" } ] diff --git a/tests/tests/relations/set/missing-op.yaml b/tests/tests/relations/set/missing-op.yaml new file mode 100644 index 00000000..dbfbaf28 --- /dev/null +++ b/tests/tests/relations/set/missing-op.yaml @@ -0,0 +1,30 @@ +name: set-missing-op +plan: + __test: [ level: i ] + relations: + - rel: + set: + inputs: + - read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + - read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + op__test: [ diag: { level: e, code: 2, msg: "*this enum may not be left unspecified*" } ] diff --git a/tests/tests/relations/set/three-inputs.yaml b/tests/tests/relations/set/three-inputs.yaml new file mode 100644 index 00000000..321aeae2 --- /dev/null +++ b/tests/tests/relations/set/three-inputs.yaml @@ -0,0 +1,42 @@ +name: set-three-inputs +plan: + __test: [ level: i ] + relations: + - rel: + set: + inputs: + - read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + - read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + - read: + baseSchema: + names: [u, v] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test3 + op: SET_OP_UNION_ALL + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/set/two-inputs.yaml b/tests/tests/relations/set/two-inputs.yaml new file mode 100644 index 00000000..d21c0902 --- /dev/null +++ b/tests/tests/relations/set/two-inputs.yaml @@ -0,0 +1,31 @@ +name: set-two-inputs +plan: + __test: [ level: i ] + relations: + - rel: + set: + inputs: + - read: + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + - read: + baseSchema: + names: [x, y] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test2 + op: SET_OP_UNION_ALL + __test: [ type: "STRUCT" ] diff --git a/tests/tests/relations/sort/coalesce.yaml b/tests/tests/relations/sort/coalesce.yaml new file mode 100644 index 00000000..83bd7d1c --- /dev/null +++ b/tests/tests/relations/sort/coalesce.yaml @@ -0,0 +1,27 @@ +name: rel-sort-coalesce +plan: + __test: [ level: i ] + relations: + - rel: + sort: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + sorts: + - expr: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + direction: SORT_DIRECTION_CLUSTERED + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/sort/key_cmp.yaml b/tests/tests/relations/sort/key_cmp.yaml new file mode 100644 index 00000000..6eb3046a --- /dev/null +++ b/tests/tests/relations/sort/key_cmp.yaml @@ -0,0 +1,42 @@ +name: rel-sort-key-cmp +plan: + __test: [ level: iw ] + extensionUris: + - extensionUriAnchor: 1 + uri__yaml: + scalar_functions: + - name: "cmp" + impls: + - args: + - value: i32 + - value: i32 + return: i32 + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: cmp:i32_i32 + relations: + - rel: + sort: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + sorts: + - expr: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + comparisonFunctionReference: 1 + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/sort/key_lt.yaml b/tests/tests/relations/sort/key_lt.yaml new file mode 100644 index 00000000..9eae7b70 --- /dev/null +++ b/tests/tests/relations/sort/key_lt.yaml @@ -0,0 +1,42 @@ +name: rel-sort-key-lt +plan: + __test: [ level: iw ] + extensionUris: + - extensionUriAnchor: 1 + uri__yaml: + scalar_functions: + - name: "lt" + impls: + - args: + - value: i32 + - value: i32 + return: BOOLEAN + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: lt:i32_i32 + relations: + - rel: + sort: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + sorts: + - expr: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + comparisonFunctionReference: 1 + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/sort/missing-expr.yaml b/tests/tests/relations/sort/missing-expr.yaml new file mode 100644 index 00000000..3f4291e8 --- /dev/null +++ b/tests/tests/relations/sort/missing-expr.yaml @@ -0,0 +1,23 @@ +name: rel-sort-missing-expr +plan: + __test: [ level: i ] + relations: + - rel: + sort: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + sorts: + - direction: SORT_DIRECTION_ASC_NULLS_LAST + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: expr*" } ] diff --git a/tests/tests/relations/sort/missing-sort-kind.yaml b/tests/tests/relations/sort/missing-sort-kind.yaml new file mode 100644 index 00000000..02f0dd00 --- /dev/null +++ b/tests/tests/relations/sort/missing-sort-kind.yaml @@ -0,0 +1,26 @@ +name: rel-sort-missing-sort-kind +plan: + __test: [ level: i ] + relations: + - rel: + sort: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + sorts: + - expr: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: sort_kind*" } ] diff --git a/tests/tests/relations/sort/missing.yaml b/tests/tests/relations/sort/missing.yaml new file mode 100644 index 00000000..e18f2dab --- /dev/null +++ b/tests/tests/relations/sort/missing.yaml @@ -0,0 +1,21 @@ +name: rel-sort-no-op +plan: + __test: [ level: i ] + relations: + - rel: + sort: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + __test: [ diag: { level: e, code: 1002, msg: "*missing required protobuf field: sorts*" } ] diff --git a/tests/tests/relations/sort/multiple.yaml b/tests/tests/relations/sort/multiple.yaml new file mode 100644 index 00000000..82b927a5 --- /dev/null +++ b/tests/tests/relations/sort/multiple.yaml @@ -0,0 +1,32 @@ +name: rel-sort-multiple +plan: + __test: [ level: i ] + relations: + - rel: + sort: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + sorts: + - expr: + selection: + rootReference: {} + directReference: { structField: { field: 1 } } + direction: SORT_DIRECTION_ASC_NULLS_LAST + - expr: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + direction: SORT_DIRECTION_DESC_NULLS_FIRST + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/relations/sort/single.yaml b/tests/tests/relations/sort/single.yaml new file mode 100644 index 00000000..5d6c4644 --- /dev/null +++ b/tests/tests/relations/sort/single.yaml @@ -0,0 +1,27 @@ +name: rel-sort-single +plan: + __test: [ level: i ] + relations: + - rel: + sort: + input: + read: + common: + direct: {} + baseSchema: + names: [a, b] + struct: + nullability: NULLABILITY_REQUIRED + types: + - string: { nullability: NULLABILITY_REQUIRED } + - i32: { nullability: NULLABILITY_REQUIRED } + namedTable: + names: + - test + sorts: + - expr: + selection: + rootReference: {} + directReference: { structField: { field: 0 } } + direction: SORT_DIRECTION_ASC_NULLS_LAST + __test: [ type: "NSTRUCT" ] diff --git a/tests/tests/tpc-h/README.md b/tests/tests/tpc-h/README.md new file mode 100644 index 00000000..e01542e6 --- /dev/null +++ b/tests/tests/tpc-h/README.md @@ -0,0 +1,28 @@ +This directory contains positive tests for (some of) the TPC-H queries. For the +most part, they are either completely or partially generated by Isthmus, +however: + + - Aggregations output an extra column according to the spec indicating which + grouping set was used for a particular row, which the Isthmus plans weren't + considering. + - Aggregations with only measures were being emitted by Isthmus as + aggregations with empty grouping sets rather than no grouping sets. + - Isthmus was emitting duplicate grouping sets wherever there should only be + one grouping set. + - Decimal literals had too many bytes attached to them. + - Emitted function signatures use `any1` etc. where according to the spec a + plain `any` should be used. Same for `decimal` vs `dec`. + - Subqueries are not presently supported by Isthmus, so queries with + subqueries were merged manually. + +NOTE: these queries have not undergone *functional* testing, and have been +manually modified. Therefore, it is quite likely for there to be slight +functional differences between the SQL queries and the plans due to bugs. + +NOTE: the plans are also not optimized; they typically start by forming the +cross product of all input tables and then applying a filter to the result. +Without predicate pushdown they are unlikely to run for any reasonable table +sizes. + +TODO: when function resolution is implemented in the validator, the diagnostic +overrides relating to those not currently working should be removed. diff --git a/tests/tests/tpc-h/tpc-h01.yaml b/tests/tests/tpc-h/tpc-h01.yaml new file mode 100644 index 00000000..de6a273f --- /dev/null +++ b/tests/tests/tpc-h/tpc-h01.yaml @@ -0,0 +1,465 @@ +# select +# l_returnflag, +# l_linestatus, +# sum(l_quantity) as sum_qty, +# sum(l_extendedprice) as sum_base_price, +# sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, +# sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, +# avg(l_quantity) as avg_qty, +# avg(l_extendedprice) as avg_price, +# avg(l_discount) as avg_disc, +# count(*) as count_order +# from +# lineitem +# where +# l_shipdate <= date '1998-12-01' - interval '120' day (3) +# group by +# l_returnflag, +# l_linestatus +# order by +# l_returnflag, +# l_linestatus + +name: TPC-H01 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_datetime.yaml + - extensionUriAnchor: 2 + uri: /functions_arithmetic_decimal.yaml + - extensionUriAnchor: 3 + uri: /functions_aggregate_generic.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: lte:date_date + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 2 + name: subtract:date_day + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 3 + name: multiply:opt_dec_dec + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 4 + name: subtract:opt_dec_dec + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 5 + name: add:opt_dec_dec + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 6 + name: sum:opt_dec + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 7 + name: avg:opt_dec + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 8 + name: count:opt + relations: + - root: + __test: + - type: "\ + NSTRUCT<\ + L_RETURNFLAG: FIXEDCHAR?<1>, \ + L_LINESTATUS: FIXEDCHAR?<1>, \ + SUM_QTY: DECIMAL?<19, 0>, \ + SUM_BASE_PRICE: DECIMAL?<19, 0>, \ + SUM_DISC_PRICE: DECIMAL?<19, 0>, \ + SUM_CHARGE: DECIMAL?<19, 0>, \ + AVG_QTY: DECIMAL?<19, 0>, \ + AVG_PRICE: DECIMAL?<19, 0>, \ + AVG_DISC: DECIMAL?<19, 0>, \ + COUNT_ORDER: i64\ + >" + input: + sort: + common: + direct: {} + input: + aggregate: + common: + emit: + outputMapping: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + - 7 + - 8 + - 9 + groupings: + - groupingExpressions: + - selection: + directReference: + structField: + field: 0 + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + input: + project: + common: + emit: + outputMapping: + - 16 + - 17 + - 18 + - 19 + - 20 + - 21 + - 22 + expressions: + - selection: + directReference: + structField: + field: 8 + rootReference: {} + - selection: + directReference: + structField: + field: 9 + rootReference: {} + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 6 + rootReference: {} + functionReference: 4 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 3 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 6 + rootReference: {} + functionReference: 4 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 3 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 7 + rootReference: {} + functionReference: 5 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 3 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 6 + rootReference: {} + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - selection: + directReference: + structField: + field: 10 + rootReference: {} + - scalarFunction: + args: + - literal: + date: 10561 + - literal: + intervalDayToSecond: + days: 120 + functionReference: 2 + outputType: + date: + nullability: NULLABILITY_REQUIRED + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + measures: + - measure: + args: + - selection: + directReference: + structField: + field: 2 + rootReference: {} + functionReference: 6 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + - measure: + args: + - selection: + directReference: + structField: + field: 3 + rootReference: {} + functionReference: 6 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + - measure: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + functionReference: 6 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + - measure: + args: + - selection: + directReference: + structField: + field: 5 + rootReference: {} + functionReference: 6 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + - measure: + args: + - selection: + directReference: + structField: + field: 2 + rootReference: {} + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + - measure: + args: + - selection: + directReference: + structField: + field: 3 + rootReference: {} + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + - measure: + args: + - selection: + directReference: + structField: + field: 6 + rootReference: {} + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + - measure: + functionReference: 8 + outputType: + i64: + nullability: NULLABILITY_REQUIRED + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + sorts: + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: {} + rootReference: {} + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: + field: 1 + rootReference: {} + names: + - L_RETURNFLAG + - L_LINESTATUS + - SUM_QTY + - SUM_BASE_PRICE + - SUM_DISC_PRICE + - SUM_CHARGE + - AVG_QTY + - AVG_PRICE + - AVG_DISC + - COUNT_ORDER diff --git a/tests/tests/tpc-h/tpc-h02.yaml b/tests/tests/tpc-h/tpc-h02.yaml new file mode 100644 index 00000000..f46139cc --- /dev/null +++ b/tests/tests/tpc-h/tpc-h02.yaml @@ -0,0 +1,780 @@ +# select +# s.s_acctbal, +# s.s_name, +# n.n_name, +# p.p_partkey, +# p.p_mfgr, +# s.s_address, +# s.s_phone, +# s.s_comment +# from +# "part" p, +# "supplier" s, +# "partsupp" ps, +# "nation" n, +# "region" r +# where +# p.p_partkey = ps.ps_partkey +# and s.s_suppkey = ps.ps_suppkey +# and p.p_size = 41 +# and p.p_type like '%NICKEL' +# and s.s_nationkey = n.n_nationkey +# and n.n_regionkey = r.r_regionkey +# and r.r_name = 'EUROPE' +# and ps.ps_supplycost = ( +# +# select +# min(ps.ps_supplycost) +# +# from +# "partsupp" ps, +# "supplier" s, +# "nation" n, +# "region" r +# where +# p.p_partkey = ps.ps_partkey +# and s.s_suppkey = ps.ps_suppkey +# and s.s_nationkey = n.n_nationkey +# and n.n_regionkey = r.r_regionkey +# and r.r_name = 'EUROPE' +# ) +# +# order by +# s.s_acctbal desc, +# n.n_name, +# s.s_name, +# p.p_partkey +# limit 100 + +name: TPC-H02 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_comparison.yaml + - extensionUriAnchor: 3 + uri: /functions_string.yaml + - extensionUriAnchor: 4 + uri: /functions_aggregate_generic.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 2 + name: equal:any_any + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 3 + name: like:vchar_vchar + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 4 + name: min:any_any + relations: + - root: + input: + fetch: + common: + direct: {} + count: '100' + input: + sort: + common: + direct: {} + input: + project: + common: + emit: + outputMapping: + - 28 + - 29 + - 30 + - 31 + - 32 + - 33 + - 34 + - 35 + expressions: + - selection: + directReference: + structField: + field: 14 + rootReference: {} + - selection: + directReference: + structField: + field: 10 + rootReference: {} + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 2 + rootReference: {} + - selection: + directReference: + structField: + field: 11 + rootReference: {} + - selection: + directReference: + structField: + field: 13 + rootReference: {} + - selection: + directReference: + structField: + field: 15 + rootReference: {} + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 16 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 9 + rootReference: {} + - selection: + directReference: + structField: + field: 17 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - literal: + i32: 41 + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - cast: + input: + literal: + fixedChar: '%NICKEL' + type: + varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 12 + rootReference: {} + - selection: + directReference: + structField: + field: 21 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 23 + rootReference: {} + - selection: + directReference: + structField: + field: 25 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 26 + rootReference: {} + - cast: + input: + literal: + fixedChar: EUROPE + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - subquery: + scalar: + input: + aggregate: + common: + emit: + outputMapping: + - 0 + input: + project: + common: + emit: + outputMapping: + - 19 + expressions: + - selection: + directReference: + structField: + field: 3 + rootReference: {} + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: {} + outerReference: + stepsOut: 1 + - selection: + directReference: + structField: {} + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 8 + rootReference: {} + - selection: + directReference: + structField: + field: 12 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 14 + rootReference: {} + - selection: + directReference: + structField: + field: 16 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 17 + rootReference: {} + - cast: + input: + literal: + fixedChar: EUROPE + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + read: + baseSchema: + names: + - PS_PARTKEY + - PS_SUPPKEY + - PS_AVAILQTY + - PS_SUPPLYCOST + - PS_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 199 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - PARTSUPP + right: + read: + baseSchema: + names: + - S_SUPPKEY + - S_NAME + - S_ADDRESS + - S_NATIONKEY + - S_PHONE + - S_ACCTBAL + - S_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 101 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - SUPPLIER + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - N_NATIONKEY + - N_NAME + - N_REGIONKEY + - N_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - NATION + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - R_REGIONKEY + - R_NAME + - R_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - REGION + type: JOIN_TYPE_INNER + measures: + - measure: + args: + - selection: + directReference: + structField: + field: 0 + rootReference: {} + functionReference: 4 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + - selection: + directReference: + structField: + field: 19 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + read: + baseSchema: + names: + - P_PARTKEY + - P_NAME + - P_MFGR + - P_BRAND + - P_TYPE + - P_SIZE + - P_CONTAINER + - P_RETAILPRICE + - P_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 55 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 23 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - PART + right: + read: + baseSchema: + names: + - S_SUPPKEY + - S_NAME + - S_ADDRESS + - S_NATIONKEY + - S_PHONE + - S_ACCTBAL + - S_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 101 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - SUPPLIER + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - PS_PARTKEY + - PS_SUPPKEY + - PS_AVAILQTY + - PS_SUPPLYCOST + - PS_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 199 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - PARTSUPP + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - N_NATIONKEY + - N_NAME + - N_REGIONKEY + - N_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - NATION + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - R_REGIONKEY + - R_NAME + - R_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - REGION + type: JOIN_TYPE_INNER + sorts: + - direction: SORT_DIRECTION_DESC_NULLS_FIRST + expr: + selection: + directReference: + structField: {} + rootReference: {} + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: + field: 2 + rootReference: {} + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: + field: 1 + rootReference: {} + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: + field: 3 + rootReference: {} + names: + - S_ACCTBAL + - S_NAME + - N_NAME + - P_PARTKEY + - P_MFGR + - S_ADDRESS + - S_PHONE + - S_COMMENT diff --git a/tests/tests/tpc-h/tpc-h03.yaml b/tests/tests/tpc-h/tpc-h03.yaml new file mode 100644 index 00000000..d1a39140 --- /dev/null +++ b/tests/tests/tpc-h/tpc-h03.yaml @@ -0,0 +1,485 @@ +# select +# l.l_orderkey, +# sum(l.l_extendedprice * (1 - l.l_discount)) as revenue, +# o.o_orderdate, +# o.o_shippriority +# +# from +# "customer" c, +# "orders" o, +# "lineitem" l +# +# where +# c.c_mktsegment = 'HOUSEHOLD' +# and c.c_custkey = o.o_custkey +# and l.l_orderkey = o.o_orderkey +# and o.o_orderdate < date '1995-03-25' +# and l.l_shipdate > date '1995-03-25' +# +# group by +# l.l_orderkey, +# o.o_orderdate, +# o.o_shippriority +# order by +# revenue desc, +# o.o_orderdate +# limit 10 + +name: TPC-H03 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_comparison.yaml + - extensionUriAnchor: 3 + uri: /functions_datetime.yaml + - extensionUriAnchor: 4 + uri: /functions_arithmetic_decimal.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 2 + name: equal:any_any + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 3 + name: lt:date_date + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 4 + name: gt:date_date + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 5 + name: multiply:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 6 + name: subtract:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 7 + name: sum:opt_dec + relations: + - root: + __test: + - type: "NSTRUCT, O_SHIPPRIORITY: date?>" + input: + fetch: + common: + direct: {} + count: '10' + input: + sort: + common: + direct: {} + input: + project: + common: + emit: + outputMapping: + - 4 + - 5 + - 6 + - 7 + expressions: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 3 + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + - selection: + directReference: + structField: + field: 2 + rootReference: {} + input: + aggregate: + common: + direct: {} + groupings: + - groupingExpressions: + - selection: + directReference: + structField: + field: 0 + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + - selection: + directReference: + structField: + field: 2 + rootReference: {} + input: + project: + common: + emit: + outputMapping: + - 33 + - 34 + - 35 + - 36 + expressions: + - selection: + directReference: + structField: + field: 17 + rootReference: {} + - selection: + directReference: + structField: + field: 12 + rootReference: {} + - selection: + directReference: + structField: + field: 15 + rootReference: {} + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 23 + rootReference: {} + functionReference: 6 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 5 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 6 + rootReference: {} + - cast: + input: + literal: + fixedChar: HOUSEHOLD + type: + fixedChar: + length: 10 + nullability: NULLABILITY_REQUIRED + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 9 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 17 + rootReference: {} + - selection: + directReference: + structField: + field: 8 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 12 + rootReference: {} + - literal: + date: 9214 + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 27 + rootReference: {} + - literal: + date: 9214 + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + read: + baseSchema: + names: + - C_CUSTKEY + - C_NAME + - C_ADDRESS + - C_NATIONKEY + - C_PHONE + - C_ACCTBAL + - C_MKTSEGMENT + - C_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 117 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - CUSTOMER + right: + read: + baseSchema: + names: + - O_ORDERKEY + - O_CUSTKEY + - O_ORDERSTATUS + - O_TOTALPRICE + - O_ORDERDATE + - O_ORDERPRIORITY + - O_CLERK + - O_SHIPPRIORITY + - O_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - varchar: + length: 79 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - ORDERS + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + type: JOIN_TYPE_INNER + measures: + - measure: + args: + - selection: + directReference: + structField: + field: 3 + rootReference: {} + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + sorts: + - direction: SORT_DIRECTION_DESC_NULLS_FIRST + expr: + selection: + directReference: + structField: + field: 1 + rootReference: {} + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: + field: 2 + rootReference: {} + names: + - L_ORDERKEY + - REVENUE + - O_ORDERDATE + - O_SHIPPRIORITY diff --git a/tests/tests/tpc-h/tpc-h04.yaml b/tests/tests/tpc-h/tpc-h04.yaml new file mode 100644 index 00000000..c99a0941 --- /dev/null +++ b/tests/tests/tpc-h/tpc-h04.yaml @@ -0,0 +1,318 @@ +# select +# o.o_orderpriority, +# count(*) as order_count +# from +# "orders" o +# +# where +# o.o_orderdate >= date '1996-10-01' +# and o.o_orderdate < date '1996-10-01' + interval '3' month +# and +# exists ( +# select +# * +# from +# "lineitem" l +# where +# l.l_orderkey = o.o_orderkey +# and l.l_commitdate < l.l_receiptdate +# ) +# group by +# o.o_orderpriority +# order by +# o.o_orderpriority + +name: TPC-H04 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_datetime.yaml + - extensionUriAnchor: 3 + uri: /functions_aggregate_generic.yaml + - extensionUriAnchor: 4 + uri: /functions_comparison.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 2 + name: gte:date_date + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 3 + name: lt:date_date + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 4 + name: add:date_year + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 5 + name: count:opt + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 6 + name: equal:any_any + relations: + - root: + input: + sort: + common: + direct: {} + input: + aggregate: + common: + emit: + outputMapping: + - 0 + - 1 + groupings: + - groupingExpressions: + - selection: + directReference: + structField: {} + rootReference: {} + input: + project: + common: + emit: + outputMapping: + - 9 + expressions: + - selection: + directReference: + structField: + field: 5 + rootReference: {} + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - literal: + date: 9770 + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - scalarFunction: + args: + - literal: + date: 9770 + - literal: + intervalYearToMonth: + months: 3 + functionReference: 4 + outputType: + date: + nullability: NULLABILITY_REQUIRED + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - subquery: + setPredicate: + predicateOp: PREDICATE_OP_EXISTS + tuples: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: {} + outerReference: + stepsOut: 1 + functionReference: 6 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 11 + rootReference: {} + - selection: + directReference: + structField: + field: 12 + rootReference: {} + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + read: + baseSchema: + names: + - O_ORDERKEY + - O_CUSTKEY + - O_ORDERSTATUS + - O_TOTALPRICE + - O_ORDERDATE + - O_ORDERPRIORITY + - O_CLERK + - O_SHIPPRIORITY + - O_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - varchar: + length: 79 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - ORDERS + measures: + - measure: + functionReference: 5 + outputType: + i64: + nullability: NULLABILITY_REQUIRED + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + sorts: + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: {} + rootReference: {} + names: + - O_ORDERPRIORITY + - ORDER_COUNT diff --git a/tests/tests/tpc-h/tpc-h05.yaml b/tests/tests/tpc-h/tpc-h05.yaml new file mode 100644 index 00000000..5e39be72 --- /dev/null +++ b/tests/tests/tpc-h/tpc-h05.yaml @@ -0,0 +1,612 @@ +# select +# n.n_name, +# sum(l.l_extendedprice * (1 - l.l_discount)) as revenue +# +# from +# "customer" c, +# "orders" o, +# "lineitem" l, +# "supplier" s, +# "nation" n, +# "region" r +# +# where +# c.c_custkey = o.o_custkey +# and l.l_orderkey = o.o_orderkey +# and l.l_suppkey = s.s_suppkey +# and c.c_nationkey = s.s_nationkey +# and s.s_nationkey = n.n_nationkey +# and n.n_regionkey = r.r_regionkey +# and r.r_name = 'EUROPE' +# and o.o_orderdate >= date '1997-01-01' +# and o.o_orderdate < date '1997-01-01' + interval '1' year +# group by +# n.n_name +# +# order by +# revenue desc + +name: TPC-H05 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_comparison.yaml + - extensionUriAnchor: 3 + uri: /functions_datetime.yaml + - extensionUriAnchor: 4 + uri: /functions_arithmetic_decimal.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 2 + name: equal:any_any + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 3 + name: gte:date_date + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 4 + name: lt:date_date + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 5 + name: add:date_year + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 6 + name: multiply:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 7 + name: subtract:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 8 + name: sum:opt_dec + relations: + - root: + __test: + - type: "NSTRUCT, REVENUE: DECIMAL?<19, 0>>" + input: + sort: + common: + direct: {} + input: + aggregate: + common: + emit: + outputMapping: + - 0 + - 1 + groupings: + - groupingExpressions: + - selection: + directReference: + structField: + field: 0 + rootReference: {} + input: + project: + common: + emit: + outputMapping: + - 47 + - 48 + expressions: + - selection: + directReference: + structField: + field: 41 + rootReference: {} + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 23 + rootReference: {} + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 6 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 9 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 17 + rootReference: {} + - selection: + directReference: + structField: + field: 8 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 19 + rootReference: {} + - selection: + directReference: + structField: + field: 33 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 3 + rootReference: {} + - selection: + directReference: + structField: + field: 36 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 36 + rootReference: {} + - selection: + directReference: + structField: + field: 40 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 42 + rootReference: {} + - selection: + directReference: + structField: + field: 44 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 45 + rootReference: {} + - cast: + input: + literal: + fixedChar: EUROPE + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 12 + rootReference: {} + - literal: + date: 9862 + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 12 + rootReference: {} + - scalarFunction: + args: + - literal: + date: 9862 + - literal: + intervalYearToMonth: + years: 1 + functionReference: 5 + outputType: + date: + nullability: NULLABILITY_REQUIRED + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + read: + baseSchema: + names: + - C_CUSTKEY + - C_NAME + - C_ADDRESS + - C_NATIONKEY + - C_PHONE + - C_ACCTBAL + - C_MKTSEGMENT + - C_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 117 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - CUSTOMER + right: + read: + baseSchema: + names: + - O_ORDERKEY + - O_CUSTKEY + - O_ORDERSTATUS + - O_TOTALPRICE + - O_ORDERDATE + - O_ORDERPRIORITY + - O_CLERK + - O_SHIPPRIORITY + - O_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - varchar: + length: 79 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - ORDERS + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - S_SUPPKEY + - S_NAME + - S_ADDRESS + - S_NATIONKEY + - S_PHONE + - S_ACCTBAL + - S_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 101 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - SUPPLIER + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - N_NATIONKEY + - N_NAME + - N_REGIONKEY + - N_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - NATION + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - R_REGIONKEY + - R_NAME + - R_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - REGION + type: JOIN_TYPE_INNER + measures: + - measure: + args: + - selection: + directReference: + structField: + field: 1 + rootReference: {} + functionReference: 8 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + sorts: + - direction: SORT_DIRECTION_DESC_NULLS_FIRST + expr: + selection: + directReference: + structField: + field: 1 + rootReference: {} + names: + - N_NAME + - REVENUE diff --git a/tests/tests/tpc-h/tpc-h06.yaml b/tests/tests/tpc-h/tpc-h06.yaml new file mode 100644 index 00000000..ede9e0ca --- /dev/null +++ b/tests/tests/tpc-h/tpc-h06.yaml @@ -0,0 +1,314 @@ +# select +# sum(l_extendedprice * l_discount) as revenue +# from +# "lineitem" +# where +# l_shipdate >= date '1997-01-01' +# and l_shipdate < date '1997-01-01' + interval '1' year +# and +# l_discount between 0.03 - 0.01 and 0.03 + 0.01 +# and l_quantity < 24 + +name: TPC-H06 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_datetime.yaml + - extensionUriAnchor: 3 + uri: /functions_comparison.yaml + - extensionUriAnchor: 4 + uri: /functions_arithmetic_decimal.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 2 + name: gte:date_date + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 3 + name: lt:date_date + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 4 + name: add:date_year + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 5 + name: gte:any_any + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 6 + name: subtract:opt_dec_dec + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 7 + name: lte:any_any + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 8 + name: add:opt_dec_dec + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 9 + name: lt:any_any + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 10 + name: multiply:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 11 + name: sum:opt_dec + relations: + - root: + __test: + - type: "NSTRUCT>" + input: + aggregate: + common: + emit: + outputMapping: + - 0 + input: + project: + common: + emit: + outputMapping: + - 16 + expressions: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - selection: + directReference: + structField: + field: 6 + rootReference: {} + functionReference: 10 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 10 + rootReference: {} + - literal: + date: 9862 + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 10 + rootReference: {} + - scalarFunction: + args: + - literal: + date: 9862 + - literal: + intervalYearToMonth: + years: 1 + functionReference: 4 + outputType: + date: + nullability: NULLABILITY_REQUIRED + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 6 + rootReference: {} + - scalarFunction: + args: + - literal: + decimal: + precision: 3 + scale: 2 + value: AAAAAAAAAAAAAAAAAAAAAA== + - literal: + decimal: + precision: 3 + scale: 2 + value: AAAAAAAAAAAAAAAAAAAAAA== + functionReference: 6 + outputType: + decimal: + nullability: NULLABILITY_REQUIRED + precision: 4 + scale: 2 + functionReference: 5 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 6 + rootReference: {} + - scalarFunction: + args: + - literal: + decimal: + precision: 3 + scale: 2 + value: AAAAAAAAAAAAAAAAAAAAAA== + - literal: + decimal: + precision: 3 + scale: 2 + value: AAAAAAAAAAAAAAAAAAAAAA== + functionReference: 8 + outputType: + decimal: + nullability: NULLABILITY_REQUIRED + precision: 4 + scale: 2 + functionReference: 7 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - cast: + input: + literal: + i32: 24 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 9 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + measures: + - measure: + args: + - selection: + directReference: + structField: {} + rootReference: {} + functionReference: 11 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + names: + - REVENUE diff --git a/tests/tests/tpc-h/tpc-h07.yaml b/tests/tests/tpc-h/tpc-h07.yaml new file mode 100644 index 00000000..7f585a93 --- /dev/null +++ b/tests/tests/tpc-h/tpc-h07.yaml @@ -0,0 +1,726 @@ +# select +# supp_nation, +# cust_nation, +# l_year, +# sum(volume) as revenue +# from +# ( +# select +# n1.n_name as supp_nation, +# n2.n_name as cust_nation, +# extract(year from l.l_shipdate) as l_year, +# l.l_extendedprice * (1 - l.l_discount) as volume +# from +# "supplier" s, +# "lineitem" l, +# "orders" o, +# "customer" c, +# "nation" n1, +# "nation" n2 +# where +# s.s_suppkey = l.l_suppkey +# and o.o_orderkey = l.l_orderkey +# and c.c_custkey = o.o_custkey +# and s.s_nationkey = n1.n_nationkey +# and c.c_nationkey = n2.n_nationkey +# and ( +# (n1.n_name = 'EGYPT' and n2.n_name = 'UNITED STATES') +# or (n1.n_name = 'UNITED STATES' and n2.n_name = 'EGYPT') +# ) +# and l.l_shipdate between date '1995-01-01' and date '1996-12-31' +# ) as shipping +# group by +# supp_nation, +# cust_nation, +# l_year +# order by +# supp_nation, +# cust_nation, +# l_year + +name: TPC-H07 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_comparison.yaml + - extensionUriAnchor: 3 + uri: /functions_datetime.yaml + - extensionUriAnchor: 4 + uri: /functions_arithmetic_decimal.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 2 + name: equal:any_any + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 3 + name: or:bool + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 4 + name: gte:date_date + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 5 + name: lte:date_date + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 6 + name: multiply:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 7 + name: subtract:opt_dec_dec + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 8 + name: extract:req_date + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 9 + name: sum:opt_dec + relations: + - root: + input: + sort: + common: + direct: {} + input: + aggregate: + common: + emit: + outputMapping: + - 0 + - 1 + - 2 + - 3 + groupings: + - groupingExpressions: + - selection: + directReference: + structField: + field: 0 + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + - selection: + directReference: + structField: + field: 2 + rootReference: {} + input: + project: + common: + emit: + outputMapping: + - 48 + - 49 + - 50 + - 51 + expressions: + - selection: + directReference: + structField: + field: 41 + rootReference: {} + - selection: + directReference: + structField: + field: 45 + rootReference: {} + - scalarFunction: + args: + - enum: + specified: YEAR + - selection: + directReference: + structField: + field: 17 + rootReference: {} + functionReference: 8 + outputType: + i64: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 12 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 13 + rootReference: {} + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 6 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 9 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 23 + rootReference: {} + - selection: + directReference: + structField: + field: 7 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 32 + rootReference: {} + - selection: + directReference: + structField: + field: 24 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 3 + rootReference: {} + - selection: + directReference: + structField: + field: 40 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 35 + rootReference: {} + - selection: + directReference: + structField: + field: 44 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 41 + rootReference: {} + - cast: + input: + literal: + fixedChar: EGYPT + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 45 + rootReference: {} + - cast: + input: + literal: + fixedChar: UNITED STATES + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 41 + rootReference: {} + - cast: + input: + literal: + fixedChar: UNITED STATES + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 45 + rootReference: {} + - cast: + input: + literal: + fixedChar: EGYPT + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 17 + rootReference: {} + - literal: + date: 9131 + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 17 + rootReference: {} + - literal: + date: 9861 + functionReference: 5 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + read: + baseSchema: + names: + - S_SUPPKEY + - S_NAME + - S_ADDRESS + - S_NATIONKEY + - S_PHONE + - S_ACCTBAL + - S_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 101 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - SUPPLIER + right: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - O_ORDERKEY + - O_CUSTKEY + - O_ORDERSTATUS + - O_TOTALPRICE + - O_ORDERDATE + - O_ORDERPRIORITY + - O_CLERK + - O_SHIPPRIORITY + - O_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - varchar: + length: 79 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - ORDERS + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - C_CUSTKEY + - C_NAME + - C_ADDRESS + - C_NATIONKEY + - C_PHONE + - C_ACCTBAL + - C_MKTSEGMENT + - C_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 117 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - CUSTOMER + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - N_NATIONKEY + - N_NAME + - N_REGIONKEY + - N_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - NATION + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - N_NATIONKEY + - N_NAME + - N_REGIONKEY + - N_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - NATION + type: JOIN_TYPE_INNER + measures: + - measure: + args: + - selection: + directReference: + structField: + field: 3 + rootReference: {} + functionReference: 9 + outputType: + decimal: + nullability: NULLABILITY_REQUIRED + precision: 38 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + sorts: + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: {} + rootReference: {} + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: + field: 1 + rootReference: {} + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: + field: 2 + rootReference: {} + names: + - SUPP_NATION + - CUST_NATION + - L_YEAR + - REVENUE diff --git a/tests/tests/tpc-h/tpc-h08.yaml b/tests/tests/tpc-h/tpc-h08.yaml new file mode 100644 index 00000000..e9bff2f1 --- /dev/null +++ b/tests/tests/tpc-h/tpc-h08.yaml @@ -0,0 +1,810 @@ +# select +# o_year, +# sum(case +# when nation = 'EGYPT' then volume +# else 0 +# end) / sum(volume) as mkt_share +# from +# ( +# select +# extract(year from o.o_orderdate) as o_year, +# l.l_extendedprice * (1 - l.l_discount) as volume, +# n2.n_name as nation +# from +# "part" p, +# "supplier" s, +# "lineitem" l, +# "orders" o, +# "customer" c, +# "nation" n1, +# "nation" n2, +# "region" r +# where +# p.p_partkey = l.l_partkey +# and s.s_suppkey = l.l_suppkey +# and l.l_orderkey = o.o_orderkey +# and o.o_custkey = c.c_custkey +# and c.c_nationkey = n1.n_nationkey +# and n1.n_regionkey = r.r_regionkey +# and r.r_name = 'MIDDLE EAST' +# and s.s_nationkey = n2.n_nationkey +# and o.o_orderdate between date '1995-01-01' and date '1996-12-31' +# and p.p_type = 'PROMO BRUSHED COPPER' +# ) as all_nations +# group by +# o_year +# order by +# o_year + +name: TPC-H08 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_comparison.yaml + - extensionUriAnchor: 3 + uri: /functions_datetime.yaml + - extensionUriAnchor: 4 + uri: /functions_arithmetic_decimal.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 2 + name: equal:any_any + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 3 + name: gte:date_date + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 4 + name: lte:date_date + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 5 + name: multiply:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 6 + name: subtract:opt_dec_dec + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 7 + name: extract:req_date + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 8 + name: sum:opt_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 9 + name: divide:opt_dec_dec + relations: + - root: + input: + project: + common: + emit: + outputMapping: + - 0 + - 3 + expressions: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 1 + rootReference: {} + - selection: + directReference: + structField: + field: 2 + rootReference: {} + functionReference: 9 + outputType: + decimal: + precision: 38 + scale: 6 + nullability: NULLABILITY_NULLABLE + input: + sort: + common: + direct: {} + input: + aggregate: + common: + emit: + outputMapping: + - 0 + - 1 + - 2 + groupings: + - groupingExpressions: + - selection: + directReference: + structField: + field: 0 + rootReference: {} + input: + project: + common: + emit: + outputMapping: + - 60 + - 61 + - 62 + expressions: + - scalarFunction: + args: + - enum: + specified: YEAR + - selection: + directReference: + structField: + field: 36 + rootReference: {} + functionReference: 7 + outputType: + i64: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 21 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 22 + rootReference: {} + functionReference: 6 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 5 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 54 + rootReference: {} + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 17 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 9 + rootReference: {} + - selection: + directReference: + structField: + field: 18 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 16 + rootReference: {} + - selection: + directReference: + structField: + field: 32 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 33 + rootReference: {} + - selection: + directReference: + structField: + field: 41 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 44 + rootReference: {} + - selection: + directReference: + structField: + field: 49 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 51 + rootReference: {} + - selection: + directReference: + structField: + field: 57 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 58 + rootReference: {} + - cast: + input: + literal: + fixedChar: MIDDLE EAST + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 12 + rootReference: {} + - selection: + directReference: + structField: + field: 53 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 36 + rootReference: {} + - literal: + date: 9131 + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 36 + rootReference: {} + - literal: + date: 9861 + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - literal: + varChar: + length: 25 + value: PROMO BRUSHED COPPER + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + read: + baseSchema: + names: + - P_PARTKEY + - P_NAME + - P_MFGR + - P_BRAND + - P_TYPE + - P_SIZE + - P_CONTAINER + - P_RETAILPRICE + - P_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 55 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 23 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - PART + right: + read: + baseSchema: + names: + - S_SUPPKEY + - S_NAME + - S_ADDRESS + - S_NATIONKEY + - S_PHONE + - S_ACCTBAL + - S_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 101 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - SUPPLIER + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - O_ORDERKEY + - O_CUSTKEY + - O_ORDERSTATUS + - O_TOTALPRICE + - O_ORDERDATE + - O_ORDERPRIORITY + - O_CLERK + - O_SHIPPRIORITY + - O_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - varchar: + length: 79 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - ORDERS + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - C_CUSTKEY + - C_NAME + - C_ADDRESS + - C_NATIONKEY + - C_PHONE + - C_ACCTBAL + - C_MKTSEGMENT + - C_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 117 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - CUSTOMER + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - N_NATIONKEY + - N_NAME + - N_REGIONKEY + - N_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - NATION + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - N_NATIONKEY + - N_NAME + - N_REGIONKEY + - N_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - NATION + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - R_REGIONKEY + - R_NAME + - R_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - REGION + type: JOIN_TYPE_INNER + measures: + - measure: + args: + - ifThen: + ifs: + - if: + scalarFunction: + args: + - selection: + directReference: + structField: + field: 2 + rootReference: {} + - literal: + fixedChar: "EGYPT " + nullable: true + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + then: + selection: + directReference: + structField: + field: 1 + rootReference: {} + else: + literal: + decimal: + value: AAAAAAAAAAAAAAAAAAAAAA== + precision: 19 + functionReference: 8 + outputType: + decimal: + nullability: NULLABILITY_REQUIRED + precision: 38 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + - measure: + args: + - selection: + directReference: + structField: + field: 1 + rootReference: {} + functionReference: 8 + outputType: + decimal: + nullability: NULLABILITY_REQUIRED + precision: 38 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + sorts: + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: {} + rootReference: {} + names: + - O_YEAR + - MKT_SHARE diff --git a/tests/tests/tpc-h/tpc-h09.yaml b/tests/tests/tpc-h/tpc-h09.yaml new file mode 100644 index 00000000..1c6ae21b --- /dev/null +++ b/tests/tests/tpc-h/tpc-h09.yaml @@ -0,0 +1,640 @@ +# select +# nation, +# o_year, +# sum(amount) as sum_profit +# from +# ( +# select +# n.n_name as nation, +# extract(year from o.o_orderdate) as o_year, +# l.l_extendedprice * (1 - l.l_discount) - ps.ps_supplycost * l.l_quantity as amount +# from +# "part" p, +# "supplier" s, +# "lineitem" l, +# "partsupp" ps, +# "orders" o, +# "nation" n +# where +# s.s_suppkey = l.l_suppkey +# and ps.ps_suppkey = l.l_suppkey +# and ps.ps_partkey = l.l_partkey +# and p.p_partkey = l.l_partkey +# and o.o_orderkey = l.l_orderkey +# and s.s_nationkey = n.n_nationkey +# and p.p_name like '%yellow%' +# ) as profit +# group by +# nation, +# o_year +# order by +# nation, +# o_year desc + +name: TPC-H09 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_comparison.yaml + - extensionUriAnchor: 3 + uri: /functions_string.yaml + - extensionUriAnchor: 4 + uri: /functions_arithmetic_decimal.yaml + - extensionUriAnchor: 5 + uri: /functions_datetime.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 2 + name: equal:any_any + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 3 + name: like:vchar_vchar + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 4 + name: subtract:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 5 + name: multiply:opt_dec_dec + - extensionFunction: + extensionUriReference: 5 + functionAnchor: 6 + name: extract:req_date + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 7 + name: sum:opt_dec + relations: + - root: + input: + sort: + common: + direct: {} + input: + aggregate: + common: + emit: + outputMapping: + - 0 + - 1 + - 2 + groupings: + - groupingExpressions: + - selection: + directReference: + structField: + field: 0 + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + input: + project: + common: + emit: + outputMapping: + - 50 + - 51 + - 52 + expressions: + - selection: + directReference: + structField: + field: 47 + rootReference: {} + - scalarFunction: + args: + - enum: + specified: YEAR + - selection: + directReference: + structField: + field: 41 + rootReference: {} + functionReference: 6 + outputType: + i64: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 21 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 22 + rootReference: {} + functionReference: 4 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 5 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 35 + rootReference: {} + - selection: + directReference: + structField: + field: 20 + rootReference: {} + functionReference: 5 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 4 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 9 + rootReference: {} + - selection: + directReference: + structField: + field: 18 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 33 + rootReference: {} + - selection: + directReference: + structField: + field: 18 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 32 + rootReference: {} + - selection: + directReference: + structField: + field: 17 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 17 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 37 + rootReference: {} + - selection: + directReference: + structField: + field: 16 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 12 + rootReference: {} + - selection: + directReference: + structField: + field: 46 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 1 + rootReference: {} + - cast: + input: + literal: + fixedChar: '%yellow%' + type: + varchar: + length: 55 + nullability: NULLABILITY_NULLABLE + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + read: + baseSchema: + names: + - P_PARTKEY + - P_NAME + - P_MFGR + - P_BRAND + - P_TYPE + - P_SIZE + - P_CONTAINER + - P_RETAILPRICE + - P_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 55 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 23 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - PART + right: + read: + baseSchema: + names: + - S_SUPPKEY + - S_NAME + - S_ADDRESS + - S_NATIONKEY + - S_PHONE + - S_ACCTBAL + - S_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 101 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - SUPPLIER + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - PS_PARTKEY + - PS_SUPPKEY + - PS_AVAILQTY + - PS_SUPPLYCOST + - PS_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 199 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - PARTSUPP + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - O_ORDERKEY + - O_CUSTKEY + - O_ORDERSTATUS + - O_TOTALPRICE + - O_ORDERDATE + - O_ORDERPRIORITY + - O_CLERK + - O_SHIPPRIORITY + - O_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - varchar: + length: 79 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - ORDERS + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - N_NATIONKEY + - N_NAME + - N_REGIONKEY + - N_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - NATION + type: JOIN_TYPE_INNER + measures: + - measure: + args: + - selection: + directReference: + structField: + field: 2 + rootReference: {} + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_REQUIRED + precision: 38 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + sorts: + - direction: SORT_DIRECTION_ASC_NULLS_LAST + expr: + selection: + directReference: + structField: + field: 0 + rootReference: {} + - direction: SORT_DIRECTION_DESC_NULLS_LAST + expr: + selection: + directReference: + structField: + field: 1 + rootReference: {} + names: + - NATION + - O_YEAR + - SUM_PROFIT diff --git a/tests/tests/tpc-h/tpc-h10.yaml b/tests/tests/tpc-h/tpc-h10.yaml new file mode 100644 index 00000000..67e993c1 --- /dev/null +++ b/tests/tests/tpc-h/tpc-h10.yaml @@ -0,0 +1,622 @@ +# select +# c.c_custkey, +# c.c_name, +# sum(l.l_extendedprice * (1 - l.l_discount)) as revenue, +# c.c_acctbal, +# n.n_name, +# c.c_address, +# c.c_phone, +# c.c_comment +# from +# "customer" c, +# "orders" o, +# "lineitem" l, +# "nation" n +# where +# c.c_custkey = o.o_custkey +# and l.l_orderkey = o.o_orderkey +# and o.o_orderdate >= date '1994-03-01' +# and o.o_orderdate < date '1994-03-01' + interval '3' month +# and l.l_returnflag = 'R' +# and c.c_nationkey = n.n_nationkey +# group by +# c.c_custkey, +# c.c_name, +# c.c_acctbal, +# c.c_phone, +# n.n_name, +# c.c_address, +# c.c_comment +# order by +# revenue desc +# limit 20 + +name: TPC-H10 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_comparison.yaml + - extensionUriAnchor: 3 + uri: /functions_datetime.yaml + - extensionUriAnchor: 4 + uri: /functions_arithmetic_decimal.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 2 + name: equal:any_any + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 3 + name: gte:date_date + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 4 + name: lt:date_date + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 5 + name: add:date_year + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 6 + name: multiply:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 7 + name: subtract:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 8 + name: sum:opt_dec + relations: + - root: + __test: + - type: "\ + NSTRUCT<\ + C_CUSTKEY: i32, \ + C_NAME: i64, \ + REVENUE: VARCHAR?<25>, \ + C_ACCTBAL: DECIMAL?<19, 0>, \ + N_NAME: DECIMAL?<19, 0>, \ + C_ADDRESS: FIXEDCHAR?<25>, \ + C_PHONE: VARCHAR?<40>, \ + C_COMMENT: FIXEDCHAR?<15>\ + >" + input: + fetch: + common: + direct: {} + count: '20' + input: + sort: + common: + direct: {} + input: + project: + common: + emit: + outputMapping: + - 8 + - 9 + - 10 + - 11 + - 12 + - 13 + - 14 + - 15 + expressions: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + - selection: + directReference: + structField: + field: 7 + rootReference: {} + - selection: + directReference: + structField: + field: 2 + rootReference: {} + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - selection: + directReference: + structField: + field: 3 + rootReference: {} + - selection: + directReference: + structField: + field: 6 + rootReference: {} + input: + aggregate: + common: + direct: {} + groupings: + - groupingExpressions: + - selection: + directReference: + structField: + field: 0 + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + - selection: + directReference: + structField: + field: 2 + rootReference: {} + - selection: + directReference: + structField: + field: 3 + rootReference: {} + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - selection: + directReference: + structField: + field: 6 + rootReference: {} + input: + project: + common: + emit: + outputMapping: + - 37 + - 38 + - 39 + - 40 + - 41 + - 42 + - 43 + - 44 + expressions: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - selection: + directReference: + structField: + field: 34 + rootReference: {} + - selection: + directReference: + structField: + field: 2 + rootReference: {} + - selection: + directReference: + structField: + field: 7 + rootReference: {} + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 23 + rootReference: {} + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 6 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: {} + rootReference: {} + - selection: + directReference: + structField: + field: 9 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 17 + rootReference: {} + - selection: + directReference: + structField: + field: 8 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 12 + rootReference: {} + - literal: + date: 8825 + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 12 + rootReference: {} + - scalarFunction: + args: + - literal: + date: 8825 + - literal: + intervalYearToMonth: + months: 3 + functionReference: 5 + outputType: + date: + nullability: NULLABILITY_REQUIRED + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 25 + rootReference: {} + - literal: + fixedChar: R + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 3 + rootReference: {} + - selection: + directReference: + structField: + field: 33 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + read: + baseSchema: + names: + - C_CUSTKEY + - C_NAME + - C_ADDRESS + - C_NATIONKEY + - C_PHONE + - C_ACCTBAL + - C_MKTSEGMENT + - C_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 40 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 117 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - CUSTOMER + right: + read: + baseSchema: + names: + - O_ORDERKEY + - O_CUSTKEY + - O_ORDERSTATUS + - O_TOTALPRICE + - O_ORDERDATE + - O_ORDERPRIORITY + - O_CLERK + - O_SHIPPRIORITY + - O_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 15 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - varchar: + length: 79 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - ORDERS + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + type: JOIN_TYPE_INNER + right: + read: + baseSchema: + names: + - N_NATIONKEY + - N_NAME + - N_REGIONKEY + - N_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 152 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - NATION + type: JOIN_TYPE_INNER + measures: + - measure: + args: + - selection: + directReference: + structField: + field: 7 + rootReference: {} + functionReference: 8 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + sorts: + - direction: SORT_DIRECTION_DESC_NULLS_FIRST + expr: + selection: + directReference: + structField: + field: 2 + rootReference: {} + names: + - C_CUSTKEY + - C_NAME + - REVENUE + - C_ACCTBAL + - N_NAME + - C_ADDRESS + - C_PHONE + - C_COMMENT diff --git a/tests/tests/tpc-h/tpc-h14.yaml b/tests/tests/tpc-h/tpc-h14.yaml new file mode 100644 index 00000000..38d1afeb --- /dev/null +++ b/tests/tests/tpc-h/tpc-h14.yaml @@ -0,0 +1,432 @@ +# select +# 100.00 * sum(case +# when p.p_type like 'PROMO%' +# then l.l_extendedprice * (1 - l.l_discount) +# else 0 +# end) / sum(l.l_extendedprice * (1 - l.l_discount)) as promo_revenue +# from +# "lineitem" l, +# "part" p +# where +# l.l_partkey = p.p_partkey +# and l.l_shipdate >= date '1994-08-01' +# and l.l_shipdate < date '1994-08-01' + interval '1' month + +name: TPC-H14 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_comparison.yaml + - extensionUriAnchor: 3 + uri: /functions_datetime.yaml + - extensionUriAnchor: 4 + uri: /functions_string.yaml + - extensionUriAnchor: 5 + uri: /functions_arithmetic_decimal.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 2 + name: equal:any_any + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 3 + name: gte:date_date + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 4 + name: lt:date_date + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 5 + name: add:date_year + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 6 + name: like:vchar_vchar + - extensionFunction: + extensionUriReference: 5 + functionAnchor: 7 + name: multiply:opt_dec_dec + - extensionFunction: + extensionUriReference: 5 + functionAnchor: 8 + name: subtract:opt_dec_dec + - extensionFunction: + extensionUriReference: 5 + functionAnchor: 9 + name: sum:opt_dec + - extensionFunction: + extensionUriReference: 5 + functionAnchor: 10 + name: divide:opt_dec_dec + relations: + - root: + __test: + - type: "NSTRUCT" + input: + project: + common: + emit: + outputMapping: + - 2 + expressions: + - scalarFunction: + args: + - scalarFunction: + args: + - literal: + decimal: + precision: 5 + scale: 2 + value: AAAAAAAAAAAAAAAAAAAAAA== + - selection: + directReference: + structField: {} + rootReference: {} + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + scale: 2 + - selection: + directReference: + structField: + field: 1 + rootReference: {} + functionReference: 10 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + scale: 2 + input: + aggregate: + common: + direct: {} + input: + project: + common: + emit: + outputMapping: + - 25 + - 26 + expressions: + - ifThen: + else: + literal: + decimal: + precision: 19 + value: AAAAAAAAAAAAAAAAAAAAAA== + ifs: + - if: + scalarFunction: + args: + - selection: + directReference: + structField: + field: 20 + rootReference: {} + - cast: + input: + literal: + fixedChar: PROMO% + type: + varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + functionReference: 6 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + then: + scalarFunction: + args: + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 6 + rootReference: {} + functionReference: 8 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 6 + rootReference: {} + functionReference: 8 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 1 + rootReference: {} + - selection: + directReference: + structField: + field: 16 + rootReference: {} + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 10 + rootReference: {} + - literal: + date: 8978 + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 10 + rootReference: {} + - scalarFunction: + args: + - literal: + date: 8978 + - literal: + intervalYearToMonth: + months: 1 + functionReference: 5 + outputType: + date: + nullability: NULLABILITY_REQUIRED + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + right: + read: + baseSchema: + names: + - P_PARTKEY + - P_NAME + - P_MFGR + - P_BRAND + - P_TYPE + - P_SIZE + - P_CONTAINER + - P_RETAILPRICE + - P_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 55 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 23 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - PART + type: JOIN_TYPE_INNER + measures: + - measure: + args: + - selection: + directReference: + structField: {} + rootReference: {} + functionReference: 9 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + - measure: + args: + - selection: + directReference: + structField: + field: 1 + rootReference: {} + functionReference: 9 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + names: + - PROMO_REVENUE diff --git a/tests/tests/tpc-h/tpc-h19.yaml b/tests/tests/tpc-h/tpc-h19.yaml new file mode 100644 index 00000000..c3d4396c --- /dev/null +++ b/tests/tests/tpc-h/tpc-h19.yaml @@ -0,0 +1,954 @@ +# select +# sum(l.l_extendedprice* (1 - l.l_discount)) as revenue +# from +# "lineitem" l, +# "part" p +# where +# ( +# p.p_partkey = l.l_partkey +# and p.p_brand = 'Brand#41' +# and p.p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') +# and l.l_quantity >= 2 and l.l_quantity <= 2 + 10 +# and p.p_size between 1 and 5 +# and l.l_shipmode in ('AIR', 'AIR REG') +# and l.l_shipinstruct = 'DELIVER IN PERSON' +# ) +# or +# ( +# p.p_partkey = l.l_partkey +# and p.p_brand = 'Brand#13' +# and p.p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') +# and l.l_quantity >= 14 and l.l_quantity <= 14 + 10 +# and p.p_size between 1 and 10 +# and l.l_shipmode in ('AIR', 'AIR REG') +# and l.l_shipinstruct = 'DELIVER IN PERSON' +# ) +# or +# ( +# p.p_partkey = l.l_partkey +# and p.p_brand = 'Brand#55' +# and p.p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') +# and l.l_quantity >= 23 and l.l_quantity <= 23 + 10 +# and p.p_size between 1 and 15 +# and l.l_shipmode in ('AIR', 'AIR REG') +# and l.l_shipinstruct = 'DELIVER IN PERSON' +# ) + +name: TPC-H19 +diags: +- { code: 0001, max: i } # Suppress "not yet implemented" warnings +- { code: 3002, max: i } # Suppress function name resolution errors (function parsing isn't implemented yet) +- { code: 6003, max: i } # Suppress function definition check warnings (function parsing isn't implemented yet) +plan: + __test: + - level: i + extensionUris: + - extensionUriAnchor: 1 + uri: /functions_boolean.yaml + - extensionUriAnchor: 2 + uri: /functions_comparison.yaml + - extensionUriAnchor: 3 + uri: /functions_arithmetic.yaml + - extensionUriAnchor: 4 + uri: /functions_arithmetic_decimal.yaml + extensions: + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 1 + name: or:bool + - extensionFunction: + extensionUriReference: 1 + functionAnchor: 2 + name: and:bool + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 3 + name: equal:any_any + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 4 + name: gte:any_any + - extensionFunction: + extensionUriReference: 2 + functionAnchor: 5 + name: lte:any_any + - extensionFunction: + extensionUriReference: 3 + functionAnchor: 6 + name: add:opt_i32_i32 + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 7 + name: multiply:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 8 + name: subtract:opt_dec_dec + - extensionFunction: + extensionUriReference: 4 + functionAnchor: 9 + name: sum:opt_dec + relations: + - root: + __test: + - type: "NSTRUCT>" + input: + aggregate: + common: + emit: + outputMapping: + - 0 + input: + project: + common: + emit: + outputMapping: + - 25 + expressions: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 5 + rootReference: {} + - scalarFunction: + args: + - cast: + input: + literal: + i32: 1 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - selection: + directReference: + structField: + field: 6 + rootReference: {} + functionReference: 8 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 7 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + input: + filter: + common: + direct: {} + condition: + scalarFunction: + args: + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 16 + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 19 + rootReference: {} + - cast: + input: + literal: + fixedChar: Brand#41 + type: + fixedChar: + length: 10 + nullability: NULLABILITY_REQUIRED + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: SM CASE + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: SM BOX + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: SM PACK + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: SM PKG + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - cast: + input: + literal: + i32: 2 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - cast: + input: + scalarFunction: + args: + - literal: + i32: 2 + - literal: + i32: 10 + functionReference: 6 + outputType: + i32: + nullability: NULLABILITY_REQUIRED + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 5 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 21 + rootReference: {} + - literal: + i32: 1 + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 21 + rootReference: {} + - literal: + i32: 5 + functionReference: 5 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 14 + rootReference: {} + - literal: + fixedChar: AIR + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 14 + rootReference: {} + - literal: + fixedChar: AIR REG + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 13 + rootReference: {} + - cast: + input: + literal: + fixedChar: DELIVER IN PERSON + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 16 + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 19 + rootReference: {} + - cast: + input: + literal: + fixedChar: Brand#13 + type: + fixedChar: + length: 10 + nullability: NULLABILITY_REQUIRED + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: MED BAG + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: MED BOX + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: MED PKG + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: MED PACK + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - cast: + input: + literal: + i32: 14 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - cast: + input: + scalarFunction: + args: + - literal: + i32: 14 + - literal: + i32: 10 + functionReference: 6 + outputType: + i32: + nullability: NULLABILITY_REQUIRED + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 5 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 21 + rootReference: {} + - literal: + i32: 1 + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 21 + rootReference: {} + - literal: + i32: 10 + functionReference: 5 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 14 + rootReference: {} + - literal: + fixedChar: AIR + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 14 + rootReference: {} + - literal: + fixedChar: AIR REG + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 13 + rootReference: {} + - cast: + input: + literal: + fixedChar: DELIVER IN PERSON + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 16 + rootReference: {} + - selection: + directReference: + structField: + field: 1 + rootReference: {} + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_REQUIRED + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 19 + rootReference: {} + - cast: + input: + literal: + fixedChar: Brand#55 + type: + fixedChar: + length: 10 + nullability: NULLABILITY_REQUIRED + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: LG CASE + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: LG BOX + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: LG PACK + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 22 + rootReference: {} + - literal: + fixedChar: LG PKG + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - cast: + input: + literal: + i32: 23 + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 4 + rootReference: {} + - cast: + input: + scalarFunction: + args: + - literal: + i32: 23 + - literal: + i32: 10 + functionReference: 6 + outputType: + i32: + nullability: NULLABILITY_REQUIRED + type: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + functionReference: 5 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 21 + rootReference: {} + - literal: + i32: 1 + functionReference: 4 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 21 + rootReference: {} + - literal: + i32: 15 + functionReference: 5 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 14 + rootReference: {} + - literal: + fixedChar: AIR + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 14 + rootReference: {} + - literal: + fixedChar: AIR REG + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + - scalarFunction: + args: + - selection: + directReference: + structField: + field: 13 + rootReference: {} + - cast: + input: + literal: + fixedChar: DELIVER IN PERSON + type: + fixedChar: + length: 25 + nullability: NULLABILITY_REQUIRED + functionReference: 3 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 2 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + functionReference: 1 + outputType: + bool: + nullability: NULLABILITY_NULLABLE + input: + join: + common: + direct: {} + expression: + literal: + boolean: true + left: + read: + baseSchema: + names: + - L_ORDERKEY + - L_PARTKEY + - L_SUPPKEY + - L_LINENUMBER + - L_QUANTITY + - L_EXTENDEDPRICE + - L_DISCOUNT + - L_TAX + - L_RETURNFLAG + - L_LINESTATUS + - L_SHIPDATE + - L_COMMITDATE + - L_RECEIPTDATE + - L_SHIPINSTRUCT + - L_SHIPMODE + - L_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i64: + nullability: NULLABILITY_REQUIRED + - i32: + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 1 + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - date: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 44 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - LINEITEM + right: + read: + baseSchema: + names: + - P_PARTKEY + - P_NAME + - P_MFGR + - P_BRAND + - P_TYPE + - P_SIZE + - P_CONTAINER + - P_RETAILPRICE + - P_COMMENT + struct: + nullability: NULLABILITY_REQUIRED + types: + - i64: + nullability: NULLABILITY_REQUIRED + - varchar: + length: 55 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 25 + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - varchar: + length: 25 + nullability: NULLABILITY_NULLABLE + - i32: + nullability: NULLABILITY_NULLABLE + - fixedChar: + length: 10 + nullability: NULLABILITY_NULLABLE + - decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + - varchar: + length: 23 + nullability: NULLABILITY_NULLABLE + common: + direct: {} + namedTable: + names: + - PART + type: JOIN_TYPE_INNER + measures: + - measure: + args: + - selection: + directReference: + structField: {} + rootReference: {} + functionReference: 9 + outputType: + decimal: + nullability: NULLABILITY_NULLABLE + precision: 19 + phase: AGGREGATION_PHASE_INITIAL_TO_RESULT + names: + - REVENUE