diff --git a/python/.pre-commit-config.yaml b/python/.pre-commit-config.yaml index 8a8b31418e8c..cf56bc31620a 100644 --- a/python/.pre-commit-config.yaml +++ b/python/.pre-commit-config.yaml @@ -54,4 +54,4 @@ repos: rev: '4.0.1' hooks: - id: flake8 - args: [ "--ignore=E501,W503" ] + args: [ "--ignore=E501,W503,E203" ] diff --git a/python/LICENSE b/python/LICENSE index 7868b9b44048..5823c32b65e0 100644 --- a/python/LICENSE +++ b/python/LICENSE @@ -202,3 +202,14 @@ limitations under the License. -------------------------------------------------------------------------------- + +This product includes code from Apache Avro. + +* Code for initializing the Avro (de)compression codecs +* The Binary decoder for reading in an Avro byte stream + +Copyright: 2014-2022 The Apache Software Foundation. +Home page: https://avro.apache.org/ +License: https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- diff --git a/python/poetry.lock b/python/poetry.lock index 0feeb1f76f28..91dfbc691400 100644 --- a/python/poetry.lock +++ b/python/poetry.lock @@ -20,6 +20,17 @@ docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "cloudpickle"] tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "cloudpickle"] +[[package]] +name = "cffi" +version = "1.15.0" +description = "Foreign Function Interface for Python calling C code." +category = "main" +optional = true +python-versions = "*" + +[package.dependencies] +pycparser = "*" + [[package]] name = "cfgv" version = "3.3.1" @@ -38,14 +49,14 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "coverage" -version = "6.4" +version = "6.4.1" description = "Code coverage measurement for Python" category = "dev" optional = false python-versions = ">=3.7" [package.dependencies] -tomli = {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"toml\""} +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} [package.extras] toml = ["tomli"] @@ -66,6 +77,20 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "fastavro" +version = "1.5.1" +description = "Fast read/write of AVRO files" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +codecs = ["python-snappy", "zstandard", "lz4"] +lz4 = ["lz4"] +snappy = ["python-snappy"] +zstandard = ["zstandard"] + [[package]] name = "filelock" version = "3.7.1" @@ -218,6 +243,14 @@ python-versions = ">=3.7" [package.dependencies] numpy = ">=1.16.6" +[[package]] +name = "pycparser" +version = "2.21" +description = "C parser in Python" +category = "main" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + [[package]] name = "pyparsing" version = "3.0.9" @@ -267,6 +300,14 @@ pep517 = "*" docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "types-docutils", "pytest-black (>=0.3.7)", "pytest-mypy"] +[[package]] +name = "python-snappy" +version = "0.6.1" +description = "Python library for the snappy compression library from Google" +category = "main" +optional = true +python-versions = "*" + [[package]] name = "pyyaml" version = "6.0" @@ -329,13 +370,29 @@ python-versions = ">=3.7" docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] +[[package]] +name = "zstandard" +version = "0.17.0" +description = "Zstandard bindings for Python" +category = "main" +optional = true +python-versions = ">=3.6" + +[package.dependencies] +cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\""} + +[package.extras] +cffi = ["cffi (>=1.11)"] + [extras] pyarrow = ["pyarrow"] +python-snappy = ["zstandard"] +snappy = ["python-snappy"] [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "3e07f5fb1c8f204b0a004541776b7d90cad2afac926c2c8554df599b977429e9" +content-hash = "439429e65911b8e768bd4617a4883e8a4ec2652df3a43b28755346da8ed17e19" [metadata.files] atomicwrites = [ @@ -346,6 +403,58 @@ attrs = [ {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, ] +cffi = [ + {file = "cffi-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962"}, + {file = "cffi-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0"}, + {file = "cffi-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14"}, + {file = "cffi-1.15.0-cp27-cp27m-win32.whl", hash = "sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474"}, + {file = "cffi-1.15.0-cp27-cp27m-win_amd64.whl", hash = "sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6"}, + {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27"}, + {file = "cffi-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023"}, + {file = "cffi-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2"}, + {file = "cffi-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962"}, + {file = "cffi-1.15.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382"}, + {file = "cffi-1.15.0-cp310-cp310-win32.whl", hash = "sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55"}, + {file = "cffi-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0"}, + {file = "cffi-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8"}, + {file = "cffi-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605"}, + {file = "cffi-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e"}, + {file = "cffi-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc"}, + {file = "cffi-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2"}, + {file = "cffi-1.15.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7"}, + {file = "cffi-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66"}, + {file = "cffi-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029"}, + {file = "cffi-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728"}, + {file = "cffi-1.15.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6"}, + {file = "cffi-1.15.0-cp38-cp38-win32.whl", hash = "sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c"}, + {file = "cffi-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443"}, + {file = "cffi-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a"}, + {file = "cffi-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df"}, + {file = "cffi-1.15.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8"}, + {file = "cffi-1.15.0-cp39-cp39-win32.whl", hash = "sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a"}, + {file = "cffi-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139"}, + {file = "cffi-1.15.0.tar.gz", hash = "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954"}, +] cfgv = [ {file = "cfgv-3.3.1-py2.py3-none-any.whl", hash = "sha256:c6a0883f3917a037485059700b9e75da2464e6c27051014ad85ba6aaa5884426"}, {file = "cfgv-3.3.1.tar.gz", hash = "sha256:f5a830efb9ce7a445376bb66ec94c638a9787422f96264c98edc6bdeed8ab736"}, @@ -355,47 +464,47 @@ colorama = [ {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, ] coverage = [ - {file = "coverage-6.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:50ed480b798febce113709846b11f5d5ed1e529c88d8ae92f707806c50297abf"}, - {file = "coverage-6.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:26f8f92699756cb7af2b30720de0c5bb8d028e923a95b6d0c891088025a1ac8f"}, - {file = "coverage-6.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60c2147921da7f4d2d04f570e1838db32b95c5509d248f3fe6417e91437eaf41"}, - {file = "coverage-6.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:750e13834b597eeb8ae6e72aa58d1d831b96beec5ad1d04479ae3772373a8088"}, - {file = "coverage-6.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af5b9ee0fc146e907aa0f5fb858c3b3da9199d78b7bb2c9973d95550bd40f701"}, - {file = "coverage-6.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a022394996419142b33a0cf7274cb444c01d2bb123727c4bb0b9acabcb515dea"}, - {file = "coverage-6.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5a78cf2c43b13aa6b56003707c5203f28585944c277c1f3f109c7b041b16bd39"}, - {file = "coverage-6.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9229d074e097f21dfe0643d9d0140ee7433814b3f0fc3706b4abffd1e3038632"}, - {file = "coverage-6.4-cp310-cp310-win32.whl", hash = "sha256:fb45fe08e1abc64eb836d187b20a59172053999823f7f6ef4f18a819c44ba16f"}, - {file = "coverage-6.4-cp310-cp310-win_amd64.whl", hash = "sha256:3cfd07c5889ddb96a401449109a8b97a165be9d67077df6802f59708bfb07720"}, - {file = "coverage-6.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:03014a74023abaf5a591eeeaf1ac66a73d54eba178ff4cb1fa0c0a44aae70383"}, - {file = "coverage-6.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c82f2cd69c71698152e943f4a5a6b83a3ab1db73b88f6e769fabc86074c3b08"}, - {file = "coverage-6.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b546cf2b1974ddc2cb222a109b37c6ed1778b9be7e6b0c0bc0cf0438d9e45a6"}, - {file = "coverage-6.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc173f1ce9ffb16b299f51c9ce53f66a62f4d975abe5640e976904066f3c835d"}, - {file = "coverage-6.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c53ad261dfc8695062fc8811ac7c162bd6096a05a19f26097f411bdf5747aee7"}, - {file = "coverage-6.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:eef5292b60b6de753d6e7f2d128d5841c7915fb1e3321c3a1fe6acfe76c38052"}, - {file = "coverage-6.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:543e172ce4c0de533fa892034cce260467b213c0ea8e39da2f65f9a477425211"}, - {file = "coverage-6.4-cp37-cp37m-win32.whl", hash = "sha256:00c8544510f3c98476bbd58201ac2b150ffbcce46a8c3e4fb89ebf01998f806a"}, - {file = "coverage-6.4-cp37-cp37m-win_amd64.whl", hash = "sha256:b84ab65444dcc68d761e95d4d70f3cfd347ceca5a029f2ffec37d4f124f61311"}, - {file = "coverage-6.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d548edacbf16a8276af13063a2b0669d58bbcfca7c55a255f84aac2870786a61"}, - {file = "coverage-6.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:033ebec282793bd9eb988d0271c211e58442c31077976c19c442e24d827d356f"}, - {file = "coverage-6.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:742fb8b43835078dd7496c3c25a1ec8d15351df49fb0037bffb4754291ef30ce"}, - {file = "coverage-6.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d55fae115ef9f67934e9f1103c9ba826b4c690e4c5bcf94482b8b2398311bf9c"}, - {file = "coverage-6.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5cd698341626f3c77784858427bad0cdd54a713115b423d22ac83a28303d1d95"}, - {file = "coverage-6.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:62d382f7d77eeeaff14b30516b17bcbe80f645f5cf02bb755baac376591c653c"}, - {file = "coverage-6.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:016d7f5cf1c8c84f533a3c1f8f36126fbe00b2ec0ccca47cc5731c3723d327c6"}, - {file = "coverage-6.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:69432946f154c6add0e9ede03cc43b96e2ef2733110a77444823c053b1ff5166"}, - {file = "coverage-6.4-cp38-cp38-win32.whl", hash = "sha256:83bd142cdec5e4a5c4ca1d4ff6fa807d28460f9db919f9f6a31babaaa8b88426"}, - {file = "coverage-6.4-cp38-cp38-win_amd64.whl", hash = "sha256:4002f9e8c1f286e986fe96ec58742b93484195defc01d5cc7809b8f7acb5ece3"}, - {file = "coverage-6.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e4f52c272fdc82e7c65ff3f17a7179bc5f710ebc8ce8a5cadac81215e8326740"}, - {file = "coverage-6.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b5578efe4038be02d76c344007b13119b2b20acd009a88dde8adec2de4f630b5"}, - {file = "coverage-6.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8099ea680201c2221f8468c372198ceba9338a5fec0e940111962b03b3f716a"}, - {file = "coverage-6.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a00441f5ea4504f5abbc047589d09e0dc33eb447dc45a1a527c8b74bfdd32c65"}, - {file = "coverage-6.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e76bd16f0e31bc2b07e0fb1379551fcd40daf8cdf7e24f31a29e442878a827c"}, - {file = "coverage-6.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:8d2e80dd3438e93b19e1223a9850fa65425e77f2607a364b6fd134fcd52dc9df"}, - {file = "coverage-6.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:341e9c2008c481c5c72d0e0dbf64980a4b2238631a7f9780b0fe2e95755fb018"}, - {file = "coverage-6.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:21e6686a95025927775ac501e74f5940cdf6fe052292f3a3f7349b0abae6d00f"}, - {file = "coverage-6.4-cp39-cp39-win32.whl", hash = "sha256:968ed5407f9460bd5a591cefd1388cc00a8f5099de9e76234655ae48cfdbe2c3"}, - {file = "coverage-6.4-cp39-cp39-win_amd64.whl", hash = "sha256:e35217031e4b534b09f9b9a5841b9344a30a6357627761d4218818b865d45055"}, - {file = "coverage-6.4-pp36.pp37.pp38-none-any.whl", hash = "sha256:e637ae0b7b481905358624ef2e81d7fb0b1af55f5ff99f9ba05442a444b11e45"}, - {file = "coverage-6.4.tar.gz", hash = "sha256:727dafd7f67a6e1cad808dc884bd9c5a2f6ef1f8f6d2f22b37b96cb0080d4f49"}, + {file = "coverage-6.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f1d5aa2703e1dab4ae6cf416eb0095304f49d004c39e9db1d86f57924f43006b"}, + {file = "coverage-6.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4ce1b258493cbf8aec43e9b50d89982346b98e9ffdfaae8ae5793bc112fb0068"}, + {file = "coverage-6.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83c4e737f60c6936460c5be330d296dd5b48b3963f48634c53b3f7deb0f34ec4"}, + {file = "coverage-6.4.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84e65ef149028516c6d64461b95a8dbcfce95cfd5b9eb634320596173332ea84"}, + {file = "coverage-6.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f69718750eaae75efe506406c490d6fc5a6161d047206cc63ce25527e8a3adad"}, + {file = "coverage-6.4.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e57816f8ffe46b1df8f12e1b348f06d164fd5219beba7d9433ba79608ef011cc"}, + {file = "coverage-6.4.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:01c5615d13f3dd3aa8543afc069e5319cfa0c7d712f6e04b920431e5c564a749"}, + {file = "coverage-6.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:75ab269400706fab15981fd4bd5080c56bd5cc07c3bccb86aab5e1d5a88dc8f4"}, + {file = "coverage-6.4.1-cp310-cp310-win32.whl", hash = "sha256:a7f3049243783df2e6cc6deafc49ea123522b59f464831476d3d1448e30d72df"}, + {file = "coverage-6.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:ee2ddcac99b2d2aec413e36d7a429ae9ebcadf912946b13ffa88e7d4c9b712d6"}, + {file = "coverage-6.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fb73e0011b8793c053bfa85e53129ba5f0250fdc0392c1591fd35d915ec75c46"}, + {file = "coverage-6.4.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106c16dfe494de3193ec55cac9640dd039b66e196e4641fa8ac396181578b982"}, + {file = "coverage-6.4.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:87f4f3df85aa39da00fd3ec4b5abeb7407e82b68c7c5ad181308b0e2526da5d4"}, + {file = "coverage-6.4.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:961e2fb0680b4f5ad63234e0bf55dfb90d302740ae9c7ed0120677a94a1590cb"}, + {file = "coverage-6.4.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:cec3a0f75c8f1031825e19cd86ee787e87cf03e4fd2865c79c057092e69e3a3b"}, + {file = "coverage-6.4.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:129cd05ba6f0d08a766d942a9ed4b29283aff7b2cccf5b7ce279d50796860bb3"}, + {file = "coverage-6.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:bf5601c33213d3cb19d17a796f8a14a9eaa5e87629a53979a5981e3e3ae166f6"}, + {file = "coverage-6.4.1-cp37-cp37m-win32.whl", hash = "sha256:269eaa2c20a13a5bf17558d4dc91a8d078c4fa1872f25303dddcbba3a813085e"}, + {file = "coverage-6.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:f02cbbf8119db68455b9d763f2f8737bb7db7e43720afa07d8eb1604e5c5ae28"}, + {file = "coverage-6.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ffa9297c3a453fba4717d06df579af42ab9a28022444cae7fa605af4df612d54"}, + {file = "coverage-6.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:145f296d00441ca703a659e8f3eb48ae39fb083baba2d7ce4482fb2723e050d9"}, + {file = "coverage-6.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d44996140af8b84284e5e7d398e589574b376fb4de8ccd28d82ad8e3bea13"}, + {file = "coverage-6.4.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2bd9a6fc18aab8d2e18f89b7ff91c0f34ff4d5e0ba0b33e989b3cd4194c81fd9"}, + {file = "coverage-6.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3384f2a3652cef289e38100f2d037956194a837221edd520a7ee5b42d00cc605"}, + {file = "coverage-6.4.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9b3e07152b4563722be523e8cd0b209e0d1a373022cfbde395ebb6575bf6790d"}, + {file = "coverage-6.4.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1480ff858b4113db2718848d7b2d1b75bc79895a9c22e76a221b9d8d62496428"}, + {file = "coverage-6.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:865d69ae811a392f4d06bde506d531f6a28a00af36f5c8649684a9e5e4a85c83"}, + {file = "coverage-6.4.1-cp38-cp38-win32.whl", hash = "sha256:664a47ce62fe4bef9e2d2c430306e1428ecea207ffd68649e3b942fa8ea83b0b"}, + {file = "coverage-6.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:26dff09fb0d82693ba9e6231248641d60ba606150d02ed45110f9ec26404ed1c"}, + {file = "coverage-6.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d9c80df769f5ec05ad21ea34be7458d1dc51ff1fb4b2219e77fe24edf462d6df"}, + {file = "coverage-6.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:39ee53946bf009788108b4dd2894bf1349b4e0ca18c2016ffa7d26ce46b8f10d"}, + {file = "coverage-6.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5b66caa62922531059bc5ac04f836860412f7f88d38a476eda0a6f11d4724f4"}, + {file = "coverage-6.4.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd180ed867e289964404051a958f7cccabdeed423f91a899829264bb7974d3d3"}, + {file = "coverage-6.4.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84631e81dd053e8a0d4967cedab6db94345f1c36107c71698f746cb2636c63e3"}, + {file = "coverage-6.4.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:8c08da0bd238f2970230c2a0d28ff0e99961598cb2e810245d7fc5afcf1254e8"}, + {file = "coverage-6.4.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d42c549a8f41dc103a8004b9f0c433e2086add8a719da00e246e17cbe4056f72"}, + {file = "coverage-6.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:309ce4a522ed5fca432af4ebe0f32b21d6d7ccbb0f5fcc99290e71feba67c264"}, + {file = "coverage-6.4.1-cp39-cp39-win32.whl", hash = "sha256:fdb6f7bd51c2d1714cea40718f6149ad9be6a2ee7d93b19e9f00934c0f2a74d9"}, + {file = "coverage-6.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:342d4aefd1c3e7f620a13f4fe563154d808b69cccef415415aece4c786665397"}, + {file = "coverage-6.4.1-pp36.pp37.pp38-none-any.whl", hash = "sha256:4803e7ccf93230accb928f3a68f00ffa80a88213af98ed338a57ad021ef06815"}, + {file = "coverage-6.4.1.tar.gz", hash = "sha256:4321f075095a096e70aff1d002030ee612b65a205a0a0f5b815280d5dc58100c"}, ] distlib = [ {file = "distlib-0.3.4-py2.py3-none-any.whl", hash = "sha256:6564fe0a8f51e734df6333d08b8b94d4ea8ee6b99b5ed50613f731fd4089f34b"}, @@ -405,6 +514,21 @@ docutils = [ {file = "docutils-0.18.1-py2.py3-none-any.whl", hash = "sha256:23010f129180089fbcd3bc08cfefccb3b890b0050e1ca00c867036e9d161b98c"}, {file = "docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06"}, ] +fastavro = [ + {file = "fastavro-1.5.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:920d170560198741fa196a62a97c220173339766e6c14369c5c68bfe8cdafa25"}, + {file = "fastavro-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b00b1711511981c4e2dd4a27ba5ae20897fe41ec7ab52eda868626d445081e5"}, + {file = "fastavro-1.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:04438b592980633ccf5d1de7798480a634ca581ae7575ab7671ba16773b6b428"}, + {file = "fastavro-1.5.1-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:0ab92ab744f9172da0791bfad0495d785c7c4f5a68924e3c6c6b39b78b044b11"}, + {file = "fastavro-1.5.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84ca1a60cecd710ead076585b56b954ab3e6e001d8e7384cb4ed20019b29e7a9"}, + {file = "fastavro-1.5.1-cp37-cp37m-win_amd64.whl", hash = "sha256:b5ff657c0d48553492d8356a30b6112fcc6db69adce6bba31135272bc9d87d82"}, + {file = "fastavro-1.5.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:0f1ed38042a2a90a7a5da170006459e73134f4c14f4fda9ebba99017adb1b14c"}, + {file = "fastavro-1.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df83ebdd7b67b52a37bc84c6e25f7056f756fb216c5c8e5c95ae1673fcbb6015"}, + {file = "fastavro-1.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:0053347a92add6f448837ff00099b0a7200ec5dd58e173743d856d65d0574ddb"}, + {file = "fastavro-1.5.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:6b4f8551ccbe0c9b19867b8c93029e8cfe8fa3757245caae6228f35ef0656371"}, + {file = "fastavro-1.5.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff241b5ce36df7af7461d001ca98fec6eacd56c4754c8ac7718e2d4b7b690a82"}, + {file = "fastavro-1.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:fb3491c88e7962a6b820548ddd12b9c0f6296ebd2385a3021296f14bfe35189a"}, + {file = "fastavro-1.5.1.tar.gz", hash = "sha256:0815da740ced2261f90b0ddbb5bbe645e9c893c8f00e5dc8d30b8ec20f3c7fa9"}, +] filelock = [ {file = "filelock-3.7.1-py3-none-any.whl", hash = "sha256:37def7b658813cda163b56fc564cdc75e86d338246458c4c28ae84cabefa2404"}, {file = "filelock-3.7.1.tar.gz", hash = "sha256:3a0fd85166ad9dbab54c9aec96737b744106dc5f15c0b09a6744a445299fcf04"}, @@ -533,6 +657,10 @@ pyarrow = [ {file = "pyarrow-8.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:cb06cacc19f3b426681f2f6803cc06ff481e7fe5b3a533b406bc5b2138843d4f"}, {file = "pyarrow-8.0.0.tar.gz", hash = "sha256:4a18a211ed888f1ac0b0ebcb99e2d9a3e913a481120ee9b1fe33d3fedb945d4e"}, ] +pycparser = [ + {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, + {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, +] pyparsing = [ {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, @@ -545,6 +673,56 @@ pytest-checkdocs = [ {file = "pytest-checkdocs-2.7.1.tar.gz", hash = "sha256:2b33b85eddfe5846a69bea4a759303e2d5a3be11d03bc7149f5ba1ef47e6c1ae"}, {file = "pytest_checkdocs-2.7.1-py3-none-any.whl", hash = "sha256:294898c64c9ce1a178edc6660e48da23c7543bfd5a1cea7f0ca4c167745d8461"}, ] +python-snappy = [ + {file = "python-snappy-0.6.1.tar.gz", hash = "sha256:b6a107ab06206acc5359d4c5632bd9b22d448702a79b3169b0c62e0fb808bb2a"}, + {file = "python_snappy-0.6.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b7f920eaf46ebf41bd26f9df51c160d40f9e00b7b48471c3438cb8d027f7fb9b"}, + {file = "python_snappy-0.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4ec533a8c1f8df797bded662ec3e494d225b37855bb63eb0d75464a07947477c"}, + {file = "python_snappy-0.6.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6f8bf4708a11b47517baf962f9a02196478bbb10fdb9582add4aa1459fa82380"}, + {file = "python_snappy-0.6.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8d0c019ee7dcf2c60e240877107cddbd95a5b1081787579bf179938392d66480"}, + {file = "python_snappy-0.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb18d9cd7b3f35a2f5af47bb8ed6a5bdbf4f3ddee37f3daade4ab7864c292f5b"}, + {file = "python_snappy-0.6.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b265cde49774752aec9ca7f5d272e3f98718164afc85521622a8a5394158a2b5"}, + {file = "python_snappy-0.6.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d017775851a778ec9cc32651c4464079d06d927303c2dde9ae9830ccf6fe94e1"}, + {file = "python_snappy-0.6.1-cp310-cp310-win32.whl", hash = "sha256:8277d1f6282463c40761f802b742f833f9f2449fcdbb20a96579aa05c8feb614"}, + {file = "python_snappy-0.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:2aaaf618c68d8c9daebc23a20436bd01b09ee70d7fbf7072b7f38b06d2fab539"}, + {file = "python_snappy-0.6.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:277757d5dad4e239dc1417438a0871b65b1b155beb108888e7438c27ffc6a8cc"}, + {file = "python_snappy-0.6.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e066a0586833d610c4bbddba0be5ba0e3e4f8e0bc5bb6d82103d8f8fc47bb59a"}, + {file = "python_snappy-0.6.1-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:0d489b50f49433494160c45048fe806de6b3aeab0586e497ebd22a0bab56e427"}, + {file = "python_snappy-0.6.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:463fd340a499d47b26ca42d2f36a639188738f6e2098c6dbf80aef0e60f461e1"}, + {file = "python_snappy-0.6.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9837ac1650cc68d22a3cf5f15fb62c6964747d16cecc8b22431f113d6e39555d"}, + {file = "python_snappy-0.6.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5e973e637112391f05581f427659c05b30b6843bc522a65be35ac7b18ce3dedd"}, + {file = "python_snappy-0.6.1-cp36-cp36m-win32.whl", hash = "sha256:c20498bd712b6e31a4402e1d027a1cd64f6a4a0066a3fe3c7344475886d07fdf"}, + {file = "python_snappy-0.6.1-cp36-cp36m-win_amd64.whl", hash = "sha256:59e975be4206cc54d0a112ef72fa3970a57c2b1bcc2c97ed41d6df0ebe518228"}, + {file = "python_snappy-0.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2a7e528ab6e09c0d67dcb61a1730a292683e5ff9bb088950638d3170cf2a0a54"}, + {file = "python_snappy-0.6.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:39692bedbe0b717001a99915ac0eb2d9d0bad546440d392a2042b96d813eede1"}, + {file = "python_snappy-0.6.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6a7620404da966f637b9ce8d4d3d543d363223f7a12452a575189c5355fc2d25"}, + {file = "python_snappy-0.6.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7778c224efc38a40d274da4eb82a04cac27aae20012372a7db3c4bbd8926c4d4"}, + {file = "python_snappy-0.6.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d029f7051ec1bbeaa3e03030b6d8ed47ceb69cae9016f493c802a08af54e026"}, + {file = "python_snappy-0.6.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a0ad38bc98d0b0497a0b0dbc29409bcabfcecff4511ed7063403c86de16927bc"}, + {file = "python_snappy-0.6.1-cp37-cp37m-win32.whl", hash = "sha256:5a453c45178d7864c1bdd6bfe0ee3ed2883f63b9ba2c9bb967c6b586bf763f96"}, + {file = "python_snappy-0.6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9f0c0d88b84259f93c3aa46398680646f2c23e43394779758d9f739c34e15295"}, + {file = "python_snappy-0.6.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5bb05c28298803a74add08ba496879242ef159c75bc86a5406fac0ffc7dd021b"}, + {file = "python_snappy-0.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9eac51307c6a1a38d5f86ebabc26a889fddf20cbba7a116ccb54ba1446601d5b"}, + {file = "python_snappy-0.6.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:88b6ea78b83d2796f330b0af1b70cdd3965dbdab02d8ac293260ec2c8fe340ee"}, + {file = "python_snappy-0.6.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8c07220408d3268e8268c9351c5c08041bc6f8c6172e59d398b71020df108541"}, + {file = "python_snappy-0.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4038019b1bcaadde726a57430718394076c5a21545ebc5badad2c045a09546cf"}, + {file = "python_snappy-0.6.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc96668d9c7cc656609764275c5f8da58ef56d89bdd6810f6923d36497468ff7"}, + {file = "python_snappy-0.6.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cf5bb9254e1c38aacf253d510d3d9be631bba21f3d068b17672b38b5cbf2fff5"}, + {file = "python_snappy-0.6.1-cp38-cp38-win32.whl", hash = "sha256:eaf905a580f2747c4a474040a5063cd5e0cc3d1d2d6edb65f28196186493ad4a"}, + {file = "python_snappy-0.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:546c1a7470ecbf6239101e9aff0f709b68ca0f0268b34d9023019a55baa1f7c6"}, + {file = "python_snappy-0.6.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e3a013895c64352b49d0d8e107a84f99631b16dbab156ded33ebf0becf56c8b2"}, + {file = "python_snappy-0.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3fb9a88a4dd6336488f3de67ce75816d0d796dce53c2c6e4d70e0b565633c7fd"}, + {file = "python_snappy-0.6.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:735cd4528c55dbe4516d6d2b403331a99fc304f8feded8ae887cf97b67d589bb"}, + {file = "python_snappy-0.6.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:90b0186516b7a101c14764b0c25931b741fb0102f21253eff67847b4742dfc72"}, + {file = "python_snappy-0.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a993dc8aadd901915a510fe6af5f20ae4256f527040066c22a154db8946751f"}, + {file = "python_snappy-0.6.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:530bfb9efebcc1aab8bb4ebcbd92b54477eed11f6cf499355e882970a6d3aa7d"}, + {file = "python_snappy-0.6.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5843feb914796b1f0405ccf31ea0fb51034ceb65a7588edfd5a8250cb369e3b2"}, + {file = "python_snappy-0.6.1-cp39-cp39-win32.whl", hash = "sha256:66c80e9b366012dbee262bb1869e4fc5ba8786cda85928481528bc4a72ec2ee8"}, + {file = "python_snappy-0.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:4d3cafdf454354a621c8ab7408e45aa4e9d5c0b943b61ff4815f71ca6bdf0130"}, + {file = "python_snappy-0.6.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:586724a0276d7a6083a17259d0b51622e492289a9998848a1b01b6441ca12b2f"}, + {file = "python_snappy-0.6.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2be4f4550acd484912441f5f1209ba611ac399aac9355fee73611b9a0d4f949c"}, + {file = "python_snappy-0.6.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bdb6942180660bda7f7d01f4c0def3cfc72b1c6d99aad964801775a3e379aba"}, + {file = "python_snappy-0.6.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:03bb511380fca2a13325b6f16fe8234c8e12da9660f0258cd45d9a02ffc916af"}, +] pyyaml = [ {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, @@ -600,3 +778,49 @@ zipp = [ {file = "zipp-3.8.0-py3-none-any.whl", hash = "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"}, {file = "zipp-3.8.0.tar.gz", hash = "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad"}, ] +zstandard = [ + {file = "zstandard-0.17.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a1991cdf2e81e643b53fb8d272931d2bdf5f4e70d56a457e1ef95bde147ae627"}, + {file = "zstandard-0.17.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4768449d8d1b0785309ace288e017cc5fa42e11a52bf08c90d9c3eb3a7a73cc6"}, + {file = "zstandard-0.17.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b1ad6d2952b41d9a0ea702a474cc08c05210c6289e29dd496935c9ca3c7fb45c"}, + {file = "zstandard-0.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90a9ba3a9c16b86afcb785b3c9418af39ccfb238fd5f6e429166e3ca8542b01f"}, + {file = "zstandard-0.17.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9cf18c156b3a108197a8bf90b37d03c31c8ef35a7c18807b321d96b74e12c301"}, + {file = "zstandard-0.17.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c81fd9386449df0ebf1ab3e01187bb30d61122c74df53ba4880a2454d866e55d"}, + {file = "zstandard-0.17.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:787efc741e61e00ffe5e65dac99b0dc5c88b9421012a207a91b869a8b1164921"}, + {file = "zstandard-0.17.0-cp310-cp310-win32.whl", hash = "sha256:49cd09ccbd1e3c0e2690dd62ebf95064d84aa42b9db381867e0b138631f969f2"}, + {file = "zstandard-0.17.0-cp310-cp310-win_amd64.whl", hash = "sha256:d78aac2ffc4e88ab1cbcad844669924c24e24c7c255de9628a18f14d832007c5"}, + {file = "zstandard-0.17.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:c19d1e06569c277dcc872d80cbadf14a29e8199e013ff2a176d169f461439a40"}, + {file = "zstandard-0.17.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d916018289d2f9a882e90d2e3bd41652861ce11b5ecd8515fa07ad31d97d56e5"}, + {file = "zstandard-0.17.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0c87f097d6867833a839b086eb8d03676bb87c2efa067a131099f04aa790683"}, + {file = "zstandard-0.17.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:60943f71e3117583655a1eb76188a7cc78a25267ef09cc74be4d25a0b0c8b947"}, + {file = "zstandard-0.17.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:208fa6bead577b2607205640078ee452e81fe20fe96321623c632bad9ebd7148"}, + {file = "zstandard-0.17.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:42f3c02c7021073cafbc6cd152b288c56a25e585518861589bb08b063b6d2ad2"}, + {file = "zstandard-0.17.0-cp36-cp36m-win32.whl", hash = "sha256:2a2ac752162ba5cbc869c60c4a4e54e890b2ee2ffb57d3ff159feab1ae4518db"}, + {file = "zstandard-0.17.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d1405caa964ba11b2396bd9fd19940440217345752e192c936d084ba5fe67dcb"}, + {file = "zstandard-0.17.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ef62eb3bcfd6d786f439828bb544ebd3936432db669403e0b8f48e424f1d55f1"}, + {file = "zstandard-0.17.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:477f172807a9fa83467b30d7c58876af1410d20177c554c27525211edf535bae"}, + {file = "zstandard-0.17.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de1aa618306a741e0497878b7f845fd6c397e52dd096fb76ed791e7268887176"}, + {file = "zstandard-0.17.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a827b9c464ee966524f8e82ec1aabb4a77ff9514cae041667fa81ae2ec8bd3e9"}, + {file = "zstandard-0.17.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cf96ace804945e53bc3e5294097e5fa32a2d43bc52416c632b414b870ee0a21"}, + {file = "zstandard-0.17.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:802109f67328c5b822d4fdac28e1cf65a24de2e2e99d76cdbeee9121cedb1b6c"}, + {file = "zstandard-0.17.0-cp37-cp37m-win32.whl", hash = "sha256:a628f20d019feb0f3a171c7a55cc4f75681f3b8c1bd7a5009165a487314887cd"}, + {file = "zstandard-0.17.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7d2e7abac41d2b4b18f03575aca860d2cb647c343e13c23d6c769106a3db2f6f"}, + {file = "zstandard-0.17.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f502fe79757434292174b04db114f9e25c767b2d5ca9e759d118b22a66f445f8"}, + {file = "zstandard-0.17.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e37c4e21f696d6bcdbbc7caf98dffa505d04c0053909b9db0a6e8ca3b935eb07"}, + {file = "zstandard-0.17.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8fd386d0ec1f9343f1776391d9e60d4eedced0a0b0e625bb89b91f6d05f70e83"}, + {file = "zstandard-0.17.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91a228a077fc7cd8486c273788d4a006a37d060cb4293f471eb0325c3113af68"}, + {file = "zstandard-0.17.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:59eadb9f347d40e8f7ef77caffd0c04a31e82c1df82fe2d2a688032429d750ac"}, + {file = "zstandard-0.17.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a71809ec062c5b7acf286ba6d4484e6fe8130fc2b93c25e596bb34e7810c79b2"}, + {file = "zstandard-0.17.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8aedd38d357f6d5e2facd88ce62b4976afdc29db57216a23f14a0cd0ca05a8a3"}, + {file = "zstandard-0.17.0-cp38-cp38-win32.whl", hash = "sha256:bd842ae3dbb7cba88beb022161c819fa80ca7d0c5a4ddd209e7daae85d904e49"}, + {file = "zstandard-0.17.0-cp38-cp38-win_amd64.whl", hash = "sha256:d0e9fec68e304fb35c559c44530213adbc7d5918bdab906a45a0f40cd56c4de2"}, + {file = "zstandard-0.17.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9ec62a4c2dbb0a86ee5138c16ef133e59a23ac108f8d7ac97aeb61d410ce6857"}, + {file = "zstandard-0.17.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d5373a56b90052f171c8634fedc53a6ac371e6c742606e9825772a394bdbd4b0"}, + {file = "zstandard-0.17.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2e3ea5e4d5ecf3faefd4a5294acb6af1f0578b0cdd75d6b4529c45deaa54d6f"}, + {file = "zstandard-0.17.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a3a1aa9528087f6f4c47f4ece2d5e6a160527821263fb8174ff36429233e093"}, + {file = "zstandard-0.17.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:bdf691a205bc492956e6daef7a06fb38f8cbe8b2c1cb0386f35f4412c360c9e9"}, + {file = "zstandard-0.17.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:db993a56e21d903893933887984ca9b0d274f2b1db7b3cf21ba129783953864f"}, + {file = "zstandard-0.17.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a7756a9446f83c81101f6c0a48c3bfd8d387a249933c57b0d095ca8b20541337"}, + {file = "zstandard-0.17.0-cp39-cp39-win32.whl", hash = "sha256:37e50501baaa935f13a1820ab2114f74313b5cb4cfff8146acb8c5b18cdced2a"}, + {file = "zstandard-0.17.0-cp39-cp39-win_amd64.whl", hash = "sha256:b4e671c4c0804cdf752be26f260058bb858fbdaaef1340af170635913ecca01e"}, + {file = "zstandard-0.17.0.tar.gz", hash = "sha256:fa9194cb91441df7242aa3ddc4cb184be38876cb10dd973674887f334bafbfb6"}, +] diff --git a/python/pyproject.toml b/python/pyproject.toml index b845f8b01704..afeee351ce52 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -43,10 +43,15 @@ mmh3 = "^3.0.0" pyarrow = { version = "^8.0.0", optional = true } +zstandard = { version = "^0.17.0", optional = true } + +python-snappy = { version = "^0.6.1", optional = true } + [tool.poetry.dev-dependencies] pytest = "^7.0.0" pytest-checkdocs = "^2.0.0" pre-commit = "^2.0.0" +fastavro = "^1.5.1" coverage = { version = "^6.0.0", extras = ["toml"] } [build-system] @@ -55,6 +60,8 @@ build-backend = "poetry.core.masonry.api" [tool.poetry.extras] pyarrow = ["pyarrow"] +snappy = ["python-snappy"] +python-snappy = ["zstandard"] [tool.black] line-length = 130 @@ -79,5 +86,13 @@ warn_unreachable = true module = "mypy-pyarrow.*" ignore_missing_imports = true +[[tool.mypy.overrides]] +module = "mypy-snappy.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "mypy-zstandard.*" +ignore_missing_imports = true + [tool.coverage.run] source = ['src/'] diff --git a/python/spellcheck-dictionary.txt b/python/spellcheck-dictionary.txt index 41da147758d2..8bb0b868c388 100644 --- a/python/spellcheck-dictionary.txt +++ b/python/spellcheck-dictionary.txt @@ -57,3 +57,9 @@ UnboundPredicate BoundPredicate BooleanExpression BooleanExpressionVisitor +zigzag +unix +zlib +Codecs +codecs +uri diff --git a/python/src/iceberg/avro/__init__.py b/python/src/iceberg/avro/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/python/src/iceberg/avro/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/src/iceberg/avro/codecs/__init__.py b/python/src/iceberg/avro/codecs/__init__.py new file mode 100644 index 000000000000..28dd23f83f9e --- /dev/null +++ b/python/src/iceberg/avro/codecs/__init__.py @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Contains Codecs for Python Avro. + +Note that the word "codecs" means "compression/decompression algorithms" in the +Avro world (https://avro.apache.org/docs/current/spec.html#Object+Container+Files), +so don't confuse it with the Python's "codecs", which is a package mainly for +converting character sets (https://docs.python.org/3/library/codecs.html). +""" +from __future__ import annotations + +from iceberg.avro.codecs.bzip2 import BZip2Codec +from iceberg.avro.codecs.codec import Codec +from iceberg.avro.codecs.deflate import DeflateCodec +from iceberg.avro.codecs.snappy_codec import SnappyCodec +from iceberg.avro.codecs.zstandard_codec import ZStandardCodec + +KNOWN_CODECS: dict[str, type[Codec] | None] = { + "null": None, + "bzip2": BZip2Codec, + "snappy": SnappyCodec, + "zstandard": ZStandardCodec, + "deflate": DeflateCodec, +} diff --git a/python/src/iceberg/avro/codecs/bzip2.py b/python/src/iceberg/avro/codecs/bzip2.py new file mode 100644 index 000000000000..b92c248de272 --- /dev/null +++ b/python/src/iceberg/avro/codecs/bzip2.py @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +from iceberg.avro.codecs.codec import Codec + +try: + import bz2 + + class BZip2Codec(Codec): + @staticmethod + def compress(data: bytes) -> tuple[bytes, int]: + compressed_data = bz2.compress(data) + return compressed_data, len(compressed_data) + + @staticmethod + def decompress(data: bytes) -> bytes: + return bz2.decompress(data) + +except ImportError: + + class BZip2Codec(Codec): # type: ignore + @staticmethod + def compress(data: bytes) -> tuple[bytes, int]: + raise ImportError("Python bzip2 support not installed, please install the extension") + + @staticmethod + def decompress(data: bytes) -> bytes: + raise ImportError("Python bzip2 support not installed, please install the extension") diff --git a/python/src/iceberg/avro/codecs/codec.py b/python/src/iceberg/avro/codecs/codec.py new file mode 100644 index 000000000000..1c04f0db3e20 --- /dev/null +++ b/python/src/iceberg/avro/codecs/codec.py @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +from abc import ABC, abstractmethod + + +class Codec(ABC): + """Abstract base class for all Avro codec classes.""" + + @staticmethod + @abstractmethod + def compress(data: bytes) -> tuple[bytes, int]: + ... + + @staticmethod + @abstractmethod + def decompress(data: bytes) -> bytes: + ... diff --git a/python/src/iceberg/avro/codecs/deflate.py b/python/src/iceberg/avro/codecs/deflate.py new file mode 100644 index 000000000000..c1f8bf30b720 --- /dev/null +++ b/python/src/iceberg/avro/codecs/deflate.py @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import zlib + +from iceberg.avro.codecs.codec import Codec + + +class DeflateCodec(Codec): + @staticmethod + def compress(data: bytes) -> tuple[bytes, int]: + # The first two characters and last character are zlib + # wrappers around deflate data. + compressed_data = zlib.compress(data)[2:-1] + return compressed_data, len(compressed_data) + + @staticmethod + def decompress(data: bytes) -> bytes: + # -15 is the log of the window size; negative indicates + # "raw" (no zlib headers) decompression. See zlib.h. + return zlib.decompress(data, -15) diff --git a/python/src/iceberg/avro/codecs/snappy_codec.py b/python/src/iceberg/avro/codecs/snappy_codec.py new file mode 100644 index 000000000000..92b599cdface --- /dev/null +++ b/python/src/iceberg/avro/codecs/snappy_codec.py @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +import binascii +import struct + +from iceberg.avro.codecs.codec import Codec + +STRUCT_CRC32 = struct.Struct(">I") # big-endian unsigned int + +try: + import snappy + + class SnappyCodec(Codec): + @staticmethod + def _check_crc32(bytes_: bytes, checksum: bytes) -> None: + """Incrementally compute CRC-32 from bytes and compare to a checksum + + Args: + bytes_ (bytes): The bytes to check against `checksum` + checksum (bytes): Byte representation of a checksum + + Raises: + ValueError: If the computed CRC-32 does not match the checksum + """ + if binascii.crc32(bytes_) & 0xFFFFFFFF != STRUCT_CRC32.unpack(checksum)[0]: + raise ValueError("Checksum failure") + + @staticmethod + def compress(data: bytes) -> tuple[bytes, int]: + compressed_data = snappy.compress(data) + # A 4-byte, big-endian CRC32 checksum + compressed_data += STRUCT_CRC32.pack(binascii.crc32(data) & 0xFFFFFFFF) + return compressed_data, len(compressed_data) + + @staticmethod + def decompress(data: bytes) -> bytes: + # Compressed data includes a 4-byte CRC32 checksum + data = data[0:-4] + uncompressed = snappy.decompress(data) + checksum = data[-4:] + SnappyCodec._check_crc32(uncompressed, checksum) + return uncompressed + +except ImportError: + + class SnappyCodec(Codec): # type: ignore + @staticmethod + def compress(data: bytes) -> tuple[bytes, int]: + raise ImportError("Snappy support not installed, please install using `pip install pyiceberg[snappy]`") + + @staticmethod + def decompress(data: bytes) -> bytes: + raise ImportError("Snappy support not installed, please install using `pip install pyiceberg[snappy]`") diff --git a/python/src/iceberg/avro/codecs/zstandard_codec.py b/python/src/iceberg/avro/codecs/zstandard_codec.py new file mode 100644 index 000000000000..8144628b06c6 --- /dev/null +++ b/python/src/iceberg/avro/codecs/zstandard_codec.py @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import annotations + +from io import BytesIO + +from iceberg.avro.codecs.codec import Codec + +try: + from zstandard import ZstdCompressor, ZstdDecompressor + + class ZStandardCodec(Codec): + @staticmethod + def compress(data: bytes) -> tuple[bytes, int]: + compressed_data = ZstdCompressor().compress(data) + return compressed_data, len(compressed_data) + + @staticmethod + def decompress(data: bytes) -> bytes: + uncompressed = bytearray() + dctx = ZstdDecompressor() + with dctx.stream_reader(BytesIO(data)) as reader: + while True: + chunk = reader.read(16384) + if not chunk: + break + uncompressed.extend(chunk) + return uncompressed + +except ImportError: + + class ZStandardCodec(Codec): # type: ignore + @staticmethod + def compress(data: bytes) -> tuple[bytes, int]: + raise ImportError("Zstandard support not installed, please install using `pip install pyiceberg[zstandard]`") + + @staticmethod + def decompress(data: bytes) -> bytes: + raise ImportError("Zstandard support not installed, please install using `pip install pyiceberg[zstandard]`") diff --git a/python/src/iceberg/avro/decoder.py b/python/src/iceberg/avro/decoder.py new file mode 100644 index 000000000000..24312cdd250f --- /dev/null +++ b/python/src/iceberg/avro/decoder.py @@ -0,0 +1,165 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import decimal +import struct +from datetime import date, datetime, time + +from iceberg.io.base import InputStream +from iceberg.utils.datetime import ( + days_to_date, + micros_to_time, + micros_to_timestamp, + micros_to_timestamptz, +) +from iceberg.utils.decimal import unscaled_to_decimal + +STRUCT_FLOAT = struct.Struct("h") # big-endian signed short +STRUCT_SIGNED_INT = struct.Struct(">i") # big-endian signed int +STRUCT_SIGNED_LONG = struct.Struct(">q") # big-endian signed long + + +class BinaryDecoder: + """Read leaf values.""" + + _input_stream: InputStream + + def __init__(self, input_stream: InputStream) -> None: + """ + reader is a Python object on which we can call read, seek, and tell. + """ + self._input_stream = input_stream + + def read(self, n: int) -> bytes: + """ + Read n bytes. + """ + if n < 0: + raise ValueError(f"Requested {n} bytes to read, expected positive integer.") + read_bytes = self._input_stream.read(n) + if len(read_bytes) != n: + raise ValueError(f"Read {len(read_bytes)} bytes, expected {n} bytes") + return read_bytes + + def read_boolean(self) -> bool: + """ + a boolean is written as a single byte + whose value is either 0 (false) or 1 (true). + """ + return ord(self.read(1)) == 1 + + def read_int(self) -> int: + """int values are written using variable-length, zigzag coding.""" + return self.read_long() + + def read_long(self) -> int: + """long values are written using variable-length, zigzag coding.""" + b = ord(self.read(1)) + n = b & 0x7F + shift = 7 + while (b & 0x80) != 0: + b = ord(self.read(1)) + n |= (b & 0x7F) << shift + shift += 7 + datum = (n >> 1) ^ -(n & 1) + return datum + + def read_float(self) -> float: + """ + A float is written as 4 bytes. + The float is converted into a 32-bit integer using a method equivalent to + Java's floatToIntBits and then encoded in little-endian format. + """ + return float(STRUCT_FLOAT.unpack(self.read(4))[0]) + + def read_double(self) -> float: + """ + A double is written as 8 bytes. + The double is converted into a 64-bit integer using a method equivalent to + Java's doubleToLongBits and then encoded in little-endian format. + """ + return float(STRUCT_DOUBLE.unpack(self.read(8))[0]) + + def read_decimal_from_bytes(self, precision: int, scale: int) -> decimal.Decimal: + """ + Decimal bytes are decoded as signed short, int or long depending on the + size of bytes. + """ + size = self.read_long() + return self.read_decimal_from_fixed(precision, scale, size) + + def read_decimal_from_fixed(self, precision: int, scale: int, size: int) -> decimal.Decimal: + """ + Decimal is encoded as fixed. Fixed instances are encoded using the + number of bytes declared in the schema. + """ + data = self.read(size) + unscaled_datum = int.from_bytes(data, byteorder="big", signed=True) + return unscaled_to_decimal(unscaled_datum, scale) + + def read_bytes(self) -> bytes: + """ + Bytes are encoded as a long followed by that many bytes of data. + """ + return self.read(self.read_long()) + + def read_utf8(self) -> str: + """ + A string is encoded as a long followed by + that many bytes of UTF-8 encoded character data. + """ + return self.read_bytes().decode("utf-8") + + def read_date_from_int(self) -> date: + """ + int is decoded as python date object. + int stores the number of days from + the unix epoch, 1 January 1970 (ISO calendar). + """ + return days_to_date(self.read_int()) + + def read_time_millis(self) -> time: + """ + int is decoded as python time object which represents + the number of milliseconds after midnight, 00:00:00.000. + """ + millis = self.read_int() + return micros_to_time(millis * 1000) + + def read_time_micros(self) -> time: + """ + long is decoded as python time object which represents + the number of microseconds after midnight, 00:00:00.000000. + """ + return micros_to_time(self.read_long()) + + def read_timestamp_micros(self) -> datetime: + """ + long is decoded as python datetime object which represents + the number of microseconds from the unix epoch, 1 January 1970. + """ + return micros_to_timestamp(self.read_long()) + + def read_timestamptz_micros(self): + """ + long is decoded as python datetime object which represents + the number of microseconds from the unix epoch, 1 January 1970. + + Adjusted to UTC + """ + return micros_to_timestamptz(self.read_long()) diff --git a/python/src/iceberg/avro/file.py b/python/src/iceberg/avro/file.py new file mode 100644 index 000000000000..0eec227e9543 --- /dev/null +++ b/python/src/iceberg/avro/file.py @@ -0,0 +1,181 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=W0621 +""" +Avro reader for reading Avro files +""" +from __future__ import annotations + +import json +from dataclasses import dataclass +from io import SEEK_SET + +from iceberg.avro.codecs import KNOWN_CODECS, Codec +from iceberg.avro.decoder import BinaryDecoder +from iceberg.avro.reader import AvroStruct, ConstructReader, StructReader +from iceberg.io.base import InputFile, InputStream +from iceberg.io.memory import MemoryInputStream +from iceberg.schema import Schema, visit +from iceberg.types import ( + FixedType, + MapType, + NestedField, + StringType, + StructType, +) +from iceberg.utils.schema_conversion import AvroSchemaConversion + +VERSION = 1 +MAGIC = bytes(b"Obj" + bytearray([VERSION])) +MAGIC_SIZE = len(MAGIC) +SYNC_SIZE = 16 +META_SCHEMA = StructType( + NestedField(name="magic", field_id=100, field_type=FixedType(length=MAGIC_SIZE), required=True), + NestedField( + field_id=200, + name="meta", + field_type=MapType(key_id=201, key_type=StringType(), value_id=202, value_type=StringType(), value_required=True), + required=True, + ), + NestedField(field_id=300, name="sync", field_type=FixedType(length=SYNC_SIZE), required=True), +) + +_CODEC_KEY = "avro.codec" +_SCHEMA_KEY = "avro.schema" + + +@dataclass(frozen=True) +class AvroFileHeader: + magic: bytes + meta: dict[str, str] + sync: bytes + + def compression_codec(self) -> type[Codec] | None: + """Get the file's compression codec algorithm from the file's metadata. + + In the case of a null codec, we return a None indicating that we + don't need to compress/decompress + """ + codec_name = self.meta.get(_CODEC_KEY, "null") + if codec_name not in KNOWN_CODECS: + raise ValueError(f"Unsupported codec: {codec_name}") + + return KNOWN_CODECS[codec_name] + + def get_schema(self) -> Schema: + if _SCHEMA_KEY in self.meta: + avro_schema_string = self.meta[_SCHEMA_KEY] + avro_schema = json.loads(avro_schema_string) + return AvroSchemaConversion().avro_to_iceberg(avro_schema) + else: + raise ValueError("No schema found in Avro file headers") + + +@dataclass +class Block: + reader: StructReader + block_records: int + block_decoder: BinaryDecoder + position: int = 0 + + def __iter__(self): + return self + + def has_next(self) -> bool: + return self.position < self.block_records + + def __next__(self) -> AvroStruct: + if self.has_next(): + self.position += 1 + return self.reader.read(self.block_decoder) + raise StopIteration + + +class AvroFile: + input_file: InputFile + input_stream: InputStream + header: AvroFileHeader + schema: Schema + file_length: int + reader: StructReader + + decoder: BinaryDecoder + block: Block | None = None + + def __init__(self, input_file: InputFile) -> None: + self.input_file = input_file + + def __enter__(self): + """ + Opens the file and reads the header and generates + a reader tree to start reading the payload + + Returns: + A generator returning the AvroStructs + """ + self.input_stream = self.input_file.open() + self.decoder = BinaryDecoder(self.input_stream) + self.header = self._read_header() + self.schema = self.header.get_schema() + self.file_length = len(self.input_file) + self.reader = visit(self.schema, ConstructReader()) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.input_stream.close() + + def __iter__(self) -> AvroFile: + return self + + def _read_block(self) -> int: + # If there is already a block, we'll have the sync bytes + if self.block: + sync_marker = self.decoder.read(SYNC_SIZE) + if sync_marker != self.header.sync: + raise ValueError(f"Expected sync bytes {self.header.sync!r}, but got {sync_marker!r}") + if self.is_EOF(): + raise StopIteration + block_records = self.decoder.read_long() + + block_bytes_len = self.decoder.read_long() + block_bytes = self.decoder.read(block_bytes_len) + if codec := self.header.compression_codec(): + block_bytes = codec.decompress(block_bytes) + + self.block = Block( + reader=self.reader, block_records=block_records, block_decoder=BinaryDecoder(MemoryInputStream(block_bytes)) + ) + return block_records + + def __next__(self) -> AvroStruct: + if self.block and self.block.has_next(): + return next(self.block) + + new_block = self._read_block() + + if new_block > 0: + return self.__next__() + raise StopIteration + + def _read_header(self) -> AvroFileHeader: + self.input_stream.seek(0, SEEK_SET) + reader = visit(META_SCHEMA, ConstructReader()) + _header = reader.read(self.decoder) + return AvroFileHeader(magic=_header.get(0), meta=_header.get(1), sync=_header.get(2)) + + def is_EOF(self) -> bool: + return self.input_stream.tell() == self.file_length diff --git a/python/src/iceberg/avro/reader.py b/python/src/iceberg/avro/reader.py new file mode 100644 index 000000000000..bd8f1b4e8087 --- /dev/null +++ b/python/src/iceberg/avro/reader.py @@ -0,0 +1,316 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Classes for building the Reader tree + +Constructing a reader tree from the schema makes it easy +to decouple the reader implementation from the schema. + +The reader tree can be changed in such a way that the +read schema is different, while respecting the read schema +""" +from __future__ import annotations + +from abc import abstractmethod +from dataclasses import dataclass, field +from datetime import date, datetime, time +from decimal import Decimal +from functools import singledispatch +from typing import Any +from uuid import UUID + +from iceberg.avro.decoder import BinaryDecoder +from iceberg.files import StructProtocol +from iceberg.schema import Schema, SchemaVisitor +from iceberg.types import ( + BinaryType, + BooleanType, + DateType, + DecimalType, + DoubleType, + FixedType, + FloatType, + IntegerType, + ListType, + LongType, + MapType, + NestedField, + PrimitiveType, + StringType, + StructType, + TimestampType, + TimestamptzType, + TimeType, +) +from iceberg.utils.singleton import Singleton + + +@dataclass(frozen=True) +class AvroStruct(StructProtocol): + _data: list[Any | StructProtocol] = field() + + def set(self, pos: int, value: Any) -> None: + self._data[pos] = value + + def get(self, pos: int) -> Any: + return self._data[pos] + + +class Reader(Singleton): + @abstractmethod + def read(self, decoder: BinaryDecoder) -> Any: + ... + + +class NoneReader(Reader): + def read(self, _: BinaryDecoder) -> None: + return None + + +class BooleanReader(Reader): + def read(self, decoder: BinaryDecoder) -> bool: + return decoder.read_boolean() + + +class IntegerReader(Reader): + def read(self, decoder: BinaryDecoder) -> int: + return decoder.read_int() + + +class LongReader(Reader): + def read(self, decoder: BinaryDecoder) -> int: + return decoder.read_long() + + +class FloatReader(Reader): + def read(self, decoder: BinaryDecoder) -> float: + return decoder.read_float() + + +class DoubleReader(Reader): + def read(self, decoder: BinaryDecoder) -> float: + return decoder.read_double() + + +class DateReader(Reader): + def read(self, decoder: BinaryDecoder) -> date: + return decoder.read_date_from_int() + + +class TimeReader(Reader): + def read(self, decoder: BinaryDecoder) -> time: + return decoder.read_time_micros() + + +class TimestampReader(Reader): + def read(self, decoder: BinaryDecoder) -> datetime: + return decoder.read_timestamp_micros() + + +class TimestamptzReader(Reader): + def read(self, decoder: BinaryDecoder) -> datetime: + return decoder.read_timestamptz_micros() + + +class StringReader(Reader): + def read(self, decoder: BinaryDecoder) -> str: + return decoder.read_utf8() + + +class UUIDReader(Reader): + def read(self, decoder: BinaryDecoder) -> UUID: + return UUID(decoder.read_utf8()) + + +@dataclass(frozen=True) +class FixedReader(Reader): + length: int = field() + + def read(self, decoder: BinaryDecoder) -> bytes: + return decoder.read(self.length) + + +class BinaryReader(Reader): + def read(self, decoder: BinaryDecoder) -> bytes: + return decoder.read_bytes() + + +@dataclass(frozen=True) +class DecimalReader(Reader): + precision: int = field() + scale: int = field() + + def read(self, decoder: BinaryDecoder) -> Decimal: + return decoder.read_decimal_from_bytes(self.precision, self.scale) + + +@dataclass(frozen=True) +class OptionReader(Reader): + option: Reader = field() + + def read(self, decoder: BinaryDecoder) -> Any | None: + # For the Iceberg spec it is required to set the default value to null + # From https://iceberg.apache.org/spec/#avro + # Optional fields must always set the Avro field default value to null. + # + # This means that null has to come first: + # https://avro.apache.org/docs/current/spec.html + # type of the default value must match the first element of the union. + # This is enforced in the schema conversion, which happens prior + # to building the reader tree + if decoder.read_int() > 0: + return self.option.read(decoder) + return None + + +@dataclass(frozen=True) +class StructReader(Reader): + fields: tuple[Reader, ...] = field() + + def read(self, decoder: BinaryDecoder) -> AvroStruct: + return AvroStruct([field.read(decoder) for field in self.fields]) + + +@dataclass(frozen=True) +class ListReader(Reader): + element: Reader + + def read(self, decoder: BinaryDecoder) -> list: + read_items = [] + block_count = decoder.read_long() + while block_count != 0: + if block_count < 0: + block_count = -block_count + # We ignore the block size for now + _ = decoder.read_long() + for _ in range(block_count): + read_items.append(self.element.read(decoder)) + block_count = decoder.read_long() + return read_items + + +@dataclass(frozen=True) +class MapReader(Reader): + key: Reader + value: Reader + + def read(self, decoder: BinaryDecoder) -> dict: + read_items = {} + block_count = decoder.read_long() + while block_count != 0: + if block_count < 0: + block_count = -block_count + # We ignore the block size for now + _ = decoder.read_long() + for _ in range(block_count): + key = self.key.read(decoder) + read_items[key] = self.value.read(decoder) + block_count = decoder.read_long() + + return read_items + + +class ConstructReader(SchemaVisitor[Reader]): + def schema(self, schema: Schema, struct_result: Reader) -> Reader: + return struct_result + + def struct(self, struct: StructType, field_results: list[Reader]) -> Reader: + return StructReader(tuple(field_results)) + + def field(self, field: NestedField, field_result: Reader) -> Reader: + return field_result if field.required else OptionReader(field_result) + + def list(self, list_type: ListType, element_result: Reader) -> Reader: + element_reader = element_result if list_type.element_required else OptionReader(element_result) + return ListReader(element_reader) + + def map(self, map_type: MapType, key_result: Reader, value_result: Reader) -> Reader: + value_reader = value_result if map_type.value_required else OptionReader(value_result) + return MapReader(key_result, value_reader) + + def primitive(self, primitive: PrimitiveType) -> Reader: + return primitive_reader(primitive) + + +@singledispatch +def primitive_reader(primitive: PrimitiveType) -> Reader: + raise ValueError(f"Unknown type: {primitive}") + + +@primitive_reader.register(FixedType) +def _(primitive: FixedType) -> Reader: + return FixedReader(primitive.length) + + +@primitive_reader.register(DecimalType) +def _(primitive: DecimalType) -> Reader: + return DecimalReader(primitive.precision, primitive.scale) + + +@primitive_reader.register(BooleanType) +def _(_: BooleanType) -> Reader: + return BooleanReader() + + +@primitive_reader.register(IntegerType) +def _(_: IntegerType) -> Reader: + return IntegerReader() + + +@primitive_reader.register(LongType) +def _(_: LongType) -> Reader: + return LongReader() + + +@primitive_reader.register(FloatType) +def _(_: FloatType) -> Reader: + return FloatReader() + + +@primitive_reader.register(DoubleType) +def _(_: DoubleType) -> Reader: + return DoubleReader() + + +@primitive_reader.register(DateType) +def _(_: DateType) -> Reader: + return DateReader() + + +@primitive_reader.register(TimeType) +def _(_: TimeType) -> Reader: + return TimeReader() + + +@primitive_reader.register(TimestampType) +def _(_: TimestampType) -> Reader: + return TimestampReader() + + +@primitive_reader.register(TimestamptzType) +def _(_: TimestamptzType) -> Reader: + return TimestamptzReader() + + +@primitive_reader.register(StringType) +def _(_: StringType) -> Reader: + return StringReader() + + +@primitive_reader.register(BinaryType) +def _(_: StringType) -> Reader: + return BinaryReader() diff --git a/python/src/iceberg/files.py b/python/src/iceberg/files.py index 586aab498c08..e56009235797 100644 --- a/python/src/iceberg/files.py +++ b/python/src/iceberg/files.py @@ -45,5 +45,5 @@ def get(self, pos: int) -> Any: ... @abstractmethod - def set(self, pos: int, value) -> None: + def set(self, pos: int, value: Any) -> None: ... diff --git a/python/src/iceberg/io/base.py b/python/src/iceberg/io/base.py index 4e4ff30cdb26..458a3d591e3d 100644 --- a/python/src/iceberg/io/base.py +++ b/python/src/iceberg/io/base.py @@ -24,29 +24,35 @@ """ from abc import ABC, abstractmethod +from io import SEEK_SET from typing import Protocol, Union, runtime_checkable @runtime_checkable -class InputStream(Protocol): # pragma: no cover +class InputStream(Protocol): """A protocol for the file-like object returned by InputFile.open(...) This outlines the minimally required methods for a seekable input stream returned from an InputFile implementation's `open(...)` method. These methods are a subset of IOBase/RawIOBase. """ - def read(self, size: int) -> bytes: + @abstractmethod + def read(self, size: int = 0) -> bytes: ... - def seek(self, offset: int, whence: int) -> None: + @abstractmethod + def seek(self, offset: int, whence: int = SEEK_SET) -> None: ... + @abstractmethod def tell(self) -> int: ... + @abstractmethod def closed(self) -> bool: ... + @abstractmethod def close(self) -> None: ... @@ -59,12 +65,15 @@ class OutputStream(Protocol): # pragma: no cover implementation's `create(...)` method. These methods are a subset of IOBase/RawIOBase. """ + @abstractmethod def write(self, b: bytes) -> None: ... + @abstractmethod def closed(self) -> bool: ... + @abstractmethod def close(self) -> None: ... diff --git a/python/src/iceberg/io/memory.py b/python/src/iceberg/io/memory.py new file mode 100644 index 000000000000..0e9dcb9c99e4 --- /dev/null +++ b/python/src/iceberg/io/memory.py @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from io import SEEK_CUR, SEEK_END, SEEK_SET + +from iceberg.io.base import InputStream + + +class MemoryInputStream(InputStream): + """ + Simple in memory stream that we use to store decompressed blocks + + Examples: + >>> stream = MemoryInputStream(b'22memory1925') + >>> stream.tell() + 0 + >>> stream.read(2) + b'22' + >>> stream.tell() + 2 + >>> stream.seek(8) + >>> stream.read(4) + b'1925' + >>> stream.close() + >>> stream.closed() + True + """ + + buffer: bytes + len: int + pos: int + + def __init__(self, buffer: bytes): + self.buffer = buffer + self.len = len(buffer) + self.pos = 0 + + def read(self, size: int = 0) -> bytes: + b = self.buffer[self.pos : self.pos + size] + self.pos += size + return b + + def seek(self, offset: int, whence: int = SEEK_SET) -> None: + if whence == SEEK_SET: + self.pos = offset + elif whence == SEEK_CUR: + self.pos += offset + elif whence == SEEK_END: + self.pos = self.len + offset + else: + raise ValueError(f"Unknown whence {offset}") + + def tell(self) -> int: + return self.pos + + def closed(self) -> bool: + return not hasattr(self, "buffer") + + def close(self) -> None: + del self.buffer + self.pos = 0 diff --git a/python/src/iceberg/types.py b/python/src/iceberg/types.py index d665dbcae607..d3ae67237380 100644 --- a/python/src/iceberg/types.py +++ b/python/src/iceberg/types.py @@ -165,7 +165,7 @@ class StructType(IcebergType): def __init__(self, *fields: NestedField, **kwargs): # pylint: disable=super-init-not-called if not fields and "fields" in kwargs: fields = kwargs["fields"] - object.__setattr__(self, "fields", fields) + object.__setattr__(self, "fields", tuple(fields)) @cached_property def string_type(self) -> str: diff --git a/python/src/iceberg/utils/datetime.py b/python/src/iceberg/utils/datetime.py index c10dda4fb2bb..9e491ce9ce69 100644 --- a/python/src/iceberg/utils/datetime.py +++ b/python/src/iceberg/utils/datetime.py @@ -16,6 +16,8 @@ # under the License. """Helper methods for working with date/time representations """ +from __future__ import annotations + import re from datetime import ( date, @@ -36,24 +38,31 @@ def micros_to_days(timestamp: int) -> int: return (datetime.fromtimestamp(timestamp / 1_000_000) - EPOCH_TIMESTAMP).days +def micros_to_time(micros: int) -> time: + """Converts a timestamp in microseconds to a time""" + micros, microseconds = divmod(micros, 1000000) + micros, seconds = divmod(micros, 60) + micros, minutes = divmod(micros, 60) + hours = micros + return time(hour=hours, minute=minutes, second=seconds, microsecond=microseconds) + + def date_to_days(date_str: str) -> int: """Converts an ISO-8601 formatted date to days from 1970-01-01""" return (date.fromisoformat(date_str) - EPOCH_DATE).days +def days_to_date(days: int) -> date: + """Creates a date from the number of days from 1970-01-01""" + return EPOCH_DATE + timedelta(days) + + def time_to_micros(time_str: str) -> int: """Converts an ISO-8601 formatted time to microseconds from midnight""" t = time.fromisoformat(time_str) return (((t.hour * 60 + t.minute) * 60) + t.second) * 1_000_000 + t.microsecond -def time_from_micros(micros: int) -> time: - seconds = micros // 1_000_000 - minutes = seconds // 60 - hours = minutes // 60 - return time(hour=hours, minute=minutes % 60, second=seconds % 60, microsecond=micros % 1_000_000) - - def datetime_to_micros(dt: datetime) -> int: """Converts a datetime to microseconds from 1970-01-01T00:00:00.000000""" if dt.tzinfo: @@ -77,6 +86,18 @@ def timestamptz_to_micros(timestamptz_str: str) -> int: raise ValueError(f"Invalid timestamp with zone: {timestamptz_str} (must be ISO-8601)") +def micros_to_timestamp(micros: int): + """Converts microseconds from epoch to a timestamp""" + dt = timedelta(microseconds=micros) + return EPOCH_TIMESTAMP + dt + + +def micros_to_timestamptz(micros: int): + """Converts microseconds from epoch to an utc timestamp""" + dt = timedelta(microseconds=micros) + return EPOCH_TIMESTAMPTZ + dt + + def to_human_day(day_ordinal: int) -> str: """Converts a DateType value to human string""" return (EPOCH_DATE + timedelta(days=day_ordinal)).isoformat() @@ -84,7 +105,7 @@ def to_human_day(day_ordinal: int) -> str: def to_human_time(micros_from_midnight: int) -> str: """Converts a TimeType value to human string""" - return time_from_micros(micros_from_midnight).isoformat() + return micros_to_time(micros_from_midnight).isoformat() def to_human_timestamptz(timestamp_micros: int) -> str: diff --git a/python/src/iceberg/utils/schema_conversion.py b/python/src/iceberg/utils/schema_conversion.py index 4e57daa0314e..f5b70354b9aa 100644 --- a/python/src/iceberg/utils/schema_conversion.py +++ b/python/src/iceberg/utils/schema_conversion.py @@ -14,9 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Utility class for converting between Avro and Iceberg schemas - -""" +"""Utility class for converting between Avro and Iceberg schemas""" from __future__ import annotations import logging @@ -116,23 +114,23 @@ def avro_to_iceberg(self, avro_schema: dict[str, Any]) -> Schema: def _resolve_union(self, type_union: dict | list | str) -> tuple[str | dict[str, Any], bool]: """ - Converts Unions into their type and resolves if the field is optional + Converts Unions into their type and resolves if the field is required Examples: >>> AvroSchemaConversion()._resolve_union('str') - ('str', False) - >>> AvroSchemaConversion()._resolve_union(['null', 'str']) ('str', True) + >>> AvroSchemaConversion()._resolve_union(['null', 'str']) + ('str', False) >>> AvroSchemaConversion()._resolve_union([{'type': 'str'}]) - ({'type': 'str'}, False) - >>> AvroSchemaConversion()._resolve_union(['null', {'type': 'str'}]) ({'type': 'str'}, True) + >>> AvroSchemaConversion()._resolve_union(['null', {'type': 'str'}]) + ({'type': 'str'}, False) Args: type_union: The field, can be a string 'str', list ['null', 'str'], or dict {"type": 'str'} Returns: - A tuple containing the type and nullability + A tuple containing the type and if required Raises: TypeError: In the case non-optional union types are encountered @@ -140,20 +138,28 @@ def _resolve_union(self, type_union: dict | list | str) -> tuple[str | dict[str, avro_types: dict | list if isinstance(type_union, str): # It is a primitive and required - return type_union, False + return type_union, True elif isinstance(type_union, dict): # It is a context and required - return type_union, False + return type_union, True else: avro_types = type_union - is_optional = "null" in avro_types - if len(avro_types) > 2: - raise TypeError("Non-optional types aren't part of the Iceberg specification") + raise TypeError(f"Non-optional types aren't part of the Iceberg specification: {avro_types}") + + # For the Iceberg spec it is required to set the default value to null + # From https://iceberg.apache.org/spec/#avro + # Optional fields must always set the Avro field default value to null. + # + # This means that null has to come first: + # https://avro.apache.org/docs/current/spec.html + # type of the default value must match the first element of the union. + if "null" != avro_types[0]: + raise TypeError("Only null-unions are supported") # Filter the null value and return the type - return list(filter(lambda t: t != "null", avro_types))[0], is_optional + return list(filter(lambda t: t != "null", avro_types))[0], False def _convert_schema(self, avro_type: str | dict[str, Any]) -> IcebergType: """ @@ -205,13 +211,13 @@ def _convert_field(self, field: dict[str, Any]) -> NestedField: if "field-id" not in field: raise ValueError(f"Cannot convert field, missing field-id: {field}") - plain_type, is_optional = self._resolve_union(field["type"]) + plain_type, required = self._resolve_union(field["type"]) return NestedField( field_id=field["field-id"], name=field["name"], field_type=self._convert_schema(plain_type), - required=is_optional, + required=required, doc=field.get("doc"), ) @@ -273,12 +279,12 @@ def _convert_array_type(self, array_type: dict[str, Any]) -> ListType: if "element-id" not in array_type: raise ValueError(f"Cannot convert array-type, missing element-id: {array_type}") - plain_type, element_is_optional = self._resolve_union(array_type["items"]) + plain_type, element_required = self._resolve_union(array_type["items"]) return ListType( element_id=array_type["element-id"], element_type=self._convert_schema(plain_type), - element_required=element_is_optional, + element_required=element_required, ) def _convert_map_type(self, map_type: dict[str, Any]) -> MapType: @@ -290,7 +296,7 @@ def _convert_map_type(self, map_type: dict[str, Any]) -> MapType: >>> from iceberg.utils.schema_conversion import AvroSchemaConversion >>> avro_field = { ... "type": "map", - ... "values": ["long", "null"], + ... "values": ["null", "long"], ... "key-id": 101, ... "value-id": 102, ... } @@ -307,14 +313,14 @@ def _convert_map_type(self, map_type: dict[str, Any]) -> MapType: Returns: A MapType """ - value_type, value_is_optional = self._resolve_union(map_type["values"]) + value_type, value_required = self._resolve_union(map_type["values"]) return MapType( key_id=map_type["key-id"], # Avro only supports string keys key_type=StringType(), value_id=map_type["value-id"], value_type=self._convert_schema(value_type), - value_required=value_is_optional, + value_required=value_required, ) def _convert_logical_type(self, avro_logical_type: dict[str, Any]) -> IcebergType: diff --git a/python/src/iceberg/utils/singleton.py b/python/src/iceberg/utils/singleton.py index c36155de14e7..5643cdd1728f 100644 --- a/python/src/iceberg/utils/singleton.py +++ b/python/src/iceberg/utils/singleton.py @@ -14,7 +14,20 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +""" +This is a singleton metaclass that can be used to cache and re-use existing objects +In the Iceberg codebase we have a lot of objects that are stateless (for example Types such as StringType, +BooleanType etc). FixedTypes have arguments (eg. Fixed[22]) that we also make part of the key when caching +the newly created object. + +The Singleton uses a metaclass which essentially defines a new type. When the Type gets created, it will first +evaluate the `__call__` method with all the arguments. If we already initialized a class earlier, we'll just +return it. + +More information on metaclasses: https://docs.python.org/3/reference/datamodel.html#metaclasses + +""" from typing import ClassVar, Dict diff --git a/python/tests/avro/__init__.py b/python/tests/avro/__init__.py new file mode 100644 index 000000000000..a67d5ea255b2 --- /dev/null +++ b/python/tests/avro/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/tests/avro/test_decoder.py b/python/tests/avro/test_decoder.py new file mode 100644 index 000000000000..295715115183 --- /dev/null +++ b/python/tests/avro/test_decoder.py @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from datetime import date, datetime, timezone +from decimal import Decimal +from io import SEEK_SET + +import pytest + +from iceberg.avro.decoder import BinaryDecoder +from iceberg.io.base import InputStream +from iceberg.io.memory import MemoryInputStream + + +def test_read_decimal_from_fixed(): + mis = MemoryInputStream(b"\x00\x00\x00\x05\x6A\x48\x1C\xFB\x2C\x7C\x50\x00") + decoder = BinaryDecoder(mis) + actual = decoder.read_decimal_from_fixed(28, 15, 12) + expected = Decimal("99892.123400000000000") + assert actual == expected + + +def test_read_long(): + mis = MemoryInputStream(b"\x18") + decoder = BinaryDecoder(mis) + assert decoder.read_long() == 12 + + +def test_read_decimal(): + mis = MemoryInputStream(b"\x18\x00\x00\x00\x05\x6A\x48\x1C\xFB\x2C\x7C\x50\x00") + decoder = BinaryDecoder(mis) + actual = decoder.read_decimal_from_bytes(28, 15) + expected = Decimal("99892.123400000000000") + assert actual == expected + + +def test_decimal_from_fixed_big(): + mis = MemoryInputStream(b"\x0E\xC2\x02\xE9\x06\x16\x33\x49\x77\x67\xA8\x00") + decoder = BinaryDecoder(mis) + actual = decoder.read_decimal_from_fixed(28, 15, 12) + expected = Decimal("4567335489766.998340000000000") + assert actual == expected + + +def test_read_negative_bytes(): + mis = MemoryInputStream(b"") + decoder = BinaryDecoder(mis) + + with pytest.raises(ValueError) as exc_info: + decoder.read(-1) + + assert "Requested -1 bytes to read, expected positive integer." in str(exc_info.value) + + +class OneByteAtATimeInputStream(InputStream): + """ + Fake input stream that just returns a single byte at the time + """ + + pos = 0 + + def read(self, size: int = 0) -> bytes: + self.pos += 1 + return int.to_bytes(1, self.pos, byteorder="little") + + def seek(self, offset: int, whence: int = SEEK_SET) -> None: + pass + + def tell(self) -> int: + pass + + def closed(self) -> bool: + pass + + def close(self) -> None: + pass + + +def test_read_single_byte_at_the_time(): + decoder = BinaryDecoder(OneByteAtATimeInputStream()) + + with pytest.raises(ValueError) as exc_info: + decoder.read(2) + + assert "Read 1 bytes, expected 2 bytes" in str(exc_info.value) + + +def test_read_float(): + mis = MemoryInputStream(b"\x00\x00\x9A\x41") + decoder = BinaryDecoder(mis) + assert decoder.read_float() == 19.25 + + +def test_read_double(): + mis = MemoryInputStream(b"\x00\x00\x00\x00\x00\x40\x33\x40") + decoder = BinaryDecoder(mis) + assert decoder.read_double() == 19.25 + + +def test_read_date(): + mis = MemoryInputStream(b"\xBC\x7D") + decoder = BinaryDecoder(mis) + assert decoder.read_date_from_int() == date(1991, 12, 27) + + +def test_read_time_millis(): + mis = MemoryInputStream(b"\xBC\x7D") + decoder = BinaryDecoder(mis) + assert decoder.read_time_millis().microsecond == 30000 + + +def test_read_time_micros(): + mis = MemoryInputStream(b"\xBC\x7D") + decoder = BinaryDecoder(mis) + assert decoder.read_time_micros().microsecond == 8030 + + +def test_read_timestamp_micros(): + mis = MemoryInputStream(b"\xBC\x7D") + decoder = BinaryDecoder(mis) + assert decoder.read_timestamp_micros() == datetime(1970, 1, 1, 0, 0, 0, 8030) + + +def test_read_timestamptz_micros(): + mis = MemoryInputStream(b"\xBC\x7D") + decoder = BinaryDecoder(mis) + assert decoder.read_timestamptz_micros() == datetime(1970, 1, 1, 0, 0, 0, 8030, tzinfo=timezone.utc) diff --git a/python/tests/avro/test_file.py b/python/tests/avro/test_file.py new file mode 100644 index 000000000000..d345865d118a --- /dev/null +++ b/python/tests/avro/test_file.py @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest + +from iceberg.avro.codecs import DeflateCodec +from iceberg.avro.file import AvroFileHeader + + +def get_deflate_compressor(): + header = AvroFileHeader(bytes(0), {"avro.codec": "deflate"}, bytes(16)) + assert header.compression_codec() == DeflateCodec + + +def get_null_compressor(): + header = AvroFileHeader(bytes(0), {"avro.codec": "null"}, bytes(16)) + assert header.compression_codec() is None + + +def test_unknown_codec(): + header = AvroFileHeader(bytes(0), {"avro.codec": "unknown"}, bytes(16)) + + with pytest.raises(ValueError) as exc_info: + header.compression_codec() + + assert "Unsupported codec: unknown" in str(exc_info.value) + + +def test_missing_schema(): + header = AvroFileHeader(bytes(0), {}, bytes(16)) + + with pytest.raises(ValueError) as exc_info: + header.get_schema() + + assert "No schema found in Avro file headers" in str(exc_info.value) diff --git a/python/tests/avro/test_reader.py b/python/tests/avro/test_reader.py new file mode 100644 index 000000000000..c310e69204cc --- /dev/null +++ b/python/tests/avro/test_reader.py @@ -0,0 +1,455 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import json + +import pytest + +from iceberg.avro.file import AvroFile +from iceberg.avro.reader import ( + AvroStruct, + BinaryReader, + BooleanReader, + DateReader, + DecimalReader, + DoubleReader, + FixedReader, + FloatReader, + IntegerReader, + LongReader, + StringReader, + TimeReader, + TimestampReader, + TimestamptzReader, + primitive_reader, +) +from iceberg.schema import Schema +from iceberg.types import ( + BinaryType, + BooleanType, + DateType, + DecimalType, + DoubleType, + FixedType, + FloatType, + IntegerType, + LongType, + PrimitiveType, + StringType, + TimestampType, + TimestamptzType, + TimeType, +) +from tests.io.test_io_base import LocalInputFile + + +def test_read_header(generated_manifest_entry_file: str, iceberg_manifest_entry_schema: Schema): + with AvroFile(LocalInputFile(generated_manifest_entry_file)) as reader: + header = reader._read_header() + + assert header.magic == b"Obj\x01" + assert json.loads(header.meta["avro.schema"]) == { + "type": "record", + "name": "manifest_entry", + "fields": [ + {"field-id": 0, "name": "status", "type": "int"}, + {"field-id": 1, "default": "null", "name": "snapshot_id", "type": ["null", "long"]}, + { + "field-id": 2, + "name": "data_file", + "type": { + "type": "record", + "name": "r2", + "fields": [ + {"field-id": 100, "doc": "Location URI with FS scheme", "name": "file_path", "type": "string"}, + { + "field-id": 101, + "doc": "File format name: avro, orc, or parquet", + "name": "file_format", + "type": "string", + }, + { + "field-id": 102, + "name": "partition", + "type": { + "type": "record", + "name": "r102", + "fields": [{"field-id": 1000, "default": "null", "name": "VendorID", "type": ["null", "int"]}], + }, + }, + {"field-id": 103, "doc": "Number of records in the file", "name": "record_count", "type": "long"}, + {"field-id": 104, "doc": "Total file size in bytes", "name": "file_size_in_bytes", "type": "long"}, + {"field-id": 105, "name": "block_size_in_bytes", "type": "long"}, + { + "field-id": 108, + "doc": "Map of column id to total size on disk", + "default": "null", + "name": "column_sizes", + "type": [ + "null", + { + "logicalType": "map", + "type": "array", + "items": { + "type": "record", + "name": "k117_v118", + "fields": [ + {"field-id": 117, "name": "key", "type": "int"}, + {"field-id": 118, "name": "value", "type": "long"}, + ], + }, + }, + ], + }, + { + "field-id": 109, + "doc": "Map of column id to total count, including null and NaN", + "default": "null", + "name": "value_counts", + "type": [ + "null", + { + "logicalType": "map", + "type": "array", + "items": { + "type": "record", + "name": "k119_v120", + "fields": [ + {"field-id": 119, "name": "key", "type": "int"}, + {"field-id": 120, "name": "value", "type": "long"}, + ], + }, + }, + ], + }, + { + "field-id": 110, + "doc": "Map of column id to null value count", + "default": "null", + "name": "null_value_counts", + "type": [ + "null", + { + "logicalType": "map", + "type": "array", + "items": { + "type": "record", + "name": "k121_v122", + "fields": [ + {"field-id": 121, "name": "key", "type": "int"}, + {"field-id": 122, "name": "value", "type": "long"}, + ], + }, + }, + ], + }, + { + "field-id": 137, + "doc": "Map of column id to number of NaN values in the column", + "default": "null", + "name": "nan_value_counts", + "type": [ + "null", + { + "logicalType": "map", + "type": "array", + "items": { + "type": "record", + "name": "k138_v139", + "fields": [ + {"field-id": 138, "name": "key", "type": "int"}, + {"field-id": 139, "name": "value", "type": "long"}, + ], + }, + }, + ], + }, + { + "field-id": 125, + "doc": "Map of column id to lower bound", + "default": "null", + "name": "lower_bounds", + "type": [ + "null", + { + "logicalType": "map", + "type": "array", + "items": { + "type": "record", + "name": "k126_v127", + "fields": [ + {"field-id": 126, "name": "key", "type": "int"}, + {"field-id": 127, "name": "value", "type": "bytes"}, + ], + }, + }, + ], + }, + { + "field-id": 128, + "doc": "Map of column id to upper bound", + "default": "null", + "name": "upper_bounds", + "type": [ + "null", + { + "logicalType": "map", + "type": "array", + "items": { + "type": "record", + "name": "k129_v130", + "fields": [ + {"field-id": 129, "name": "key", "type": "int"}, + {"field-id": 130, "name": "value", "type": "bytes"}, + ], + }, + }, + ], + }, + { + "field-id": 131, + "doc": "Encryption key metadata blob", + "default": "null", + "name": "key_metadata", + "type": ["null", "bytes"], + }, + { + "field-id": 132, + "doc": "Splittable offsets", + "default": "null", + "name": "split_offsets", + "type": ["null", {"element-id": 133, "type": "array", "items": "long"}], + }, + { + "field-id": 140, + "doc": "Sort order ID", + "default": "null", + "name": "sort_order_id", + "type": ["null", "int"], + }, + ], + }, + }, + ], + } + + assert header.get_schema() == iceberg_manifest_entry_schema + + +def test_read_manifest_entry_file(generated_manifest_entry_file: str): + with AvroFile(LocalInputFile(generated_manifest_entry_file)) as reader: + # Consume the generator + records = list(reader) + + assert len(records) == 2, f"Expected 2 records, got {len(records)}" + assert records[0] == AvroStruct( + _data=[ + 1, + 8744736658442914487, + AvroStruct( + _data=[ + "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", + "PARQUET", + AvroStruct(_data=[None]), + 19513, + 388872, + 67108864, + { + 1: 53, + 2: 98153, + 3: 98693, + 4: 53, + 5: 53, + 6: 53, + 7: 17425, + 8: 18528, + 9: 53, + 10: 44788, + 11: 35571, + 12: 53, + 13: 1243, + 14: 2355, + 15: 12750, + 16: 4029, + 17: 110, + 18: 47194, + 19: 2948, + }, + { + 1: 19513, + 2: 19513, + 3: 19513, + 4: 19513, + 5: 19513, + 6: 19513, + 7: 19513, + 8: 19513, + 9: 19513, + 10: 19513, + 11: 19513, + 12: 19513, + 13: 19513, + 14: 19513, + 15: 19513, + 16: 19513, + 17: 19513, + 18: 19513, + 19: 19513, + }, + { + 1: 19513, + 2: 0, + 3: 0, + 4: 19513, + 5: 19513, + 6: 19513, + 7: 0, + 8: 0, + 9: 19513, + 10: 0, + 11: 0, + 12: 19513, + 13: 0, + 14: 0, + 15: 0, + 16: 0, + 17: 0, + 18: 0, + 19: 0, + }, + {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}, + { + 2: b"2020-04-01 00:00", + 3: b"2020-04-01 00:12", + 7: b"\x03\x00\x00\x00", + 8: b"\x01\x00\x00\x00", + 10: b"\xf6(\\\x8f\xc2\x05S\xc0", + 11: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 13: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 14: b"\x00\x00\x00\x00\x00\x00\xe0\xbf", + 15: b")\\\x8f\xc2\xf5(\x08\xc0", + 16: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 17: b"\x00\x00\x00\x00\x00\x00\x00\x00", + 18: b"\xf6(\\\x8f\xc2\xc5S\xc0", + 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0", + }, + { + 2: b"2020-04-30 23:5:", + 3: b"2020-05-01 00:41", + 7: b"\t\x01\x00\x00", + 8: b"\t\x01\x00\x00", + 10: b"\xcd\xcc\xcc\xcc\xcc,_@", + 11: b"\x1f\x85\xebQ\\\xe2\xfe@", + 13: b"\x00\x00\x00\x00\x00\x00\x12@", + 14: b"\x00\x00\x00\x00\x00\x00\xe0?", + 15: b"q=\n\xd7\xa3\xf01@", + 16: b"\x00\x00\x00\x00\x00`B@", + 17: b"333333\xd3?", + 18: b"\x00\x00\x00\x00\x00\x18b@", + 19: b"\x00\x00\x00\x00\x00\x00\x04@", + }, + None, + [4], + 0, + ] + ), + ] + ) + + +def test_read_manifest_file_file(generated_manifest_file_file: str): + with AvroFile(LocalInputFile(generated_manifest_file_file)) as reader: + # Consume the generator + records = list(reader) + + assert len(records) == 1, f"Expected 1 records, got {len(records)}" + assert records[0] == AvroStruct( + _data=[ + "/home/iceberg/warehouse/nyc/taxis_partitioned/metadata/0125c686-8aa6-4502-bdcc-b6d17ca41a3b-m0.avro", + 7989, + 0, + 9182715666859759686, + 3, + 0, + 0, + [AvroStruct(_data=[True, False, b"\x01\x00\x00\x00", b"\x02\x00\x00\x00"])], + 237993, + 0, + 0, + ] + ) + + +def test_fixed_reader(): + assert primitive_reader(FixedType(22)) == FixedReader(22) + + +def test_decimal_reader(): + assert primitive_reader(DecimalType(19, 25)) == DecimalReader(19, 25) + + +def test_boolean_reader(): + assert primitive_reader(BooleanType()) == BooleanReader() + + +def test_integer_reader(): + assert primitive_reader(IntegerType()) == IntegerReader() + + +def test_long_reader(): + assert primitive_reader(LongType()) == LongReader() + + +def test_float_reader(): + assert primitive_reader(FloatType()) == FloatReader() + + +def test_double_reader(): + assert primitive_reader(DoubleType()) == DoubleReader() + + +def test_date_reader(): + assert primitive_reader(DateType()) == DateReader() + + +def test_time_reader(): + assert primitive_reader(TimeType()) == TimeReader() + + +def test_timestamp_reader(): + assert primitive_reader(TimestampType()) == TimestampReader() + + +def test_timestamptz_reader(): + assert primitive_reader(TimestamptzType()) == TimestamptzReader() + + +def test_string_reader(): + assert primitive_reader(StringType()) == StringReader() + + +def test_binary_reader(): + assert primitive_reader(BinaryType()) == BinaryReader() + + +def test_unknown_type(): + class UnknownType(PrimitiveType): + ... + + with pytest.raises(ValueError) as exc_info: + primitive_reader(UnknownType()) + + assert "Unknown type:" in str(exc_info.value) diff --git a/python/tests/conftest.py b/python/tests/conftest.py index b5dd18b8d80a..48f3bf02390b 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -15,16 +15,20 @@ # specific language governing permissions and limitations # under the License. +from tempfile import TemporaryDirectory from typing import Any, Dict import pytest from iceberg import schema +from iceberg.schema import Schema from iceberg.types import ( + BinaryType, BooleanType, FloatType, IntegerType, ListType, + LongType, MapType, NestedField, StringType, @@ -114,7 +118,367 @@ def foo_struct(): @pytest.fixture(scope="session") -def manifest_schema() -> Dict[str, Any]: +def all_avro_types() -> Dict[str, Any]: + return { + "type": "record", + "name": "all_avro_types", + "fields": [ + {"name": "primitive_string", "type": "string", "field-id": 100}, + {"name": "primitive_int", "type": "int", "field-id": 200}, + {"name": "primitive_long", "type": "long", "field-id": 300}, + {"name": "primitive_float", "type": "float", "field-id": 400}, + {"name": "primitive_double", "type": "double", "field-id": 500}, + {"name": "primitive_bytes", "type": "bytes", "field-id": 600}, + { + "type": "record", + "name": "Person", + "fields": [ + {"name": "name", "type": "string", "field-id": 701}, + {"name": "age", "type": "long", "field-id": 702}, + {"name": "gender", "type": ["string", "null"], "field-id": 703}, + ], + "field-id": 700, + }, + { + "name": "array_with_string", + "type": { + "type": "array", + "items": "string", + "default": [], + "element-id": 801, + }, + "field-id": 800, + }, + { + "name": "array_with_optional_string", + "type": [ + "null", + { + "type": "array", + "items": ["string", "null"], + "default": [], + "element-id": 901, + }, + ], + "field-id": 900, + }, + { + "name": "array_with_optional_record", + "type": [ + "null", + { + "type": "array", + "items": [ + "null", + { + "type": "record", + "name": "person", + "fields": [ + {"name": "name", "type": "string", "field-id": 1002}, + {"name": "age", "type": "long", "field-id": 1003}, + {"name": "gender", "type": ["string", "null"], "field-id": 1004}, + ], + }, + ], + "element-id": 1001, + }, + ], + "field-id": 1000, + }, + { + "name": "map_with_longs", + "type": { + "type": "map", + "values": "long", + "default": {}, + "key-id": 1101, + "value-id": 1102, + }, + "field-id": 1000, + }, + ], + } + + +@pytest.fixture +def catalog() -> InMemoryCatalog: + return InMemoryCatalog("test.in.memory.catalog", {"test.key": "test.value"}) + + +manifest_entry_records = [ + { + "status": 1, + "snapshot_id": 8744736658442914487, + "data_file": { + "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=null/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00001.parquet", + "file_format": "PARQUET", + "partition": {"VendorID": None}, + "record_count": 19513, + "file_size_in_bytes": 388872, + "block_size_in_bytes": 67108864, + "column_sizes": [ + {"key": 1, "value": 53}, + {"key": 2, "value": 98153}, + {"key": 3, "value": 98693}, + {"key": 4, "value": 53}, + {"key": 5, "value": 53}, + {"key": 6, "value": 53}, + {"key": 7, "value": 17425}, + {"key": 8, "value": 18528}, + {"key": 9, "value": 53}, + {"key": 10, "value": 44788}, + {"key": 11, "value": 35571}, + {"key": 12, "value": 53}, + {"key": 13, "value": 1243}, + {"key": 14, "value": 2355}, + {"key": 15, "value": 12750}, + {"key": 16, "value": 4029}, + {"key": 17, "value": 110}, + {"key": 18, "value": 47194}, + {"key": 19, "value": 2948}, + ], + "value_counts": [ + {"key": 1, "value": 19513}, + {"key": 2, "value": 19513}, + {"key": 3, "value": 19513}, + {"key": 4, "value": 19513}, + {"key": 5, "value": 19513}, + {"key": 6, "value": 19513}, + {"key": 7, "value": 19513}, + {"key": 8, "value": 19513}, + {"key": 9, "value": 19513}, + {"key": 10, "value": 19513}, + {"key": 11, "value": 19513}, + {"key": 12, "value": 19513}, + {"key": 13, "value": 19513}, + {"key": 14, "value": 19513}, + {"key": 15, "value": 19513}, + {"key": 16, "value": 19513}, + {"key": 17, "value": 19513}, + {"key": 18, "value": 19513}, + {"key": 19, "value": 19513}, + ], + "null_value_counts": [ + {"key": 1, "value": 19513}, + {"key": 2, "value": 0}, + {"key": 3, "value": 0}, + {"key": 4, "value": 19513}, + {"key": 5, "value": 19513}, + {"key": 6, "value": 19513}, + {"key": 7, "value": 0}, + {"key": 8, "value": 0}, + {"key": 9, "value": 19513}, + {"key": 10, "value": 0}, + {"key": 11, "value": 0}, + {"key": 12, "value": 19513}, + {"key": 13, "value": 0}, + {"key": 14, "value": 0}, + {"key": 15, "value": 0}, + {"key": 16, "value": 0}, + {"key": 17, "value": 0}, + {"key": 18, "value": 0}, + {"key": 19, "value": 0}, + ], + "nan_value_counts": [ + {"key": 16, "value": 0}, + {"key": 17, "value": 0}, + {"key": 18, "value": 0}, + {"key": 19, "value": 0}, + {"key": 10, "value": 0}, + {"key": 11, "value": 0}, + {"key": 12, "value": 0}, + {"key": 13, "value": 0}, + {"key": 14, "value": 0}, + {"key": 15, "value": 0}, + ], + "lower_bounds": [ + {"key": 2, "value": b"2020-04-01 00:00"}, + {"key": 3, "value": b"2020-04-01 00:12"}, + {"key": 7, "value": b"\x03\x00\x00\x00"}, + {"key": 8, "value": b"\x01\x00\x00\x00"}, + {"key": 10, "value": b"\xf6(\\\x8f\xc2\x05S\xc0"}, + {"key": 11, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 13, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 14, "value": b"\x00\x00\x00\x00\x00\x00\xe0\xbf"}, + {"key": 15, "value": b")\\\x8f\xc2\xf5(\x08\xc0"}, + {"key": 16, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 17, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 18, "value": b"\xf6(\\\x8f\xc2\xc5S\xc0"}, + {"key": 19, "value": b"\x00\x00\x00\x00\x00\x00\x04\xc0"}, + ], + "upper_bounds": [ + {"key": 2, "value": b"2020-04-30 23:5:"}, + {"key": 3, "value": b"2020-05-01 00:41"}, + {"key": 7, "value": b"\t\x01\x00\x00"}, + {"key": 8, "value": b"\t\x01\x00\x00"}, + {"key": 10, "value": b"\xcd\xcc\xcc\xcc\xcc,_@"}, + {"key": 11, "value": b"\x1f\x85\xebQ\\\xe2\xfe@"}, + {"key": 13, "value": b"\x00\x00\x00\x00\x00\x00\x12@"}, + {"key": 14, "value": b"\x00\x00\x00\x00\x00\x00\xe0?"}, + {"key": 15, "value": b"q=\n\xd7\xa3\xf01@"}, + {"key": 16, "value": b"\x00\x00\x00\x00\x00`B@"}, + {"key": 17, "value": b"333333\xd3?"}, + {"key": 18, "value": b"\x00\x00\x00\x00\x00\x18b@"}, + {"key": 19, "value": b"\x00\x00\x00\x00\x00\x00\x04@"}, + ], + "key_metadata": None, + "split_offsets": [4], + "sort_order_id": 0, + }, + }, + { + "status": 1, + "snapshot_id": 8744736658442914487, + "data_file": { + "file_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/data/VendorID=1/00000-633-d8a4223e-dc97-45a1-86e1-adaba6e8abd7-00002.parquet", + "file_format": "PARQUET", + "partition": {"VendorID": 1}, + "record_count": 95050, + "file_size_in_bytes": 1265950, + "block_size_in_bytes": 67108864, + "column_sizes": [ + {"key": 1, "value": 318}, + {"key": 2, "value": 329806}, + {"key": 3, "value": 331632}, + {"key": 4, "value": 15343}, + {"key": 5, "value": 2351}, + {"key": 6, "value": 3389}, + {"key": 7, "value": 71269}, + {"key": 8, "value": 76429}, + {"key": 9, "value": 16383}, + {"key": 10, "value": 86992}, + {"key": 11, "value": 89608}, + {"key": 12, "value": 265}, + {"key": 13, "value": 19377}, + {"key": 14, "value": 1692}, + {"key": 15, "value": 76162}, + {"key": 16, "value": 4354}, + {"key": 17, "value": 759}, + {"key": 18, "value": 120650}, + {"key": 19, "value": 11804}, + ], + "value_counts": [ + {"key": 1, "value": 95050}, + {"key": 2, "value": 95050}, + {"key": 3, "value": 95050}, + {"key": 4, "value": 95050}, + {"key": 5, "value": 95050}, + {"key": 6, "value": 95050}, + {"key": 7, "value": 95050}, + {"key": 8, "value": 95050}, + {"key": 9, "value": 95050}, + {"key": 10, "value": 95050}, + {"key": 11, "value": 95050}, + {"key": 12, "value": 95050}, + {"key": 13, "value": 95050}, + {"key": 14, "value": 95050}, + {"key": 15, "value": 95050}, + {"key": 16, "value": 95050}, + {"key": 17, "value": 95050}, + {"key": 18, "value": 95050}, + {"key": 19, "value": 95050}, + ], + "null_value_counts": [ + {"key": 1, "value": 0}, + {"key": 2, "value": 0}, + {"key": 3, "value": 0}, + {"key": 4, "value": 0}, + {"key": 5, "value": 0}, + {"key": 6, "value": 0}, + {"key": 7, "value": 0}, + {"key": 8, "value": 0}, + {"key": 9, "value": 0}, + {"key": 10, "value": 0}, + {"key": 11, "value": 0}, + {"key": 12, "value": 95050}, + {"key": 13, "value": 0}, + {"key": 14, "value": 0}, + {"key": 15, "value": 0}, + {"key": 16, "value": 0}, + {"key": 17, "value": 0}, + {"key": 18, "value": 0}, + {"key": 19, "value": 0}, + ], + "nan_value_counts": [ + {"key": 16, "value": 0}, + {"key": 17, "value": 0}, + {"key": 18, "value": 0}, + {"key": 19, "value": 0}, + {"key": 10, "value": 0}, + {"key": 11, "value": 0}, + {"key": 12, "value": 0}, + {"key": 13, "value": 0}, + {"key": 14, "value": 0}, + {"key": 15, "value": 0}, + ], + "lower_bounds": [ + {"key": 1, "value": b"\x01\x00\x00\x00"}, + {"key": 2, "value": b"2020-04-01 00:00"}, + {"key": 3, "value": b"2020-04-01 00:03"}, + {"key": 4, "value": b"\x00\x00\x00\x00"}, + {"key": 5, "value": b"\x01\x00\x00\x00"}, + {"key": 6, "value": b"N"}, + {"key": 7, "value": b"\x01\x00\x00\x00"}, + {"key": 8, "value": b"\x01\x00\x00\x00"}, + {"key": 9, "value": b"\x01\x00\x00\x00"}, + {"key": 10, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 11, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 13, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 14, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 15, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 16, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 17, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 18, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 19, "value": b"\x00\x00\x00\x00\x00\x00\x00\x00"}, + ], + "upper_bounds": [ + {"key": 1, "value": b"\x01\x00\x00\x00"}, + {"key": 2, "value": b"2020-04-30 23:5:"}, + {"key": 3, "value": b"2020-05-01 00:1:"}, + {"key": 4, "value": b"\x06\x00\x00\x00"}, + {"key": 5, "value": b"c\x00\x00\x00"}, + {"key": 6, "value": b"Y"}, + {"key": 7, "value": b"\t\x01\x00\x00"}, + {"key": 8, "value": b"\t\x01\x00\x00"}, + {"key": 9, "value": b"\x04\x00\x00\x00"}, + {"key": 10, "value": b"\\\x8f\xc2\xf5(8\x8c@"}, + {"key": 11, "value": b"\xcd\xcc\xcc\xcc\xcc,f@"}, + {"key": 13, "value": b"\x00\x00\x00\x00\x00\x00\x1c@"}, + {"key": 14, "value": b"\x9a\x99\x99\x99\x99\x99\xf1?"}, + {"key": 15, "value": b"\x00\x00\x00\x00\x00\x00Y@"}, + {"key": 16, "value": b"\x00\x00\x00\x00\x00\xb0X@"}, + {"key": 17, "value": b"333333\xd3?"}, + {"key": 18, "value": b"\xc3\xf5(\\\x8f:\x8c@"}, + {"key": 19, "value": b"\x00\x00\x00\x00\x00\x00\x04@"}, + ], + "key_metadata": None, + "split_offsets": [4], + "sort_order_id": 0, + }, + }, +] + +manifest_file_records = [ + { + "manifest_path": "/home/iceberg/warehouse/nyc/taxis_partitioned/metadata/0125c686-8aa6-4502-bdcc-b6d17ca41a3b-m0.avro", + "manifest_length": 7989, + "partition_spec_id": 0, + "added_snapshot_id": 9182715666859759686, + "added_data_files_count": 3, + "existing_data_files_count": 0, + "deleted_data_files_count": 0, + "partitions": [ + {"contains_null": True, "contains_nan": False, "lower_bound": b"\x01\x00\x00\x00", "upper_bound": b"\x02\x00\x00\x00"} + ], + "added_rows_count": 237993, + "existing_rows_count": 0, + "deleted_rows_count": 0, + } +] + + +@pytest.fixture(scope="session") +def avro_schema_manifest_file() -> Dict[str, Any]: return { "type": "record", "name": "manifest_file", @@ -126,28 +490,28 @@ def manifest_schema() -> Dict[str, Any]: "name": "added_snapshot_id", "type": ["null", "long"], "doc": "Snapshot ID that added the manifest", - "default": None, + "default": "null", "field-id": 503, }, { "name": "added_data_files_count", "type": ["null", "int"], "doc": "Added entry count", - "default": None, + "default": "null", "field-id": 504, }, { "name": "existing_data_files_count", "type": ["null", "int"], "doc": "Existing entry count", - "default": None, + "default": "null", "field-id": 505, }, { "name": "deleted_data_files_count", "type": ["null", "int"], "doc": "Deleted entry count", - "default": None, + "default": "null", "field-id": 506, }, { @@ -170,21 +534,21 @@ def manifest_schema() -> Dict[str, Any]: "name": "contains_nan", "type": ["null", "boolean"], "doc": "True if any file has a nan partition value", - "default": None, + "default": "null", "field-id": 518, }, { "name": "lower_bound", "type": ["null", "bytes"], "doc": "Partition lower bound for all files", - "default": None, + "default": "null", "field-id": 510, }, { "name": "upper_bound", "type": ["null", "bytes"], "doc": "Partition upper bound for all files", - "default": None, + "default": "null", "field-id": 511, }, ], @@ -193,22 +557,22 @@ def manifest_schema() -> Dict[str, Any]: }, ], "doc": "Summary for each partition", - "default": None, + "default": "null", "field-id": 507, }, - {"name": "added_rows_count", "type": ["null", "long"], "doc": "Added rows count", "default": None, "field-id": 512}, + {"name": "added_rows_count", "type": ["null", "long"], "doc": "Added rows count", "default": "null", "field-id": 512}, { "name": "existing_rows_count", "type": ["null", "long"], "doc": "Existing rows count", - "default": None, + "default": "null", "field-id": 513, }, { "name": "deleted_rows_count", "type": ["null", "long"], "doc": "Deleted rows count", - "default": None, + "default": "null", "field-id": 514, }, ], @@ -216,88 +580,381 @@ def manifest_schema() -> Dict[str, Any]: @pytest.fixture(scope="session") -def all_avro_types() -> Dict[str, Any]: +def avro_schema_manifest_entry() -> Dict[str, Any]: return { "type": "record", - "name": "all_avro_types", + "name": "manifest_entry", "fields": [ - {"name": "primitive_string", "type": "string", "field-id": 100}, - {"name": "primitive_int", "type": "int", "field-id": 200}, - {"name": "primitive_long", "type": "long", "field-id": 300}, - {"name": "primitive_float", "type": "float", "field-id": 400}, - {"name": "primitive_double", "type": "double", "field-id": 500}, - {"name": "primitive_bytes", "type": "bytes", "field-id": 600}, - { - "type": "record", - "name": "Person", - "fields": [ - {"name": "name", "type": "string", "field-id": 701}, - {"name": "age", "type": "long", "field-id": 702}, - {"name": "gender", "type": ["string", "null"], "field-id": 703}, - ], - "field-id": 700, - }, + {"name": "status", "type": "int", "field-id": 0}, + {"name": "snapshot_id", "type": ["null", "long"], "default": "null", "field-id": 1}, { - "name": "array_with_string", + "name": "data_file", "type": { - "type": "array", - "items": "string", - "default": [], - "element-id": 801, - }, - "field-id": 800, - }, - { - "name": "array_with_optional_string", - "type": [ - "null", - { - "type": "array", - "items": ["string", "null"], - "default": [], - "element-id": 901, - }, - ], - "field-id": 900, - }, - { - "name": "array_with_optional_record", - "type": [ - "null", - { - "type": "array", - "items": [ - "null", - { + "type": "record", + "name": "r2", + "fields": [ + {"name": "file_path", "type": "string", "doc": "Location URI with FS scheme", "field-id": 100}, + { + "name": "file_format", + "type": "string", + "doc": "File format name: avro, orc, or parquet", + "field-id": 101, + }, + { + "name": "partition", + "type": { "type": "record", - "name": "person", - "fields": [ - {"name": "name", "type": "string", "field-id": 1002}, - {"name": "age", "type": "long", "field-id": 1003}, - {"name": "gender", "type": ["string", "null"], "field-id": 1004}, - ], + "name": "r102", + "fields": [{"name": "VendorID", "type": ["null", "int"], "default": "null", "field-id": 1000}], }, - ], - "element-id": 1001, - }, - ], - "field-id": 1000, - }, - { - "name": "map_with_longs", - "type": { - "type": "map", - "values": "long", - "default": {}, - "key-id": 1101, - "value-id": 1102, + "field-id": 102, + }, + {"name": "record_count", "type": "long", "doc": "Number of records in the file", "field-id": 103}, + {"name": "file_size_in_bytes", "type": "long", "doc": "Total file size in bytes", "field-id": 104}, + {"name": "block_size_in_bytes", "type": "long", "field-id": 105}, + { + "name": "column_sizes", + "type": [ + "null", + { + "type": "array", + "items": { + "type": "record", + "name": "k117_v118", + "fields": [ + {"name": "key", "type": "int", "field-id": 117}, + {"name": "value", "type": "long", "field-id": 118}, + ], + }, + "logicalType": "map", + }, + ], + "doc": "Map of column id to total size on disk", + "default": "null", + "field-id": 108, + }, + { + "name": "value_counts", + "type": [ + "null", + { + "type": "array", + "items": { + "type": "record", + "name": "k119_v120", + "fields": [ + {"name": "key", "type": "int", "field-id": 119}, + {"name": "value", "type": "long", "field-id": 120}, + ], + }, + "logicalType": "map", + }, + ], + "doc": "Map of column id to total count, including null and NaN", + "default": "null", + "field-id": 109, + }, + { + "name": "null_value_counts", + "type": [ + "null", + { + "type": "array", + "items": { + "type": "record", + "name": "k121_v122", + "fields": [ + {"name": "key", "type": "int", "field-id": 121}, + {"name": "value", "type": "long", "field-id": 122}, + ], + }, + "logicalType": "map", + }, + ], + "doc": "Map of column id to null value count", + "default": "null", + "field-id": 110, + }, + { + "name": "nan_value_counts", + "type": [ + "null", + { + "type": "array", + "items": { + "type": "record", + "name": "k138_v139", + "fields": [ + {"name": "key", "type": "int", "field-id": 138}, + {"name": "value", "type": "long", "field-id": 139}, + ], + }, + "logicalType": "map", + }, + ], + "doc": "Map of column id to number of NaN values in the column", + "default": "null", + "field-id": 137, + }, + { + "name": "lower_bounds", + "type": [ + "null", + { + "type": "array", + "items": { + "type": "record", + "name": "k126_v127", + "fields": [ + {"name": "key", "type": "int", "field-id": 126}, + {"name": "value", "type": "bytes", "field-id": 127}, + ], + }, + "logicalType": "map", + }, + ], + "doc": "Map of column id to lower bound", + "default": "null", + "field-id": 125, + }, + { + "name": "upper_bounds", + "type": [ + "null", + { + "type": "array", + "items": { + "type": "record", + "name": "k129_v130", + "fields": [ + {"name": "key", "type": "int", "field-id": 129}, + {"name": "value", "type": "bytes", "field-id": 130}, + ], + }, + "logicalType": "map", + }, + ], + "doc": "Map of column id to upper bound", + "default": "null", + "field-id": 128, + }, + { + "name": "key_metadata", + "type": ["null", "bytes"], + "doc": "Encryption key metadata blob", + "default": "null", + "field-id": 131, + }, + { + "name": "split_offsets", + "type": ["null", {"type": "array", "items": "long", "element-id": 133}], + "doc": "Splittable offsets", + "default": "null", + "field-id": 132, + }, + { + "name": "sort_order_id", + "type": ["null", "int"], + "doc": "Sort order ID", + "default": "null", + "field-id": 140, + }, + ], }, - "field-id": 1000, + "field-id": 2, }, ], } -@pytest.fixture -def catalog() -> InMemoryCatalog: - return InMemoryCatalog("test.in.memory.catalog", {"test.key": "test.value"}) +@pytest.fixture(scope="session") +def generated_manifest_entry_file(avro_schema_manifest_entry): + from fastavro import parse_schema, writer + + parsed_schema = parse_schema(avro_schema_manifest_entry) + + with TemporaryDirectory() as tmpdir: + tmp_avro_file = tmpdir + "/manifest.avro" + with open(tmp_avro_file, "wb") as out: + writer(out, parsed_schema, manifest_entry_records) + yield tmp_avro_file + + +@pytest.fixture(scope="session") +def generated_manifest_file_file(avro_schema_manifest_file): + from fastavro import parse_schema, writer + + parsed_schema = parse_schema(avro_schema_manifest_file) + + with TemporaryDirectory() as tmpdir: + tmp_avro_file = tmpdir + "/manifest.avro" + with open(tmp_avro_file, "wb") as out: + writer(out, parsed_schema, manifest_file_records) + yield tmp_avro_file + + +@pytest.fixture(scope="session") +def iceberg_manifest_entry_schema() -> Schema: + return Schema( + NestedField(field_id=0, name="status", field_type=IntegerType(), required=True), + NestedField(field_id=1, name="snapshot_id", field_type=LongType(), required=False), + NestedField( + field_id=2, + name="data_file", + field_type=StructType( + NestedField( + field_id=100, + name="file_path", + field_type=StringType(), + doc="Location URI with FS scheme", + required=True, + ), + NestedField( + field_id=101, + name="file_format", + field_type=StringType(), + doc="File format name: avro, orc, or parquet", + required=True, + ), + NestedField( + field_id=102, + name="partition", + field_type=StructType( + NestedField( + field_id=1000, + name="VendorID", + field_type=IntegerType(), + required=False, + ), + ), + required=True, + ), + NestedField( + field_id=103, + name="record_count", + field_type=LongType(), + doc="Number of records in the file", + required=True, + ), + NestedField( + field_id=104, + name="file_size_in_bytes", + field_type=LongType(), + doc="Total file size in bytes", + required=True, + ), + NestedField( + field_id=105, + name="block_size_in_bytes", + field_type=LongType(), + required=True, + ), + NestedField( + field_id=108, + name="column_sizes", + field_type=MapType( + key_id=117, + key_type=IntegerType(), + value_id=118, + value_type=LongType(), + value_required=True, + ), + doc="Map of column id to total size on disk", + required=False, + ), + NestedField( + field_id=109, + name="value_counts", + field_type=MapType( + key_id=119, + key_type=IntegerType(), + value_id=120, + value_type=LongType(), + value_required=True, + ), + doc="Map of column id to total count, including null and NaN", + required=False, + ), + NestedField( + field_id=110, + name="null_value_counts", + field_type=MapType( + key_id=121, + key_type=IntegerType(), + value_id=122, + value_type=LongType(), + value_required=True, + ), + doc="Map of column id to null value count", + required=False, + ), + NestedField( + field_id=137, + name="nan_value_counts", + field_type=MapType( + key_id=138, + key_type=IntegerType(), + value_id=139, + value_type=LongType(), + value_required=True, + ), + doc="Map of column id to number of NaN values in the column", + required=False, + ), + NestedField( + field_id=125, + name="lower_bounds", + field_type=MapType( + key_id=126, + key_type=IntegerType(), + value_id=127, + value_type=BinaryType(), + value_required=True, + ), + doc="Map of column id to lower bound", + required=False, + ), + NestedField( + field_id=128, + name="upper_bounds", + field_type=MapType( + key_id=129, + key_type=IntegerType(), + value_id=130, + value_type=BinaryType(), + value_required=True, + ), + doc="Map of column id to upper bound", + required=False, + ), + NestedField( + field_id=131, + name="key_metadata", + field_type=BinaryType(), + doc="Encryption key metadata blob", + required=False, + ), + NestedField( + field_id=132, + name="split_offsets", + field_type=ListType( + element_id=133, + element_type=LongType(), + element_required=True, + ), + doc="Splittable offsets", + required=False, + ), + NestedField( + field_id=140, + name="sort_order_id", + field_type=IntegerType(), + doc="Sort order ID", + required=False, + ), + ), + required=True, + ), + schema_id=1, + identifier_field_ids=[], + ) diff --git a/python/tests/io/__init__.py b/python/tests/io/__init__.py index 00eaa2ffe46c..13a83393a912 100644 --- a/python/tests/io/__init__.py +++ b/python/tests/io/__init__.py @@ -1,11 +1,16 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/tests/io/test_io_base.py b/python/tests/io/test_io_base.py index 72495e19b6fe..9008c453a2ab 100644 --- a/python/tests/io/test_io_base.py +++ b/python/tests/io/test_io_base.py @@ -124,7 +124,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None: try: os.remove(parsed_location.path) except FileNotFoundError as e: - raise FileNotFoundError(f"Cannot delete file, does not exist: {parsed_location.path} - Caused by: {e}") + raise FileNotFoundError(f"Cannot delete file, does not exist: {parsed_location.path} - Caused by: {e}") from e @pytest.mark.parametrize("CustomInputFile", [LocalInputFile, PyArrowFile]) diff --git a/python/tests/utils/test_schema_conversion.py b/python/tests/utils/test_schema_conversion.py index 8776e1a14ffc..234ec1573f46 100644 --- a/python/tests/utils/test_schema_conversion.py +++ b/python/tests/utils/test_schema_conversion.py @@ -14,6 +14,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from typing import Any, Dict + import pytest from iceberg.schema import Schema @@ -34,31 +36,29 @@ from iceberg.utils.schema_conversion import AvroSchemaConversion -def test_iceberg_to_avro(manifest_schema): - iceberg_schema = AvroSchemaConversion().avro_to_iceberg(manifest_schema) +def test_iceberg_to_avro(avro_schema_manifest_file: Dict[str, Any]): + iceberg_schema = AvroSchemaConversion().avro_to_iceberg(avro_schema_manifest_file) expected_iceberg_schema = Schema( NestedField( - field_id=500, name="manifest_path", field_type=StringType(), required=False, doc="Location URI with FS scheme" - ), - NestedField(field_id=501, name="manifest_length", field_type=LongType(), required=False, doc="Total file size in bytes"), - NestedField( - field_id=502, name="partition_spec_id", field_type=IntegerType(), required=False, doc="Spec ID used to write" + field_id=500, name="manifest_path", field_type=StringType(), required=True, doc="Location URI with FS scheme" ), + NestedField(field_id=501, name="manifest_length", field_type=LongType(), required=True, doc="Total file size in bytes"), + NestedField(field_id=502, name="partition_spec_id", field_type=IntegerType(), required=True, doc="Spec ID used to write"), NestedField( field_id=503, name="added_snapshot_id", field_type=LongType(), - required=True, + required=False, doc="Snapshot ID that added the manifest", ), NestedField( - field_id=504, name="added_data_files_count", field_type=IntegerType(), required=True, doc="Added entry count" + field_id=504, name="added_data_files_count", field_type=IntegerType(), required=False, doc="Added entry count" ), NestedField( - field_id=505, name="existing_data_files_count", field_type=IntegerType(), required=True, doc="Existing entry count" + field_id=505, name="existing_data_files_count", field_type=IntegerType(), required=False, doc="Existing entry count" ), NestedField( - field_id=506, name="deleted_data_files_count", field_type=IntegerType(), required=True, doc="Deleted entry count" + field_id=506, name="deleted_data_files_count", field_type=IntegerType(), required=False, doc="Deleted entry count" ), NestedField( field_id=507, @@ -66,45 +66,43 @@ def test_iceberg_to_avro(manifest_schema): field_type=ListType( element_id=508, element_type=StructType( - fields=( - NestedField( - field_id=509, - name="contains_null", - field_type=BooleanType(), - required=False, - doc="True if any file has a null partition value", - ), - NestedField( - field_id=518, - name="contains_nan", - field_type=BooleanType(), - required=True, - doc="True if any file has a nan partition value", - ), - NestedField( - field_id=510, - name="lower_bound", - field_type=BinaryType(), - required=True, - doc="Partition lower bound for all files", - ), - NestedField( - field_id=511, - name="upper_bound", - field_type=BinaryType(), - required=True, - doc="Partition upper bound for all files", - ), - ) + NestedField( + field_id=509, + name="contains_null", + field_type=BooleanType(), + required=True, + doc="True if any file has a null partition value", + ), + NestedField( + field_id=518, + name="contains_nan", + field_type=BooleanType(), + required=False, + doc="True if any file has a nan partition value", + ), + NestedField( + field_id=510, + name="lower_bound", + field_type=BinaryType(), + required=False, + doc="Partition lower bound for all files", + ), + NestedField( + field_id=511, + name="upper_bound", + field_type=BinaryType(), + required=False, + doc="Partition upper bound for all files", + ), ), - element_required=False, + element_required=True, ), - required=True, + required=False, doc="Summary for each partition", ), - NestedField(field_id=512, name="added_rows_count", field_type=LongType(), required=True, doc="Added rows count"), - NestedField(field_id=513, name="existing_rows_count", field_type=LongType(), required=True, doc="Existing rows count"), - NestedField(field_id=514, name="deleted_rows_count", field_type=LongType(), required=True, doc="Deleted rows count"), + NestedField(field_id=512, name="added_rows_count", field_type=LongType(), required=False, doc="Added rows count"), + NestedField(field_id=513, name="existing_rows_count", field_type=LongType(), required=False, doc="Existing rows count"), + NestedField(field_id=514, name="deleted_rows_count", field_type=LongType(), required=False, doc="Deleted rows count"), schema_id=1, identifier_field_ids=[], ) @@ -133,8 +131,8 @@ def test_avro_list_required_primitive(): NestedField( field_id=100, name="array_with_string", - field_type=ListType(element_id=101, element_type=StringType(), element_required=False), - required=False, + field_type=ListType(element_id=101, element_type=StringType(), element_required=True), + required=True, ), schema_id=1, ) @@ -166,8 +164,8 @@ def test_avro_list_wrapped_primitive(): NestedField( field_id=100, name="array_with_string", - field_type=ListType(element_id=101, element_type=StringType(), element_required=False), - required=False, + field_type=ListType(element_id=101, element_type=StringType(), element_required=True), + required=True, ), schema_id=1, ) @@ -217,13 +215,13 @@ def test_avro_list_required_record(): element_id=101, element_type=StructType( fields=( - NestedField(field_id=102, name="contains_null", field_type=BooleanType(), required=False), - NestedField(field_id=103, name="contains_nan", field_type=BooleanType(), required=True), + NestedField(field_id=102, name="contains_null", field_type=BooleanType(), required=True), + NestedField(field_id=103, name="contains_nan", field_type=BooleanType(), required=False), ) ), - element_required=False, + element_required=True, ), - required=False, + required=True, ), schema_id=1, identifier_field_ids=[], @@ -249,12 +247,12 @@ def test_nested_type(): def test_map_type(): avro_type = { "type": "map", - "values": ["long", "null"], + "values": ["null", "long"], "key-id": 101, "value-id": 102, } actual = AvroSchemaConversion()._convert_schema(avro_type) - expected = MapType(key_id=101, key_type=StringType(), value_id=102, value_type=LongType(), value_required=True) + expected = MapType(key_id=101, key_type=StringType(), value_id=102, value_type=LongType(), value_required=False) assert actual == expected diff --git a/python/tests/utils/test_singleton.py b/python/tests/utils/test_singleton.py new file mode 100644 index 000000000000..92b923dd820f --- /dev/null +++ b/python/tests/utils/test_singleton.py @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from iceberg.avro.reader import BooleanReader, FixedReader + + +def test_singleton(): + """We want to reuse the readers to avoid creating a gazillion of them""" + assert id(BooleanReader()) == id(BooleanReader()) + assert id(FixedReader(22)) == id(FixedReader(22)) + assert id(FixedReader(19)) != id(FixedReader(25))