From 859777b9662053bfb9bdb241cfcabfa6608ed238 Mon Sep 17 00:00:00 2001 From: Chris Riccomini Date: Tue, 16 Jan 2024 11:15:45 -0800 Subject: [PATCH] Allow StringType and BytesType to have undefined byte lengths Prior to this commit, Recap StringType and BytesType both required `bytes_` to be set. If unset, a default of 65536 was used. This configuration allowed developers to skip the `bytes` field in the concrete syntax tree (CST), but still required `bytes_` to be set in the abstract syntax tree (AST). @adrianisk and I converged on this design in https://github.com/recap-build/recap/discussions/284. Recently, @mjperrone pointed out that requiring `bytes_` in the AST is still awkward for JSON. In the absence of an undefined byte_ option, you have to set *something* for JSON--either a magic number or INT/LONG_MAX. Thus far, we defaulted to LONG_MAX. But a defined LONG_MAX and an undefined length are actually two separate states. Allowing a converter to specify an undefined string/byte length permits other converters to decide how to deal with the undefined byte length. I made the change and it fit quite naturally, so I think we're on the right track with this. JSON and Hive metastore both benefited; both support undefined lengths. I've left the other converters alone for now. This means they'll continue to barf if you go from JSON/HMS to Recap to Avro/Proto. I think that's fine for now. We can revisit this later if we want to. --- recap/converters/bigquery.py | 4 +- recap/converters/hive_metastore.py | 3 +- recap/converters/json_schema.py | 25 ++------ recap/types.py | 28 ++++++--- .../clients/test_hive_metastore.py | 6 +- ...ed_no_length.json => fixed_no_length.json} | 8 +++ tests/spec/invalid/illegal_lengths.json | 59 +++++++++++++++++++ tests/spec/test_json_schema_spec.py | 2 +- tests/spec/valid/bytes_field.json | 7 +++ tests/spec/valid/string_field.json | 7 +++ tests/spec/valid/uuid_logical_field.json | 5 -- tests/unit/clients/test_hive_metastore.py | 4 +- tests/unit/converters/test_bigquery.py | 6 +- tests/unit/converters/test_hive_metastore.py | 8 +-- tests/unit/converters/test_json_schema.py | 20 +++---- tests/unit/test_types.py | 6 +- 16 files changed, 134 insertions(+), 64 deletions(-) rename tests/spec/invalid/{list_field_fixed_no_length.json => fixed_no_length.json} (52%) create mode 100644 tests/spec/invalid/illegal_lengths.json diff --git a/recap/converters/bigquery.py b/recap/converters/bigquery.py index 0fd5fc7..7532d13 100644 --- a/recap/converters/bigquery.py +++ b/recap/converters/bigquery.py @@ -23,9 +23,9 @@ def to_recap(self, fields: list[bigquery.SchemaField]) -> StructType: # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types # Says "2 logical bytes + the UTF-8 encoded string size", so I'm assuming # the 2 logical bytes are a uint16 length header, which is 65_536. - field_type = StringType(bytes_=field.max_length or 65_536) + field_type = StringType(bytes_=field.max_length) case "BYTES": - field_type = BytesType(bytes_=field.max_length or 65_536) + field_type = BytesType(bytes_=field.max_length) case "INT64" | "INTEGER" | "INT" | "SMALLINT" | "TINYINT" | "BYTEINT": field_type = IntType(bits=64) case "FLOAT" | "FLOAT64": diff --git a/recap/converters/hive_metastore.py b/recap/converters/hive_metastore.py index 09e65cb..996eb7f 100644 --- a/recap/converters/hive_metastore.py +++ b/recap/converters/hive_metastore.py @@ -67,8 +67,7 @@ def _parse_schema(self, htype: HType | HColumn) -> RecapType: recap_type = NullType(**extra_attrs) case HPrimitiveType(primitive_type=PrimitiveCategory.STRING): # TODO: Should handle multi-byte encodings - # Using 2^63-1 as the max length because Hive has no defined max. - recap_type = StringType(bytes_=9_223_372_036_854_775_807, **extra_attrs) + recap_type = StringType(**extra_attrs) case HPrimitiveType(primitive_type=PrimitiveCategory.BINARY): recap_type = BytesType(bytes_=2_147_483_647, **extra_attrs) case HDecimalType(precision=int(precision), scale=int(scale)): diff --git a/recap/converters/json_schema.py b/recap/converters/json_schema.py index 7072e13..d413e15 100644 --- a/recap/converters/json_schema.py +++ b/recap/converters/json_schema.py @@ -79,30 +79,15 @@ def _parse( values = self._parse(items, alias_strategy) return ListType(values, **extra_attrs) case {"type": "string", "format": "bytes"}: - return BytesType( - bytes_=9_223_372_036_854_775_807, - **extra_attrs, - ) + return BytesType(**extra_attrs) case {"type": "string", "format": "date"}: - return StringType( - bytes_=9_223_372_036_854_775_807, - logical="org.iso.8601.Date", - **extra_attrs, - ) + return StringType(logical="org.iso.8601.Date", **extra_attrs) case {"type": "string", "format": "date-time"}: - return StringType( - bytes_=9_223_372_036_854_775_807, - logical="org.iso.8601.DateTime", - **extra_attrs, - ) + return StringType(logical="org.iso.8601.DateTime", **extra_attrs) case {"type": "string", "format": "time"}: - return StringType( - bytes_=9_223_372_036_854_775_807, - logical="org.iso.8601.Time", - **extra_attrs, - ) + return StringType(logical="org.iso.8601.Time", **extra_attrs) case {"type": "string"}: - return StringType(bytes_=9_223_372_036_854_775_807, **extra_attrs) + return StringType(**extra_attrs) case {"type": "number"}: return FloatType(bits=64, **extra_attrs) case {"type": "integer"}: diff --git a/recap/types.py b/recap/types.py index 4d2bdc1..fdbe679 100644 --- a/recap/types.py +++ b/recap/types.py @@ -106,7 +106,7 @@ def __eq__(self, other): def validate(self) -> None: if self.bits < 1 or self.bits > 2_147_483_647: - raise ValueError("bits must be between 1 and 2,147,483,647") + raise ValueError("Bits must be between 1 and 2,147,483,647") class FloatType(RecapType): @@ -121,13 +121,13 @@ def __eq__(self, other): def validate(self) -> None: if self.bits < 1 or self.bits > 2_147_483_647: - raise ValueError("bits must be between 1 and 2,147,483,647") + raise ValueError("Bits must be between 1 and 2,147,483,647") class StringType(RecapType): """Represents a string Recap type.""" - def __init__(self, bytes_: int = 65_536, variable: bool = True, **extra_attrs): + def __init__(self, bytes_: int | None = None, variable: bool = True, **extra_attrs): super().__init__("string", **extra_attrs) self.bytes_ = bytes_ self.variable = variable @@ -139,14 +139,18 @@ def __eq__(self, other): ) def validate(self) -> None: - if self.bytes_ < 1 or self.bytes_ > 9_223_372_036_854_775_807: - raise ValueError("bytes must be between 1 and 9,223,372,036,854,775,807") + if not self.variable and self.bytes_ is None: + raise ValueError("Fixed length bytes must have a length set") + if self.bytes_ is not None and ( + self.bytes_ < 1 or self.bytes_ > 9_223_372_036_854_775_807 + ): + raise ValueError("Bytes must be between 1 and 9,223,372,036,854,775,807") class BytesType(RecapType): """Represents a bytes Recap type.""" - def __init__(self, bytes_: int = 65_536, variable: bool = True, **extra_attrs): + def __init__(self, bytes_: int | None = None, variable: bool = True, **extra_attrs): super().__init__("bytes", **extra_attrs) self.bytes_ = bytes_ self.variable = variable @@ -158,8 +162,12 @@ def __eq__(self, other): ) def validate(self) -> None: - if self.bytes_ < 1 or self.bytes_ > 9_223_372_036_854_775_807: - raise ValueError("bytes must be between 1 and 9,223,372,036,854,775,807") + if not self.variable and self.bytes_ is None: + raise ValueError("Fixed length bytes must have a length set") + if self.bytes_ is not None and ( + self.bytes_ < 1 or self.bytes_ > 9_223_372_036_854_775_807 + ): + raise ValueError("Bytes must be between 1 and 9,223,372,036,854,775,807") class ListType(RecapType): @@ -187,7 +195,9 @@ def __eq__(self, other): def validate(self) -> None: if not self.variable and self.length is None: raise ValueError("Fixed length lists must have a length set") - if self.length and (self.length < 0 or self.length > 9_223_372_036_854_775_807): + if self.length is not None and ( + self.length < 1 or self.length > 9_223_372_036_854_775_807 + ): raise ValueError( "List length must be between 0 and 9,223,372,036,854,775,807" ) diff --git a/tests/integration/clients/test_hive_metastore.py b/tests/integration/clients/test_hive_metastore.py index ec7b828..62b60b2 100644 --- a/tests/integration/clients/test_hive_metastore.py +++ b/tests/integration/clients/test_hive_metastore.py @@ -312,7 +312,7 @@ def test_parameterized_types(hive_client): assert isinstance(fields[4].types[1].values, UnionType) assert isinstance(fields[4].types[1].values.types[0], NullType) assert isinstance(fields[4].types[1].values.types[1], StringType) - assert fields[4].types[1].values.types[1].bytes_ == 9_223_372_036_854_775_807 + assert fields[4].types[1].values.types[1].bytes_ is None assert fields[4].doc == "c20" assert fields[5].extra_attrs["name"] == "col21" @@ -330,7 +330,7 @@ def test_parameterized_types(hive_client): assert isinstance(fields[5].types[1].fields[1], UnionType) assert isinstance(fields[5].types[1].fields[1].types[0], NullType) assert isinstance(fields[5].types[1].fields[1].types[1], StringType) - assert fields[5].types[1].fields[1].types[1].bytes_ == 9_223_372_036_854_775_807 + assert fields[5].types[1].fields[1].types[1].bytes_ is None assert fields[5].doc == "c21" assert fields[6].extra_attrs["name"] == "col22" @@ -341,7 +341,7 @@ def test_parameterized_types(hive_client): assert fields[6].types[1].bits == 32 assert fields[6].types[1].signed assert isinstance(fields[6].types[2], StringType) - assert fields[6].types[2].bytes_ == 9_223_372_036_854_775_807 + assert fields[6].types[2].bytes_ is None assert fields[6].doc == "c22" diff --git a/tests/spec/invalid/list_field_fixed_no_length.json b/tests/spec/invalid/fixed_no_length.json similarity index 52% rename from tests/spec/invalid/list_field_fixed_no_length.json rename to tests/spec/invalid/fixed_no_length.json index b1f780f..1ede701 100644 --- a/tests/spec/invalid/list_field_fixed_no_length.json +++ b/tests/spec/invalid/fixed_no_length.json @@ -6,5 +6,13 @@ "type": "float", "bits": 16 } + }, + { + "type": "string", + "variable": false + }, + { + "type": "bytes", + "variable": false } ] \ No newline at end of file diff --git a/tests/spec/invalid/illegal_lengths.json b/tests/spec/invalid/illegal_lengths.json new file mode 100644 index 0000000..691f4da --- /dev/null +++ b/tests/spec/invalid/illegal_lengths.json @@ -0,0 +1,59 @@ +[ + { + "type": "list", + "variable": false, + "values": { + "type": "float", + "bits": 16 + }, + "length": -1 + }, + { + "type": "list", + "variable": false, + "values": { + "type": "float", + "bits": 16 + }, + "length": 0 + }, + { + "type": "list", + "variable": false, + "values": { + "type": "float", + "bits": 16 + }, + "length": 9223372036854775808 + }, + { + "type": "string", + "variable": false, + "bytes": -1 + }, + { + "type": "string", + "variable": false, + "bytes": 0 + }, + { + "type": "string", + "variable": false, + "bytes": 9223372036854775808 + }, + { + "type": "bytes", + "variable": false, + "bytes": -1 + }, + { + "type": "bytes", + "variable": false, + "bytes": 0 + }, + { + "type": "bytes", + "variable": false, + "bytes": 9223372036854775808 + } + ] \ No newline at end of file diff --git a/tests/spec/test_json_schema_spec.py b/tests/spec/test_json_schema_spec.py index 8d32ef2..413dc1d 100644 --- a/tests/spec/test_json_schema_spec.py +++ b/tests/spec/test_json_schema_spec.py @@ -20,7 +20,7 @@ VALID_SCHEMA_DIR = "tests/spec/valid" INVALID_SCHEMA_DIR = "tests/spec/invalid" -RECAP_SPEC_JSON_HTTP = "https://recap.build/specs/type/0.2.0.json" +RECAP_SPEC_JSON_HTTP = "https://recap.build/specs/type/0.3.0.json" @pytest.fixture(scope="session") diff --git a/tests/spec/valid/bytes_field.json b/tests/spec/valid/bytes_field.json index f44c846..208b89b 100644 --- a/tests/spec/valid/bytes_field.json +++ b/tests/spec/valid/bytes_field.json @@ -14,5 +14,12 @@ "type": "bytes", "bytes": 9223372036854775807, "variable": true + }, + { + "type": "bytes" + }, + { + "type": "bytes", + "variable": true } ] diff --git a/tests/spec/valid/string_field.json b/tests/spec/valid/string_field.json index 9c10af7..fa90b35 100644 --- a/tests/spec/valid/string_field.json +++ b/tests/spec/valid/string_field.json @@ -14,5 +14,12 @@ "type": "string", "bytes": 9223372036854775807, "variable": true + }, + { + "type": "string" + }, + { + "type": "string", + "variable": true } ] diff --git a/tests/spec/valid/uuid_logical_field.json b/tests/spec/valid/uuid_logical_field.json index 22a8288..6294a4f 100644 --- a/tests/spec/valid/uuid_logical_field.json +++ b/tests/spec/valid/uuid_logical_field.json @@ -10,10 +10,5 @@ "logical": "build.recap.UUID", "variable": false, "bytes": 256 - }, - { - "type": "string", - "logical": "build.recap.UUID", - "variable": false } ] diff --git a/tests/unit/clients/test_hive_metastore.py b/tests/unit/clients/test_hive_metastore.py index ac43a8e..07924e7 100644 --- a/tests/unit/clients/test_hive_metastore.py +++ b/tests/unit/clients/test_hive_metastore.py @@ -106,7 +106,7 @@ class MockTable: assert isinstance(result.fields[7], UnionType) assert isinstance(result.fields[7].types[1], StringType) - assert result.fields[7].types[1].bytes_ == 9_223_372_036_854_775_807 + assert result.fields[7].types[1].bytes_ is None assert result.fields[7].extra_attrs["name"] == "col8" assert isinstance(result.fields[8], UnionType) @@ -301,7 +301,7 @@ class MockTable: # Validate sub_col2 assert isinstance(struct_field.fields[1], UnionType) assert isinstance(struct_field.fields[1].types[1], StringType) - assert struct_field.fields[1].types[1].bytes_ == 9_223_372_036_854_775_807 + assert struct_field.fields[1].types[1].bytes_ is None assert struct_field.fields[1].extra_attrs["name"] == "sub_col2" diff --git a/tests/unit/converters/test_bigquery.py b/tests/unit/converters/test_bigquery.py index c6f4af0..e1e5ebb 100644 --- a/tests/unit/converters/test_bigquery.py +++ b/tests/unit/converters/test_bigquery.py @@ -8,8 +8,8 @@ @pytest.mark.parametrize( "field_type,expected", [ - ("STRING", StringType(bytes_=65_536, name="test_field")), - ("BYTES", BytesType(bytes_=65_536, name="test_field")), + ("STRING", StringType(name="test_field")), + ("BYTES", BytesType(name="test_field")), ("INT64", IntType(bits=64, name="test_field")), ("FLOAT", FloatType(bits=64, name="test_field")), ("BOOLEAN", BoolType(name="test_field")), @@ -81,7 +81,7 @@ def test_record(): expected = StructType( [ IntType(bits=64, name="nested_int"), - StringType(bytes_=65_536, name="nested_string"), + StringType(bytes_=None, name="nested_string"), ], name="test_record", ) diff --git a/tests/unit/converters/test_hive_metastore.py b/tests/unit/converters/test_hive_metastore.py index 65569e7..fcef3c6 100644 --- a/tests/unit/converters/test_hive_metastore.py +++ b/tests/unit/converters/test_hive_metastore.py @@ -61,7 +61,7 @@ ( HPrimitiveType(primitive_type=PrimitiveCategory.STRING), UnionType( - [NullType(), StringType(bytes_=9_223_372_036_854_775_807)], + [NullType(), StringType()], default=None, name="test_col", ), @@ -199,7 +199,7 @@ NullType(), MapType( keys=UnionType( - [NullType(), StringType(bytes_=9_223_372_036_854_775_807)], + [NullType(), StringType()], default=None, ), values=UnionType( @@ -241,7 +241,7 @@ [ NullType(), IntType(bits=32), - StringType(bytes_=9_223_372_036_854_775_807), + StringType(), ], default=None, name="test_col", @@ -268,7 +268,7 @@ UnionType( [ NullType(), - StringType(bytes_=9_223_372_036_854_775_807), + StringType(), ], default=None, name="field2", diff --git a/tests/unit/converters/test_json_schema.py b/tests/unit/converters/test_json_schema.py index 9f728b3..b61401d 100644 --- a/tests/unit/converters/test_json_schema.py +++ b/tests/unit/converters/test_json_schema.py @@ -38,7 +38,7 @@ def test_all_basic_types(): assert isinstance(struct_type, StructType) assert struct_type.fields == [ UnionType( - [NullType(), StringType(bytes_=9_223_372_036_854_775_807)], + [NullType(), StringType()], name="a_string", default=None, ), @@ -89,7 +89,7 @@ def test_nested_objects(): StructType( [ UnionType( - [NullType(), StringType(bytes_=9_223_372_036_854_775_807)], + [NullType(), StringType()], name="a_string", default=None, ), @@ -132,7 +132,7 @@ def test_object_with_array_of_objects(): UnionType( [ NullType(), - StringType(bytes_=9_223_372_036_854_775_807), + StringType(), ], name="a_string", default=None, @@ -162,7 +162,7 @@ def test_required_properties(): struct_type = JSONSchemaConverter().to_recap(json_schema) assert isinstance(struct_type, StructType) assert struct_type.fields == [ - StringType(bytes_=9_223_372_036_854_775_807, name="required_property"), + StringType(name="required_property"), UnionType( [NullType(), FloatType(bits=64)], name="optional_property", @@ -188,7 +188,7 @@ def test_doc_attribute(): assert isinstance(struct_type, StructType) assert struct_type.fields == [ UnionType( - [NullType(), StringType(bytes_=9_223_372_036_854_775_807)], + [NullType(), StringType()], name="a_string", doc="This is a string", default=None, @@ -213,7 +213,7 @@ def test_name_attribute(): assert isinstance(struct_type, StructType) assert struct_type.fields == [ UnionType( - [NullType(), StringType(bytes_=9_223_372_036_854_775_807)], + [NullType(), StringType()], name="a_string", default=None, ), @@ -239,7 +239,7 @@ def test_default_attribute(): UnionType( [ NullType(), - StringType(bytes_=9_223_372_036_854_775_807), + StringType(), ], name="a_string", default="Default value", @@ -264,7 +264,7 @@ def test_convert_bytes(): assert result == StructType( [ UnionType( - [NullType(), BytesType(bytes_=9_223_372_036_854_775_807)], + [NullType(), BytesType()], name="img", default=None, ), @@ -353,7 +353,7 @@ def test_id_to_recap_alias(): assert recap_type == StructType( fields=[ UnionType( - [NullType(), StringType(bytes_=9_223_372_036_854_775_807)], + [NullType(), StringType()], name="field", default=None, ) @@ -379,7 +379,7 @@ def test_id_to_recap_alias_schema_default(): assert recap_type == StructType( fields=[ UnionType( - [NullType(), StringType(bytes_=9_223_372_036_854_775_807)], + [NullType(), StringType()], name="field", default=None, ) diff --git a/tests/unit/test_types.py b/tests/unit/test_types.py index 8a8fc04..4e551fc 100644 --- a/tests/unit/test_types.py +++ b/tests/unit/test_types.py @@ -478,7 +478,7 @@ def test_from_dict_struct_with_string_field_no_bytes_set(): result = from_dict(input_dict) assert isinstance(result, StructType) assert isinstance(result.fields[0], StringType) - assert result.fields[0].bytes_ == 65_536 + assert result.fields[0].bytes_ is None def test_from_dict_struct_with_bytes_field_no_bytes_set(): @@ -493,7 +493,7 @@ def test_from_dict_struct_with_bytes_field_no_bytes_set(): result = from_dict(input_dict) assert isinstance(result, StructType) assert isinstance(result.fields[0], BytesType) - assert result.fields[0].bytes_ == 65_536 + assert result.fields[0].bytes_ is None def test_from_dict_optional_field(): @@ -1578,7 +1578,7 @@ def test_list_type_validate(): def test_map_type_validate(): - invalid_map_keys = MapType(StringType(9_223_372_036_854_775_808), IntType(32)) + invalid_map_keys = MapType(StringType(bytes_=9_223_372_036_854_775_808), IntType(32)) invalid_map_values = MapType(StringType(65_536), IntType(2_147_483_648)) with pytest.raises(ValueError):