From f47a987429d1370e92eb36e374bb489d0b63ab76 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 22 Aug 2023 14:52:30 +0200 Subject: [PATCH 01/17] Test windows release --- .github/workflows/python-release.yml | 6 ++---- python/mkdocs/docs/how-to-release.md | 8 ++++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml index 33ddaa3ae7b1..c9957aa4fd6c 100644 --- a/.github/workflows/python-release.yml +++ b/.github/workflows/python-release.yml @@ -17,7 +17,7 @@ # under the License. # -name: "Python release" +name: "Python Release" on: workflow_dispatch: @@ -57,11 +57,9 @@ jobs: working-directory: ./python - name: Set version - run: python3 -m poetry version $PYTHON_VERSION + run: python3 -m poetry version "${{ inputs.version }}" working-directory: ./python if: "${{ github.event.inputs.version != 'master' }}" - env: - PYTHON_VERSION: ${{ inputs.version }} - name: Build wheels uses: pypa/cibuildwheel@v2.15.0 diff --git a/python/mkdocs/docs/how-to-release.md b/python/mkdocs/docs/how-to-release.md index 4d0100b3dc58..4169d59a1ac6 100644 --- a/python/mkdocs/docs/how-to-release.md +++ b/python/mkdocs/docs/how-to-release.md @@ -57,7 +57,7 @@ Before committing the files to the Apache SVN artifact distribution SVN hashes n Go to [Github Actions and run the `Python release` action](https://github.com/apache/iceberg/actions/workflows/python-release.yml). **Set the version to master, since we cannot modify the source**. Download the zip, and sign the files: ```bash -for name in $(ls release-master/pyiceberg-*.whl) +for name in $(ls release-master/pyiceberg-*.whl release-master/pyiceberg-*.tar.gz) do gpg --yes --armor --local-user fokko@apache.org --output "${name}.asc" --detach-sig "${name}" shasum -a 512 "${name}.asc" > "${name}.asc.sha512" @@ -72,7 +72,7 @@ svn checkout https://dist.apache.org/repos/dist/dev/iceberg $SVN_TMP_DIR export SVN_TMP_DIR_VERSIONED=${SVN_TMP_DIR}pyiceberg-$VERSION/ mkdir -p $SVN_TMP_DIR_VERSIONED -cp artifact/* $SVN_TMP_DIR_VERSIONED +cp release-master/* $SVN_TMP_DIR_VERSIONED svn add $SVN_TMP_DIR_VERSIONED svn ci -m "PyIceberg ${VERSION}" ${SVN_TMP_DIR_VERSIONED} ``` @@ -81,10 +81,10 @@ svn ci -m "PyIceberg ${VERSION}" ${SVN_TMP_DIR_VERSIONED} Go to Github Actions and run the `Python release` action. Set the version of the release candidate as the input: `0.1.0rc1`. Download the zip and unzip it locally. -Next step is to upload them to pypi. Please keep in mind that this **won't** bump the version for everyone that hasn't pinned their version, since it is set to a RC [pre-release and those are ignored](https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#pre-release-versioning). +Next step is to upload them to pypi. Please keep in mind that this **won't** bump the version for everyone that hasn't pinned their version, since it is set to an RC [pre-release and those are ignored](https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/#pre-release-versioning). ```bash -twine upload -s release-0.1.0-rc1/* +twine upload -s release-0.5.0rc1/* ``` Final step is to generate the email to the dev mail list: From b0c0e1d540f4b58f3fceea7ebcf3d3160abccc9d Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 22 Aug 2023 20:53:37 +0200 Subject: [PATCH 02/17] Python: Follow up on update schema --- python/mkdocs/docs/api.md | 25 ++++------ python/pyiceberg/schema.py | 6 --- python/pyiceberg/table/__init__.py | 80 ++++++++++++++---------------- python/tests/table/test_init.py | 25 +++++----- 4 files changed, 59 insertions(+), 77 deletions(-) diff --git a/python/mkdocs/docs/api.md b/python/mkdocs/docs/api.md index f0b2873c038e..ca1d183fc691 100644 --- a/python/mkdocs/docs/api.md +++ b/python/mkdocs/docs/api.md @@ -150,32 +150,25 @@ catalog.create_table( Add new columns through the `Transaction` or `UpdateSchema` API: -Use the Transaction API: - ```python -with table.transaction() as transaction: - transaction.update_schema().add_column("x", IntegerType(), "doc").commit() +with table.update_schema() as update: + update.add_column("x", IntegerType(), "doc") ``` -Or, without a context manager: +Or, without a context manager by calling the `.commit()` explicitly: ```python -transaction = table.transaction() -transaction.update_schema().add_column("x", IntegerType(), "doc").commit() -transaction.commit_transaction() +table.update_schema().add_column("x", IntegerType(), "doc").commit() ``` -Or, use the UpdateSchema API directly: +Use the Transaction API: ```python -with table.update_schema() as update: - update.add_column("x", IntegerType(), "doc") -``` - -Or, without a context manager: +from datetime import datetime -```python -table.update_schema().add_column("x", IntegerType(), "doc").commit() +with table.transaction() as transaction: + transaction.update_schema().add_column("x", IntegerType(), "doc").commit() + transaction.set_properties(schema_updated_at=str(datetime.now())) ``` ### Update table properties diff --git a/python/pyiceberg/schema.py b/python/pyiceberg/schema.py index 5064d07174a6..3814586baeda 100644 --- a/python/pyiceberg/schema.py +++ b/python/pyiceberg/schema.py @@ -892,12 +892,6 @@ def __init__(self) -> None: self._field_names: List[str] = [] self._short_field_names: List[str] = [] - def before_map_key(self, key: NestedField) -> None: - self.before_field(key) - - def after_map_key(self, key: NestedField) -> None: - self.after_field(key) - def before_map_value(self, value: NestedField) -> None: if not isinstance(value.field_type, StructType): self._short_field_names.append(value.name) diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index 3d4e5f7d2862..c356c879bb9c 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -120,10 +120,7 @@ def __enter__(self) -> Transaction: def __exit__(self, _: Any, value: Any, traceback: Any) -> None: """Closes and commits the transaction.""" - fresh_table = self.commit_transaction() - # Update the new data in place - self._table.metadata = fresh_table.metadata - self._table.metadata_location = fresh_table.metadata_location + self.commit_transaction() def _append_updates(self, *new_updates: TableUpdate) -> Transaction: """Appends updates to the set of staged updates. @@ -158,7 +155,7 @@ def _append_requirements(self, *new_requirements: TableRequirement) -> Transacti """ for requirement in new_requirements: type_new_requirement = type(requirement) - if any(type(update) == type_new_requirement for update in self._updates): + if any(type(requirement) == type_new_requirement for update in self._requirements): raise ValueError(f"Requirements in a single commit need to be unique, duplicate: {type_new_requirement}") self._requirements = self._requirements + new_requirements return self @@ -232,9 +229,7 @@ def commit_transaction(self) -> Table: updates=self._updates, ) ) - # Update the metadata with the new one - self._table.metadata = response.metadata - self._table.metadata_location = response.metadata_location + self._table._update_table(response) # pylint: disable=W0212 return self._table else: @@ -530,6 +525,10 @@ def history(self) -> List[SnapshotLogEntry]: def update_schema(self) -> UpdateSchema: return UpdateSchema(self.schema(), self) + def _update_table(self, response: CommitTableResponse) -> None: + self.metadata = response.metadata + self.metadata_location = response.metadata_location + def __eq__(self, other: Any) -> bool: """Returns the equality of two instances of the Table class.""" return ( @@ -901,7 +900,14 @@ class UpdateSchema: _case_sensitive: bool _transaction: Optional[Transaction] - def __init__(self, schema: Schema, table: Table, transaction: Optional[Transaction] = None): + def __init__( + self, + schema: Schema, + table: Table, + transaction: Optional[Transaction] = None, + allow_incompatible_changes: bool = False, + case_sensitive: bool = True, + ) -> None: self._table = table self._schema = schema self._last_column_id = itertools.count(schema.highest_field_id + 1) @@ -909,8 +915,8 @@ def __init__(self, schema: Schema, table: Table, transaction: Optional[Transacti self._adds = {} self._added_name_to_id = {} self._id_to_parent = {} - self._allow_incompatible_changes = False - self._case_sensitive = True + self._allow_incompatible_changes = allow_incompatible_changes + self._case_sensitive = case_sensitive self._transaction = transaction def __exit__(self, _: Any, value: Any, traceback: Any) -> None: @@ -934,7 +940,7 @@ def case_sensitive(self, case_sensitive: bool) -> UpdateSchema: return self def add_column( - self, name: str, type_var: IcebergType, doc: Optional[str] = None, parent: Optional[str] = None, required: bool = False + self, name: Union[str, Tuple[str, ...]], type_var: IcebergType, doc: Optional[str] = None, required: bool = False ) -> UpdateSchema: """Add a new column to a nested struct or Add a new top-level column. @@ -942,20 +948,19 @@ def add_column( name: Name for the new column. type_var: Type for the new column. doc: Documentation string for the new column. - parent: Name of the parent struct to the column will be added to. required: Whether the new column is required. Returns: This for method chaining """ - if "." in name: - raise ValueError(f"Cannot add column with ambiguous name: {name}") + if isinstance(name, str): + name = (name,) if required and not self._allow_incompatible_changes: # Table format version 1 and 2 cannot add required column because there is no initial value - raise ValueError(f"Incompatible change: cannot add required column: {name}") + raise ValueError(f'Incompatible change: cannot add required column: {".".join(name)}') - self._internal_add_column(parent, name, not required, type_var, doc) + self._internal_add_column(name, not required, type_var, doc) return self def allow_incompatible_changes(self) -> UpdateSchema: @@ -980,11 +985,10 @@ def commit(self) -> None: self._transaction._append_updates(*updates) # pylint: disable=W0212 self._transaction._append_requirements(*requirements) # pylint: disable=W0212 else: - table_update_response = self._table.catalog._commit_table( # pylint: disable=W0212 + response = self._table.catalog._commit_table( # pylint: disable=W0212 CommitTableRequest(identifier=self._table.identifier[1:], updates=updates, requirements=requirements) ) - self._table.metadata = table_update_response.metadata - self._table.metadata_location = table_update_response.metadata_location + self._table._update_table(response) # pylint: disable=W0212 def _apply(self) -> Schema: """Apply the pending changes to the original schema and returns the result. @@ -995,14 +999,16 @@ def _apply(self) -> Schema: return _apply_changes(self._schema, self._adds, self._identifier_field_names) def _internal_add_column( - self, parent: Optional[str], name: str, is_optional: bool, type_var: IcebergType, doc: Optional[str] + self, field_path: Tuple[str, ...], is_optional: bool, type_var: IcebergType, doc: Optional[str] ) -> None: - full_name: str = name + name = field_path[-1] + parent = field_path[:-1] + + full_name = ".".join(field_path) parent_id: int = TABLE_ROOT_ID - exist_field: Optional[NestedField] = None - if parent: - parent_field = self._schema.find_field(parent, self._case_sensitive) + if len(parent) > 0: + parent_field = self._schema.find_field(".".join(parent), self._case_sensitive) parent_type = parent_field.field_type if isinstance(parent_type, MapType): parent_field = parent_type.value_field @@ -1014,24 +1020,14 @@ def _internal_add_column( parent_id = parent_field.field_id - try: - exist_field = self._schema.find_field(parent + "." + name, self._case_sensitive) - except ValueError: - pass - - if exist_field: - raise ValueError(f"Cannot add column, name already exists: {parent}.{name}") - - full_name = parent_field.name + "." + name - - else: - try: - exist_field = self._schema.find_field(name, self._case_sensitive) - except ValueError: - pass + exists = False + try: + exists = self._schema.find_field(full_name, self._case_sensitive) is not None + except ValueError: + pass - if exist_field: - raise ValueError(f"Cannot add column, name already exists: {name}") + if exists: + raise ValueError(f"Cannot add column, name already exists: {full_name}") # assign new IDs in order new_id = self.assign_new_column_id() diff --git a/python/tests/table/test_init.py b/python/tests/table/test_init.py index b25e445032fd..62840ff10d77 100644 --- a/python/tests/table/test_init.py +++ b/python/tests/table/test_init.py @@ -431,7 +431,7 @@ def test_add_primitive_type_column(table_schema_simple: Schema, table: Table) -> for name, type_ in primitive_type.items(): field_name = f"new_column_{name}" update = UpdateSchema(table_schema_simple, table) - update.add_column(parent=None, name=field_name, type_var=type_, doc=f"new_column_{name}") + update.add_column(name=field_name, type_var=type_, doc=f"new_column_{name}") new_schema = update._apply() # pylint: disable=W0212 field: NestedField = new_schema.find_field(field_name) @@ -447,7 +447,7 @@ def test_add_nested_type_column(table_schema_simple: Schema, table: Table) -> No NestedField(1, "lat", DoubleType()), NestedField(2, "long", DoubleType()), ) - update.add_column(parent=None, name=field_name, type_var=struct_) + update.add_column(name=field_name, type_var=struct_) schema_ = update._apply() # pylint: disable=W0212 field: NestedField = schema_.find_field(field_name) assert field.field_type == StructType( @@ -462,7 +462,7 @@ def test_add_nested_map_type_column(table_schema_simple: Schema, table: Table) - field_name = "new_column_map" update = UpdateSchema(table_schema_simple, table) map_ = MapType(1, StringType(), 2, IntegerType(), False) - update.add_column(parent=None, name=field_name, type_var=map_) + update.add_column(name=field_name, type_var=map_) new_schema = update._apply() # pylint: disable=W0212 field: NestedField = new_schema.find_field(field_name) assert field.field_type == MapType(5, StringType(), 6, IntegerType(), False) @@ -481,7 +481,7 @@ def test_add_nested_list_type_column(table_schema_simple: Schema, table: Table) ), element_required=False, ) - update.add_column(parent=None, name=field_name, type_var=list_) + update.add_column(name=field_name, type_var=list_) new_schema = update._apply() # pylint: disable=W0212 field: NestedField = new_schema.find_field(field_name) assert field.field_type == ListType( @@ -498,26 +498,26 @@ def test_add_nested_list_type_column(table_schema_simple: Schema, table: Table) def test_add_field_to_map_key(table_schema_nested_with_struct_key_map: Schema, table: Table) -> None: with pytest.raises(ValueError) as exc_info: update = UpdateSchema(table_schema_nested_with_struct_key_map, table) - update.add_column(name="b", type_var=IntegerType(), parent="location.key")._apply() # pylint: disable=W0212 + update.add_column(name=("location", "key", "b"), type_var=IntegerType())._apply() # pylint: disable=W0212 assert "Cannot add fields to map keys" in str(exc_info.value) def test_add_already_exists(table_schema_nested: Schema, table: Table) -> None: + update = UpdateSchema(table_schema_nested, table) + with pytest.raises(ValueError) as exc_info: - update = UpdateSchema(table_schema_nested, table) update.add_column("foo", IntegerType()) assert "already exists: foo" in str(exc_info.value) with pytest.raises(ValueError) as exc_info: - update = UpdateSchema(table_schema_nested, table) - update.add_column(name="latitude", type_var=IntegerType(), parent="location") - assert "already exists: location.lat" in str(exc_info.value) + update.add_column(name="location.latitude", type_var=IntegerType()) + assert "already exists: location.latitude" in str(exc_info.value) def test_add_to_non_struct_type(table_schema_simple: Schema, table: Table) -> None: + update = UpdateSchema(table_schema_simple, table) with pytest.raises(ValueError) as exc_info: - update = UpdateSchema(table_schema_simple, table) - update.add_column(name="lat", type_var=IntegerType(), parent="foo") + update.add_column(name=("foo", "lat"), type_var=IntegerType()) assert "Cannot add column to non-struct type" in str(exc_info.value) @@ -525,9 +525,8 @@ def test_add_required_column(table: Table) -> None: schema_ = Schema( NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] ) - + update = UpdateSchema(schema_, table) with pytest.raises(ValueError) as exc_info: - update = UpdateSchema(schema_, table) update.add_column(name="data", type_var=IntegerType(), required=True) assert "Incompatible change: cannot add required column: data" in str(exc_info.value) From ec5ed0be03d81f71e9f4179cf1959150eaa80a40 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 22 Aug 2023 21:18:32 +0200 Subject: [PATCH 03/17] Cleanup --- python/pyiceberg/table/__init__.py | 13 +++++++------ python/tests/table/test_init.py | 10 +++++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index c356c879bb9c..68a6a01e971e 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -222,15 +222,13 @@ def commit_transaction(self) -> Table: """ # Strip the catalog name if len(self._updates) > 0: - response = self._table.catalog._commit_table( # pylint: disable=W0212 + self._table._do_commit( # pylint: disable=W0212 CommitTableRequest( identifier=self._table.identifier[1:], requirements=self._requirements, updates=self._updates, ) ) - self._table._update_table(response) # pylint: disable=W0212 - return self._table else: return self._table @@ -525,7 +523,8 @@ def history(self) -> List[SnapshotLogEntry]: def update_schema(self) -> UpdateSchema: return UpdateSchema(self.schema(), self) - def _update_table(self, response: CommitTableResponse) -> None: + def _do_commit(self, request: CommitTableRequest) -> None: + response = self.catalog._commit_table(request) # pylint: disable=W0212 self.metadata = response.metadata self.metadata_location = response.metadata_location @@ -956,6 +955,9 @@ def add_column( if isinstance(name, str): name = (name,) + if "." in name[-1]: + raise ValueError(f"Cannot add column with ambiguous name: {name[-1]}, provide a tuple instead") + if required and not self._allow_incompatible_changes: # Table format version 1 and 2 cannot add required column because there is no initial value raise ValueError(f'Incompatible change: cannot add required column: {".".join(name)}') @@ -985,10 +987,9 @@ def commit(self) -> None: self._transaction._append_updates(*updates) # pylint: disable=W0212 self._transaction._append_requirements(*requirements) # pylint: disable=W0212 else: - response = self._table.catalog._commit_table( # pylint: disable=W0212 + self._table._do_commit( # pylint: disable=W0212 CommitTableRequest(identifier=self._table.identifier[1:], updates=updates, requirements=requirements) ) - self._table._update_table(response) # pylint: disable=W0212 def _apply(self) -> Schema: """Apply the pending changes to the original schema and returns the result. diff --git a/python/tests/table/test_init.py b/python/tests/table/test_init.py index 62840ff10d77..925087465668 100644 --- a/python/tests/table/test_init.py +++ b/python/tests/table/test_init.py @@ -510,10 +510,18 @@ def test_add_already_exists(table_schema_nested: Schema, table: Table) -> None: assert "already exists: foo" in str(exc_info.value) with pytest.raises(ValueError) as exc_info: - update.add_column(name="location.latitude", type_var=IntegerType()) + update.add_column(name=("location", "latitude"), type_var=IntegerType()) assert "already exists: location.latitude" in str(exc_info.value) +def test_ambigous_column(table_schema_nested: Schema, table: Table) -> None: + update = UpdateSchema(table_schema_nested, table) + + with pytest.raises(ValueError) as exc_info: + update.add_column(name="location.latitude", type_var=IntegerType()) + assert "Cannot add column with ambiguous name: location.latitude, provide a tuple instead" in str(exc_info.value) + + def test_add_to_non_struct_type(table_schema_simple: Schema, table: Table) -> None: update = UpdateSchema(table_schema_simple, table) with pytest.raises(ValueError) as exc_info: From 61019cbfe67c3fcd0dfec6eb614bbcc9108ab209 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 23 Aug 2023 07:43:25 +0200 Subject: [PATCH 04/17] Update python/mkdocs/docs/api.md --- python/mkdocs/docs/api.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/mkdocs/docs/api.md b/python/mkdocs/docs/api.md index ca1d183fc691..7af9053a030d 100644 --- a/python/mkdocs/docs/api.md +++ b/python/mkdocs/docs/api.md @@ -161,7 +161,7 @@ Or, without a context manager by calling the `.commit()` explicitly: table.update_schema().add_column("x", IntegerType(), "doc").commit() ``` -Use the Transaction API: +Alternatively, use the transaction API to combine changes from multiple operations: ```python from datetime import datetime From 48a205783352e5e983f4bb83266ab04bc9e2de63 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 23 Aug 2023 23:16:07 +0200 Subject: [PATCH 05/17] WIP --- python/pyiceberg/table/__init__.py | 101 ++++++++++-------------- python/tests/test_integration.py | 121 ++++++++++++++++++++++++++++- 2 files changed, 159 insertions(+), 63 deletions(-) diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index 68a6a01e971e..ac1ba9cafe9d 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -997,7 +997,13 @@ def _apply(self) -> Schema: Returns: the result Schema when all pending updates are applied """ - return _apply_changes(self._schema, self._adds, self._identifier_field_names) + struct = visit(self._schema, _ApplySchemaChanges(self._adds)) + name_to_id: Dict[str, int] = index_by_name(struct) + for name in self._identifier_field_names: + if name not in name_to_id: + raise ValueError(f"Cannot add field {name} as an identifier field: not found in current schema or added columns") + + return Schema(*struct.fields) def _internal_add_column( self, field_path: Tuple[str, ...], is_optional: bool, type_var: IcebergType, doc: Optional[str] @@ -1017,7 +1023,7 @@ def _internal_add_column( parent_field = parent_type.element_field if not parent_field.field_type.is_struct: - raise ValueError(f"Cannot add column to non-struct type: {parent}") + raise ValueError(f"Cannot add column {name} to non-struct type: {parent}") parent_id = parent_field.field_id @@ -1039,77 +1045,50 @@ def _internal_add_column( new_type = assign_fresh_schema_ids(type_var, self.assign_new_column_id) field = NestedField(new_id, name, new_type, not is_optional, doc) - self._adds.setdefault(parent_id, []).append(field) + self._adds[parent_id] = self._adds.get(parent_id, []) + [field] def assign_new_column_id(self) -> int: return next(self._last_column_id) -def _apply_changes(schema_: Schema, adds: Dict[int, List[NestedField]], identifier_field_names: List[str]) -> Schema: - struct = visit(schema_, _ApplyChanges(adds)) - name_to_id: Dict[str, int] = index_by_name(struct) - for name in identifier_field_names: - if name not in name_to_id: - raise ValueError(f"Cannot add field {name} as an identifier field: not found in current schema or added columns") - - return Schema(*struct.fields) - +class _ApplySchemaChanges(SchemaVisitor[IcebergType]): + _adds: Dict[int, List[NestedField]] -class _ApplyChanges(SchemaVisitor[IcebergType]): - def __init__(self, adds: Dict[int, List[NestedField]]): - self.adds = adds + def __init__(self, adds: Dict[int, List[NestedField]]) -> None: + self._adds = adds def schema(self, schema: Schema, struct_result: IcebergType) -> IcebergType: - fields = _ApplyChanges.add_fields(schema.as_struct().fields, self.adds.get(TABLE_ROOT_ID)) - if len(fields) > 0: - return StructType(*fields) - - return struct_result - - def struct(self, struct: StructType, field_results: List[IcebergType]) -> IcebergType: - has_change = False - new_fields: List[NestedField] = [] - for i in range(len(field_results)): - type_: Optional[IcebergType] = field_results[i] - if type_ is None: - has_change = True - continue - - field: NestedField = struct.fields[i] - new_fields.append(field) - - if has_change: + if new_fields := _ApplySchemaChanges.add_fields(struct_result.fields, self._adds.get(TABLE_ROOT_ID, [])): return StructType(*new_fields) + else: + return struct_result - return struct + def struct(self, struct: StructType, field_results: List[IcebergType]) -> IcebergType: + return StructType(*field_results) def field(self, field: NestedField, field_result: IcebergType) -> IcebergType: - field_id: int = field.field_id - if field_id in self.adds: - new_fields = self.adds[field_id] - if len(new_fields) > 0: - fields = _ApplyChanges.add_fields(field_result.fields, new_fields) - if len(fields) > 0: - return StructType(*fields) + if field.field_id in self._adds and isinstance(field_result, StructType): + new_types = _ApplySchemaChanges.add_fields(field_result.fields, self._adds[field.field_id]) + field_result = StructType(*new_types) - return field_result + return NestedField( + field_id=field.field_id, name=field.name, field_type=field_result, required=field.required, doc=field.doc + ) def list(self, list_type: ListType, element_result: IcebergType) -> IcebergType: - element_field: NestedField = list_type.element_field - element_type = self.field(element_field, element_result) - if element_type is None: + element_field: NestedField = self.field(list_type.element_field, element_result) + element_type = element_field.field_type + if element_field is None: raise ValueError(f"Cannot delete element type from list: {element_field}") - is_element_optional = not list_type.element_required - - if is_element_optional == element_field.required and list_type.element_type == element_type: + if list_type.element_required == element_field.required and list_type.element_type == element_type: return list_type - return ListType(list_type.element_id, element_type, is_element_optional) + return ListType(element_id=list_type.element_id, element=element_type, element_required=element_field.required) def map(self, map_type: MapType, key_result: IcebergType, value_result: IcebergType) -> IcebergType: key_id: int = map_type.key_field.field_id - if key_id in self.adds: + if key_id in self._adds: raise ValueError(f"Cannot add fields to map keys: {map_type}") value_field: NestedField = map_type.value_field @@ -1117,20 +1096,20 @@ def map(self, map_type: MapType, key_result: IcebergType, value_result: IcebergT if value_type is None: raise ValueError(f"Cannot delete value type from map: {value_field}") - is_value_optional = not map_type.value_required - - if is_value_optional != value_field.required and map_type.value_type == value_type: + if map_type.value_required == value_field.required and map_type.value_type == value_type: return map_type - return MapType(map_type.key_id, map_type.key_field, map_type.value_id, value_type, not is_value_optional) + return MapType( + key_id=map_type.key_id, + key_type=map_type.key_type, + value_id=map_type.value_id, + value_type=value_type.field_type, + value_required=map_type.value_required + ) def primitive(self, primitive: PrimitiveType) -> IcebergType: return primitive @staticmethod - def add_fields(fields: Tuple[NestedField, ...], adds: Optional[List[NestedField]]) -> List[NestedField]: - new_fields: List[NestedField] = [] - new_fields.extend(fields) - if adds: - new_fields.extend(adds) - return new_fields + def add_fields(fields: Tuple[NestedField, ...], adds: List[NestedField]) -> Optional[List[NestedField]]: + return None if len(adds) == 0 else list(fields) + adds diff --git a/python/tests/test_integration.py b/python/tests/test_integration.py index acd694677463..044b7bba0d9a 100644 --- a/python/tests/test_integration.py +++ b/python/tests/test_integration.py @@ -42,10 +42,14 @@ BooleanType, DoubleType, FixedType, + FloatType, IntegerType, + ListType, LongType, + MapType, NestedField, StringType, + StructType, TimestampType, UUIDType, ) @@ -387,8 +391,9 @@ def test_schema_evolution(catalog: Catalog) -> None: @pytest.mark.integration def test_schema_evolution_via_transaction(catalog: Catalog) -> None: + tbl_name = "default.test_schema_evolution_via_transaction" try: - catalog.drop_table("default.test_schema_evolution") + catalog.drop_table(tbl_name) except NoSuchTableError: pass @@ -397,7 +402,7 @@ def test_schema_evolution_via_transaction(catalog: Catalog) -> None: NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), ) - tbl = catalog.create_table(identifier="default.test_schema_evolution", schema=schema) + tbl = catalog.create_table(identifier=tbl_name, schema=schema) assert tbl.schema() == schema @@ -442,3 +447,115 @@ def test_schema_evolution_via_transaction(catalog: Catalog) -> None: NestedField(field_id=5, name="col_long", field_type=LongType(), required=False), schema_id=1, ) + + +@pytest.mark.integration +def test_schema_evolution_nested(catalog: Catalog) -> None: + tbl_name = "default.test_schema_evolution_nested" + try: + catalog.drop_table(tbl_name) + except NoSuchTableError: + pass + + nested_schema = Schema( + NestedField( + field_id=1, + name="location_lookup", + field_type=MapType( + key_id=10, + key_type=StringType(), + value_id=11, + value_type=StructType( + NestedField(field_id=110, name="x", field_type=FloatType(), required=False), + NestedField(field_id=111, name="y", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=2, + name="locations", + field_type=ListType( + element_id=20, + element_type=StructType( + NestedField(field_id=200, name="x", field_type=FloatType(), required=False), + NestedField(field_id=201, name="y", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=3, + name="person", + field_type=StructType( + NestedField(field_id=30, name="name", field_type=StringType(), required=False), + NestedField(field_id=31, name="age", field_type=IntegerType(), required=True), + ), + required=False, + ), + schema_id=1, + ) + + tbl = catalog.create_table(identifier=tbl_name, schema=nested_schema) + + assert tbl.schema().highest_field_id == 12 + + with tbl.update_schema() as schema_update: + schema_update.add_column(("location_lookup", "z"), FloatType()) + schema_update.add_column(("locations", "z"), FloatType()) + schema_update.add_column(("person", "address"), StringType()) + + assert str(tbl.schema()) == str(Schema( + NestedField( + field_id=1, + name="location_lookup", + field_type=MapType( + type="map", + key_id=4, + key_type=StringType(), + value_id=5, + value_type=StructType( + fields=( + NestedField(field_id=6, name="x", field_type=FloatType(), required=False), + NestedField(field_id=7, name="y", field_type=FloatType(), required=False), + NestedField(field_id=13, name="z", field_type=FloatType(), required=False), + ) + ), + value_required=True, + ), + required=True, + ), + NestedField( + field_id=2, + name="locations", + field_type=ListType( + type="list", + element_id=8, + element_type=StructType( + fields=( + NestedField(field_id=9, name="x", field_type=FloatType(), required=False), + NestedField(field_id=10, name="y", field_type=FloatType(), required=False), + NestedField(field_id=14, name="z", field_type=FloatType(), required=False), + ) + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=3, + name="person", + field_type=StructType( + fields=( + NestedField(field_id=11, name="name", field_type=StringType(), required=False), + NestedField(field_id=12, name="age", field_type=IntegerType(), required=True), + NestedField(field_id=15, name="address", field_type=StringType(), required=False), + ) + ), + required=False, + ), + schema_id=1, + identifier_field_ids=[], + )) From 31b06e4b1838c99d4610b47ad66ad4d6c79cba52 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 23 Aug 2023 23:31:21 +0200 Subject: [PATCH 06/17] Fix nested fields --- python/pyiceberg/table/__init__.py | 48 +++++++++++--------- python/tests/table/test_init.py | 2 +- python/tests/test_integration.py | 72 ++++++++++++++---------------- 3 files changed, 61 insertions(+), 61 deletions(-) diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index ac1ba9cafe9d..6a45ab5826c8 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -1023,7 +1023,7 @@ def _internal_add_column( parent_field = parent_type.element_field if not parent_field.field_type.is_struct: - raise ValueError(f"Cannot add column {name} to non-struct type: {parent}") + raise ValueError(f"Cannot add column '{name}' to non-struct type: {parent}") parent_id = parent_field.field_id @@ -1064,27 +1064,34 @@ def schema(self, schema: Schema, struct_result: IcebergType) -> IcebergType: return struct_result def struct(self, struct: StructType, field_results: List[IcebergType]) -> IcebergType: - return StructType(*field_results) + new_fields = [] + for idx, result_type in enumerate(field_results): + result_type = field_results[idx] - def field(self, field: NestedField, field_result: IcebergType) -> IcebergType: - if field.field_id in self._adds and isinstance(field_result, StructType): - new_types = _ApplySchemaChanges.add_fields(field_result.fields, self._adds[field.field_id]) - field_result = StructType(*new_types) + # Add delete and update logic later - return NestedField( - field_id=field.field_id, name=field.name, field_type=field_result, required=field.required, doc=field.doc - ) + field = struct.fields[idx] + new_fields.append( + NestedField( + field_id=field.field_id, name=field.name, field_type=result_type, required=field.required, doc=field.doc + ) + ) + return StructType(*new_fields) - def list(self, list_type: ListType, element_result: IcebergType) -> IcebergType: - element_field: NestedField = self.field(list_type.element_field, element_result) - element_type = element_field.field_type - if element_field is None: - raise ValueError(f"Cannot delete element type from list: {element_field}") + def field(self, field: NestedField, field_result: IcebergType) -> IcebergType: + if isinstance(field_result, StructType) and ( + new_types := _ApplySchemaChanges.add_fields(field_result.fields, self._adds.get(field.field_id, [])) + ): + return StructType(*new_types) + else: + return field_result - if list_type.element_required == element_field.required and list_type.element_type == element_type: - return list_type + def list(self, list_type: ListType, element_result: IcebergType) -> IcebergType: + element_type: NestedField = self.field(list_type.element_field, element_result) + if element_type is None: + raise ValueError(f"Cannot delete element type from list: {element_result}") - return ListType(element_id=list_type.element_id, element=element_type, element_required=element_field.required) + return ListType(element_id=list_type.element_id, element=element_type, element_required=list_type.element_required) def map(self, map_type: MapType, key_result: IcebergType, value_result: IcebergType) -> IcebergType: key_id: int = map_type.key_field.field_id @@ -1096,15 +1103,12 @@ def map(self, map_type: MapType, key_result: IcebergType, value_result: IcebergT if value_type is None: raise ValueError(f"Cannot delete value type from map: {value_field}") - if map_type.value_required == value_field.required and map_type.value_type == value_type: - return map_type - return MapType( key_id=map_type.key_id, key_type=map_type.key_type, value_id=map_type.value_id, - value_type=value_type.field_type, - value_required=map_type.value_required + value_type=value_type, + value_required=map_type.value_required, ) def primitive(self, primitive: PrimitiveType) -> IcebergType: diff --git a/python/tests/table/test_init.py b/python/tests/table/test_init.py index 925087465668..9efcd75b8398 100644 --- a/python/tests/table/test_init.py +++ b/python/tests/table/test_init.py @@ -526,7 +526,7 @@ def test_add_to_non_struct_type(table_schema_simple: Schema, table: Table) -> No update = UpdateSchema(table_schema_simple, table) with pytest.raises(ValueError) as exc_info: update.add_column(name=("foo", "lat"), type_var=IntegerType()) - assert "Cannot add column to non-struct type" in str(exc_info.value) + assert "Cannot add column 'lat' to non-struct type" in str(exc_info.value) def test_add_required_column(table: Table) -> None: diff --git a/python/tests/test_integration.py b/python/tests/test_integration.py index 044b7bba0d9a..4dcdff966c1a 100644 --- a/python/tests/test_integration.py +++ b/python/tests/test_integration.py @@ -507,55 +507,51 @@ def test_schema_evolution_nested(catalog: Catalog) -> None: schema_update.add_column(("locations", "z"), FloatType()) schema_update.add_column(("person", "address"), StringType()) - assert str(tbl.schema()) == str(Schema( - NestedField( - field_id=1, - name="location_lookup", - field_type=MapType( - type="map", - key_id=4, - key_type=StringType(), - value_id=5, - value_type=StructType( - fields=( + assert str(tbl.schema()) == str( + Schema( + NestedField( + field_id=1, + name="location_lookup", + field_type=MapType( + type="map", + key_id=4, + key_type=StringType(), + value_id=5, + value_type=StructType( NestedField(field_id=6, name="x", field_type=FloatType(), required=False), NestedField(field_id=7, name="y", field_type=FloatType(), required=False), NestedField(field_id=13, name="z", field_type=FloatType(), required=False), - ) + ), + value_required=True, ), - value_required=True, + required=True, ), - required=True, - ), - NestedField( - field_id=2, - name="locations", - field_type=ListType( - type="list", - element_id=8, - element_type=StructType( - fields=( + NestedField( + field_id=2, + name="locations", + field_type=ListType( + type="list", + element_id=8, + element_type=StructType( NestedField(field_id=9, name="x", field_type=FloatType(), required=False), NestedField(field_id=10, name="y", field_type=FloatType(), required=False), NestedField(field_id=14, name="z", field_type=FloatType(), required=False), - ) + ), + element_required=True, ), - element_required=True, + required=True, ), - required=True, - ), - NestedField( - field_id=3, - name="person", - field_type=StructType( - fields=( + NestedField( + field_id=3, + name="person", + field_type=StructType( NestedField(field_id=11, name="name", field_type=StringType(), required=False), NestedField(field_id=12, name="age", field_type=IntegerType(), required=True), NestedField(field_id=15, name="address", field_type=StringType(), required=False), - ) + ), + required=False, ), - required=False, - ), - schema_id=1, - identifier_field_ids=[], - )) + schema_id=1, + identifier_field_ids=[], + ) + ) From 1ff0bec84c8ed0add834ac48c0ec5c4b8719fa6a Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 24 Aug 2023 14:13:48 +0200 Subject: [PATCH 07/17] First the code, next the tests --- python/pyiceberg/table/__init__.py | 611 +++++++++++++++++++++++++---- python/tests/catalog/test_base.py | 8 +- python/tests/table/test_init.py | 28 +- 3 files changed, 542 insertions(+), 105 deletions(-) diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index 6a45ab5826c8..2ddfcb6d28ff 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -18,6 +18,7 @@ import itertools from abc import ABC, abstractmethod +from copy import copy from dataclasses import dataclass from enum import Enum from functools import cached_property @@ -63,7 +64,6 @@ Schema, SchemaVisitor, assign_fresh_schema_ids, - index_by_name, visit, ) from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER, TableMetadata @@ -94,7 +94,6 @@ from pyiceberg.catalog import Catalog - ALWAYS_TRUE = AlwaysTrue() TABLE_ROOT_ID = -1 @@ -887,14 +886,32 @@ def to_ray(self) -> ray.data.dataset.Dataset: return ray.data.from_arrow(self.to_arrow()) +class MoveOperation(Enum): + First = 1 + Before = 2 + After = 3 + + +@dataclass +class Move: + field_id: int + op: MoveOperation + other_field_id: Optional[int] = None + + class UpdateSchema: _table: Table _schema: Schema _last_column_id: itertools.count[int] _identifier_field_names: List[str] - _adds: Dict[int, List[NestedField]] - _added_name_to_id: Dict[str, int] - _id_to_parent: Dict[int, str] + + _adds: Dict[int, List[NestedField]] = {} + _updates: Dict[int, NestedField] = {} + _deletes: Set[int] = set() + _moves: Dict[int, List[Move]] = {} + + _added_name_to_id: Dict[str, int] = {} + _id_to_parent: Dict[int, str] = {} _allow_incompatible_changes: bool _case_sensitive: bool _transaction: Optional[Transaction] @@ -911,9 +928,15 @@ def __init__( self._schema = schema self._last_column_id = itertools.count(schema.highest_field_id + 1) self._identifier_field_names = schema.column_names + self._adds = {} + self._updates = {} + self._deletes = set() + self._moves = {} + self._added_name_to_id = {} self._id_to_parent = {} + self._allow_incompatible_changes = allow_incompatible_changes self._case_sensitive = case_sensitive self._transaction = transaction @@ -939,30 +962,391 @@ def case_sensitive(self, case_sensitive: bool) -> UpdateSchema: return self def add_column( - self, name: Union[str, Tuple[str, ...]], type_var: IcebergType, doc: Optional[str] = None, required: bool = False + self, path: Union[str, Tuple[str, ...]], field_type: IcebergType, doc: Optional[str] = None, required: bool = False ) -> UpdateSchema: """Add a new column to a nested struct or Add a new top-level column. Args: - name: Name for the new column. - type_var: Type for the new column. + path: Name for the new column. + field_type: Type for the new column. doc: Documentation string for the new column. required: Whether the new column is required. Returns: - This for method chaining + This for method chaining. """ - if isinstance(name, str): - name = (name,) + path = (path,) if isinstance(path, str) else path - if "." in name[-1]: - raise ValueError(f"Cannot add column with ambiguous name: {name[-1]}, provide a tuple instead") + if "." in path[-1]: + raise ValueError(f"Cannot add column with ambiguous name: {path[-1]}, provide a tuple instead") if required and not self._allow_incompatible_changes: # Table format version 1 and 2 cannot add required column because there is no initial value - raise ValueError(f'Incompatible change: cannot add required column: {".".join(name)}') + raise ValueError(f'Incompatible change: cannot add required column: {".".join(path)}') + + name = path[-1] + parent = path[:-1] + + full_name = ".".join(path) + parent_id: int = TABLE_ROOT_ID + + if len(parent) > 0: + parent_field = self._schema.find_field(".".join(parent), self._case_sensitive) + parent_type = parent_field.field_type + if isinstance(parent_type, MapType): + parent_field = parent_type.value_field + elif isinstance(parent_type, ListType): + parent_field = parent_type.element_field + + if not parent_field.field_type.is_struct: + raise ValueError(f"Cannot add column '{name}' to non-struct type: {'.'.join(parent)}") + + parent_id = parent_field.field_id + + exists = False + try: + exists = self._schema.find_field(full_name, self._case_sensitive) is not None + except ValueError: + pass + + if exists: + raise ValueError(f"Cannot add column, name already exists: {full_name}") + + # assign new IDs in order + new_id = self.assign_new_column_id() + + # update tracking for moves + self._added_name_to_id[full_name] = new_id + + new_type = assign_fresh_schema_ids(field_type, self.assign_new_column_id) + field = NestedField(field_id=new_id, name=name, field_type=new_type, required=required, doc=doc) + + self._adds[parent_id] = self._adds.get(parent_id, []) + [field] + + return self + + def delete_column(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: + """Deletes a column from a table. + + Args: + path: The path to the column. + + Returns: + The UpdateSchema with the delete operation staged. + """ + name = (path,) if isinstance(path, str) else path + full_name = ".".join(name) + + field = self._schema.find_field(full_name, self._case_sensitive) + + if field.field_id in self._adds: + raise ValueError(f"Cannot delete a column that has additions: {full_name}") + if field.field_id in self._updates: + raise ValueError(f"Cannot delete a column that has updates: {full_name}") + + self._deletes.add(field.field_id) + + return self + + def rename_column(self, path_from: Union[str, Tuple[str, ...]], path_to: Union[str, Tuple[str, ...]]) -> UpdateSchema: + """Updates the name of a column. + + Args: + path_from: The path to the column to be renamed. + path_to: The new path of the column. + + Returns: + The UpdateSchema with the rename operation staged. + """ + name_from = (path_from,) if isinstance(path_from, str) else path_from + name_to = (path_to,) if isinstance(path_to, str) else path_to + + full_name_from = ".".join(name_from) + full_name_to = ".".join(name_to) + + from_field = self._schema.find_field(full_name_from, self._case_sensitive) + + if from_field.field_id in self._deletes: + raise ValueError(f"Cannot rename a column that will be deleted: {full_name_from}") + + if updated := self._updates.get(from_field.field_id): + self._updates[from_field.field_id] = NestedField( + field_id=updated.field_id, + name=full_name_to, + field_type=updated.field_type, + doc=updated.doc, + required=updated.required, + ) + else: + self._updates[from_field.field_id] = NestedField( + field_id=from_field.field_id, + name=full_name_to, + field_type=from_field.field_type, + doc=from_field.doc, + required=from_field.required, + ) + + if path_from in self._identifier_field_names: + self._identifier_field_names.remove(full_name_from) + self._identifier_field_names.append(full_name_to) + + return self + + def require_column(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: + """Makes a column required. + + This is a breaking change since writers have to make sure that + this value is not-null. + + Args: + path: The path to the field + + Returns: + The UpdateSchema with the requirement change staged. + """ + self._set_column_requirement(path, True) + return self + + def make_column_optional(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: + """Makes a column optional. + + Args: + path: The path to the field. + + Returns: + The UpdateSchema with the requirement change staged. + """ + self._set_column_requirement(path, False) + return self + + def _set_column_requirement(self, path: Union[str, Tuple[str, ...]], required: bool) -> None: + path = (path,) if isinstance(path, str) else path + name = ".".join(path) + + field = self._schema.find_field(name) + + if (field.required and required) or (field.optional and not required): + # if the change is a noop, allow it even if allowIncompatibleChanges is false + return + + if self._allow_incompatible_changes and not required: + raise ValueError(f"Cannot change column nullability: {name}: optional -> required") + + if field.field_id in self._deletes: + raise ValueError(f"Cannot update a column that will be deleted: {name}") + + if updated := self._updates.get(field.field_id): + self._updates[field.field_id] = NestedField( + field_id=updated.field_id, + name=updated.name, + field_type=updated.field_type, + doc=updated.doc, + required=required, + ) + else: + self._updates[field.field_id] = NestedField( + field_id=field.field_id, + name=field.name, + field_type=field.field_type, + doc=field.doc, + required=required, + ) + + def update_column(self, path: Union[str, Tuple[str, ...]], field_type: IcebergType) -> UpdateSchema: + """Update the type of column. + + Args: + path: The path to the field. + field_type: The new type + + Returns: + The UpdateSchema with the type update staged. + """ + path = (path,) if isinstance(path, str) else path + full_name = ".".join(path) + + field = self._schema.find_field(full_name) + + if field.field_id in self._deletes: + raise ValueError(f"Cannot update a column that will be deleted: {full_name}") + + if field.field_type == field_type: + # Nothing changed + return self + + if updated := self._updates.get(field.field_id): + self._updates[field.field_id] = NestedField( + field_id=updated.field_id, + name=updated.name, + field_type=field_type, + doc=updated.doc, + required=updated.required, + ) + else: + self._updates[field.field_id] = NestedField( + field_id=field.field_id, + name=field.name, + field_type=field_type, + doc=field.doc, + required=field.required, + ) + + return self + + def update_column_doc(self, path: Union[str, Tuple[str, ...]], doc: str) -> UpdateSchema: + """Update the documentation of column. + + Args: + path: The path to the field. + doc: The new documentation of the column + + Returns: + The UpdateSchema with the doc update staged. + """ + path = (path,) if isinstance(path, str) else path + full_name = ".".join(path) + + field = self._schema.find_field(full_name) + + if field.field_id in self._deletes: + raise ValueError(f"Cannot update a column that will be deleted: {full_name}") + + if field.doc == doc: + # Noop + return self + + if updated := self._updates.get(field.field_id): + self._updates[field.field_id] = NestedField( + field_id=updated.field_id, + name=updated.name, + field_type=updated.field_type, + doc=doc, + required=updated.required, + ) + else: + self._updates[field.field_id] = NestedField( + field_id=field.field_id, + name=field.name, + field_type=field.field_type, + doc=doc, + required=field.required, + ) + + return self + + def _find_for_move(self, name: str) -> Optional[int]: + try: + return self._schema.find_field(name, self._case_sensitive).field_id + except ValueError: + pass + + return self._added_name_to_id.get(name) + + def _move(self, full_name: str, move: Move) -> None: + if parent_name := self._id_to_parent.get(move.field_id): + parent_field = self._schema.find_field(parent_name) + if not parent_field.is_struct: + raise ValueError(f"Cannot move fields in non-struct type: {parent_field}") + + if move.op == MoveOperation.After or move.op == MoveOperation.Before: + if move.other_field_id is None: + raise ValueError("Expected other field when performing before/after move") + + if self._id_to_parent.get(move.field_id) != self._id_to_parent.get(move.other_field_id): + raise ValueError(f"Cannot move field {full_name} to a different struct") + + self._moves[parent_field.field_id] = self._moves.get(parent_field.field_id, []) + [move] + else: + if move.op == MoveOperation.After or move.op == MoveOperation.Before: + if move.other_field_id is None: + raise ValueError("Expected other field when performing before/after move") + + if self._id_to_parent.get(move.other_field_id) is not None: + raise ValueError(f"Cannot move field {full_name} to a different struct") + + self._moves[TABLE_ROOT_ID] = self._moves.get(TABLE_ROOT_ID, []) + [move] + + def move_first(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: + """Moves the field to the first position of the parent struct. + + Args: + path: The path to the field. + + Returns: + The UpdateSchema with the move operation staged. + """ + path = (path,) if isinstance(path, str) else path + full_name = ".".join(path) + + field_id = self._find_for_move(full_name) + + if field_id is None: + raise ValueError(f"Cannot move missing column: {full_name}") + + self._move(full_name, Move(field_id=field_id, op=MoveOperation.First)) + + return self + + def move_before(self, path: Union[str, Tuple[str, ...]], before_path: Union[str, Tuple[str, ...]]) -> UpdateSchema: + """Moves the field to before another field. + + Args: + path: The path to the field. + + Returns: + The UpdateSchema with the move operation staged. + """ + path = (path,) if isinstance(path, str) else path + full_name = ".".join(path) + + field_id = self._find_for_move(full_name) + + if field_id is None: + raise ValueError(f"Cannot move missing column: {full_name}") + + before_path = (before_path,) if isinstance(before_path, str) else before_path + before_full_name = ".".join(before_path) + before_field_id = self._find_for_move(before_full_name) + + if before_field_id is None: + raise ValueError(f"Cannot move before missing column: {before_full_name}") + + if field_id == before_field_id: + raise ValueError(f"Cannot move {full_name} before itself") + + self._move(full_name, Move(field_id=field_id, other_field_id=before_field_id, op=MoveOperation.Before)) + + return self + + def move_after(self, path: Union[str, Tuple[str, ...]], after_name: Union[str, Tuple[str, ...]]) -> UpdateSchema: + """Moves the field to after another field. + + Args: + path: The path to the field. + + Returns: + The UpdateSchema with the move operation staged. + """ + path = (path,) if isinstance(path, str) else path + full_name = ".".join(path) + + field_id = self._find_for_move(full_name) + + if field_id is None: + raise ValueError(f"Cannot move missing column: {full_name}") + + after_path = (after_name,) if isinstance(after_name, str) else after_name + after_full_name = ".".join(after_path) + after_field_id = self._find_for_move(after_full_name) + + if after_field_id is None: + raise ValueError(f"Cannot move after missing column: {after_full_name}") + + if field_id == after_field_id: + raise ValueError(f"Cannot move {full_name} after itself") + + self._move(full_name, Move(field_id=field_id, other_field_id=after_field_id, op=MoveOperation.After)) - self._internal_add_column(name, not required, type_var, doc) return self def allow_incompatible_changes(self) -> UpdateSchema: @@ -997,103 +1381,116 @@ def _apply(self) -> Schema: Returns: the result Schema when all pending updates are applied """ - struct = visit(self._schema, _ApplySchemaChanges(self._adds)) - name_to_id: Dict[str, int] = index_by_name(struct) - for name in self._identifier_field_names: - if name not in name_to_id: - raise ValueError(f"Cannot add field {name} as an identifier field: not found in current schema or added columns") - - return Schema(*struct.fields) - - def _internal_add_column( - self, field_path: Tuple[str, ...], is_optional: bool, type_var: IcebergType, doc: Optional[str] - ) -> None: - name = field_path[-1] - parent = field_path[:-1] + struct = visit(self._schema, _ApplyChanges(self._adds, self._updates, self._deletes, self._moves)) + if struct is None: + # Should never happen + raise ValueError("Could not apply changes") - full_name = ".".join(field_path) - parent_id: int = TABLE_ROOT_ID - - if len(parent) > 0: - parent_field = self._schema.find_field(".".join(parent), self._case_sensitive) - parent_type = parent_field.field_type - if isinstance(parent_type, MapType): - parent_field = parent_type.value_field - elif isinstance(parent_type, ListType): - parent_field = parent_type.element_field - - if not parent_field.field_type.is_struct: - raise ValueError(f"Cannot add column '{name}' to non-struct type: {parent}") - - parent_id = parent_field.field_id - - exists = False - try: - exists = self._schema.find_field(full_name, self._case_sensitive) is not None - except ValueError: - pass - - if exists: - raise ValueError(f"Cannot add column, name already exists: {full_name}") - - # assign new IDs in order - new_id = self.assign_new_column_id() - - # update tracking for moves - self._added_name_to_id[full_name] = new_id - - new_type = assign_fresh_schema_ids(type_var, self.assign_new_column_id) - field = NestedField(new_id, name, new_type, not is_optional, doc) + schema = Schema(*struct.fields) + for name in self._identifier_field_names: + try: + _ = schema.find_field(name) + except ValueError as e: + raise ValueError( + f"Cannot add field {name} as an identifier field: not found in current schema or added columns" + ) from e - self._adds[parent_id] = self._adds.get(parent_id, []) + [field] + return schema def assign_new_column_id(self) -> int: return next(self._last_column_id) -class _ApplySchemaChanges(SchemaVisitor[IcebergType]): +class _ApplyChanges(SchemaVisitor[Optional[IcebergType]]): _adds: Dict[int, List[NestedField]] + _updates: Dict[int, NestedField] + _deletes: Set[int] + _moves: Dict[int, List[Move]] - def __init__(self, adds: Dict[int, List[NestedField]]) -> None: + def __init__( + self, adds: Dict[int, List[NestedField]], updates: Dict[int, NestedField], deletes: Set[int], moves: Dict[int, List[Move]] + ) -> None: self._adds = adds + self._updates = updates + self._deletes = deletes + self._moves = moves - def schema(self, schema: Schema, struct_result: IcebergType) -> IcebergType: - if new_fields := _ApplySchemaChanges.add_fields(struct_result.fields, self._adds.get(TABLE_ROOT_ID, [])): + def schema(self, schema: Schema, struct_result: Optional[IcebergType]) -> Optional[IcebergType]: + if new_fields := _add_fields(struct_result.fields if struct_result else [], self._adds.get(TABLE_ROOT_ID)): return StructType(*new_fields) else: return struct_result - def struct(self, struct: StructType, field_results: List[IcebergType]) -> IcebergType: + def struct(self, struct: StructType, field_results: List[Optional[IcebergType]]) -> Optional[IcebergType]: + has_changes = False new_fields = [] + for idx, result_type in enumerate(field_results): result_type = field_results[idx] - # Add delete and update logic later + # Has been deleted + if result_type is None: + has_changes = True + continue field = struct.fields[idx] - new_fields.append( - NestedField( - field_id=field.field_id, name=field.name, field_type=result_type, required=field.required, doc=field.doc + + name = field.name + doc = field.doc + required = field.required + + # There is an update + if update := self._updates.get(field): + name = update.name + doc = update.doc + required = update.required + + if field.name == name and field.field_type == result_type and field.required == required and field.doc == doc: + new_fields.append(field) + else: + has_changes = True + new_fields.append( + NestedField( + field_id=field.field_id, name=field.name, field_type=result_type, required=field.required, doc=field.doc + ) ) - ) - return StructType(*new_fields) - def field(self, field: NestedField, field_result: IcebergType) -> IcebergType: - if isinstance(field_result, StructType) and ( - new_types := _ApplySchemaChanges.add_fields(field_result.fields, self._adds.get(field.field_id, [])) - ): - return StructType(*new_types) - else: - return field_result + if has_changes: + return StructType(*new_fields) + + return struct + + def field(self, field: NestedField, field_result: Optional[IcebergType]) -> Optional[IcebergType]: + # the API validates deletes, updates, and additions don't conflict handle deletes + if field.field_id in self._deletes: + return None + + # handle updates + if (update := self._updates.get(field.field_id)) and field.field_type != update.field_type: + return update.field_type + + # handle add & moves + added = self._adds.get(field.field_id) + moves = self._moves.get(field.field_id) + if added is not None or moves is not None: + if not isinstance(field.field_type, StructType): + raise ValueError(f"Cannot add fields to non-struct: {field}") + + if new_fields := _add_and_move_fields(field.field_type.fields, added or [], moves or []): + return StructType(*new_fields) - def list(self, list_type: ListType, element_result: IcebergType) -> IcebergType: - element_type: NestedField = self.field(list_type.element_field, element_result) + return field_result + + def list(self, list_type: ListType, element_result: Optional[IcebergType]) -> Optional[IcebergType]: + element_type = self.field(list_type.element_field, element_result) if element_type is None: raise ValueError(f"Cannot delete element type from list: {element_result}") return ListType(element_id=list_type.element_id, element=element_type, element_required=list_type.element_required) - def map(self, map_type: MapType, key_result: IcebergType, value_result: IcebergType) -> IcebergType: + def map( + self, map_type: MapType, key_result: Optional[IcebergType], value_result: Optional[IcebergType] + ) -> Optional[IcebergType]: key_id: int = map_type.key_field.field_id if key_id in self._adds: raise ValueError(f"Cannot add fields to map keys: {map_type}") @@ -1111,9 +1508,49 @@ def map(self, map_type: MapType, key_result: IcebergType, value_result: IcebergT value_required=map_type.value_required, ) - def primitive(self, primitive: PrimitiveType) -> IcebergType: + def primitive(self, primitive: PrimitiveType) -> Optional[IcebergType]: return primitive - @staticmethod - def add_fields(fields: Tuple[NestedField, ...], adds: List[NestedField]) -> Optional[List[NestedField]]: - return None if len(adds) == 0 else list(fields) + adds + +def _add_fields(fields: Tuple[NestedField, ...], adds: Optional[List[NestedField]]) -> Optional[Tuple[NestedField, ...]]: + adds = adds or [] + return None if len(adds) == 0 else tuple(*fields, *adds) + + +def _move_fields(fields: Tuple[NestedField, ...], moves: List[Move]) -> Tuple[NestedField, ...]: + reordered = list(copy(fields)) + for move in moves: + # Find the field that we're about to move + field = next(field for field in reordered if field.field_id == move.field_id) + # Remove the field that we're about to move from the list + reordered = [field for field in reordered if field.field_id != move.field_id] + + if move.op == MoveOperation.First: + reordered = [field] + reordered + elif move.op == MoveOperation.Before or move.op == MoveOperation.After: + other_field_id = move.other_field_id + other_field_pos = next(i for i, field in enumerate(reordered) if field.field_id == other_field_id) + if move.op == MoveOperation.Before: + reordered.insert(other_field_pos, field) + else: + reordered.insert(other_field_pos + 1, field) + else: + raise ValueError(f"Unknown operation: {move.op}") + + return tuple(reordered) + + +def _add_and_move_fields( + fields: Tuple[NestedField, ...], adds: List[NestedField], moves: List[Move] +) -> Optional[Tuple[NestedField, ...]]: + if adds: + # always apply adds first so that added fields can be moved + added = _add_fields(fields, adds) + if moves: + return _move_fields(added, moves) # type: ignore + else: + return added + # add fields + elif moves: + return _move_fields(fields, moves) + return None if len(adds) == 0 else tuple(*fields, *adds) diff --git a/python/tests/catalog/test_base.py b/python/tests/catalog/test_base.py index 29e93d0c9d05..e4da808014f6 100644 --- a/python/tests/catalog/test_base.py +++ b/python/tests/catalog/test_base.py @@ -542,7 +542,7 @@ def test_commit_table(catalog: InMemoryCatalog) -> None: def test_add_column(catalog: InMemoryCatalog) -> None: given_table = given_catalog_has_a_table(catalog) - given_table.update_schema().add_column(name="new_column1", type_var=IntegerType()).commit() + given_table.update_schema().add_column(path="new_column1", field_type=IntegerType()).commit() assert given_table.schema() == Schema( NestedField(field_id=1, name="x", field_type=LongType(), required=True), @@ -554,7 +554,7 @@ def test_add_column(catalog: InMemoryCatalog) -> None: ) transaction = given_table.transaction() - transaction.update_schema().add_column(name="new_column2", type_var=IntegerType(), doc="doc").commit() + transaction.update_schema().add_column(path="new_column2", field_type=IntegerType(), doc="doc").commit() transaction.commit_transaction() assert given_table.schema() == Schema( @@ -572,7 +572,7 @@ def test_add_column_with_statement(catalog: InMemoryCatalog) -> None: given_table = given_catalog_has_a_table(catalog) with given_table.update_schema() as tx: - tx.add_column(name="new_column1", type_var=IntegerType()) + tx.add_column(path="new_column1", field_type=IntegerType()) assert given_table.schema() == Schema( NestedField(field_id=1, name="x", field_type=LongType(), required=True), @@ -584,7 +584,7 @@ def test_add_column_with_statement(catalog: InMemoryCatalog) -> None: ) with given_table.transaction() as tx: - tx.update_schema().add_column(name="new_column2", type_var=IntegerType(), doc="doc").commit() + tx.update_schema().add_column(path="new_column2", field_type=IntegerType(), doc="doc").commit() assert given_table.schema() == Schema( NestedField(field_id=1, name="x", field_type=LongType(), required=True), diff --git a/python/tests/table/test_init.py b/python/tests/table/test_init.py index 9efcd75b8398..b90fd2d8773f 100644 --- a/python/tests/table/test_init.py +++ b/python/tests/table/test_init.py @@ -398,7 +398,7 @@ def test_serialize_set_properties_updates() -> None: def test_add_column(table_schema_simple: Schema, table: Table) -> None: update = UpdateSchema(table_schema_simple, table) - update.add_column(name="b", type_var=IntegerType()) + update.add_column(path="b", field_type=IntegerType()) apply_schema: Schema = update._apply() # pylint: disable=W0212 assert len(apply_schema.fields) == 4 @@ -431,7 +431,7 @@ def test_add_primitive_type_column(table_schema_simple: Schema, table: Table) -> for name, type_ in primitive_type.items(): field_name = f"new_column_{name}" update = UpdateSchema(table_schema_simple, table) - update.add_column(name=field_name, type_var=type_, doc=f"new_column_{name}") + update.add_column(path=field_name, field_type=type_, doc=f"new_column_{name}") new_schema = update._apply() # pylint: disable=W0212 field: NestedField = new_schema.find_field(field_name) @@ -447,7 +447,7 @@ def test_add_nested_type_column(table_schema_simple: Schema, table: Table) -> No NestedField(1, "lat", DoubleType()), NestedField(2, "long", DoubleType()), ) - update.add_column(name=field_name, type_var=struct_) + update.add_column(path=field_name, field_type=struct_) schema_ = update._apply() # pylint: disable=W0212 field: NestedField = schema_.find_field(field_name) assert field.field_type == StructType( @@ -462,7 +462,7 @@ def test_add_nested_map_type_column(table_schema_simple: Schema, table: Table) - field_name = "new_column_map" update = UpdateSchema(table_schema_simple, table) map_ = MapType(1, StringType(), 2, IntegerType(), False) - update.add_column(name=field_name, type_var=map_) + update.add_column(path=field_name, field_type=map_) new_schema = update._apply() # pylint: disable=W0212 field: NestedField = new_schema.find_field(field_name) assert field.field_type == MapType(5, StringType(), 6, IntegerType(), False) @@ -481,7 +481,7 @@ def test_add_nested_list_type_column(table_schema_simple: Schema, table: Table) ), element_required=False, ) - update.add_column(name=field_name, type_var=list_) + update.add_column(path=field_name, field_type=list_) new_schema = update._apply() # pylint: disable=W0212 field: NestedField = new_schema.find_field(field_name) assert field.field_type == ListType( @@ -498,7 +498,7 @@ def test_add_nested_list_type_column(table_schema_simple: Schema, table: Table) def test_add_field_to_map_key(table_schema_nested_with_struct_key_map: Schema, table: Table) -> None: with pytest.raises(ValueError) as exc_info: update = UpdateSchema(table_schema_nested_with_struct_key_map, table) - update.add_column(name=("location", "key", "b"), type_var=IntegerType())._apply() # pylint: disable=W0212 + update.add_column(path=("location", "key", "b"), field_type=IntegerType())._apply() # pylint: disable=W0212 assert "Cannot add fields to map keys" in str(exc_info.value) @@ -510,7 +510,7 @@ def test_add_already_exists(table_schema_nested: Schema, table: Table) -> None: assert "already exists: foo" in str(exc_info.value) with pytest.raises(ValueError) as exc_info: - update.add_column(name=("location", "latitude"), type_var=IntegerType()) + update.add_column(path=("location", "latitude"), field_type=IntegerType()) assert "already exists: location.latitude" in str(exc_info.value) @@ -518,14 +518,14 @@ def test_ambigous_column(table_schema_nested: Schema, table: Table) -> None: update = UpdateSchema(table_schema_nested, table) with pytest.raises(ValueError) as exc_info: - update.add_column(name="location.latitude", type_var=IntegerType()) + update.add_column(path="location.latitude", field_type=IntegerType()) assert "Cannot add column with ambiguous name: location.latitude, provide a tuple instead" in str(exc_info.value) def test_add_to_non_struct_type(table_schema_simple: Schema, table: Table) -> None: update = UpdateSchema(table_schema_simple, table) with pytest.raises(ValueError) as exc_info: - update.add_column(name=("foo", "lat"), type_var=IntegerType()) + update.add_column(path=("foo", "lat"), field_type=IntegerType()) assert "Cannot add column 'lat' to non-struct type" in str(exc_info.value) @@ -535,13 +535,13 @@ def test_add_required_column(table: Table) -> None: ) update = UpdateSchema(schema_, table) with pytest.raises(ValueError) as exc_info: - update.add_column(name="data", type_var=IntegerType(), required=True) + update.add_column(path="data", field_type=IntegerType(), required=True) assert "Incompatible change: cannot add required column: data" in str(exc_info.value) new_schema = ( UpdateSchema(schema_, table) # pylint: disable=W0212 .allow_incompatible_changes() - .add_column(name="data", type_var=IntegerType(), required=True) + .add_column(path="data", field_type=IntegerType(), required=True) ._apply() ) assert new_schema == Schema( @@ -559,17 +559,17 @@ def test_add_required_column_case_insensitive(table: Table) -> None: with pytest.raises(ValueError) as exc_info: update = UpdateSchema(schema_, table) - update.allow_incompatible_changes().case_sensitive(False).add_column(name="ID", type_var=IntegerType(), required=True) + update.allow_incompatible_changes().case_sensitive(False).add_column(path="ID", field_type=IntegerType(), required=True) assert "already exists: ID" in str(exc_info.value) new_schema = ( UpdateSchema(schema_, table) # pylint: disable=W0212 .allow_incompatible_changes() - .add_column(name="ID", type_var=IntegerType(), required=True) + .add_column(path="ID", field_type=IntegerType(), required=True) ._apply() ) assert new_schema == Schema( - NestedField(field_id=1, name="id", field_type=BooleanType(), required=False), + NestedField(field_id=1, path="id", field_type=BooleanType(), required=False), NestedField(field_id=2, name="ID", field_type=IntegerType(), required=True), schema_id=0, identifier_field_ids=[], From f9fc3053d65a62887218445d693ea25e3feed9a8 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 24 Aug 2023 18:54:28 +0200 Subject: [PATCH 08/17] Step into the right direction --- python/pyiceberg/schema.py | 25 ++ python/pyiceberg/table/__init__.py | 80 ++-- python/tests/conftest.py | 2 +- python/tests/table/test_init.py | 202 +-------- python/tests/test_integration.py | 205 +-------- python/tests/test_integration_schema.py | 569 ++++++++++++++++++++++++ 6 files changed, 643 insertions(+), 440 deletions(-) create mode 100644 python/tests/test_integration_schema.py diff --git a/python/pyiceberg/schema.py b/python/pyiceberg/schema.py index 3814586baeda..57708076992c 100644 --- a/python/pyiceberg/schema.py +++ b/python/pyiceberg/schema.py @@ -245,6 +245,21 @@ def accessor_for_field(self, field_id: int) -> Accessor: return self._lazy_id_to_accessor[field_id] + def identifier_field_names(self) -> Set[str]: + """Returns the names of the identifier fields. + + Returns: + Set of names of the identifier fields + """ + ids = set() + for field_id in self.identifier_field_ids: + column_name = self.find_column_name(field_id) + if column_name is None: + raise ValueError(f"Could not find identifier column id: {field_id}") + ids.add(column_name) + + return ids + def select(self, *names: str, case_sensitive: bool = True) -> Schema: """Return a new schema instance pruned to a subset of columns. @@ -1155,6 +1170,16 @@ def primitive(self, primitive: PrimitiveType) -> PrimitiveType: def prune_columns(schema: Schema, selected: Set[int], select_full_types: bool = True) -> Schema: + """Prunes a column by only selecting a set of field-ids. + + Args: + schema: The schema to be pruned. + selected: The field-ids to be included. + select_full_types: Return the full struct when a subset is recorded + + Returns: + The pruned schema. + """ result = visit(schema.as_struct(), _PruneColumnsVisitor(selected, select_full_types)) return Schema( *(result or StructType()).fields, diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index 2ddfcb6d28ff..199761c9da57 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -189,7 +189,7 @@ def update_schema(self) -> UpdateSchema: Returns: A new UpdateSchema. """ - return UpdateSchema(self._table.schema(), self._table, self) + return UpdateSchema(self._table, self) def remove_properties(self, *removals: str) -> Transaction: """Removes properties. @@ -519,8 +519,8 @@ def history(self) -> List[SnapshotLogEntry]: """Get the snapshot history of this table.""" return self.metadata.snapshot_log - def update_schema(self) -> UpdateSchema: - return UpdateSchema(self.schema(), self) + def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema: + return UpdateSchema(self, allow_incompatible_changes=allow_incompatible_changes, case_sensitive=case_sensitive) def _do_commit(self, request: CommitTableRequest) -> None: response = self.catalog._commit_table(request) # pylint: disable=W0212 @@ -903,7 +903,7 @@ class UpdateSchema: _table: Table _schema: Schema _last_column_id: itertools.count[int] - _identifier_field_names: List[str] + _identifier_field_names: Set[str] _adds: Dict[int, List[NestedField]] = {} _updates: Dict[int, NestedField] = {} @@ -918,16 +918,15 @@ class UpdateSchema: def __init__( self, - schema: Schema, table: Table, transaction: Optional[Transaction] = None, allow_incompatible_changes: bool = False, case_sensitive: bool = True, ) -> None: self._table = table - self._schema = schema - self._last_column_id = itertools.count(schema.highest_field_id + 1) - self._identifier_field_names = schema.column_names + self._schema = table.schema() + self._last_column_id = itertools.count(self._schema.highest_field_id + 1) + self._identifier_field_names = self._schema.identifier_field_names() self._adds = {} self._updates = {} @@ -1088,7 +1087,7 @@ def rename_column(self, path_from: Union[str, Tuple[str, ...]], path_to: Union[s if path_from in self._identifier_field_names: self._identifier_field_names.remove(full_name_from) - self._identifier_field_names.append(full_name_to) + self._identifier_field_names.add(full_name_to) return self @@ -1123,7 +1122,7 @@ def _set_column_requirement(self, path: Union[str, Tuple[str, ...]], required: b path = (path,) if isinstance(path, str) else path name = ".".join(path) - field = self._schema.find_field(name) + field = self._schema.find_field(name, self._case_sensitive) if (field.required and required) or (field.optional and not required): # if the change is a noop, allow it even if allowIncompatibleChanges is false @@ -1165,7 +1164,7 @@ def update_column(self, path: Union[str, Tuple[str, ...]], field_type: IcebergTy path = (path,) if isinstance(path, str) else path full_name = ".".join(path) - field = self._schema.find_field(full_name) + field = self._schema.find_field(full_name, self._case_sensitive) if field.field_id in self._deletes: raise ValueError(f"Cannot update a column that will be deleted: {full_name}") @@ -1206,7 +1205,7 @@ def update_column_doc(self, path: Union[str, Tuple[str, ...]], doc: str) -> Upda path = (path,) if isinstance(path, str) else path full_name = ".".join(path) - field = self._schema.find_field(full_name) + field = self._schema.find_field(full_name, self._case_sensitive) if field.field_id in self._deletes: raise ValueError(f"Cannot update a column that will be deleted: {full_name}") @@ -1244,7 +1243,7 @@ def _find_for_move(self, name: str) -> Optional[int]: def _move(self, full_name: str, move: Move) -> None: if parent_name := self._id_to_parent.get(move.field_id): - parent_field = self._schema.find_field(parent_name) + parent_field = self._schema.find_field(parent_name, self._case_sensitive) if not parent_field.is_struct: raise ValueError(f"Cannot move fields in non-struct type: {parent_field}") @@ -1361,19 +1360,22 @@ def allow_incompatible_changes(self) -> UpdateSchema: def commit(self) -> None: """Apply the pending changes and commit.""" new_schema = self._apply() - updates = [ - AddSchemaUpdate(schema=new_schema, last_column_id=new_schema.highest_field_id), - SetCurrentSchemaUpdate(schema_id=-1), - ] - requirements = [AssertCurrentSchemaId(current_schema_id=self._schema.schema_id)] - if self._transaction is not None: - self._transaction._append_updates(*updates) # pylint: disable=W0212 - self._transaction._append_requirements(*requirements) # pylint: disable=W0212 - else: - self._table._do_commit( # pylint: disable=W0212 - CommitTableRequest(identifier=self._table.identifier[1:], updates=updates, requirements=requirements) - ) + if new_schema != self._schema: + last_column_id = max(self._schema.highest_field_id, new_schema.highest_field_id) + updates = [ + AddSchemaUpdate(schema=new_schema, last_column_id=last_column_id), + SetCurrentSchemaUpdate(schema_id=-1), + ] + requirements = [AssertCurrentSchemaId(current_schema_id=self._schema.schema_id)] + + if self._transaction is not None: + self._transaction._append_updates(*updates) # pylint: disable=W0212 + self._transaction._append_requirements(*requirements) # pylint: disable=W0212 + else: + self._table._do_commit( # pylint: disable=W0212 + CommitTableRequest(identifier=self._table.identifier[1:], updates=updates, requirements=requirements) + ) def _apply(self) -> Schema: """Apply the pending changes to the original schema and returns the result. @@ -1386,16 +1388,26 @@ def _apply(self) -> Schema: # Should never happen raise ValueError("Could not apply changes") - schema = Schema(*struct.fields) + # @TODO: This differs still a bit from the Java side, + # The validate identifier field is missing + field_ids = set() for name in self._identifier_field_names: - try: - _ = schema.find_field(name) - except ValueError as e: - raise ValueError( - f"Cannot add field {name} as an identifier field: not found in current schema or added columns" - ) from e + field = self._schema.find_field(name, self._case_sensitive) + field_ids.add(field.field_id) + if field.field_id in self._deletes: + raise ValueError(f"Cannot delete identifier field {name}. To force deletion, update the identifier fields first.") + + # If it nested, also check if the parents aren't deleted + column_name = self._id_to_parent.get(field.field_id) + while column_name is not None: + parent = self._schema.find_field(column_name) + if parent.field_id in self._deletes: + raise ValueError( + f"Cannot delete field {parent.field_id} as it will delete nested identifier field {name}", + ) + column_name = self._id_to_parent.get(parent.field_id) - return schema + return Schema(*struct.fields, identifier_field_ids=field_ids) def assign_new_column_id(self) -> int: return next(self._last_column_id) @@ -1514,7 +1526,7 @@ def primitive(self, primitive: PrimitiveType) -> Optional[IcebergType]: def _add_fields(fields: Tuple[NestedField, ...], adds: Optional[List[NestedField]]) -> Optional[Tuple[NestedField, ...]]: adds = adds or [] - return None if len(adds) == 0 else tuple(*fields, *adds) + return None if len(adds) == 0 else fields + tuple(adds) def _move_fields(fields: Tuple[NestedField, ...], moves: List[Move]) -> Tuple[NestedField, ...]: diff --git a/python/tests/conftest.py b/python/tests/conftest.py index 67fc9927809e..870996c235be 100644 --- a/python/tests/conftest.py +++ b/python/tests/conftest.py @@ -192,7 +192,7 @@ def table_schema_nested() -> Schema: required=False, ), schema_id=1, - identifier_field_ids=[1], + identifier_field_ids=[2], ) diff --git a/python/tests/table/test_init.py b/python/tests/table/test_init.py index b90fd2d8773f..a7b14a8e27ce 100644 --- a/python/tests/table/test_init.py +++ b/python/tests/table/test_init.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. # pylint:disable=redefined-outer-name -from typing import Dict import pytest from sortedcontainers import SortedList @@ -40,7 +39,6 @@ SetPropertiesUpdate, StaticTable, Table, - UpdateSchema, _match_deletes_to_datafile, ) from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER @@ -57,25 +55,7 @@ SortOrder, ) from pyiceberg.transforms import BucketTransform, IdentityTransform -from pyiceberg.types import ( - BinaryType, - BooleanType, - DateType, - DoubleType, - FloatType, - IntegerType, - ListType, - LongType, - MapType, - NestedField, - PrimitiveType, - StringType, - StructType, - TimestampType, - TimestamptzType, - TimeType, - UUIDType, -) +from pyiceberg.types import LongType, NestedField def test_schema(table: Table) -> None: @@ -394,183 +374,3 @@ def test_match_deletes_to_datafile_duplicate_number() -> None: def test_serialize_set_properties_updates() -> None: assert SetPropertiesUpdate(updates={"abc": "🤪"}).model_dump_json() == """{"action":"set-properties","updates":{"abc":"🤪"}}""" - - -def test_add_column(table_schema_simple: Schema, table: Table) -> None: - update = UpdateSchema(table_schema_simple, table) - update.add_column(path="b", field_type=IntegerType()) - apply_schema: Schema = update._apply() # pylint: disable=W0212 - assert len(apply_schema.fields) == 4 - - assert apply_schema == Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=False), - NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), - NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), - NestedField(field_id=4, name="b", field_type=IntegerType(), required=False), - ) - assert apply_schema.schema_id == 0 - assert apply_schema.highest_field_id == 4 - - -def test_add_primitive_type_column(table_schema_simple: Schema, table: Table) -> None: - primitive_type: Dict[str, PrimitiveType] = { - "boolean": BooleanType(), - "int": IntegerType(), - "long": LongType(), - "float": FloatType(), - "double": DoubleType(), - "date": DateType(), - "time": TimeType(), - "timestamp": TimestampType(), - "timestamptz": TimestamptzType(), - "string": StringType(), - "uuid": UUIDType(), - "binary": BinaryType(), - } - - for name, type_ in primitive_type.items(): - field_name = f"new_column_{name}" - update = UpdateSchema(table_schema_simple, table) - update.add_column(path=field_name, field_type=type_, doc=f"new_column_{name}") - new_schema = update._apply() # pylint: disable=W0212 - - field: NestedField = new_schema.find_field(field_name) - assert field.field_type == type_ - assert field.doc == f"new_column_{name}" - - -def test_add_nested_type_column(table_schema_simple: Schema, table: Table) -> None: - # add struct type column - field_name = "new_column_struct" - update = UpdateSchema(table_schema_simple, table) - struct_ = StructType( - NestedField(1, "lat", DoubleType()), - NestedField(2, "long", DoubleType()), - ) - update.add_column(path=field_name, field_type=struct_) - schema_ = update._apply() # pylint: disable=W0212 - field: NestedField = schema_.find_field(field_name) - assert field.field_type == StructType( - NestedField(5, "lat", DoubleType()), - NestedField(6, "long", DoubleType()), - ) - assert schema_.highest_field_id == 6 - - -def test_add_nested_map_type_column(table_schema_simple: Schema, table: Table) -> None: - # add map type column - field_name = "new_column_map" - update = UpdateSchema(table_schema_simple, table) - map_ = MapType(1, StringType(), 2, IntegerType(), False) - update.add_column(path=field_name, field_type=map_) - new_schema = update._apply() # pylint: disable=W0212 - field: NestedField = new_schema.find_field(field_name) - assert field.field_type == MapType(5, StringType(), 6, IntegerType(), False) - assert new_schema.highest_field_id == 6 - - -def test_add_nested_list_type_column(table_schema_simple: Schema, table: Table) -> None: - # add list type column - field_name = "new_column_list" - update = UpdateSchema(table_schema_simple, table) - list_ = ListType( - element_id=101, - element_type=StructType( - NestedField(102, "lat", DoubleType()), - NestedField(103, "long", DoubleType()), - ), - element_required=False, - ) - update.add_column(path=field_name, field_type=list_) - new_schema = update._apply() # pylint: disable=W0212 - field: NestedField = new_schema.find_field(field_name) - assert field.field_type == ListType( - element_id=5, - element_type=StructType( - NestedField(6, "lat", DoubleType()), - NestedField(7, "long", DoubleType()), - ), - element_required=False, - ) - assert new_schema.highest_field_id == 7 - - -def test_add_field_to_map_key(table_schema_nested_with_struct_key_map: Schema, table: Table) -> None: - with pytest.raises(ValueError) as exc_info: - update = UpdateSchema(table_schema_nested_with_struct_key_map, table) - update.add_column(path=("location", "key", "b"), field_type=IntegerType())._apply() # pylint: disable=W0212 - assert "Cannot add fields to map keys" in str(exc_info.value) - - -def test_add_already_exists(table_schema_nested: Schema, table: Table) -> None: - update = UpdateSchema(table_schema_nested, table) - - with pytest.raises(ValueError) as exc_info: - update.add_column("foo", IntegerType()) - assert "already exists: foo" in str(exc_info.value) - - with pytest.raises(ValueError) as exc_info: - update.add_column(path=("location", "latitude"), field_type=IntegerType()) - assert "already exists: location.latitude" in str(exc_info.value) - - -def test_ambigous_column(table_schema_nested: Schema, table: Table) -> None: - update = UpdateSchema(table_schema_nested, table) - - with pytest.raises(ValueError) as exc_info: - update.add_column(path="location.latitude", field_type=IntegerType()) - assert "Cannot add column with ambiguous name: location.latitude, provide a tuple instead" in str(exc_info.value) - - -def test_add_to_non_struct_type(table_schema_simple: Schema, table: Table) -> None: - update = UpdateSchema(table_schema_simple, table) - with pytest.raises(ValueError) as exc_info: - update.add_column(path=("foo", "lat"), field_type=IntegerType()) - assert "Cannot add column 'lat' to non-struct type" in str(exc_info.value) - - -def test_add_required_column(table: Table) -> None: - schema_ = Schema( - NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] - ) - update = UpdateSchema(schema_, table) - with pytest.raises(ValueError) as exc_info: - update.add_column(path="data", field_type=IntegerType(), required=True) - assert "Incompatible change: cannot add required column: data" in str(exc_info.value) - - new_schema = ( - UpdateSchema(schema_, table) # pylint: disable=W0212 - .allow_incompatible_changes() - .add_column(path="data", field_type=IntegerType(), required=True) - ._apply() - ) - assert new_schema == Schema( - NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), - NestedField(field_id=2, name="data", field_type=IntegerType(), required=True), - schema_id=0, - identifier_field_ids=[], - ) - - -def test_add_required_column_case_insensitive(table: Table) -> None: - schema_ = Schema( - NestedField(field_id=1, name="id", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] - ) - - with pytest.raises(ValueError) as exc_info: - update = UpdateSchema(schema_, table) - update.allow_incompatible_changes().case_sensitive(False).add_column(path="ID", field_type=IntegerType(), required=True) - assert "already exists: ID" in str(exc_info.value) - - new_schema = ( - UpdateSchema(schema_, table) # pylint: disable=W0212 - .allow_incompatible_changes() - .add_column(path="ID", field_type=IntegerType(), required=True) - ._apply() - ) - assert new_schema == Schema( - NestedField(field_id=1, path="id", field_type=BooleanType(), required=False), - NestedField(field_id=2, name="ID", field_type=IntegerType(), required=True), - schema_id=0, - identifier_field_ids=[], - ) diff --git a/python/tests/test_integration.py b/python/tests/test_integration.py index 4dcdff966c1a..a63436bdaead 100644 --- a/python/tests/test_integration.py +++ b/python/tests/test_integration.py @@ -25,7 +25,7 @@ from pyarrow.fs import S3FileSystem from pyiceberg.catalog import Catalog, load_catalog -from pyiceberg.exceptions import CommitFailedException, NoSuchTableError +from pyiceberg.exceptions import NoSuchTableError from pyiceberg.expressions import ( And, EqualTo, @@ -40,18 +40,10 @@ from pyiceberg.table import Table from pyiceberg.types import ( BooleanType, - DoubleType, - FixedType, - FloatType, IntegerType, - ListType, - LongType, - MapType, NestedField, StringType, - StructType, TimestampType, - UUIDType, ) @@ -360,198 +352,3 @@ def test_unpartitioned_fixed_table(catalog: Catalog) -> None: b"12345678901234567ass12345", b"qweeqwwqq1231231231231111", ] - - -@pytest.mark.integration -def test_schema_evolution(catalog: Catalog) -> None: - try: - catalog.drop_table("default.test_schema_evolution") - except NoSuchTableError: - pass - - schema = Schema( - NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), - NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), - ) - - t = catalog.create_table(identifier="default.test_schema_evolution", schema=schema) - - assert t.schema() == schema - - with t.update_schema() as tx: - tx.add_column("col_string", StringType()) - - assert t.schema() == Schema( - NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), - NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), - NestedField(field_id=3, name="col_string", field_type=StringType(), required=False), - schema_id=1, - ) - - -@pytest.mark.integration -def test_schema_evolution_via_transaction(catalog: Catalog) -> None: - tbl_name = "default.test_schema_evolution_via_transaction" - try: - catalog.drop_table(tbl_name) - except NoSuchTableError: - pass - - schema = Schema( - NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), - NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), - ) - - tbl = catalog.create_table(identifier=tbl_name, schema=schema) - - assert tbl.schema() == schema - - with tbl.transaction() as tx: - tx.update_schema().add_column("col_string", StringType()).commit() - - assert tbl.schema() == Schema( - NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), - NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), - NestedField(field_id=3, name="col_string", field_type=StringType(), required=False), - schema_id=1, - ) - - tbl.update_schema().add_column("col_integer", IntegerType()).commit() - - assert tbl.schema() == Schema( - NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), - NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), - NestedField(field_id=3, name="col_string", field_type=StringType(), required=False), - NestedField(field_id=4, name="col_integer", field_type=IntegerType(), required=False), - schema_id=1, - ) - - with pytest.raises(CommitFailedException) as exc_info: - with tbl.transaction() as tx: - # Start a new update - schema_update = tx.update_schema() - - # Do a concurrent update - tbl.update_schema().add_column("col_long", LongType()).commit() - - # stage another update in the transaction - schema_update.add_column("col_double", DoubleType()).commit() - - assert "Requirement failed: current schema changed: expected id 2 != 3" in str(exc_info.value) - - assert tbl.schema() == Schema( - NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), - NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), - NestedField(field_id=3, name="col_string", field_type=StringType(), required=False), - NestedField(field_id=4, name="col_integer", field_type=IntegerType(), required=False), - NestedField(field_id=5, name="col_long", field_type=LongType(), required=False), - schema_id=1, - ) - - -@pytest.mark.integration -def test_schema_evolution_nested(catalog: Catalog) -> None: - tbl_name = "default.test_schema_evolution_nested" - try: - catalog.drop_table(tbl_name) - except NoSuchTableError: - pass - - nested_schema = Schema( - NestedField( - field_id=1, - name="location_lookup", - field_type=MapType( - key_id=10, - key_type=StringType(), - value_id=11, - value_type=StructType( - NestedField(field_id=110, name="x", field_type=FloatType(), required=False), - NestedField(field_id=111, name="y", field_type=FloatType(), required=False), - ), - element_required=True, - ), - required=True, - ), - NestedField( - field_id=2, - name="locations", - field_type=ListType( - element_id=20, - element_type=StructType( - NestedField(field_id=200, name="x", field_type=FloatType(), required=False), - NestedField(field_id=201, name="y", field_type=FloatType(), required=False), - ), - element_required=True, - ), - required=True, - ), - NestedField( - field_id=3, - name="person", - field_type=StructType( - NestedField(field_id=30, name="name", field_type=StringType(), required=False), - NestedField(field_id=31, name="age", field_type=IntegerType(), required=True), - ), - required=False, - ), - schema_id=1, - ) - - tbl = catalog.create_table(identifier=tbl_name, schema=nested_schema) - - assert tbl.schema().highest_field_id == 12 - - with tbl.update_schema() as schema_update: - schema_update.add_column(("location_lookup", "z"), FloatType()) - schema_update.add_column(("locations", "z"), FloatType()) - schema_update.add_column(("person", "address"), StringType()) - - assert str(tbl.schema()) == str( - Schema( - NestedField( - field_id=1, - name="location_lookup", - field_type=MapType( - type="map", - key_id=4, - key_type=StringType(), - value_id=5, - value_type=StructType( - NestedField(field_id=6, name="x", field_type=FloatType(), required=False), - NestedField(field_id=7, name="y", field_type=FloatType(), required=False), - NestedField(field_id=13, name="z", field_type=FloatType(), required=False), - ), - value_required=True, - ), - required=True, - ), - NestedField( - field_id=2, - name="locations", - field_type=ListType( - type="list", - element_id=8, - element_type=StructType( - NestedField(field_id=9, name="x", field_type=FloatType(), required=False), - NestedField(field_id=10, name="y", field_type=FloatType(), required=False), - NestedField(field_id=14, name="z", field_type=FloatType(), required=False), - ), - element_required=True, - ), - required=True, - ), - NestedField( - field_id=3, - name="person", - field_type=StructType( - NestedField(field_id=11, name="name", field_type=StringType(), required=False), - NestedField(field_id=12, name="age", field_type=IntegerType(), required=True), - NestedField(field_id=15, name="address", field_type=StringType(), required=False), - ), - required=False, - ), - schema_id=1, - identifier_field_ids=[], - ) - ) diff --git a/python/tests/test_integration_schema.py b/python/tests/test_integration_schema.py new file mode 100644 index 000000000000..342a899a3afb --- /dev/null +++ b/python/tests/test_integration_schema.py @@ -0,0 +1,569 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint:disable=redefined-outer-name +from typing import Dict + +import pytest + +from pyiceberg.catalog import Catalog, load_catalog +from pyiceberg.exceptions import CommitFailedException, NoSuchTableError +from pyiceberg.schema import Schema, prune_columns +from pyiceberg.table import Table, UpdateSchema +from pyiceberg.types import ( + BooleanType, + DoubleType, + FixedType, + FloatType, + IntegerType, + ListType, + LongType, + MapType, + NestedField, + StringType, + StructType, + UUIDType, PrimitiveType, DateType, TimeType, TimestampType, TimestamptzType, BinaryType, +) + + +@pytest.fixture() +def catalog() -> Catalog: + return load_catalog( + "local", + **{ + "type": "rest", + "uri": "http://localhost:8181", + "s3.endpoint": "http://localhost:9000", + "s3.access-key-id": "admin", + "s3.secret-access-key": "password", + }, + ) + + + +@pytest.fixture() +def simple_table(catalog: Catalog, table_schema_simple: Schema) -> Table: + return _create_table_with_schema(catalog, table_schema_simple) + + + +def test_add_column(simple_table: Table) -> None: + update = UpdateSchema(simple_table) + update.add_column(path="b", field_type=IntegerType()) + apply_schema: Schema = update._apply() # pylint: disable=W0212 + assert len(apply_schema.fields) == 4 + + assert apply_schema == Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=False), + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), + NestedField(field_id=4, name="b", field_type=IntegerType(), required=False), + ) + assert apply_schema.schema_id == 0 + assert apply_schema.highest_field_id == 4 + + +def test_add_primitive_type_column(simple_table: Table) -> None: + primitive_type: Dict[str, PrimitiveType] = { + "boolean": BooleanType(), + "int": IntegerType(), + "long": LongType(), + "float": FloatType(), + "double": DoubleType(), + "date": DateType(), + "time": TimeType(), + "timestamp": TimestampType(), + "timestamptz": TimestamptzType(), + "string": StringType(), + "uuid": UUIDType(), + "binary": BinaryType(), + } + + for name, type_ in primitive_type.items(): + field_name = f"new_column_{name}" + update = UpdateSchema(simple_table) + update.add_column(path=field_name, field_type=type_, doc=f"new_column_{name}") + new_schema = update._apply() # pylint: disable=W0212 + + field: NestedField = new_schema.find_field(field_name) + assert field.field_type == type_ + assert field.doc == f"new_column_{name}" + + +def test_add_nested_type_column(simple_table: Table) -> None: + # add struct type column + field_name = "new_column_struct" + update = UpdateSchema(simple_table) + struct_ = StructType( + NestedField(1, "lat", DoubleType()), + NestedField(2, "long", DoubleType()), + ) + update.add_column(path=field_name, field_type=struct_) + schema_ = update._apply() # pylint: disable=W0212 + field: NestedField = schema_.find_field(field_name) + assert field.field_type == StructType( + NestedField(5, "lat", DoubleType()), + NestedField(6, "long", DoubleType()), + ) + assert schema_.highest_field_id == 6 + + +def test_add_nested_map_type_column(simple_table: Table) -> None: + # add map type column + field_name = "new_column_map" + update = UpdateSchema(simple_table) + map_ = MapType(1, StringType(), 2, IntegerType(), False) + update.add_column(path=field_name, field_type=map_) + new_schema = update._apply() # pylint: disable=W0212 + field: NestedField = new_schema.find_field(field_name) + assert field.field_type == MapType(5, StringType(), 6, IntegerType(), False) + assert new_schema.highest_field_id == 6 + + +def test_add_nested_list_type_column(simple_table: Table) -> None: + # add list type column + field_name = "new_column_list" + update = UpdateSchema(simple_table) + list_ = ListType( + element_id=101, + element_type=StructType( + NestedField(102, "lat", DoubleType()), + NestedField(103, "long", DoubleType()), + ), + element_required=False, + ) + update.add_column(path=field_name, field_type=list_) + new_schema = update._apply() # pylint: disable=W0212 + field: NestedField = new_schema.find_field(field_name) + assert field.field_type == ListType( + element_id=5, + element_type=StructType( + NestedField(6, "lat", DoubleType()), + NestedField(7, "long", DoubleType()), + ), + element_required=False, + ) + assert new_schema.highest_field_id == 7 + + +def test_add_field_to_map_key(catalog: Catalog, table_schema_nested_with_struct_key_map: Schema, table: Table) -> None: + table = _create_table_with_schema(catalog, table_schema_nested_with_struct_key_map) + with pytest.raises(ValueError) as exc_info: + update = UpdateSchema(table) + update.add_column(path=("location", "key", "b"), field_type=IntegerType())._apply() # pylint: disable=W0212 + assert "Cannot add fields to map keys" in str(exc_info.value) + + +def _create_table_with_schema(catalog: Catalog, schema: Schema) -> Table: + tbl_name = "default.test_schema_evolution" + try: + catalog.drop_table(tbl_name) + except NoSuchTableError: + pass + return catalog.create_table(identifier=tbl_name, schema=schema) + + +def test_add_already_exists(catalog: Catalog, table_schema_nested: Schema) -> None: + table = _create_table_with_schema(catalog, table_schema_nested) + update = UpdateSchema(table) + + with pytest.raises(ValueError) as exc_info: + update.add_column("foo", IntegerType()) + assert "already exists: foo" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + update.add_column(path=("location", "latitude"), field_type=IntegerType()) + assert "already exists: location.latitude" in str(exc_info.value) + + +def test_ambiguous_column(catalog: Catalog, table_schema_nested: Schema) -> None: + table = _create_table_with_schema(catalog, table_schema_nested) + update = UpdateSchema(table) + + with pytest.raises(ValueError) as exc_info: + update.add_column(path="location.latitude", field_type=IntegerType()) + assert "Cannot add column with ambiguous name: location.latitude, provide a tuple instead" in str(exc_info.value) + + +def test_add_to_non_struct_type(catalog: Catalog, table_schema_simple: Schema) -> None: + table = _create_table_with_schema(catalog, table_schema_simple) + update = UpdateSchema(table) + with pytest.raises(ValueError) as exc_info: + update.add_column(path=("foo", "lat"), field_type=IntegerType()) + assert "Cannot add column 'lat' to non-struct type" in str(exc_info.value) + + +def test_add_required_column(catalog: Catalog) -> None: + schema_ = Schema( + NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] + ) + table = _create_table_with_schema(catalog, schema_) + update = UpdateSchema(table) + with pytest.raises(ValueError) as exc_info: + update.add_column(path="data", field_type=IntegerType(), required=True) + assert "Incompatible change: cannot add required column: data" in str(exc_info.value) + + new_schema = ( + UpdateSchema(table) # pylint: disable=W0212 + .allow_incompatible_changes() + .add_column(path="data", field_type=IntegerType(), required=True) + ._apply() + ) + assert new_schema == Schema( + NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), + NestedField(field_id=2, name="data", field_type=IntegerType(), required=True), + schema_id=0, + identifier_field_ids=[], + ) + + +def test_add_required_column_case_insensitive(catalog: Catalog) -> None: + schema_ = Schema( + NestedField(field_id=1, name="id", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] + ) + table = _create_table_with_schema(catalog, schema_) + + with pytest.raises(ValueError) as exc_info: + update = UpdateSchema(table) + update.allow_incompatible_changes().case_sensitive(False).add_column(path="ID", field_type=IntegerType(), required=True) + assert "already exists: ID" in str(exc_info.value) + + new_schema = ( + UpdateSchema(table) # pylint: disable=W0212 + .allow_incompatible_changes() + .add_column(path="ID", field_type=IntegerType(), required=True) + ._apply() + ) + assert new_schema == Schema( + NestedField(field_id=1, path="id", field_type=BooleanType(), required=False), + NestedField(field_id=2, name="ID", field_type=IntegerType(), required=True), + schema_id=0, + identifier_field_ids=[], + ) + + +@pytest.mark.integration +def test_schema_evolution_via_transaction(catalog: Catalog) -> None: + schema = Schema( + NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), + NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), + ) + tbl = _create_table_with_schema(catalog, schema) + + assert tbl.schema() == schema + + with tbl.transaction() as tx: + tx.update_schema().add_column("col_string", StringType()).commit() + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), + NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), + NestedField(field_id=3, name="col_string", field_type=StringType(), required=False), + schema_id=1, + ) + + tbl.update_schema().add_column("col_integer", IntegerType()).commit() + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), + NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), + NestedField(field_id=3, name="col_string", field_type=StringType(), required=False), + NestedField(field_id=4, name="col_integer", field_type=IntegerType(), required=False), + schema_id=1, + ) + + with pytest.raises(CommitFailedException) as exc_info: + with tbl.transaction() as tx: + # Start a new update + schema_update = tx.update_schema() + + # Do a concurrent update + tbl.update_schema().add_column("col_long", LongType()).commit() + + # stage another update in the transaction + schema_update.add_column("col_double", DoubleType()).commit() + + assert "Requirement failed: current schema changed: expected id 2 != 3" in str(exc_info.value) + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), + NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), + NestedField(field_id=3, name="col_string", field_type=StringType(), required=False), + NestedField(field_id=4, name="col_integer", field_type=IntegerType(), required=False), + NestedField(field_id=5, name="col_long", field_type=LongType(), required=False), + schema_id=1, + ) + + +@pytest.mark.integration +def test_schema_evolution_nested(catalog: Catalog) -> None: + nested_schema = Schema( + NestedField( + field_id=1, + name="location_lookup", + field_type=MapType( + key_id=10, + key_type=StringType(), + value_id=11, + value_type=StructType( + NestedField(field_id=110, name="x", field_type=FloatType(), required=False), + NestedField(field_id=111, name="y", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=2, + name="locations", + field_type=ListType( + element_id=20, + element_type=StructType( + NestedField(field_id=200, name="x", field_type=FloatType(), required=False), + NestedField(field_id=201, name="y", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=3, + name="person", + field_type=StructType( + NestedField(field_id=30, name="name", field_type=StringType(), required=False), + NestedField(field_id=31, name="age", field_type=IntegerType(), required=True), + ), + required=False, + ), + schema_id=1, + ) + + tbl = _create_table_with_schema(catalog, nested_schema) + + assert tbl.schema().highest_field_id == 12 + + with tbl.update_schema() as schema_update: + schema_update.add_column(("location_lookup", "z"), FloatType()) + schema_update.add_column(("locations", "z"), FloatType()) + schema_update.add_column(("person", "address"), StringType()) + + assert str(tbl.schema()) == str( + Schema( + NestedField( + field_id=1, + name="location_lookup", + field_type=MapType( + type="map", + key_id=4, + key_type=StringType(), + value_id=5, + value_type=StructType( + NestedField(field_id=6, name="x", field_type=FloatType(), required=False), + NestedField(field_id=7, name="y", field_type=FloatType(), required=False), + NestedField(field_id=13, name="z", field_type=FloatType(), required=False), + ), + value_required=True, + ), + required=True, + ), + NestedField( + field_id=2, + name="locations", + field_type=ListType( + type="list", + element_id=8, + element_type=StructType( + NestedField(field_id=9, name="x", field_type=FloatType(), required=False), + NestedField(field_id=10, name="y", field_type=FloatType(), required=False), + NestedField(field_id=14, name="z", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=3, + name="person", + field_type=StructType( + NestedField(field_id=11, name="name", field_type=StringType(), required=False), + NestedField(field_id=12, name="age", field_type=IntegerType(), required=True), + NestedField(field_id=15, name="address", field_type=StringType(), required=False), + ), + required=False, + ), + schema_id=1, + identifier_field_ids=[], + ) + ) + +schema_nested = Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=False), + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), + NestedField( + field_id=4, + name="qux", + field_type=ListType(type="list", element_id=8, element_type=StringType(), element_required=True), + required=True, + ), + NestedField( + field_id=5, + name="quux", + field_type=MapType( + type="map", + key_id=9, + key_type=StringType(), + value_id=10, + value_type=MapType( + type="map", key_id=11, key_type=StringType(), value_id=12, value_type=IntegerType(), value_required=True + ), + value_required=True, + ), + required=True, + ), + NestedField( + field_id=6, + name="location", + field_type=ListType( + type="list", + element_id=13, + element_type=StructType( + fields=( + NestedField(field_id=14, name="latitude", field_type=FloatType(), required=False), + NestedField(field_id=15, name="longitude", field_type=FloatType(), required=False), + ) + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=7, + name="person", + field_type=StructType( + fields=( + NestedField(field_id=16, name="name", field_type=StringType(), required=False), + NestedField(field_id=17, name="age", field_type=IntegerType(), required=True), + ) + ), + required=False, + ), + schema_id=0, + identifier_field_ids=[2], +) + + +@pytest.fixture() +def nested_table(catalog: Catalog) -> Table: + return _create_table_with_schema(catalog, schema_nested) + + +def test_no_changes(simple_table: Table, table_schema_simple: Schema) -> None: + with simple_table.update_schema() as _: + pass + + assert simple_table.schema() == table_schema_simple + + +def test_delete_field(simple_table: Table) -> None: + with simple_table.update_schema() as schema_update: + schema_update.delete_column("foo") + + assert simple_table.schema() == Schema( + # foo is missing 👍 + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), + schema_id=1, + identifier_field_ids=[2], + ) + + +def test_delete_field_case_insensitive(simple_table: Table) -> None: + with simple_table.update_schema(case_sensitive=False) as schema_update: + schema_update.delete_column("FOO") + + assert simple_table.schema() == Schema( + # foo is missing 👍 + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), + schema_id=1, + identifier_field_ids=[2], + ) + + +def test_delete_identifier_fields(simple_table: Table) -> None: + with pytest.raises(ValueError) as exc_info: + with simple_table.update_schema() as schema_update: + schema_update.delete_column("bar") + + assert str(exc_info) == "Cannot delete identifier field bar. To force deletion, update the identifier fields first." + + +@pytest.mark.skip(reason="REST Catalog gives an error") +def test_delete_identifier_fields_nested(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=False), + NestedField( + field_id=2, + name="person", + field_type=StructType( + NestedField(field_id=3, name="name", field_type=StringType(), required=True), + NestedField(field_id=4, name="age", field_type=IntegerType(), required=True), + ), + required=True, + ), + schema_id=1, + identifier_field_ids=[], + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.delete_column("person") + + assert str(exc_info) == "Cannot delete field person as it will delete nested identifier field name." + + +@pytest.mark.parametrize( + "field", + [ + "foo", + "baz", + "qux", + "quux", + "location", + "location.element.latitude", + "location.element.longitude", + "person", + "person.name", + "person.age", + ], +) +def test_deletes(field: str, nested_table: Table) -> None: + with nested_table.update_schema() as schema_update: + schema_update.delete_column(field) + + selected_ids = { + field_id + for field_id in schema_nested.field_ids + if not isinstance(schema_nested.find_field(field_id).field_type, (MapType, ListType)) + and not schema_nested.find_column_name(field_id).startswith(field) + } + expected_schema = prune_columns(schema_nested, selected_ids, select_full_types=False) + + assert expected_schema == nested_table.schema() From 0c4405d86296b4a3a0e7075a955962802463b298 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 24 Aug 2023 22:08:20 +0200 Subject: [PATCH 09/17] MOAR Tests --- python/pyiceberg/table/__init__.py | 7 + python/tests/test_integration_schema.py | 268 ++++++++++++++++++++++-- 2 files changed, 262 insertions(+), 13 deletions(-) diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index 199761c9da57..a44c68e8db4f 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -41,6 +41,7 @@ from pydantic import Field, SerializeAsAny from sortedcontainers import SortedList +from pyiceberg.exceptions import ResolveError, ValidationError from pyiceberg.expressions import ( AlwaysTrue, And, @@ -64,6 +65,7 @@ Schema, SchemaVisitor, assign_fresh_schema_ids, + promote, visit, ) from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER, TableMetadata @@ -1173,6 +1175,11 @@ def update_column(self, path: Union[str, Tuple[str, ...]], field_type: IcebergTy # Nothing changed return self + try: + promote(field.field_type, field_type) + except ResolveError as e: + raise ValidationError(f"Cannot change column type: {full_name}: {field.field_type} -> {field_type}") from e + if updated := self._updates.get(field.field_id): self._updates[field.field_id] = NestedField( field_id=updated.field_id, diff --git a/python/tests/test_integration_schema.py b/python/tests/test_integration_schema.py index 342a899a3afb..f7a37d968fca 100644 --- a/python/tests/test_integration_schema.py +++ b/python/tests/test_integration_schema.py @@ -20,11 +20,14 @@ import pytest from pyiceberg.catalog import Catalog, load_catalog -from pyiceberg.exceptions import CommitFailedException, NoSuchTableError +from pyiceberg.exceptions import CommitFailedException, NoSuchTableError, ValidationError from pyiceberg.schema import Schema, prune_columns from pyiceberg.table import Table, UpdateSchema from pyiceberg.types import ( + BinaryType, BooleanType, + DateType, + DecimalType, DoubleType, FixedType, FloatType, @@ -33,9 +36,13 @@ LongType, MapType, NestedField, + PrimitiveType, StringType, StructType, - UUIDType, PrimitiveType, DateType, TimeType, TimestampType, TimestamptzType, BinaryType, + TimestampType, + TimestamptzType, + TimeType, + UUIDType, ) @@ -53,13 +60,11 @@ def catalog() -> Catalog: ) - @pytest.fixture() def simple_table(catalog: Catalog, table_schema_simple: Schema) -> Table: return _create_table_with_schema(catalog, table_schema_simple) - def test_add_column(simple_table: Table) -> None: update = UpdateSchema(simple_table) update.add_column(path="b", field_type=IntegerType()) @@ -409,6 +414,7 @@ def test_schema_evolution_nested(catalog: Catalog) -> None: ) ) + schema_nested = Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=False), NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), @@ -441,10 +447,8 @@ def test_schema_evolution_nested(catalog: Catalog) -> None: type="list", element_id=13, element_type=StructType( - fields=( - NestedField(field_id=14, name="latitude", field_type=FloatType(), required=False), - NestedField(field_id=15, name="longitude", field_type=FloatType(), required=False), - ) + NestedField(field_id=14, name="latitude", field_type=FloatType(), required=False), + NestedField(field_id=15, name="longitude", field_type=FloatType(), required=False), ), element_required=True, ), @@ -454,10 +458,8 @@ def test_schema_evolution_nested(catalog: Catalog) -> None: field_id=7, name="person", field_type=StructType( - fields=( - NestedField(field_id=16, name="name", field_type=StringType(), required=False), - NestedField(field_id=17, name="age", field_type=IntegerType(), required=True), - ) + NestedField(field_id=16, name="name", field_type=StringType(), required=False), + NestedField(field_id=17, name="age", field_type=IntegerType(), required=True), ), required=False, ), @@ -562,8 +564,248 @@ def test_deletes(field: str, nested_table: Table) -> None: field_id for field_id in schema_nested.field_ids if not isinstance(schema_nested.find_field(field_id).field_type, (MapType, ListType)) - and not schema_nested.find_column_name(field_id).startswith(field) + and not schema_nested.find_column_name(field_id).startswith(field) # type: ignore + } + expected_schema = prune_columns(schema_nested, selected_ids, select_full_types=False) + + assert expected_schema == nested_table.schema() + + +@pytest.mark.parametrize( + "field", + [ + "Foo", + "Baz", + "Qux", + "Quux", + "Location", + "Location.element.latitude", + "Location.element.longitude", + "Person", + "Person.name", + "Person.age", + ], +) +def test_deletes_case_insensitive(field: str, nested_table: Table) -> None: + with nested_table.update_schema(case_sensitive=False) as schema_update: + schema_update.delete_column(field) + + selected_ids = { + field_id + for field_id in schema_nested.field_ids + if not isinstance(schema_nested.find_field(field_id).field_type, (MapType, ListType)) + and not schema_nested.find_column_name(field_id).startswith(field.lower()) # type: ignore } expected_schema = prune_columns(schema_nested, selected_ids, select_full_types=False) assert expected_schema == nested_table.schema() + + +def test_update_types(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="bar", field_type=IntegerType(), required=True), + NestedField( + field_id=2, + name="location", + field_type=ListType( + type="list", + element_id=3, + element_type=StructType( + NestedField(field_id=4, name="latitude", field_type=FloatType(), required=False), + NestedField(field_id=5, name="longitude", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.update_column("bar", LongType()) + schema_update.update_column("location.latitude", DoubleType()) + schema_update.update_column("location.longitude", DoubleType()) + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="bar", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="location", + field_type=ListType( + type="list", + element_id=3, + element_type=StructType( + NestedField(field_id=4, name="latitude", field_type=DoubleType(), required=False), + NestedField(field_id=5, name="longitude", field_type=DoubleType(), required=False), + ), + element_required=True, + ), + required=True, + ), + ) + + +def test_update_types_case_insensitive(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="bar", field_type=IntegerType(), required=True), + NestedField( + field_id=2, + name="location", + field_type=ListType( + type="list", + element_id=3, + element_type=StructType( + NestedField(field_id=4, name="latitude", field_type=FloatType(), required=False), + NestedField(field_id=5, name="longitude", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + ), + ) + + with tbl.update_schema(case_sensitive=False) as schema_update: + schema_update.update_column("baR", LongType()) + schema_update.update_column("Location.Latitude", DoubleType()) + schema_update.update_column("Location.Longitude", DoubleType()) + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="bar", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="location", + field_type=ListType( + type="list", + element_id=3, + element_type=StructType( + NestedField(field_id=4, name="latitude", field_type=DoubleType(), required=False), + NestedField(field_id=5, name="longitude", field_type=DoubleType(), required=False), + ), + element_required=True, + ), + required=True, + ), + ) + + +allowed_promotions = [ + (StringType(), BinaryType()), + (BinaryType(), StringType()), + (IntegerType(), LongType()), + (FloatType(), DoubleType()), + (DecimalType(9, 2), DecimalType(18, 2)), +] + + +@pytest.mark.parametrize("from_type, to_type", allowed_promotions) +def test_allowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="bar", field_type=from_type, required=True), + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.update_column("bar", to_type) + + assert tbl.schema() == Schema(NestedField(field_id=1, name="bar", field_type=to_type, required=True)) + + +disallowed_promotions_types = [ + BooleanType(), + IntegerType(), + LongType(), + FloatType(), + DoubleType(), + DateType(), + TimeType(), + TimestampType(), + TimestamptzType(), + StringType(), + UUIDType(), + BinaryType(), + FixedType(3), + FixedType(4), + # We'll just allow Decimal promotions right now + # https://github.com/apache/iceberg/issues/8389 + # DecimalType(9, 2), + # DecimalType(9, 3), + # DecimalType(18, 2) +] + + +@pytest.mark.parametrize("from_type", disallowed_promotions_types) +@pytest.mark.parametrize("to_type", disallowed_promotions_types) +def test_disallowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="bar", field_type=from_type, required=True), + ), + ) + + if from_type != to_type and (from_type, to_type) not in allowed_promotions: + with pytest.raises(ValidationError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.update_column("bar", to_type) + + assert str(exc_info.value).startswith("Cannot change column type: bar:") + else: + with tbl.update_schema() as schema_update: + schema_update.update_column("bar", to_type) + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="bar", field_type=to_type, required=True), + ) + + +# def test_rename(): +# tbl = _create_table_with_schema( +# catalog, +# Schema( +# NestedField( +# field_id=1, +# name="location_lookup", +# field_type=MapType( +# key_id=10, +# key_type=StringType(), +# value_id=11, +# value_type=StructType( +# NestedField(field_id=110, name="x", field_type=FloatType(), required=False), +# NestedField(field_id=111, name="y", field_type=FloatType(), required=False), +# ), +# element_required=True, +# ), +# required=True, +# ), +# NestedField( +# field_id=2, +# name="locations", +# field_type=ListType( +# element_id=20, +# element_type=StructType( +# NestedField(field_id=200, name="x", field_type=FloatType(), required=False), +# NestedField(field_id=201, name="y", field_type=FloatType(), required=False), +# ), +# element_required=True, +# ), +# required=True, +# ), +# NestedField( +# field_id=3, +# name="person", +# field_type=StructType( +# NestedField(field_id=30, name="name", field_type=StringType(), required=False), +# NestedField(field_id=31, name="age", field_type=IntegerType(), required=True), +# ), +# required=False, +# ), +# schema_id=1, +# ), +# ) From 641d61e9e9012bf511d970ae3927fef1e58f1bb9 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 24 Aug 2023 22:46:45 +0200 Subject: [PATCH 10/17] MOAR tests for renaming of columns --- python/pyiceberg/table/__init__.py | 22 ++-- python/tests/test_integration_schema.py | 143 ++++++++++++++++-------- 2 files changed, 109 insertions(+), 56 deletions(-) diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index a44c68e8db4f..475daeda021a 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -1049,21 +1049,21 @@ def delete_column(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: return self - def rename_column(self, path_from: Union[str, Tuple[str, ...]], path_to: Union[str, Tuple[str, ...]]) -> UpdateSchema: + def rename_column(self, path_from: Union[str, Tuple[str, ...]], new_name: str) -> UpdateSchema: """Updates the name of a column. Args: path_from: The path to the column to be renamed. - path_to: The new path of the column. + new_name: The new path of the column. Returns: The UpdateSchema with the rename operation staged. """ - name_from = (path_from,) if isinstance(path_from, str) else path_from - name_to = (path_to,) if isinstance(path_to, str) else path_to + name_from = tuple(path_from.split(".")) if isinstance(path_from, str) else path_from + new_name = new_name.split(".")[-1] + parent_path = name_from[:-1] full_name_from = ".".join(name_from) - full_name_to = ".".join(name_to) from_field = self._schema.find_field(full_name_from, self._case_sensitive) @@ -1073,7 +1073,7 @@ def rename_column(self, path_from: Union[str, Tuple[str, ...]], path_to: Union[s if updated := self._updates.get(from_field.field_id): self._updates[from_field.field_id] = NestedField( field_id=updated.field_id, - name=full_name_to, + name=new_name, field_type=updated.field_type, doc=updated.doc, required=updated.required, @@ -1081,7 +1081,7 @@ def rename_column(self, path_from: Union[str, Tuple[str, ...]], path_to: Union[s else: self._updates[from_field.field_id] = NestedField( field_id=from_field.field_id, - name=full_name_to, + name=new_name, field_type=from_field.field_type, doc=from_field.doc, required=from_field.required, @@ -1089,7 +1089,7 @@ def rename_column(self, path_from: Union[str, Tuple[str, ...]], path_to: Union[s if path_from in self._identifier_field_names: self._identifier_field_names.remove(full_name_from) - self._identifier_field_names.add(full_name_to) + self._identifier_field_names.add(f"{'.'.join(parent_path)}.{new_name}") return self @@ -1459,7 +1459,7 @@ def struct(self, struct: StructType, field_results: List[Optional[IcebergType]]) required = field.required # There is an update - if update := self._updates.get(field): + if update := self._updates.get(field.field_id): name = update.name doc = update.doc required = update.required @@ -1469,9 +1469,7 @@ def struct(self, struct: StructType, field_results: List[Optional[IcebergType]]) else: has_changes = True new_fields.append( - NestedField( - field_id=field.field_id, name=field.name, field_type=result_type, required=field.required, doc=field.doc - ) + NestedField(field_id=field.field_id, name=name, field_type=result_type, required=required, doc=doc) ) if has_changes: diff --git a/python/tests/test_integration_schema.py b/python/tests/test_integration_schema.py index f7a37d968fca..861f67550076 100644 --- a/python/tests/test_integration_schema.py +++ b/python/tests/test_integration_schema.py @@ -765,47 +765,102 @@ def test_disallowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, ca ) -# def test_rename(): -# tbl = _create_table_with_schema( -# catalog, -# Schema( -# NestedField( -# field_id=1, -# name="location_lookup", -# field_type=MapType( -# key_id=10, -# key_type=StringType(), -# value_id=11, -# value_type=StructType( -# NestedField(field_id=110, name="x", field_type=FloatType(), required=False), -# NestedField(field_id=111, name="y", field_type=FloatType(), required=False), -# ), -# element_required=True, -# ), -# required=True, -# ), -# NestedField( -# field_id=2, -# name="locations", -# field_type=ListType( -# element_id=20, -# element_type=StructType( -# NestedField(field_id=200, name="x", field_type=FloatType(), required=False), -# NestedField(field_id=201, name="y", field_type=FloatType(), required=False), -# ), -# element_required=True, -# ), -# required=True, -# ), -# NestedField( -# field_id=3, -# name="person", -# field_type=StructType( -# NestedField(field_id=30, name="name", field_type=StringType(), required=False), -# NestedField(field_id=31, name="age", field_type=IntegerType(), required=True), -# ), -# required=False, -# ), -# schema_id=1, -# ), -# ) +def test_rename(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField( + field_id=1, + name="location_lookup", + field_type=MapType( + type="map", + key_id=5, + key_type=StringType(), + value_id=6, + value_type=StructType( + NestedField(field_id=7, name="x", field_type=FloatType(), required=False), + NestedField(field_id=8, name="y", field_type=FloatType(), required=False), + ), + value_required=True, + ), + required=True, + ), + NestedField( + field_id=2, + name="locations", + field_type=ListType( + type="list", + element_id=9, + element_type=StructType( + NestedField(field_id=10, name="x", field_type=FloatType(), required=False), + NestedField(field_id=11, name="y", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=3, + name="person", + field_type=StructType( + NestedField(field_id=12, name="name", field_type=StringType(), required=False), + NestedField(field_id=13, name="leeftijd", field_type=IntegerType(), required=True), + ), + required=False, + ), + NestedField(field_id=4, name="foo", field_type=StringType(), required=True), + schema_id=0, + identifier_field_ids=[], + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.rename_column("foo", "bar") + schema_update.rename_column("location_lookup.x", "location_lookup.latitude") + schema_update.rename_column("locations.x", "locations.latitude") + schema_update.rename_column("person.leeftijd", "person.age") + + assert tbl.schema() == Schema( + NestedField( + field_id=1, + name="location_lookup", + field_type=MapType( + type="map", + key_id=5, + key_type=StringType(), + value_id=6, + value_type=StructType( + NestedField(field_id=7, name="latitude", field_type=FloatType(), required=False), + NestedField(field_id=8, name="y", field_type=FloatType(), required=False), + ), + value_required=True, + ), + required=True, + ), + NestedField( + field_id=2, + name="locations", + field_type=ListType( + type="list", + element_id=9, + element_type=StructType( + NestedField(field_id=10, name="latitude", field_type=FloatType(), required=False), + NestedField(field_id=11, name="y", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=3, + name="person", + field_type=StructType( + NestedField(field_id=12, name="name", field_type=StringType(), required=False), + NestedField(field_id=13, name="age", field_type=IntegerType(), required=True), + ), + required=False, + ), + NestedField(field_id=4, name="bar", field_type=StringType(), required=True), + schema_id=0, + identifier_field_ids=[], + ) From 1ae5be211aaf50bb7bdf455c6804a3a1f3017c4d Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Sun, 27 Aug 2023 20:21:37 +0200 Subject: [PATCH 11/17] MOAR tests --- python/pyiceberg/table/__init__.py | 92 +- python/tests/test_integration_schema.py | 1756 ++++++++++++++++++++++- 2 files changed, 1749 insertions(+), 99 deletions(-) diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index 475daeda021a..e631a6cd41ee 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -913,6 +913,7 @@ class UpdateSchema: _moves: Dict[int, List[Move]] = {} _added_name_to_id: Dict[str, int] = {} + # Part of https://github.com/apache/iceberg/pull/8393 _id_to_parent: Dict[int, str] = {} _allow_incompatible_changes: bool _case_sensitive: bool @@ -976,10 +977,10 @@ def add_column( Returns: This for method chaining. """ - path = (path,) if isinstance(path, str) else path - - if "." in path[-1]: - raise ValueError(f"Cannot add column with ambiguous name: {path[-1]}, provide a tuple instead") + if isinstance(path, str): + if "." in path: + raise ValueError(f"Cannot add column with ambiguous name: {path}, provide a tuple instead") + path = (path,) if required and not self._allow_incompatible_changes: # Table format version 1 and 2 cannot add required column because there is no initial value @@ -1004,13 +1005,13 @@ def add_column( parent_id = parent_field.field_id - exists = False + existing_field = None try: - exists = self._schema.find_field(full_name, self._case_sensitive) is not None + existing_field = self._schema.find_field(full_name, self._case_sensitive) except ValueError: pass - if exists: + if existing_field is not None and existing_field.field_id not in self._deletes: raise ValueError(f"Cannot add column, name already exists: {full_name}") # assign new IDs in order @@ -1060,7 +1061,6 @@ def rename_column(self, path_from: Union[str, Tuple[str, ...]], new_name: str) - The UpdateSchema with the rename operation staged. """ name_from = tuple(path_from.split(".")) if isinstance(path_from, str) else path_from - new_name = new_name.split(".")[-1] parent_path = name_from[:-1] full_name_from = ".".join(name_from) @@ -1089,7 +1089,7 @@ def rename_column(self, path_from: Union[str, Tuple[str, ...]], new_name: str) - if path_from in self._identifier_field_names: self._identifier_field_names.remove(full_name_from) - self._identifier_field_names.add(f"{'.'.join(parent_path)}.{new_name}") + self._identifier_field_names.add(f"{'.'.join(parent_path)}{'.' if len(parent_path) > 0 else ''}{new_name}") return self @@ -1120,6 +1120,9 @@ def make_column_optional(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchem self._set_column_requirement(path, False) return self + def set_identifier_fields(self, *fields: str) -> None: + self._identifier_field_names = set(fields) + def _set_column_requirement(self, path: Union[str, Tuple[str, ...]], required: bool) -> None: path = (path,) if isinstance(path, str) else path name = ".".join(path) @@ -1130,7 +1133,7 @@ def _set_column_requirement(self, path: Union[str, Tuple[str, ...]], required: b # if the change is a noop, allow it even if allowIncompatibleChanges is false return - if self._allow_incompatible_changes and not required: + if not self._allow_incompatible_changes and not required: raise ValueError(f"Cannot change column nullability: {name}: optional -> required") if field.field_id in self._deletes: @@ -1153,7 +1156,9 @@ def _set_column_requirement(self, path: Union[str, Tuple[str, ...]], required: b required=required, ) - def update_column(self, path: Union[str, Tuple[str, ...]], field_type: IcebergType) -> UpdateSchema: + def update_column( + self, path: Union[str, Tuple[str, ...]], field_type: IcebergType, doc: Optional[str] = None + ) -> UpdateSchema: """Update the type of column. Args: @@ -1175,17 +1180,18 @@ def update_column(self, path: Union[str, Tuple[str, ...]], field_type: IcebergTy # Nothing changed return self - try: - promote(field.field_type, field_type) - except ResolveError as e: - raise ValidationError(f"Cannot change column type: {full_name}: {field.field_type} -> {field_type}") from e + if not self._allow_incompatible_changes: + try: + promote(field.field_type, field_type) + except ResolveError as e: + raise ValidationError(f"Cannot change column type: {full_name}: {field.field_type} -> {field_type}") from e if updated := self._updates.get(field.field_id): self._updates[field.field_id] = NestedField( field_id=updated.field_id, name=updated.name, field_type=field_type, - doc=updated.doc, + doc=doc or updated.doc, required=updated.required, ) else: @@ -1193,7 +1199,7 @@ def update_column(self, path: Union[str, Tuple[str, ...]], field_type: IcebergTy field_id=field.field_id, name=field.name, field_type=field_type, - doc=field.doc, + doc=doc or field.doc, required=field.required, ) @@ -1315,7 +1321,7 @@ def move_before(self, path: Union[str, Tuple[str, ...]], before_path: Union[str, before_field_id = self._find_for_move(before_full_name) if before_field_id is None: - raise ValueError(f"Cannot move before missing column: {before_full_name}") + raise ValueError(f"Cannot move {full_name} before missing column: {before_full_name}") if field_id == before_field_id: raise ValueError(f"Cannot move {full_name} before itself") @@ -1346,7 +1352,7 @@ def move_after(self, path: Union[str, Tuple[str, ...]], after_name: Union[str, T after_field_id = self._find_for_move(after_full_name) if after_field_id is None: - raise ValueError(f"Cannot move after missing column: {after_full_name}") + raise ValueError(f"Cannot move {full_name} after missing column: {after_full_name}") if field_id == after_field_id: raise ValueError(f"Cannot move {full_name} after itself") @@ -1395,11 +1401,13 @@ def _apply(self) -> Schema: # Should never happen raise ValueError("Could not apply changes") + new_schema = Schema(*struct.fields) + # @TODO: This differs still a bit from the Java side, # The validate identifier field is missing field_ids = set() for name in self._identifier_field_names: - field = self._schema.find_field(name, self._case_sensitive) + field = new_schema.find_field(name, self._case_sensitive) field_ids.add(field.field_id) if field.field_id in self._deletes: raise ValueError(f"Cannot delete identifier field {name}. To force deletion, update the identifier fields first.") @@ -1407,7 +1415,7 @@ def _apply(self) -> Schema: # If it nested, also check if the parents aren't deleted column_name = self._id_to_parent.get(field.field_id) while column_name is not None: - parent = self._schema.find_field(column_name) + parent = new_schema.find_field(column_name) if parent.field_id in self._deletes: raise ValueError( f"Cannot delete field {parent.field_id} as it will delete nested identifier field {name}", @@ -1435,10 +1443,17 @@ def __init__( self._moves = moves def schema(self, schema: Schema, struct_result: Optional[IcebergType]) -> Optional[IcebergType]: - if new_fields := _add_fields(struct_result.fields if struct_result else [], self._adds.get(TABLE_ROOT_ID)): - return StructType(*new_fields) - else: - return struct_result + added = self._adds.get(TABLE_ROOT_ID) + moves = self._moves.get(TABLE_ROOT_ID) + + if added is not None or moves is not None: + if not isinstance(struct_result, StructType): + raise ValueError(f"Cannot add fields to non-struct: {struct_result}") + + if new_fields := _add_and_move_fields(struct_result.fields, added or [], moves or []): + return StructType(*new_fields) + + return struct_result def struct(self, struct: StructType, field_results: List[Optional[IcebergType]]) -> Optional[IcebergType]: has_changes = False @@ -1486,15 +1501,16 @@ def field(self, field: NestedField, field_result: Optional[IcebergType]) -> Opti if (update := self._updates.get(field.field_id)) and field.field_type != update.field_type: return update.field_type - # handle add & moves - added = self._adds.get(field.field_id) - moves = self._moves.get(field.field_id) - if added is not None or moves is not None: - if not isinstance(field.field_type, StructType): - raise ValueError(f"Cannot add fields to non-struct: {field}") + if isinstance(field_result, StructType): + # handle add & moves + added = self._adds.get(field.field_id) + moves = self._moves.get(field.field_id) + if added is not None or moves is not None: + if not isinstance(field.field_type, StructType): + raise ValueError(f"Cannot add fields to non-struct: {field}") - if new_fields := _add_and_move_fields(field.field_type.fields, added or [], moves or []): - return StructType(*new_fields) + if new_fields := _add_and_move_fields(field_result.fields, added or [], moves or []): + return StructType(*new_fields) return field_result @@ -1509,9 +1525,19 @@ def map( self, map_type: MapType, key_result: Optional[IcebergType], value_result: Optional[IcebergType] ) -> Optional[IcebergType]: key_id: int = map_type.key_field.field_id + + if key_id in self._deletes: + raise ValueError(f"Cannot delete map keys: {map_type}") + + if key_id in self._updates: + raise ValueError(f"Cannot update map keys: {map_type}") + if key_id in self._adds: raise ValueError(f"Cannot add fields to map keys: {map_type}") + if map_type.key_type != key_result: + raise ValueError(f"Cannot alter map keys: {map_type}") + value_field: NestedField = map_type.value_field value_type = self.field(value_field, value_result) if value_type is None: diff --git a/python/tests/test_integration_schema.py b/python/tests/test_integration_schema.py index 861f67550076..c495169db90d 100644 --- a/python/tests/test_integration_schema.py +++ b/python/tests/test_integration_schema.py @@ -164,14 +164,6 @@ def test_add_nested_list_type_column(simple_table: Table) -> None: assert new_schema.highest_field_id == 7 -def test_add_field_to_map_key(catalog: Catalog, table_schema_nested_with_struct_key_map: Schema, table: Table) -> None: - table = _create_table_with_schema(catalog, table_schema_nested_with_struct_key_map) - with pytest.raises(ValueError) as exc_info: - update = UpdateSchema(table) - update.add_column(path=("location", "key", "b"), field_type=IntegerType())._apply() # pylint: disable=W0212 - assert "Cannot add fields to map keys" in str(exc_info.value) - - def _create_table_with_schema(catalog: Catalog, schema: Schema) -> Table: tbl_name = "default.test_schema_evolution" try: @@ -194,15 +186,6 @@ def test_add_already_exists(catalog: Catalog, table_schema_nested: Schema) -> No assert "already exists: location.latitude" in str(exc_info.value) -def test_ambiguous_column(catalog: Catalog, table_schema_nested: Schema) -> None: - table = _create_table_with_schema(catalog, table_schema_nested) - update = UpdateSchema(table) - - with pytest.raises(ValueError) as exc_info: - update.add_column(path="location.latitude", field_type=IntegerType()) - assert "Cannot add column with ambiguous name: location.latitude, provide a tuple instead" in str(exc_info.value) - - def test_add_to_non_struct_type(catalog: Catalog, table_schema_simple: Schema) -> None: table = _create_table_with_schema(catalog, table_schema_simple) update = UpdateSchema(table) @@ -211,55 +194,6 @@ def test_add_to_non_struct_type(catalog: Catalog, table_schema_simple: Schema) - assert "Cannot add column 'lat' to non-struct type" in str(exc_info.value) -def test_add_required_column(catalog: Catalog) -> None: - schema_ = Schema( - NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] - ) - table = _create_table_with_schema(catalog, schema_) - update = UpdateSchema(table) - with pytest.raises(ValueError) as exc_info: - update.add_column(path="data", field_type=IntegerType(), required=True) - assert "Incompatible change: cannot add required column: data" in str(exc_info.value) - - new_schema = ( - UpdateSchema(table) # pylint: disable=W0212 - .allow_incompatible_changes() - .add_column(path="data", field_type=IntegerType(), required=True) - ._apply() - ) - assert new_schema == Schema( - NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), - NestedField(field_id=2, name="data", field_type=IntegerType(), required=True), - schema_id=0, - identifier_field_ids=[], - ) - - -def test_add_required_column_case_insensitive(catalog: Catalog) -> None: - schema_ = Schema( - NestedField(field_id=1, name="id", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] - ) - table = _create_table_with_schema(catalog, schema_) - - with pytest.raises(ValueError) as exc_info: - update = UpdateSchema(table) - update.allow_incompatible_changes().case_sensitive(False).add_column(path="ID", field_type=IntegerType(), required=True) - assert "already exists: ID" in str(exc_info.value) - - new_schema = ( - UpdateSchema(table) # pylint: disable=W0212 - .allow_incompatible_changes() - .add_column(path="ID", field_type=IntegerType(), required=True) - ._apply() - ) - assert new_schema == Schema( - NestedField(field_id=1, path="id", field_type=BooleanType(), required=False), - NestedField(field_id=2, name="ID", field_type=IntegerType(), required=True), - schema_id=0, - identifier_field_ids=[], - ) - - @pytest.mark.integration def test_schema_evolution_via_transaction(catalog: Catalog) -> None: schema = Schema( @@ -864,3 +798,1693 @@ def test_rename(catalog: Catalog) -> None: schema_id=0, identifier_field_ids=[], ) + + +def test_rename_case_insensitive(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField( + field_id=1, + name="location_lookup", + field_type=MapType( + type="map", + key_id=5, + key_type=StringType(), + value_id=6, + value_type=StructType( + NestedField(field_id=7, name="x", field_type=FloatType(), required=False), + NestedField(field_id=8, name="y", field_type=FloatType(), required=False), + ), + value_required=True, + ), + required=True, + ), + NestedField( + field_id=2, + name="locations", + field_type=ListType( + type="list", + element_id=9, + element_type=StructType( + NestedField(field_id=10, name="x", field_type=FloatType(), required=False), + NestedField(field_id=11, name="y", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=3, + name="person", + field_type=StructType( + NestedField(field_id=12, name="name", field_type=StringType(), required=False), + NestedField(field_id=13, name="leeftijd", field_type=IntegerType(), required=True), + ), + required=False, + ), + NestedField(field_id=4, name="foo", field_type=StringType(), required=True), + schema_id=0, + identifier_field_ids=[], + ), + ) + + with tbl.update_schema(case_sensitive=False) as schema_update: + schema_update.rename_column("Foo", "bar") + schema_update.rename_column("Location_lookup.X", "location_lookup.latitude") + schema_update.rename_column("Locations.X", "locations.latitude") + schema_update.rename_column("Person.Leeftijd", "person.age") + + assert tbl.schema() == Schema( + NestedField( + field_id=1, + name="location_lookup", + field_type=MapType( + type="map", + key_id=5, + key_type=StringType(), + value_id=6, + value_type=StructType( + NestedField(field_id=7, name="latitude", field_type=FloatType(), required=False), + NestedField(field_id=8, name="y", field_type=FloatType(), required=False), + ), + value_required=True, + ), + required=True, + ), + NestedField( + field_id=2, + name="locations", + field_type=ListType( + type="list", + element_id=9, + element_type=StructType( + NestedField(field_id=10, name="latitude", field_type=FloatType(), required=False), + NestedField(field_id=11, name="y", field_type=FloatType(), required=False), + ), + element_required=True, + ), + required=True, + ), + NestedField( + field_id=3, + name="person", + field_type=StructType( + NestedField(field_id=12, name="name", field_type=StringType(), required=False), + NestedField(field_id=13, name="age", field_type=IntegerType(), required=True), + ), + required=False, + ), + NestedField(field_id=4, name="bar", field_type=StringType(), required=True), + schema_id=0, + identifier_field_ids=[], + ) + + +def test_add_struct(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType()), + schema_id=1, + ), + ) + + struct = StructType( + NestedField(field_id=3, name="x", field_type=DoubleType(), required=False), + NestedField(field_id=4, name="y", field_type=DoubleType(), required=False), + ) + + with tbl.update_schema() as schema_update: + schema_update.add_column("location", struct) + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="foo", field_type=StringType()), + NestedField(field_id=2, name="location", field_type=struct, required=False), + schema_id=1, + ) + + +def test_add_nested_map_of_structs(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType()), + schema_id=1, + ), + ) + + map_type_example = MapType( + key_id=1, + value_id=2, + key_type=StructType( + NestedField(field_id=20, name="address", field_type=StringType(), required=True), + NestedField(field_id=21, name="city", field_type=StringType(), required=True), + NestedField(field_id=22, name="state", field_type=StringType(), required=True), + NestedField(field_id=23, name="zip", field_type=IntegerType(), required=True), + ), + value_type=StructType( + NestedField(field_id=9, name="lat", field_type=DoubleType(), required=True), + NestedField(field_id=8, name="long", field_type=DoubleType(), required=False), + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.add_column("locations", map_type_example) + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + NestedField( + field_id=2, + name="locations", + field_type=MapType( + type="map", + key_id=3, + key_type=StructType( + NestedField(field_id=5, name="address", field_type=StringType(), required=True), + NestedField(field_id=6, name="city", field_type=StringType(), required=True), + NestedField(field_id=7, name="state", field_type=StringType(), required=True), + NestedField(field_id=8, name="zip", field_type=IntegerType(), required=True), + ), + value_id=4, + value_type=StructType( + NestedField(field_id=9, name="lat", field_type=DoubleType(), required=True), + NestedField(field_id=10, name="long", field_type=DoubleType(), required=False), + ), + value_required=True, + ), + required=False, + ), + schema_id=1, + identifier_field_ids=[], + ) + + +def test_add_nested_list_of_structs(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType()), + schema_id=1, + ), + ) + + list_type_examples = ListType( + element_id=1, + element_type=StructType( + NestedField(field_id=9, name="lat", field_type=DoubleType(), required=True), + NestedField(field_id=10, name="long", field_type=DoubleType(), required=False), + ), + element_required=False, + ) + + with tbl.update_schema() as schema_update: + schema_update.add_column("locations", list_type_examples) + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + NestedField( + field_id=2, + name="locations", + field_type=ListType( + type="list", + element_id=3, + element_type=StructType( + NestedField(field_id=4, name="lat", field_type=DoubleType(), required=True), + NestedField(field_id=5, name="long", field_type=DoubleType(), required=False), + ), + element_required=False, + ), + required=False, + ), + schema_id=1, + identifier_field_ids=[], + ) + + +def test_add_required_column(catalog: Catalog) -> None: + schema_ = Schema( + NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] + ) + table = _create_table_with_schema(catalog, schema_) + update = UpdateSchema(table) + with pytest.raises(ValueError) as exc_info: + update.add_column(path="data", field_type=IntegerType(), required=True) + assert "Incompatible change: cannot add required column: data" in str(exc_info.value) + + new_schema = ( + UpdateSchema(table) # pylint: disable=W0212 + .allow_incompatible_changes() + .add_column(path="data", field_type=IntegerType(), required=True) + ._apply() + ) + assert new_schema == Schema( + NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), + NestedField(field_id=2, name="data", field_type=IntegerType(), required=True), + schema_id=0, + identifier_field_ids=[], + ) + + +def test_add_required_column_case_insensitive(catalog: Catalog) -> None: + schema_ = Schema( + NestedField(field_id=1, name="id", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] + ) + table = _create_table_with_schema(catalog, schema_) + + with pytest.raises(ValueError) as exc_info: + update = UpdateSchema(table) + update.allow_incompatible_changes().case_sensitive(False).add_column(path="ID", field_type=IntegerType(), required=True) + assert "already exists: ID" in str(exc_info.value) + + new_schema = ( + UpdateSchema(table) # pylint: disable=W0212 + .allow_incompatible_changes() + .add_column(path="ID", field_type=IntegerType(), required=True) + ._apply() + ) + assert new_schema == Schema( + NestedField(field_id=1, path="id", field_type=BooleanType(), required=False), + NestedField(field_id=2, name="ID", field_type=IntegerType(), required=True), + schema_id=0, + identifier_field_ids=[], + ) + + +def test_make_column_optional(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.make_column_optional("foo") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=False), + schema_id=0, + identifier_field_ids=[], + ) + + +def test_mixed_changes(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=StringType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=False), + NestedField( + field_id=3, + name="preferences", + field_type=StructType( + NestedField(field_id=8, name="feature1", type=BooleanType(), required=True), + NestedField(field_id=9, name="feature2", type=BooleanType(), required=False), + ), + required=False, + ), + NestedField( + field_id=4, + name="locations", + field_type=MapType( + key_id=10, + value_id=11, + key_type=StructType( + NestedField(field_id=20, name="address", field_type=StringType(), required=True), + NestedField(field_id=21, name="city", field_type=StringType(), required=True), + NestedField(field_id=22, name="state", field_type=StringType(), required=True), + NestedField(field_id=23, name="zip", field_type=IntegerType(), required=True), + ), + value_type=StructType( + NestedField(field_id=12, name="lat", field_type=DoubleType(), required=True), + NestedField(field_id=13, name="long", field_type=DoubleType(), required=False), + ), + ), + required=True, + ), + NestedField( + field_id=5, + name="points", + field_type=ListType( + element_id=14, + element_type=StructType( + NestedField(field_id=15, name="x", field_type=LongType(), required=True), + NestedField(field_id=16, name="y", field_type=LongType(), required=True), + ), + ), + required=True, + doc="2-D cartesian points", + ), + NestedField(field_id=6, name="doubles", field_type=ListType(element_id=17, element_type=DoubleType()), required=True), + NestedField( + field_id=7, + name="properties", + field_type=MapType(key_id=18, value_id=19, key_type=StringType(), value_type=StringType()), + required=False, + ), + schema_id=1, + ), + ) + + with tbl.update_schema(allow_incompatible_changes=True) as schema_update: + schema_update.add_column("toplevel", field_type=DecimalType(9, 2)) + schema_update.add_column(("locations", "alt"), field_type=FloatType()) + schema_update.add_column(("points", "z"), field_type=LongType()) + schema_update.add_column(("points", "t.t"), field_type=LongType(), doc="name with '.'") + schema_update.rename_column("data", "json") + schema_update.rename_column("preferences", "options") + schema_update.rename_column("preferences.feature2", "newfeature") + schema_update.rename_column("locations.lat", "latitude") + schema_update.rename_column("points.x", "X") + schema_update.rename_column("points.y", "y.y") + schema_update.update_column("id", field_type=LongType(), doc="unique id") + schema_update.update_column("locations.lat", DoubleType()) + schema_update.update_column_doc("locations.lat", "latitude") + schema_update.delete_column("locations.long") + schema_update.delete_column("properties") + schema_update.make_column_optional("points.x") + schema_update.require_column("data") + schema_update.add_column(("locations", "description"), StringType(), doc="location description") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True, doc="unique id"), + NestedField(field_id=2, name="json", field_type=StringType(), required=True), + NestedField( + field_id=3, + name="options", + field_type=StructType( + NestedField(field_id=8, name="feature1", field_type=BooleanType(), required=True), + NestedField(field_id=9, name="newfeature", field_type=BooleanType(), required=False), + ), + required=False, + ), + NestedField( + field_id=4, + name="locations", + field_type=MapType( + type="map", + key_id=10, + key_type=StructType( + NestedField(field_id=12, name="address", field_type=StringType(), required=True), + NestedField(field_id=13, name="city", field_type=StringType(), required=True), + NestedField(field_id=14, name="state", field_type=StringType(), required=True), + NestedField(field_id=15, name="zip", field_type=IntegerType(), required=True), + ), + value_id=11, + value_type=StructType( + NestedField(field_id=16, name="latitude", field_type=DoubleType(), required=True, doc="latitude"), + NestedField(field_id=25, name="alt", field_type=FloatType(), required=False), + NestedField( + field_id=28, name="description", field_type=StringType(), required=False, doc="location description" + ), + ), + value_required=True, + ), + required=True, + ), + NestedField( + field_id=5, + name="points", + field_type=ListType( + type="list", + element_id=18, + element_type=StructType( + NestedField(field_id=19, name="X", field_type=LongType(), required=False), + NestedField(field_id=20, name="y.y", field_type=LongType(), required=True), + NestedField(field_id=26, name="z", field_type=LongType(), required=False), + NestedField(field_id=27, name="t.t", field_type=LongType(), required=False, doc="name with '.'"), + ), + element_required=True, + ), + doc="2-D cartesian points", + required=True, + ), + NestedField( + field_id=6, + name="doubles", + field_type=ListType(type="list", element_id=21, element_type=DoubleType(), element_required=True), + required=True, + ), + NestedField(field_id=24, name="toplevel", field_type=DecimalType(precision=9, scale=2), required=False), + schema_id=1, + identifier_field_ids=[], + ) + + +def test_ambiguous_column(catalog: Catalog, table_schema_nested: Schema) -> None: + table = _create_table_with_schema(catalog, table_schema_nested) + update = UpdateSchema(table) + + with pytest.raises(ValueError) as exc_info: + update.add_column(path="location.latitude", field_type=IntegerType()) + assert "Cannot add column with ambiguous name: location.latitude, provide a tuple instead" in str(exc_info.value) + + +def test_delete_then_add(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.delete_column("foo") + schema_update.add_column("foo", StringType()) + + assert tbl.schema() == Schema( + NestedField(field_id=2, name="foo", field_type=StringType(), required=False), + schema_id=1, + ) + + +def test_delete_then_add_nested(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField( + field_id=1, + name="preferences", + field_type=StructType( + NestedField(field_id=2, name="feature1", field_type=BooleanType()), + NestedField(field_id=3, name="feature2", field_type=BooleanType()), + ), + required=True, + ), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.delete_column("preferences.feature1") + schema_update.add_column(("preferences", "feature1"), BooleanType()) + + assert tbl.schema() == Schema( + NestedField( + field_id=1, + name="preferences", + field_type=StructType( + NestedField(field_id=3, name="feature2", field_type=BooleanType()), + NestedField(field_id=4, name="feature1", field_type=BooleanType(), required=False), + ), + required=True, + ), + schema_id=1, + ) + + +def test_delete_missing_column(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.delete_column("bar") + + assert "Could not find field with name bar, case_sensitive=True" in str(exc_info.value) + + +def test_add_delete_conflict(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.add_column("bar", BooleanType()) + schema_update.delete_column("bar") + assert "Could not find field with name bar, case_sensitive=True" in str(exc_info.value) + + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField( + field_id=1, + name="preferences", + field_type=StructType( + NestedField(field_id=2, name="feature1", field_type=BooleanType()), + NestedField(field_id=3, name="feature2", field_type=BooleanType()), + ), + required=True, + ), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.add_column(("preferences", "feature3"), BooleanType()) + schema_update.delete_column("preferences") + assert "Cannot delete a column that has additions: preferences" in str(exc_info.value) + + +def test_rename_missing_column(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.rename_column("bar", "fail") + + assert "Could not find field with name bar, case_sensitive=True" in str(exc_info.value) + + +def test_rename_missing_conflicts(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.rename_column("foo", "bar") + schema_update.delete_column("foo") + + assert "Cannot delete a column that has updates: foo" in str(exc_info.value) + + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.rename_column("foo", "bar") + schema_update.delete_column("bar") + + assert "Could not find field with name bar, case_sensitive=True" in str(exc_info.value) + + +def test_update_missing_column(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.update_column("bar", DateType()) + + assert "Could not find field with name bar, case_sensitive=True" in str(exc_info.value) + + +def test_update_delete_conflict(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=IntegerType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.update_column("foo", LongType()) + schema_update.delete_column("foo") + + assert "Cannot delete a column that has updates: foo" in str(exc_info.value) + + +def test_delete_update_conflict(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=IntegerType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.delete_column("foo") + schema_update.update_column("foo", LongType()) + + assert "Cannot update a column that will be deleted: foo" in str(exc_info.value) + + +def test_delete_map_key(nested_table: Table) -> None: + with pytest.raises(ValueError) as exc_info: + with nested_table.update_schema() as schema_update: + schema_update.delete_column("quux.key") + + assert "Cannot delete map keys" in str(exc_info.value) + + +def test_add_field_to_map_key(nested_table: Table) -> None: + with pytest.raises(ValueError) as exc_info: + with nested_table.update_schema() as schema_update: + schema_update.add_column(("quux", "key"), StringType()) + + assert "Cannot add column 'key' to non-struct type: quux" in str(exc_info.value) + + +def test_alter_map_key(nested_table: Table) -> None: + with pytest.raises(ValueError) as exc_info: + with nested_table.update_schema() as schema_update: + schema_update.update_column(("quux", "key"), BinaryType()) + + assert "Cannot update map keys" in str(exc_info.value) + + +def test_update_map_key(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField( + field_id=1, name="m", field_type=MapType(key_id=2, value_id=3, key_type=IntegerType(), value_type=DoubleType()) + ) + ), + ) + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.update_column("m.key", LongType()) + + assert "Cannot update map keys: map" in str(exc_info.value) + + +def test_update_added_column_doc(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.add_column("value", LongType()) + schema_update.update_column_doc("value", "a value") + + assert "Could not find field with name value, case_sensitive=True" in str(exc_info.value) + + +def test_update_deleted_column_doc(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.delete_column("foo") + schema_update.update_column_doc("foo", "a value") + + assert "Cannot update a column that will be deleted: foo" in str(exc_info.value) + + +def test_multiple_moves(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="a", field_type=IntegerType(), required=True), + NestedField(field_id=2, name="b", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="c", field_type=IntegerType(), required=True), + NestedField(field_id=4, name="d", field_type=IntegerType(), required=True), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.move_first("d") + schema_update.move_first("c") + schema_update.move_after("b", "d") + schema_update.move_before("d", "a") + + assert tbl.schema() == Schema( + NestedField(field_id=3, name="c", field_type=IntegerType(), required=True), + NestedField(field_id=2, name="b", field_type=IntegerType(), required=True), + NestedField(field_id=4, name="d", field_type=IntegerType(), required=True), + NestedField(field_id=1, name="a", field_type=IntegerType(), required=True), + schema_id=1, + ) + + +def test_move_top_level_column_first(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.move_first("data") + + assert tbl.schema() == Schema( + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + schema_id=1, + ) + + +def test_move_top_level_column_before_first(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.move_before("data", "id") + + assert tbl.schema() == Schema( + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + schema_id=1, + ) + + +def test_move_top_level_column_after_last(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.move_after("id", "data") + + assert tbl.schema() == Schema( + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + schema_id=1, + ) + + +def test_move_nested_field_first(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + ), + required=True, + ), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.move_first("struct.data") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + ), + required=True, + ), + schema_id=1, + ) + + +def test_move_nested_field_before_first(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + ), + required=True, + ), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.move_before("struct.data", "struct.count") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + ), + required=True, + ), + schema_id=1, + ) + + +def test_move_nested_field_after_first(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + ), + required=True, + ), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.move_before("struct.count", "struct.data") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + ), + required=True, + ), + schema_id=1, + ) + + +def test_move_nested_field_after(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=True), + ), + required=True, + ), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.move_after("struct.ts", "struct.count") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=True), + ), + required=True, + ), + schema_id=1, + ) + + +def test_move_nested_field_before(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=True), + ), + required=True, + ), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.move_before("struct.ts", "struct.data") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + ), + required=True, + ), + schema_id=1, + ) + + +def test_move_map_value_struct_field(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="map", + field_type=MapType( + key_id=6, + value_id=7, + key_type=StringType(), + element_type=StructType( + NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=True), + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + ), + ), + required=True, + ), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.move_before("map.ts", "map.data") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="map", + field_type=MapType( + key_id=6, + value_id=7, + key_type=StringType(), + element_type=StructType( + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + ), + ), + required=True, + ), + schema_id=1, + ) + + +def test_move_added_top_level_column(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.add_column("ts", TimestamptzType()) + schema_update.move_after("ts", "id") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=3, name="ts", field_type=TimestamptzType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + schema_id=1, + ) + + +def test_move_added_top_level_column_after_added_column(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.add_column("ts", TimestamptzType()) + schema_update.add_column("count", LongType()) + schema_update.move_after("ts", "id") + schema_update.move_after("count", "ts") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=3, name="ts", field_type=TimestamptzType(), required=False), + NestedField(field_id=4, name="count", field_type=LongType(), required=False), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + schema_id=1, + ) + + +def test_move_added_nested_struct_field(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + ), + required=True, + ), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.add_column(("struct", "ts"), TimestamptzType()) + schema_update.move_before("struct.ts", "struct.count") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=False), + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + ), + required=True, + ), + schema_id=1, + ) + + +def test_move_added_nested_field_before_added_column(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + ), + required=True, + ), + schema_id=1, + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.add_column(("struct", "ts"), TimestamptzType()) + schema_update.add_column(("struct", "size"), LongType()) + schema_update.move_before("struct.ts", "struct.count") + schema_update.move_before("struct.size", "struct.ts") + + assert tbl.schema() == Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=6, name="size", field_type=LongType(), required=False), + NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=False), + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + ), + required=True, + ), + schema_id=1, + ) + + +def test_move_self_reference_fails(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType()), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_before("foo", "foo") + assert "Cannot move foo before itself" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_after("foo", "foo") + assert "Cannot move foo after itself" in str(exc_info.value) + + +def test_move_missing_column_fails(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType()), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_first("items") + assert "Cannot move missing column: items" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_before("items", "id") + assert "Cannot move missing column: items" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_after("items", "data") + assert "Cannot move missing column: items" in str(exc_info.value) + + +def test_move_before_add_fails(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType()), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_first("ts") + update.add_column("ts", TimestamptzType()) + assert "Cannot move missing column: ts" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_before("ts", "id") + update.add_column("ts", TimestamptzType()) + assert "Cannot move missing column: ts" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_after("ts", "data") + update.add_column("ts", TimestamptzType()) + assert "Cannot move missing column: ts" in str(exc_info.value) + + +def test_move_missing_reference_column_fails(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_before("id", "items") + assert "Cannot move id before missing column: items" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_after("data", "items") + assert "Cannot move data after missing column: items" in str(exc_info.value) + + +def test_move_primitive_map_key_fails(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + NestedField( + field_id=3, + name="map", + field_type=MapType(key_id=4, value_id=5, key_type=StringType(), value_type=StringType()), + required=False, + ), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_before("map.key", "map.value") + assert "Cannot move fields in non-struct type: map" in str(exc_info.value) + + +def test_move_primitive_map_value_fails(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + NestedField( + field_id=3, + name="map", + field_type=MapType(key_id=4, value_id=5, key_type=StringType(), value_type=StructType()), + required=False, + ), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_before("map.value", "map.key") + assert "Cannot move fields in non-struct type: map>" in str(exc_info.value) + + +def test_move_top_level_between_structs_fails(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="a", field_type=IntegerType(), required=True), + NestedField(field_id=2, name="b", field_type=IntegerType(), required=True), + NestedField( + field_id=3, + name="struct", + field_type=StructType( + NestedField(field_id=4, name="x", field_type=IntegerType(), required=True), + NestedField(field_id=5, name="y", field_type=IntegerType(), required=True), + ), + required=False, + ), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_before("a", "struct.x") + assert "Cannot move field a to a different struct" in str(exc_info.value) + + +def test_move_between_structs_fails(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField( + field_id=1, + name="s1", + field_type=StructType( + NestedField(field_id=3, name="a", field_type=IntegerType(), required=True), + NestedField(field_id=4, name="b", field_type=IntegerType(), required=True), + ), + required=False, + ), + NestedField( + field_id=2, + name="s2", + field_type=StructType( + NestedField(field_id=5, name="x", field_type=IntegerType(), required=True), + NestedField(field_id=6, name="y", field_type=IntegerType(), required=True), + ), + required=False, + ), + schema_id=1, + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update: + update.move_before("s2.x", "s1.a") + + assert "Cannot move field s2.x to a different struct" in str(exc_info.value) + + +def test_add_existing_identifier_fields(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema() as update_schema: + update_schema.set_identifier_fields("foo") + + assert tbl.schema().identifier_field_names() == {"foo"} + + +def test_add_new_identifiers_field_columns(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema(allow_incompatible_changes=True) as update_schema: + update_schema.add_column("new_field", StringType(), required=True) + update_schema.set_identifier_fields("foo", "new_field") + + assert tbl.schema().identifier_field_names() == {"foo", "new_field"} + + +def test_add_new_identifiers_field_columns_out_of_order(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema(allow_incompatible_changes=True) as update_schema: + update_schema.add_column("new_field", StringType(), required=True) + update_schema.set_identifier_fields("foo", "new_field") + + assert tbl.schema().identifier_field_names() == {"foo", "new_field"} + + +def test_add_nested_identifier_field_columns(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema(allow_incompatible_changes=True) as update_schema: + update_schema.add_column( + "required_struct", StructType(NestedField(field_id=3, name="field", type=StringType(), required=True)), required=True + ) + + with tbl.update_schema() as update_schema: + update_schema.set_identifier_fields("required_struct.field") + + assert tbl.schema().identifier_field_names() == {"required_struct.field"} + + +def test_add_nested_identifier_field_columns_single_transaction(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema(allow_incompatible_changes=True) as update_schema: + update_schema.add_column( + "new", StructType(NestedField(field_id=3, name="field", type=StringType(), required=True)), required=True + ) + update_schema.set_identifier_fields("new.field") + + assert tbl.schema().identifier_field_names() == {"new.field"} + + +def test_add_nested_nested_identifier_field_columns(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema(allow_incompatible_changes=True) as update_schema: + update_schema.add_column( + "new", + StructType( + NestedField( + field_id=3, + name="field", + type=StructType(NestedField(field_id=4, name="nested", type=StringType(), required=False)), + required=True, + ) + ), + required=True, + ) + update_schema.set_identifier_fields("new.field.nested") + + assert tbl.schema().identifier_field_names() == {"new.field.nested"} + + +def test_add_dotted_identifier_field_columns(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema(allow_incompatible_changes=True) as update_schema: + update_schema.add_column(("dot.field",), StringType(), required=True) + update_schema.set_identifier_fields("dot.field") + + assert tbl.schema().identifier_field_names() == {"dot.field"} + + +def test_remove_identifier_fields(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema(allow_incompatible_changes=True) as update_schema: + update_schema.add_column(("new_field",), StringType(), required=True) + update_schema.add_column(("new_field2",), StringType(), required=True) + update_schema.set_identifier_fields("foo", "new_field", "new_field2") + + assert tbl.schema().identifier_field_names() == {"foo", "new_field", "new_field2"} + + with tbl.update_schema(allow_incompatible_changes=True) as update_schema: + update_schema.set_identifier_fields() + + assert tbl.schema().identifier_field_names() == set() + + +def test_set_identifier_field_fails_schema(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), + NestedField(field_id=2, name="float", field_type=FloatType(), required=True), + NestedField(field_id=3, name="double", field_type=DoubleType(), required=True), + schema_id=1, + identifier_field_ids=[1], + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update_schema: + update_schema.set_identifier_fields("id") + + assert "Cannot add field id as an identifier field: not a required field" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update_schema: + update_schema.set_identifier_fields("float") + + assert "Cannot add field float as an identifier field: must not be float or double field" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update_schema: + update_schema.set_identifier_fields("double") + + assert "Cannot add field float as an identifier field: must not be float or double field" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as update_schema: + update_schema.set_identifier_fields("unknown") + + assert "Cannot add field unknown as an identifier field: not found in current schema or added columns" in str(exc_info.value) + + +def test_set_identifier_field_fails(nested_table: Table) -> None: + with pytest.raises(ValueError) as exc_info: + with nested_table.update_schema() as update_schema: + update_schema.set_identifier_fields("location") + + assert "Cannot add field location as an identifier field: not a primitive type field" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with nested_table.update_schema() as update_schema: + update_schema.set_identifier_fields("baz") + + assert "Cannot add field baz as an identifier field: not a required field" in str(exc_info.value) + + with pytest.raises(ValueError) as exc_info: + with nested_table.update_schema() as update_schema: + update_schema.set_identifier_fields("person.name") + + assert "Cannot add field zip as an identifier field: must not be nested in" in str(exc_info.value) + + +def test_delete_identifier_field_columns(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.delete_column("foo") + schema_update.set_identifier_fields() + + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.set_identifier_fields() + schema_update.delete_column("foo") + + +def test_delete_containing_nested_identifier_field_columns_fails(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema(allow_incompatible_changes=True) as schema_update: + schema_update.add_column( + "out", StructType(NestedField(field_id=3, name="nested", field_type=StringType(), required=True)), required=True + ) + schema_update.set_identifier_fields("out.nested") + + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.update_schema() as schema_update: + schema_update.delete_column("out") + + assert ( + str(exc_info) + == "Cannot delete field 24: out: required struct<25: nested: required string> as it will delete nested identifier field 25: nested: required string" + ) + + +def test_rename_identifier_fields(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.rename_column("foo", "bar") + + assert tbl.schema().identifier_field_ids == [1] + assert tbl.schema().identifier_field_names() == {"bar"} + + +def test_move_identifier_fields(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with tbl.update_schema() as update: + update.move_before("data", "id") + + assert tbl.schema().identifier_field_ids == [1] + assert tbl.schema().identifier_field_names() == {"bar"} + + with tbl.update_schema() as update: + update.move_after("id", "data") + + assert tbl.schema().identifier_field_ids == [1] + assert tbl.schema().identifier_field_names() == {"bar"} + + with tbl.update_schema() as update: + update.move_first("data") + + assert tbl.schema().identifier_field_ids == [1] + assert tbl.schema().identifier_field_names() == {"bar"} + + +def test_move_identifier_fields_case_insensitive(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField(field_id=2, name="data", field_type=StringType(), required=True), + schema_id=1, + ), + ) + + with tbl.update_schema(case_sensitive=False) as update: + update.move_before("DATA", "ID") + + assert tbl.schema().identifier_field_ids == [1] + assert tbl.schema().identifier_field_names() == {"bar"} + + with tbl.update_schema(case_sensitive=False) as update: + update.move_after("ID", "DATA") + + assert tbl.schema().identifier_field_ids == [1] + assert tbl.schema().identifier_field_names() == {"bar"} + + with tbl.update_schema(case_sensitive=False) as update: + update.move_first("DATA") + + assert tbl.schema().identifier_field_ids == [1] + assert tbl.schema().identifier_field_names() == {"bar"} From eccff35286b08545583786bef97e8686a5e4cdf7 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Sun, 27 Aug 2023 23:07:18 +0200 Subject: [PATCH 12/17] Comments --- python/pyiceberg/table/__init__.py | 158 ++++++++++++------------ python/tests/test_integration_schema.py | 10 +- 2 files changed, 84 insertions(+), 84 deletions(-) diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index e631a6cd41ee..423ef0a20ee5 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -224,11 +224,8 @@ def commit_transaction(self) -> Table: # Strip the catalog name if len(self._updates) > 0: self._table._do_commit( # pylint: disable=W0212 - CommitTableRequest( - identifier=self._table.identifier[1:], - requirements=self._requirements, - updates=self._updates, - ) + updates=self._updates, + requirements=self._requirements, ) return self._table else: @@ -404,8 +401,8 @@ class AssertDefaultSortOrderId(TableRequirement): class CommitTableRequest(IcebergBaseModel): identifier: Identifier = Field() - requirements: List[SerializeAsAny[TableRequirement]] = Field(default_factory=list) - updates: List[SerializeAsAny[TableUpdate]] = Field(default_factory=list) + requirements: Tuple[SerializeAsAny[TableRequirement]] = Field(default_factory=tuple) + updates: Tuple[SerializeAsAny[TableUpdate]] = Field(default_factory=tuple) class CommitTableResponse(IcebergBaseModel): @@ -524,8 +521,10 @@ def history(self) -> List[SnapshotLogEntry]: def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive: bool = True) -> UpdateSchema: return UpdateSchema(self, allow_incompatible_changes=allow_incompatible_changes, case_sensitive=case_sensitive) - def _do_commit(self, request: CommitTableRequest) -> None: - response = self.catalog._commit_table(request) # pylint: disable=W0212 + def _do_commit(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequirement, ...]) -> None: + response = self.catalog._commit_table( # pylint: disable=W0212 + CommitTableRequest(identifier=self.identifier[:-1], updates=updates, requirements=requirements) + ) # pylint: disable=W0212 self.metadata = response.metadata self.metadata_location = response.metadata_location @@ -897,6 +896,7 @@ class MoveOperation(Enum): @dataclass class Move: field_id: int + full_name: str op: MoveOperation other_field_id: Optional[int] = None @@ -928,7 +928,7 @@ def __init__( ) -> None: self._table = table self._schema = table.schema() - self._last_column_id = itertools.count(self._schema.highest_field_id + 1) + self._last_column_id = itertools.count(table.metadata.last_column_id + 1) self._identifier_field_names = self._schema.identifier_field_names() self._adds = {} @@ -968,6 +968,12 @@ def add_column( ) -> UpdateSchema: """Add a new column to a nested struct or Add a new top-level column. + Because "." may be interpreted as a column path separator or may be used in field names, it + is not allowed to add nested column by passing in a string. To add to nested structures or + to add fields with names that contain "." use a tuple instead to indicate the path. + + If type is a nested type, its field IDs are reassigned when added to the existing schema. + Args: path: Name for the new column. field_type: Type for the new column. @@ -990,10 +996,11 @@ def add_column( parent = path[:-1] full_name = ".".join(path) + parent_full_path = ".".join(parent) parent_id: int = TABLE_ROOT_ID if len(parent) > 0: - parent_field = self._schema.find_field(".".join(parent), self._case_sensitive) + parent_field = self._schema.find_field(parent_full_path, self._case_sensitive) parent_type = parent_field.field_type if isinstance(parent_type, MapType): parent_field = parent_type.value_field @@ -1019,11 +1026,15 @@ def add_column( # update tracking for moves self._added_name_to_id[full_name] = new_id + self._id_to_parent[new_id] = parent_full_path new_type = assign_fresh_schema_ids(field_type, self.assign_new_column_id) field = NestedField(field_id=new_id, name=name, field_type=new_type, required=required, doc=doc) - self._adds[parent_id] = self._adds.get(parent_id, []) + [field] + if parent_id in self._adds: + self._adds[parent_id].append(field) + else: + self._adds[parent_id] = [field] return self @@ -1060,18 +1071,14 @@ def rename_column(self, path_from: Union[str, Tuple[str, ...]], new_name: str) - Returns: The UpdateSchema with the rename operation staged. """ - name_from = tuple(path_from.split(".")) if isinstance(path_from, str) else path_from + path_from = ".".join(path_from) if isinstance(path_from, tuple) else path_from + field_from = self._schema.find_field(path_from, self._case_sensitive) - parent_path = name_from[:-1] - full_name_from = ".".join(name_from) + if field_from.field_id in self._deletes: + raise ValueError(f"Cannot rename a column that will be deleted: {path_from}") - from_field = self._schema.find_field(full_name_from, self._case_sensitive) - - if from_field.field_id in self._deletes: - raise ValueError(f"Cannot rename a column that will be deleted: {full_name_from}") - - if updated := self._updates.get(from_field.field_id): - self._updates[from_field.field_id] = NestedField( + if updated := self._updates.get(field_from.field_id): + self._updates[field_from.field_id] = NestedField( field_id=updated.field_id, name=new_name, field_type=updated.field_type, @@ -1079,17 +1086,17 @@ def rename_column(self, path_from: Union[str, Tuple[str, ...]], new_name: str) - required=updated.required, ) else: - self._updates[from_field.field_id] = NestedField( - field_id=from_field.field_id, + self._updates[field_from.field_id] = NestedField( + field_id=field_from.field_id, name=new_name, - field_type=from_field.field_type, - doc=from_field.doc, - required=from_field.required, + field_type=field_from.field_type, + doc=field_from.doc, + required=field_from.required, ) if path_from in self._identifier_field_names: - self._identifier_field_names.remove(full_name_from) - self._identifier_field_names.add(f"{'.'.join(parent_path)}{'.' if len(parent_path) > 0 else ''}{new_name}") + self._identifier_field_names.remove(path_from) + self._identifier_field_names.add(f"{path_from[:-len(field_from.name)]}{new_name}") return self @@ -1105,7 +1112,7 @@ def require_column(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: Returns: The UpdateSchema with the requirement change staged. """ - self._set_column_requirement(path, True) + self._set_column_requirement(path, required=True) return self def make_column_optional(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: @@ -1157,13 +1164,19 @@ def _set_column_requirement(self, path: Union[str, Tuple[str, ...]], required: b ) def update_column( - self, path: Union[str, Tuple[str, ...]], field_type: IcebergType, doc: Optional[str] = None + self, + path: Union[str, Tuple[str, ...]], + field_type: Optional[IcebergType] = None, + required: Optional[bool] = None, + doc: Optional[str] = None, ) -> UpdateSchema: """Update the type of column. Args: path: The path to the field. field_type: The new type + required: If the field should be required + doc: Documentation describing the column Returns: The UpdateSchema with the type update staged. @@ -1176,21 +1189,18 @@ def update_column( if field.field_id in self._deletes: raise ValueError(f"Cannot update a column that will be deleted: {full_name}") - if field.field_type == field_type: - # Nothing changed - return self - - if not self._allow_incompatible_changes: - try: - promote(field.field_type, field_type) - except ResolveError as e: - raise ValidationError(f"Cannot change column type: {full_name}: {field.field_type} -> {field_type}") from e + if field_type is not None: + if not self._allow_incompatible_changes: + try: + promote(field.field_type, field_type) + except ResolveError as e: + raise ValidationError(f"Cannot change column type: {full_name}: {field.field_type} -> {field_type}") from e if updated := self._updates.get(field.field_id): self._updates[field.field_id] = NestedField( field_id=updated.field_id, name=updated.name, - field_type=field_type, + field_type=field_type or updated.field_type, doc=doc or updated.doc, required=updated.required, ) @@ -1198,11 +1208,14 @@ def update_column( self._updates[field.field_id] = NestedField( field_id=field.field_id, name=field.name, - field_type=field_type, + field_type=field_type or field.field_type, doc=doc or field.doc, required=field.required, ) + if required is not None: + self._set_column_requirement(path, required=required) + return self def update_column_doc(self, path: Union[str, Tuple[str, ...]], doc: str) -> UpdateSchema: @@ -1254,7 +1267,7 @@ def _find_for_move(self, name: str) -> Optional[int]: return self._added_name_to_id.get(name) - def _move(self, full_name: str, move: Move) -> None: + def _move(self, move: Move) -> None: if parent_name := self._id_to_parent.get(move.field_id): parent_field = self._schema.find_field(parent_name, self._case_sensitive) if not parent_field.is_struct: @@ -1265,7 +1278,7 @@ def _move(self, full_name: str, move: Move) -> None: raise ValueError("Expected other field when performing before/after move") if self._id_to_parent.get(move.field_id) != self._id_to_parent.get(move.other_field_id): - raise ValueError(f"Cannot move field {full_name} to a different struct") + raise ValueError(f"Cannot move field {move.full_name} to a different struct") self._moves[parent_field.field_id] = self._moves.get(parent_field.field_id, []) + [move] else: @@ -1274,7 +1287,7 @@ def _move(self, full_name: str, move: Move) -> None: raise ValueError("Expected other field when performing before/after move") if self._id_to_parent.get(move.other_field_id) is not None: - raise ValueError(f"Cannot move field {full_name} to a different struct") + raise ValueError(f"Cannot move field {move.full_name} to a different struct") self._moves[TABLE_ROOT_ID] = self._moves.get(TABLE_ROOT_ID, []) + [move] @@ -1287,15 +1300,14 @@ def move_first(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: Returns: The UpdateSchema with the move operation staged. """ - path = (path,) if isinstance(path, str) else path - full_name = ".".join(path) + full_name = ".".join(path) if isinstance(path, tuple) else path field_id = self._find_for_move(full_name) if field_id is None: raise ValueError(f"Cannot move missing column: {full_name}") - self._move(full_name, Move(field_id=field_id, op=MoveOperation.First)) + self._move(Move(field_id=field_id, full_name=full_name, op=MoveOperation.First)) return self @@ -1308,16 +1320,19 @@ def move_before(self, path: Union[str, Tuple[str, ...]], before_path: Union[str, Returns: The UpdateSchema with the move operation staged. """ - path = (path,) if isinstance(path, str) else path - full_name = ".".join(path) - + full_name = ".".join(path) if isinstance(path, tuple) else path field_id = self._find_for_move(full_name) if field_id is None: raise ValueError(f"Cannot move missing column: {full_name}") - before_path = (before_path,) if isinstance(before_path, str) else before_path - before_full_name = ".".join(before_path) + before_full_name = ( + ".".join( + before_path, + ) + if isinstance(before_path, tuple) + else before_path + ) before_field_id = self._find_for_move(before_full_name) if before_field_id is None: @@ -1326,7 +1341,7 @@ def move_before(self, path: Union[str, Tuple[str, ...]], before_path: Union[str, if field_id == before_field_id: raise ValueError(f"Cannot move {full_name} before itself") - self._move(full_name, Move(field_id=field_id, other_field_id=before_field_id, op=MoveOperation.Before)) + self._move(Move(field_id=field_id, full_name=full_name, other_field_id=before_field_id, op=MoveOperation.Before)) return self @@ -1339,16 +1354,14 @@ def move_after(self, path: Union[str, Tuple[str, ...]], after_name: Union[str, T Returns: The UpdateSchema with the move operation staged. """ - path = (path,) if isinstance(path, str) else path - full_name = ".".join(path) + full_name = ".".join(path) if isinstance(path, tuple) else path field_id = self._find_for_move(full_name) if field_id is None: raise ValueError(f"Cannot move missing column: {full_name}") - after_path = (after_name,) if isinstance(after_name, str) else after_name - after_full_name = ".".join(after_path) + after_full_name = ".".join(after_name) if isinstance(after_name, tuple) else after_name after_field_id = self._find_for_move(after_full_name) if after_field_id is None: @@ -1357,38 +1370,27 @@ def move_after(self, path: Union[str, Tuple[str, ...]], after_name: Union[str, T if field_id == after_field_id: raise ValueError(f"Cannot move {full_name} after itself") - self._move(full_name, Move(field_id=field_id, other_field_id=after_field_id, op=MoveOperation.After)) + self._move(Move(field_id=field_id, full_name=full_name, other_field_id=after_field_id, op=MoveOperation.After)) return self - def allow_incompatible_changes(self) -> UpdateSchema: - """Allow incompatible changes to the schema. - - Returns: - This for method chaining - """ - self._allow_incompatible_changes = True - return self - def commit(self) -> None: """Apply the pending changes and commit.""" new_schema = self._apply() if new_schema != self._schema: - last_column_id = max(self._schema.highest_field_id, new_schema.highest_field_id) - updates = [ + last_column_id = max(self._table.metadata.last_column_id, new_schema.highest_field_id) + updates = ( AddSchemaUpdate(schema=new_schema, last_column_id=last_column_id), SetCurrentSchemaUpdate(schema_id=-1), - ] - requirements = [AssertCurrentSchemaId(current_schema_id=self._schema.schema_id)] + ) + requirements = (AssertCurrentSchemaId(current_schema_id=self._schema.schema_id),) if self._transaction is not None: self._transaction._append_updates(*updates) # pylint: disable=W0212 self._transaction._append_requirements(*requirements) # pylint: disable=W0212 else: - self._table._do_commit( # pylint: disable=W0212 - CommitTableRequest(identifier=self._table.identifier[1:], updates=updates, requirements=requirements) - ) + self._table._do_commit(updates=updates, requirements=requirements) # pylint: disable=W0212 def _apply(self) -> Schema: """Apply the pending changes to the original schema and returns the result. @@ -1555,9 +1557,9 @@ def primitive(self, primitive: PrimitiveType) -> Optional[IcebergType]: return primitive -def _add_fields(fields: Tuple[NestedField, ...], adds: Optional[List[NestedField]]) -> Optional[Tuple[NestedField, ...]]: +def _add_fields(fields: Tuple[NestedField, ...], adds: Optional[List[NestedField]]) -> Tuple[NestedField, ...]: adds = adds or [] - return None if len(adds) == 0 else fields + tuple(adds) + return fields + tuple(adds) def _move_fields(fields: Tuple[NestedField, ...], moves: List[Move]) -> Tuple[NestedField, ...]: @@ -1590,7 +1592,7 @@ def _add_and_move_fields( # always apply adds first so that added fields can be moved added = _add_fields(fields, adds) if moves: - return _move_fields(added, moves) # type: ignore + return _move_fields(added, moves) else: return added # add fields diff --git a/python/tests/test_integration_schema.py b/python/tests/test_integration_schema.py index c495169db90d..acc467e0d051 100644 --- a/python/tests/test_integration_schema.py +++ b/python/tests/test_integration_schema.py @@ -1033,8 +1033,7 @@ def test_add_required_column(catalog: Catalog) -> None: assert "Incompatible change: cannot add required column: data" in str(exc_info.value) new_schema = ( - UpdateSchema(table) # pylint: disable=W0212 - .allow_incompatible_changes() + UpdateSchema(table, allow_incompatible_changes=True) # pylint: disable=W0212 .add_column(path="data", field_type=IntegerType(), required=True) ._apply() ) @@ -1053,13 +1052,12 @@ def test_add_required_column_case_insensitive(catalog: Catalog) -> None: table = _create_table_with_schema(catalog, schema_) with pytest.raises(ValueError) as exc_info: - update = UpdateSchema(table) - update.allow_incompatible_changes().case_sensitive(False).add_column(path="ID", field_type=IntegerType(), required=True) + update = UpdateSchema(table, allow_incompatible_changes=True) + update.case_sensitive(False).add_column(path="ID", field_type=IntegerType(), required=True) assert "already exists: ID" in str(exc_info.value) new_schema = ( - UpdateSchema(table) # pylint: disable=W0212 - .allow_incompatible_changes() + UpdateSchema(table, allow_incompatible_changes=True) # pylint: disable=W0212 .add_column(path="ID", field_type=IntegerType(), required=True) ._apply() ) From 1e735c236e7d6cc41d4bb769ee08f0b5642180f7 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Mon, 28 Aug 2023 22:59:04 +0200 Subject: [PATCH 13/17] Make all the tests pass --- python/Makefile | 2 +- python/pyiceberg/table/__init__.py | 55 +++--- python/tests/test_integration_schema.py | 249 ++++++++++++++++++------ 3 files changed, 218 insertions(+), 88 deletions(-) diff --git a/python/Makefile b/python/Makefile index db33d09b0438..8ee7efff41fd 100644 --- a/python/Makefile +++ b/python/Makefile @@ -30,7 +30,7 @@ lint: poetry run pre-commit run --all-files test: - poetry run pytest tests/ -m "unmarked or parametrize" ${PYTEST_ARGS} + poetry run pytest tests/ -m "(unmarked or parametrize) and not integration" ${PYTEST_ARGS} test-s3: sh ./dev/run-minio.sh diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index da3d84b3483b..431a8a9e512e 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -404,8 +404,8 @@ class AssertDefaultSortOrderId(TableRequirement): class CommitTableRequest(IcebergBaseModel): identifier: Identifier = Field() - requirements: Tuple[SerializeAsAny[TableRequirement]] = Field(default_factory=tuple) - updates: Tuple[SerializeAsAny[TableUpdate]] = Field(default_factory=tuple) + requirements: Tuple[SerializeAsAny[TableRequirement], ...] = Field(default_factory=tuple) + updates: Tuple[SerializeAsAny[TableUpdate], ...] = Field(default_factory=tuple) class CommitTableResponse(IcebergBaseModel): @@ -526,7 +526,7 @@ def update_schema(self, allow_incompatible_changes: bool = False, case_sensitive def _do_commit(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequirement, ...]) -> None: response = self.catalog._commit_table( # pylint: disable=W0212 - CommitTableRequest(identifier=self.identifier[:-1], updates=updates, requirements=requirements) + CommitTableRequest(identifier=self.identifier[1:], updates=updates, requirements=requirements) ) # pylint: disable=W0212 self.metadata = response.metadata self.metadata_location = response.metadata_location @@ -940,7 +940,16 @@ def __init__( self._moves = {} self._added_name_to_id = {} - self._id_to_parent = {} + + def get_column_name(field_id: int) -> str: + column_name = self._schema.find_column_name(column_id=field_id) + if column_name is None: + raise ValueError(f"Could not find field-id: {field_id}") + return column_name + + self._id_to_parent = { + field_id: get_column_name(parent_field_id) for field_id, parent_field_id in self._schema._lazy_id_to_parent.items() + } self._allow_incompatible_changes = allow_incompatible_changes self._case_sensitive = case_sensitive @@ -1053,7 +1062,7 @@ def delete_column(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: name = (path,) if isinstance(path, str) else path full_name = ".".join(name) - field = self._schema.find_field(full_name, self._case_sensitive) + field = self._schema.find_field(full_name, case_sensitive=self._case_sensitive) if field.field_id in self._adds: raise ValueError(f"Cannot delete a column that has additions: {full_name}") @@ -1143,7 +1152,7 @@ def _set_column_requirement(self, path: Union[str, Tuple[str, ...]], required: b # if the change is a noop, allow it even if allowIncompatibleChanges is false return - if not self._allow_incompatible_changes and not required: + if not self._allow_incompatible_changes and required: raise ValueError(f"Cannot change column nullability: {name}: optional -> required") if field.field_id in self._deletes: @@ -1193,7 +1202,7 @@ def update_column( raise ValueError(f"Cannot update a column that will be deleted: {full_name}") if field_type is not None: - if not self._allow_incompatible_changes: + if not self._allow_incompatible_changes and field.field_type != field_type: try: promote(field.field_type, field_type) except ResolveError as e: @@ -1272,9 +1281,9 @@ def _find_for_move(self, name: str) -> Optional[int]: def _move(self, move: Move) -> None: if parent_name := self._id_to_parent.get(move.field_id): - parent_field = self._schema.find_field(parent_name, self._case_sensitive) - if not parent_field.is_struct: - raise ValueError(f"Cannot move fields in non-struct type: {parent_field}") + parent_field = self._schema.find_field(parent_name, case_sensitive=self._case_sensitive) + if not parent_field.field_type.is_struct: + raise ValueError(f"Cannot move fields in non-struct type: {parent_field.field_type}") if move.op == MoveOperation.After or move.op == MoveOperation.Before: if move.other_field_id is None: @@ -1289,8 +1298,8 @@ def _move(self, move: Move) -> None: if move.other_field_id is None: raise ValueError("Expected other field when performing before/after move") - if self._id_to_parent.get(move.other_field_id) is not None: - raise ValueError(f"Cannot move field {move.full_name} to a different struct") + if other_struct := self._id_to_parent.get(move.other_field_id): + raise ValueError(f"Cannot move field {move.full_name} to a different struct: {other_struct}") self._moves[TABLE_ROOT_ID] = self._moves.get(TABLE_ROOT_ID, []) + [move] @@ -1408,24 +1417,16 @@ def _apply(self) -> Schema: new_schema = Schema(*struct.fields) - # @TODO: This differs still a bit from the Java side, - # The validate identifier field is missing field_ids = set() for name in self._identifier_field_names: - field = new_schema.find_field(name, self._case_sensitive) + try: + field = new_schema.find_field(name, self._case_sensitive) + except ValueError as e: + raise ValueError( + f"Cannot find identifier field {name}. In case of deletion, update the identifier fields first." + ) from e + field_ids.add(field.field_id) - if field.field_id in self._deletes: - raise ValueError(f"Cannot delete identifier field {name}. To force deletion, update the identifier fields first.") - - # If it nested, also check if the parents aren't deleted - column_name = self._id_to_parent.get(field.field_id) - while column_name is not None: - parent = new_schema.find_field(column_name) - if parent.field_id in self._deletes: - raise ValueError( - f"Cannot delete field {parent.field_id} as it will delete nested identifier field {name}", - ) - column_name = self._id_to_parent.get(parent.field_id) return Schema(*struct.fields, identifier_field_ids=field_ids) diff --git a/python/tests/test_integration_schema.py b/python/tests/test_integration_schema.py index acc467e0d051..16e65cb6552e 100644 --- a/python/tests/test_integration_schema.py +++ b/python/tests/test_integration_schema.py @@ -65,6 +65,7 @@ def simple_table(catalog: Catalog, table_schema_simple: Schema) -> Table: return _create_table_with_schema(catalog, table_schema_simple) +@pytest.mark.integration def test_add_column(simple_table: Table) -> None: update = UpdateSchema(simple_table) update.add_column(path="b", field_type=IntegerType()) @@ -76,11 +77,13 @@ def test_add_column(simple_table: Table) -> None: NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), NestedField(field_id=4, name="b", field_type=IntegerType(), required=False), + identifier_field_ids=[2], ) assert apply_schema.schema_id == 0 assert apply_schema.highest_field_id == 4 +@pytest.mark.integration def test_add_primitive_type_column(simple_table: Table) -> None: primitive_type: Dict[str, PrimitiveType] = { "boolean": BooleanType(), @@ -108,6 +111,7 @@ def test_add_primitive_type_column(simple_table: Table) -> None: assert field.doc == f"new_column_{name}" +@pytest.mark.integration def test_add_nested_type_column(simple_table: Table) -> None: # add struct type column field_name = "new_column_struct" @@ -126,6 +130,7 @@ def test_add_nested_type_column(simple_table: Table) -> None: assert schema_.highest_field_id == 6 +@pytest.mark.integration def test_add_nested_map_type_column(simple_table: Table) -> None: # add map type column field_name = "new_column_map" @@ -138,6 +143,7 @@ def test_add_nested_map_type_column(simple_table: Table) -> None: assert new_schema.highest_field_id == 6 +@pytest.mark.integration def test_add_nested_list_type_column(simple_table: Table) -> None: # add list type column field_name = "new_column_list" @@ -173,6 +179,7 @@ def _create_table_with_schema(catalog: Catalog, schema: Schema) -> Table: return catalog.create_table(identifier=tbl_name, schema=schema) +@pytest.mark.integration def test_add_already_exists(catalog: Catalog, table_schema_nested: Schema) -> None: table = _create_table_with_schema(catalog, table_schema_nested) update = UpdateSchema(table) @@ -186,6 +193,7 @@ def test_add_already_exists(catalog: Catalog, table_schema_nested: Schema) -> No assert "already exists: location.latitude" in str(exc_info.value) +@pytest.mark.integration def test_add_to_non_struct_type(catalog: Catalog, table_schema_simple: Schema) -> None: table = _create_table_with_schema(catalog, table_schema_simple) update = UpdateSchema(table) @@ -194,6 +202,7 @@ def test_add_to_non_struct_type(catalog: Catalog, table_schema_simple: Schema) - assert "Cannot add column 'lat' to non-struct type" in str(exc_info.value) +@pytest.mark.integration @pytest.mark.integration def test_schema_evolution_via_transaction(catalog: Catalog) -> None: schema = Schema( @@ -247,6 +256,7 @@ def test_schema_evolution_via_transaction(catalog: Catalog) -> None: ) +@pytest.mark.integration @pytest.mark.integration def test_schema_evolution_nested(catalog: Catalog) -> None: nested_schema = Schema( @@ -407,6 +417,7 @@ def nested_table(catalog: Catalog) -> Table: return _create_table_with_schema(catalog, schema_nested) +@pytest.mark.integration def test_no_changes(simple_table: Table, table_schema_simple: Schema) -> None: with simple_table.update_schema() as _: pass @@ -414,6 +425,7 @@ def test_no_changes(simple_table: Table, table_schema_simple: Schema) -> None: assert simple_table.schema() == table_schema_simple +@pytest.mark.integration def test_delete_field(simple_table: Table) -> None: with simple_table.update_schema() as schema_update: schema_update.delete_column("foo") @@ -427,6 +439,7 @@ def test_delete_field(simple_table: Table) -> None: ) +@pytest.mark.integration def test_delete_field_case_insensitive(simple_table: Table) -> None: with simple_table.update_schema(case_sensitive=False) as schema_update: schema_update.delete_column("FOO") @@ -440,15 +453,16 @@ def test_delete_field_case_insensitive(simple_table: Table) -> None: ) +@pytest.mark.integration def test_delete_identifier_fields(simple_table: Table) -> None: with pytest.raises(ValueError) as exc_info: with simple_table.update_schema() as schema_update: schema_update.delete_column("bar") - assert str(exc_info) == "Cannot delete identifier field bar. To force deletion, update the identifier fields first." + assert "Cannot find identifier field bar. In case of deletion, update the identifier fields first." in str(exc_info) -@pytest.mark.skip(reason="REST Catalog gives an error") +@pytest.mark.integration def test_delete_identifier_fields_nested(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -464,7 +478,7 @@ def test_delete_identifier_fields_nested(catalog: Catalog) -> None: required=True, ), schema_id=1, - identifier_field_ids=[], + identifier_field_ids=[3], ), ) @@ -472,7 +486,7 @@ def test_delete_identifier_fields_nested(catalog: Catalog) -> None: with tbl.update_schema() as schema_update: schema_update.delete_column("person") - assert str(exc_info) == "Cannot delete field person as it will delete nested identifier field name." + assert "Cannot find identifier field person.name. In case of deletion, update the identifier fields first." in str(exc_info) @pytest.mark.parametrize( @@ -490,6 +504,7 @@ def test_delete_identifier_fields_nested(catalog: Catalog) -> None: "person.age", ], ) +@pytest.mark.integration def test_deletes(field: str, nested_table: Table) -> None: with nested_table.update_schema() as schema_update: schema_update.delete_column(field) @@ -520,6 +535,7 @@ def test_deletes(field: str, nested_table: Table) -> None: "Person.age", ], ) +@pytest.mark.integration def test_deletes_case_insensitive(field: str, nested_table: Table) -> None: with nested_table.update_schema(case_sensitive=False) as schema_update: schema_update.delete_column(field) @@ -535,6 +551,7 @@ def test_deletes_case_insensitive(field: str, nested_table: Table) -> None: assert expected_schema == nested_table.schema() +@pytest.mark.integration def test_update_types(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -581,6 +598,7 @@ def test_update_types(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_update_types_case_insensitive(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -637,6 +655,7 @@ def test_update_types_case_insensitive(catalog: Catalog) -> None: @pytest.mark.parametrize("from_type, to_type", allowed_promotions) +@pytest.mark.integration def test_allowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -676,6 +695,7 @@ def test_allowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, catal @pytest.mark.parametrize("from_type", disallowed_promotions_types) @pytest.mark.parametrize("to_type", disallowed_promotions_types) +@pytest.mark.integration def test_disallowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -699,6 +719,48 @@ def test_disallowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, ca ) +@pytest.mark.integration +def test_rename_simple(simple_table: Table) -> None: + with simple_table.update_schema() as schema_update: + schema_update.rename_column("foo", "vo") + + assert simple_table.schema() == Schema( + NestedField(field_id=1, name="vo", field_type=StringType(), required=False), + NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), + schema_id=1, + identifier_field_ids=[2], + ) + + +@pytest.mark.integration +def test_rename_simple_nested(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField( + field_id=1, + name="foo", + field_type=StructType(NestedField(field_id=2, name="bar", field_type=StringType())), + required=True, + ), + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.rename_column("foo.bar", "vo") + + assert tbl.schema() == Schema( + NestedField( + field_id=1, + name="foo", + field_type=StructType(NestedField(field_id=2, name="vo", field_type=StringType())), + required=True, + ), + ) + + +@pytest.mark.integration def test_rename(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -750,9 +812,9 @@ def test_rename(catalog: Catalog) -> None: with tbl.update_schema() as schema_update: schema_update.rename_column("foo", "bar") - schema_update.rename_column("location_lookup.x", "location_lookup.latitude") - schema_update.rename_column("locations.x", "locations.latitude") - schema_update.rename_column("person.leeftijd", "person.age") + schema_update.rename_column("location_lookup.x", "latitude") + schema_update.rename_column("locations.x", "latitude") + schema_update.rename_column("person.leeftijd", "age") assert tbl.schema() == Schema( NestedField( @@ -800,6 +862,7 @@ def test_rename(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_rename_case_insensitive(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -851,9 +914,9 @@ def test_rename_case_insensitive(catalog: Catalog) -> None: with tbl.update_schema(case_sensitive=False) as schema_update: schema_update.rename_column("Foo", "bar") - schema_update.rename_column("Location_lookup.X", "location_lookup.latitude") - schema_update.rename_column("Locations.X", "locations.latitude") - schema_update.rename_column("Person.Leeftijd", "person.age") + schema_update.rename_column("Location_lookup.X", "latitude") + schema_update.rename_column("Locations.X", "latitude") + schema_update.rename_column("Person.Leeftijd", "age") assert tbl.schema() == Schema( NestedField( @@ -901,6 +964,7 @@ def test_rename_case_insensitive(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_add_struct(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -925,6 +989,7 @@ def test_add_struct(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_add_nested_map_of_structs(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -980,6 +1045,7 @@ def test_add_nested_map_of_structs(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_add_nested_list_of_structs(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1022,6 +1088,7 @@ def test_add_nested_list_of_structs(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_add_required_column(catalog: Catalog) -> None: schema_ = Schema( NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] @@ -1045,6 +1112,7 @@ def test_add_required_column(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_add_required_column_case_insensitive(catalog: Catalog) -> None: schema_ = Schema( NestedField(field_id=1, name="id", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] @@ -1052,8 +1120,8 @@ def test_add_required_column_case_insensitive(catalog: Catalog) -> None: table = _create_table_with_schema(catalog, schema_) with pytest.raises(ValueError) as exc_info: - update = UpdateSchema(table, allow_incompatible_changes=True) - update.case_sensitive(False).add_column(path="ID", field_type=IntegerType(), required=True) + with UpdateSchema(table, allow_incompatible_changes=True) as update: + update.case_sensitive(False).add_column(path="ID", field_type=IntegerType(), required=True) assert "already exists: ID" in str(exc_info.value) new_schema = ( @@ -1062,13 +1130,14 @@ def test_add_required_column_case_insensitive(catalog: Catalog) -> None: ._apply() ) assert new_schema == Schema( - NestedField(field_id=1, path="id", field_type=BooleanType(), required=False), + NestedField(field_id=1, name="id", field_type=BooleanType(), required=False), NestedField(field_id=2, name="ID", field_type=IntegerType(), required=True), schema_id=0, identifier_field_ids=[], ) +@pytest.mark.integration def test_make_column_optional(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1088,6 +1157,7 @@ def test_make_column_optional(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_mixed_changes(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1231,6 +1301,7 @@ def test_mixed_changes(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_ambiguous_column(catalog: Catalog, table_schema_nested: Schema) -> None: table = _create_table_with_schema(catalog, table_schema_nested) update = UpdateSchema(table) @@ -1240,6 +1311,7 @@ def test_ambiguous_column(catalog: Catalog, table_schema_nested: Schema) -> None assert "Cannot add column with ambiguous name: location.latitude, provide a tuple instead" in str(exc_info.value) +@pytest.mark.integration def test_delete_then_add(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1259,6 +1331,7 @@ def test_delete_then_add(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_delete_then_add_nested(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1294,6 +1367,7 @@ def test_delete_then_add_nested(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_delete_missing_column(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1310,6 +1384,7 @@ def test_delete_missing_column(catalog: Catalog) -> None: assert "Could not find field with name bar, case_sensitive=True" in str(exc_info.value) +@pytest.mark.integration def test_add_delete_conflict(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1348,6 +1423,7 @@ def test_add_delete_conflict(catalog: Catalog) -> None: assert "Cannot delete a column that has additions: preferences" in str(exc_info.value) +@pytest.mark.integration def test_rename_missing_column(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1364,6 +1440,7 @@ def test_rename_missing_column(catalog: Catalog) -> None: assert "Could not find field with name bar, case_sensitive=True" in str(exc_info.value) +@pytest.mark.integration def test_rename_missing_conflicts(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1396,6 +1473,7 @@ def test_rename_missing_conflicts(catalog: Catalog) -> None: assert "Could not find field with name bar, case_sensitive=True" in str(exc_info.value) +@pytest.mark.integration def test_update_missing_column(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1412,6 +1490,7 @@ def test_update_missing_column(catalog: Catalog) -> None: assert "Could not find field with name bar, case_sensitive=True" in str(exc_info.value) +@pytest.mark.integration def test_update_delete_conflict(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1429,6 +1508,7 @@ def test_update_delete_conflict(catalog: Catalog) -> None: assert "Cannot delete a column that has updates: foo" in str(exc_info.value) +@pytest.mark.integration def test_delete_update_conflict(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1446,6 +1526,7 @@ def test_delete_update_conflict(catalog: Catalog) -> None: assert "Cannot update a column that will be deleted: foo" in str(exc_info.value) +@pytest.mark.integration def test_delete_map_key(nested_table: Table) -> None: with pytest.raises(ValueError) as exc_info: with nested_table.update_schema() as schema_update: @@ -1454,6 +1535,7 @@ def test_delete_map_key(nested_table: Table) -> None: assert "Cannot delete map keys" in str(exc_info.value) +@pytest.mark.integration def test_add_field_to_map_key(nested_table: Table) -> None: with pytest.raises(ValueError) as exc_info: with nested_table.update_schema() as schema_update: @@ -1462,6 +1544,7 @@ def test_add_field_to_map_key(nested_table: Table) -> None: assert "Cannot add column 'key' to non-struct type: quux" in str(exc_info.value) +@pytest.mark.integration def test_alter_map_key(nested_table: Table) -> None: with pytest.raises(ValueError) as exc_info: with nested_table.update_schema() as schema_update: @@ -1470,6 +1553,7 @@ def test_alter_map_key(nested_table: Table) -> None: assert "Cannot update map keys" in str(exc_info.value) +@pytest.mark.integration def test_update_map_key(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1486,6 +1570,7 @@ def test_update_map_key(catalog: Catalog) -> None: assert "Cannot update map keys: map" in str(exc_info.value) +@pytest.mark.integration def test_update_added_column_doc(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1503,6 +1588,7 @@ def test_update_added_column_doc(catalog: Catalog) -> None: assert "Could not find field with name value, case_sensitive=True" in str(exc_info.value) +@pytest.mark.integration def test_update_deleted_column_doc(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1520,6 +1606,7 @@ def test_update_deleted_column_doc(catalog: Catalog) -> None: assert "Cannot update a column that will be deleted: foo" in str(exc_info.value) +@pytest.mark.integration def test_multiple_moves(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1547,6 +1634,7 @@ def test_multiple_moves(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_top_level_column_first(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1567,6 +1655,7 @@ def test_move_top_level_column_first(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_top_level_column_before_first(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1587,6 +1676,7 @@ def test_move_top_level_column_before_first(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_top_level_column_after_last(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1607,6 +1697,7 @@ def test_move_top_level_column_after_last(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_nested_field_first(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1643,6 +1734,7 @@ def test_move_nested_field_first(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_nested_field_before_first(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1679,6 +1771,7 @@ def test_move_nested_field_before_first(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_nested_field_after_first(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1698,23 +1791,26 @@ def test_move_nested_field_after_first(catalog: Catalog) -> None: ) with tbl.update_schema() as schema_update: - schema_update.move_before("struct.count", "struct.data") + schema_update.move_before("struct.data", "struct.count") - assert tbl.schema() == Schema( - NestedField(field_id=1, name="id", field_type=LongType(), required=True), - NestedField( - field_id=2, - name="struct", - field_type=StructType( - NestedField(field_id=4, name="data", field_type=StringType(), required=True), - NestedField(field_id=3, name="count", field_type=LongType(), required=True), + assert str(tbl.schema()) == str( + Schema( + NestedField(field_id=1, name="id", field_type=LongType(), required=True), + NestedField( + field_id=2, + name="struct", + field_type=StructType( + NestedField(field_id=4, name="data", field_type=StringType(), required=True), + NestedField(field_id=3, name="count", field_type=LongType(), required=True), + ), + required=True, ), - required=True, - ), - schema_id=1, + schema_id=1, + ) ) +@pytest.mark.integration def test_move_nested_field_after(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1744,8 +1840,8 @@ def test_move_nested_field_after(catalog: Catalog) -> None: name="struct", field_type=StructType( NestedField(field_id=3, name="count", field_type=LongType(), required=True), - NestedField(field_id=4, name="data", field_type=StringType(), required=True), NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=True), + NestedField(field_id=4, name="data", field_type=StringType(), required=True), ), required=True, ), @@ -1753,6 +1849,7 @@ def test_move_nested_field_after(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_nested_field_before(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1791,6 +1888,7 @@ def test_move_nested_field_before(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_map_value_struct_field(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1800,13 +1898,13 @@ def test_move_map_value_struct_field(catalog: Catalog) -> None: field_id=2, name="map", field_type=MapType( - key_id=6, - value_id=7, + key_id=3, + value_id=4, key_type=StringType(), - element_type=StructType( + value_type=StructType( NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=True), - NestedField(field_id=3, name="count", field_type=LongType(), required=True), - NestedField(field_id=4, name="data", field_type=StringType(), required=True), + NestedField(field_id=6, name="count", field_type=LongType(), required=True), + NestedField(field_id=7, name="data", field_type=StringType(), required=True), ), ), required=True, @@ -1824,13 +1922,13 @@ def test_move_map_value_struct_field(catalog: Catalog) -> None: field_id=2, name="map", field_type=MapType( - key_id=6, - value_id=7, + key_id=3, + value_id=4, key_type=StringType(), - element_type=StructType( - NestedField(field_id=3, name="count", field_type=LongType(), required=True), + value_type=StructType( + NestedField(field_id=6, name="count", field_type=LongType(), required=True), NestedField(field_id=5, name="ts", field_type=TimestamptzType(), required=True), - NestedField(field_id=4, name="data", field_type=StringType(), required=True), + NestedField(field_id=7, name="data", field_type=StringType(), required=True), ), ), required=True, @@ -1839,6 +1937,7 @@ def test_move_map_value_struct_field(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_added_top_level_column(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1855,12 +1954,13 @@ def test_move_added_top_level_column(catalog: Catalog) -> None: assert tbl.schema() == Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), - NestedField(field_id=3, name="ts", field_type=TimestamptzType(), required=True), + NestedField(field_id=3, name="ts", field_type=TimestamptzType(), required=False), NestedField(field_id=2, name="data", field_type=StringType(), required=True), schema_id=1, ) +@pytest.mark.integration def test_move_added_top_level_column_after_added_column(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1886,6 +1986,7 @@ def test_move_added_top_level_column_after_added_column(catalog: Catalog) -> Non ) +@pytest.mark.integration def test_move_added_nested_struct_field(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1924,6 +2025,7 @@ def test_move_added_nested_struct_field(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_added_nested_field_before_added_column(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1965,6 +2067,7 @@ def test_move_added_nested_field_before_added_column(catalog: Catalog) -> None: ) +@pytest.mark.integration def test_move_self_reference_fails(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -1985,6 +2088,7 @@ def test_move_self_reference_fails(catalog: Catalog) -> None: assert "Cannot move foo after itself" in str(exc_info.value) +@pytest.mark.integration def test_move_missing_column_fails(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2010,6 +2114,7 @@ def test_move_missing_column_fails(catalog: Catalog) -> None: assert "Cannot move missing column: items" in str(exc_info.value) +@pytest.mark.integration def test_move_before_add_fails(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2038,6 +2143,7 @@ def test_move_before_add_fails(catalog: Catalog) -> None: assert "Cannot move missing column: ts" in str(exc_info.value) +@pytest.mark.integration def test_move_missing_reference_column_fails(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2059,6 +2165,7 @@ def test_move_missing_reference_column_fails(catalog: Catalog) -> None: assert "Cannot move data after missing column: items" in str(exc_info.value) +@pytest.mark.integration def test_move_primitive_map_key_fails(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2081,6 +2188,7 @@ def test_move_primitive_map_key_fails(catalog: Catalog) -> None: assert "Cannot move fields in non-struct type: map" in str(exc_info.value) +@pytest.mark.integration def test_move_primitive_map_value_fails(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2103,6 +2211,7 @@ def test_move_primitive_map_value_fails(catalog: Catalog) -> None: assert "Cannot move fields in non-struct type: map>" in str(exc_info.value) +@pytest.mark.integration def test_move_top_level_between_structs_fails(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2128,6 +2237,7 @@ def test_move_top_level_between_structs_fails(catalog: Catalog) -> None: assert "Cannot move field a to a different struct" in str(exc_info.value) +@pytest.mark.integration def test_move_between_structs_fails(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2161,6 +2271,7 @@ def test_move_between_structs_fails(catalog: Catalog) -> None: assert "Cannot move field s2.x to a different struct" in str(exc_info.value) +@pytest.mark.integration def test_add_existing_identifier_fields(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2175,6 +2286,7 @@ def test_add_existing_identifier_fields(catalog: Catalog) -> None: assert tbl.schema().identifier_field_names() == {"foo"} +@pytest.mark.integration def test_add_new_identifiers_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2190,6 +2302,7 @@ def test_add_new_identifiers_field_columns(catalog: Catalog) -> None: assert tbl.schema().identifier_field_names() == {"foo", "new_field"} +@pytest.mark.integration def test_add_new_identifiers_field_columns_out_of_order(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2205,6 +2318,7 @@ def test_add_new_identifiers_field_columns_out_of_order(catalog: Catalog) -> Non assert tbl.schema().identifier_field_names() == {"foo", "new_field"} +@pytest.mark.integration def test_add_nested_identifier_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2224,6 +2338,7 @@ def test_add_nested_identifier_field_columns(catalog: Catalog) -> None: assert tbl.schema().identifier_field_names() == {"required_struct.field"} +@pytest.mark.integration def test_add_nested_identifier_field_columns_single_transaction(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2241,6 +2356,7 @@ def test_add_nested_identifier_field_columns_single_transaction(catalog: Catalog assert tbl.schema().identifier_field_names() == {"new.field"} +@pytest.mark.integration def test_add_nested_nested_identifier_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2256,7 +2372,7 @@ def test_add_nested_nested_identifier_field_columns(catalog: Catalog) -> None: NestedField( field_id=3, name="field", - type=StructType(NestedField(field_id=4, name="nested", type=StringType(), required=False)), + type=StructType(NestedField(field_id=4, name="nested", type=StringType(), required=True)), required=True, ) ), @@ -2267,6 +2383,7 @@ def test_add_nested_nested_identifier_field_columns(catalog: Catalog) -> None: assert tbl.schema().identifier_field_names() == {"new.field.nested"} +@pytest.mark.integration def test_add_dotted_identifier_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2282,6 +2399,7 @@ def test_add_dotted_identifier_field_columns(catalog: Catalog) -> None: assert tbl.schema().identifier_field_names() == {"dot.field"} +@pytest.mark.integration def test_remove_identifier_fields(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2303,6 +2421,7 @@ def test_remove_identifier_fields(catalog: Catalog) -> None: assert tbl.schema().identifier_field_names() == set() +@pytest.mark.integration def test_set_identifier_field_fails_schema(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2311,7 +2430,7 @@ def test_set_identifier_field_fails_schema(catalog: Catalog) -> None: NestedField(field_id=2, name="float", field_type=FloatType(), required=True), NestedField(field_id=3, name="double", field_type=DoubleType(), required=True), schema_id=1, - identifier_field_ids=[1], + identifier_field_ids=[], ), ) @@ -2319,47 +2438,49 @@ def test_set_identifier_field_fails_schema(catalog: Catalog) -> None: with tbl.update_schema() as update_schema: update_schema.set_identifier_fields("id") - assert "Cannot add field id as an identifier field: not a required field" in str(exc_info.value) + assert "Identifier field 1 invalid: not a required field" in str(exc_info.value) with pytest.raises(ValueError) as exc_info: with tbl.update_schema() as update_schema: update_schema.set_identifier_fields("float") - assert "Cannot add field float as an identifier field: must not be float or double field" in str(exc_info.value) + assert "Identifier field 2 invalid: must not be float or double field" in str(exc_info.value) with pytest.raises(ValueError) as exc_info: with tbl.update_schema() as update_schema: update_schema.set_identifier_fields("double") - assert "Cannot add field float as an identifier field: must not be float or double field" in str(exc_info.value) + assert "Identifier field 3 invalid: must not be float or double field" in str(exc_info.value) with pytest.raises(ValueError) as exc_info: with tbl.update_schema() as update_schema: update_schema.set_identifier_fields("unknown") - assert "Cannot add field unknown as an identifier field: not found in current schema or added columns" in str(exc_info.value) + assert "Cannot find identifier field unknown. In case of deletion, update the identifier fields first." in str(exc_info.value) +@pytest.mark.integration def test_set_identifier_field_fails(nested_table: Table) -> None: with pytest.raises(ValueError) as exc_info: with nested_table.update_schema() as update_schema: update_schema.set_identifier_fields("location") - assert "Cannot add field location as an identifier field: not a primitive type field" in str(exc_info.value) + assert "Identifier field 6 invalid: not a primitive type field" in str(exc_info.value) with pytest.raises(ValueError) as exc_info: with nested_table.update_schema() as update_schema: update_schema.set_identifier_fields("baz") - assert "Cannot add field baz as an identifier field: not a required field" in str(exc_info.value) + assert "Identifier field 3 invalid: not a required field" in str(exc_info.value) with pytest.raises(ValueError) as exc_info: with nested_table.update_schema() as update_schema: update_schema.set_identifier_fields("person.name") - assert "Cannot add field zip as an identifier field: must not be nested in" in str(exc_info.value) + assert "Identifier field 16 invalid: not a required field" in str(exc_info.value) +@pytest.mark.integration def test_delete_identifier_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2384,6 +2505,7 @@ def test_delete_identifier_field_columns(catalog: Catalog) -> None: schema_update.delete_column("foo") +@pytest.mark.integration def test_delete_containing_nested_identifier_field_columns_fails(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2398,23 +2520,26 @@ def test_delete_containing_nested_identifier_field_columns_fails(catalog: Catalo ) schema_update.set_identifier_fields("out.nested") - tbl = _create_table_with_schema( - catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] + assert tbl.schema() == Schema( + NestedField(field_id=1, name="foo", field_type=StringType(), required=True), + NestedField( + field_id=2, + name="out", + field_type=StructType(NestedField(field_id=3, name="nested", field_type=StringType(), required=True)), + required=True, ), + schema_id=1, + identifier_field_ids=[3], ) with pytest.raises(ValueError) as exc_info: with tbl.update_schema() as schema_update: schema_update.delete_column("out") - assert ( - str(exc_info) - == "Cannot delete field 24: out: required struct<25: nested: required string> as it will delete nested identifier field 25: nested: required string" - ) + assert "Cannot find identifier field out.nested. In case of deletion, update the identifier fields first." in str(exc_info) +@pytest.mark.integration def test_rename_identifier_fields(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2430,6 +2555,7 @@ def test_rename_identifier_fields(catalog: Catalog) -> None: assert tbl.schema().identifier_field_names() == {"bar"} +@pytest.mark.integration def test_move_identifier_fields(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2437,6 +2563,7 @@ def test_move_identifier_fields(catalog: Catalog) -> None: NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=True), schema_id=1, + identifier_field_ids=[1], ), ) @@ -2444,21 +2571,22 @@ def test_move_identifier_fields(catalog: Catalog) -> None: update.move_before("data", "id") assert tbl.schema().identifier_field_ids == [1] - assert tbl.schema().identifier_field_names() == {"bar"} + assert tbl.schema().identifier_field_names() == {"id"} with tbl.update_schema() as update: update.move_after("id", "data") assert tbl.schema().identifier_field_ids == [1] - assert tbl.schema().identifier_field_names() == {"bar"} + assert tbl.schema().identifier_field_names() == {"id"} with tbl.update_schema() as update: update.move_first("data") assert tbl.schema().identifier_field_ids == [1] - assert tbl.schema().identifier_field_names() == {"bar"} + assert tbl.schema().identifier_field_names() == {"id"} +@pytest.mark.integration def test_move_identifier_fields_case_insensitive(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, @@ -2466,6 +2594,7 @@ def test_move_identifier_fields_case_insensitive(catalog: Catalog) -> None: NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=True), schema_id=1, + identifier_field_ids=[1], ), ) @@ -2473,16 +2602,16 @@ def test_move_identifier_fields_case_insensitive(catalog: Catalog) -> None: update.move_before("DATA", "ID") assert tbl.schema().identifier_field_ids == [1] - assert tbl.schema().identifier_field_names() == {"bar"} + assert tbl.schema().identifier_field_names() == {"id"} with tbl.update_schema(case_sensitive=False) as update: update.move_after("ID", "DATA") assert tbl.schema().identifier_field_ids == [1] - assert tbl.schema().identifier_field_names() == {"bar"} + assert tbl.schema().identifier_field_names() == {"id"} with tbl.update_schema(case_sensitive=False) as update: update.move_first("DATA") assert tbl.schema().identifier_field_ids == [1] - assert tbl.schema().identifier_field_names() == {"bar"} + assert tbl.schema().identifier_field_names() == {"id"} From b8b807f397b31473433e5bb16d256e3237dc3749 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 29 Aug 2023 08:48:59 +0200 Subject: [PATCH 14/17] Coverage over all the tests --- python/Makefile | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/Makefile b/python/Makefile index 8ee7efff41fd..706b477c7f6e 100644 --- a/python/Makefile +++ b/python/Makefile @@ -53,15 +53,18 @@ test-adlfs: sh ./dev/run-azurite.sh poetry run pytest tests/ -m adlfs ${PYTEST_ARGS} +test-gcs: + sh ./dev/run-gcs-server.sh + poetry run pytest tests/ -m gcs ${PYTEST_ARGS} + test-coverage: + docker-compose -f dev/docker-compose-integration.yml kill + docker-compose -f dev/docker-compose-integration.yml rm -f + docker-compose -f dev/docker-compose-integration.yml up -d sh ./dev/run-minio.sh sh ./dev/run-azurite.sh sh ./dev/run-gcs-server.sh - poetry run coverage run --source=pyiceberg/ -m pytest tests/ -m "not integration" ${PYTEST_ARGS} + poetry run coverage run --source=pyiceberg/ -m pytest tests/ ${PYTEST_ARGS} poetry run coverage report -m --fail-under=90 poetry run coverage html poetry run coverage xml - -test-gcs: - sh ./dev/run-gcs-server.sh - poetry run pytest tests/ -m gcs ${PYTEST_ARGS} From 6584ad6a59197d01c03a2076d2f06013115c9813 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 29 Aug 2023 09:15:53 +0200 Subject: [PATCH 15/17] Less is more --- python/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/Makefile b/python/Makefile index 706b477c7f6e..dc72bf85d59a 100644 --- a/python/Makefile +++ b/python/Makefile @@ -61,9 +61,9 @@ test-coverage: docker-compose -f dev/docker-compose-integration.yml kill docker-compose -f dev/docker-compose-integration.yml rm -f docker-compose -f dev/docker-compose-integration.yml up -d - sh ./dev/run-minio.sh sh ./dev/run-azurite.sh sh ./dev/run-gcs-server.sh + docker-compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py poetry run coverage run --source=pyiceberg/ -m pytest tests/ ${PYTEST_ARGS} poetry run coverage report -m --fail-under=90 poetry run coverage html From 368e4f393c16e51288e5d6c59348c4e873250f5f Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Tue, 29 Aug 2023 21:33:38 +0200 Subject: [PATCH 16/17] Add docs --- python/mkdocs/docs/api.md | 230 ++++++++++++++++-------- python/mkdocs/docs/contributing.md | 2 +- python/tests/test_integration_schema.py | 6 +- 3 files changed, 158 insertions(+), 80 deletions(-) diff --git a/python/mkdocs/docs/api.md b/python/mkdocs/docs/api.md index 7af9053a030d..2ac86bec4fed 100644 --- a/python/mkdocs/docs/api.md +++ b/python/mkdocs/docs/api.md @@ -42,136 +42,216 @@ Then load the `prod` catalog: ```python from pyiceberg.catalog import load_catalog -catalog = load_catalog("prod") +catalog = load_catalog( + "docs", + **{ + "uri": "http://127.0.0.1:8181", + "s3.endpoint": "http://127.0.0.1:9000", + "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO", + "s3.access-key-id": "admin", + "s3.secret-access-key": "password", + } +) +``` -catalog.list_namespaces() +Let's create a namespace: + +```python +catalog.create_namespace("docs_example") ``` -Returns two namespaces: +And then list them: ```python -[("default",), ("nyc",)] +ns = catalog.list_namespaces() + +assert ns == [("docs_example",)] ``` -Listing the tables in the `nyc` namespace: +And then list tables in the namespace: ```python -catalog.list_tables("nyc") +catalog.list_tables("docs_example") ``` -Returns as list with tuples, containing a single table `taxis`: +## Create a table + +To create a table from a catalog: ```python -[("nyc", "taxis")] +from pyiceberg.schema import Schema +from pyiceberg.types import ( + TimestampType, + FloatType, + DoubleType, + StringType, + NestedField, + StructType, +) + +schema = Schema( + NestedField(field_id=1, name="datetime", field_type=TimestampType(), required=True), + NestedField(field_id=2, name="symbol", field_type=StringType(), required=True), + NestedField(field_id=3, name="bid", field_type=FloatType(), required=False), + NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False), + NestedField( + field_id=5, + name="details", + field_type=StructType( + NestedField( + field_id=4, name="created_by", field_type=StringType(), required=False + ), + ), + required=False, + ), +) + +from pyiceberg.partitioning import PartitionSpec, PartitionField +from pyiceberg.transforms import DayTransform + +partition_spec = PartitionSpec( + PartitionField( + source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day" + ) +) + +from pyiceberg.table.sorting import SortOrder, SortField +from pyiceberg.transforms import IdentityTransform + +# Sort on the symbol +sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform())) + +catalog.create_table( + identifier="docs_example.bids", + schema=schema, + partition_spec=partition_spec, + sort_order=sort_order, +) ``` ## Load a table -### From a catalog +### Catalog table -Loading the `taxis` table: +Loading the `bids` table: ```python -catalog.load_table("nyc.taxis") +table = catalog.load_table("docs_example.bids") # Equivalent to: -catalog.load_table(("nyc", "taxis")) +table = catalog.load_table(("docs_example", "bids")) # The tuple syntax can be used if the namespace or table contains a dot. ``` This returns a `Table` that represents an Iceberg table that can be queried and altered. -### Directly from a metadata file +### Static table To load a table directly from a metadata file (i.e., **without** using a catalog), you can use a `StaticTable` as follows: ```python from pyiceberg.table import StaticTable -table = StaticTable.from_metadata( - "s3a://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json" +static_table = StaticTable.from_metadata( + # For example: + # "s3a://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json", + tbl.metadata_location, + properties={ + "s3.endpoint": "http://127.0.0.1:9000", + "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO", + "s3.access-key-id": "admin", + "s3.secret-access-key": "password", + }, ) ``` -For the rest, this table behaves similarly as a table loaded using a catalog. Note that `StaticTable` is intended to be _read only_. +The static-table is considered read-only. -Any properties related to file IO can be passed accordingly: +## Schema evolution + +PyIceberg supports full schema evolution through the Python API. It takes care of setting the field-IDs and makes sure that only non-breaking changes are done (can be overriden). + +In the examples below, the `.update_schema()` is called from the table itself. ```python -table = StaticTable.from_metadata( - "s3a://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json", - {PY_IO_IMPL: "pyiceberg.some.FileIO.class"}, -) +with table.update_schema() as update: + update.add_column("some_field", IntegerType(), "doc") ``` -## Create a table - -To create a table from a catalog: +You can also initiate a transaction if you want to make more changes than just evolving the schema: ```python -from pyiceberg.catalog import load_catalog -from pyiceberg.schema import Schema -from pyiceberg.types import TimestampType, DoubleType, StringType, NestedField +with table.transaction() as transaction: + with transaction.update_schema() as update_schema: + update.add_column("some_other_field", IntegerType(), "doc") + # ... Update properties etc +``` -schema = Schema( - NestedField( - field_id=1, name="datetime", field_type=TimestampType(), required=False - ), - NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False), - NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False), - NestedField(field_id=4, name="symbol", field_type=StringType(), required=False), -) +### Add column -from pyiceberg.partitioning import PartitionSpec, PartitionField -from pyiceberg.transforms import DayTransform - -partition_spec = PartitionSpec( - PartitionField( - source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day" - ) -) +Using `add_column` you can add a column, without having to worry about the field-id: -from pyiceberg.table.sorting import SortOrder, SortField -from pyiceberg.transforms import IdentityTransform +```python +with table.update_schema() as update: + update.add_column("retries", IntegerType(), "Number of retries to place the bid") + # In a struct + update.add_column("details.confirmed_by", StringType(), "Name of the exchange") +``` -sort_order = SortOrder(SortField(source_id=4, transform=IdentityTransform())) +### Rename column -catalog = load_catalog("prod") +Renaming a field in an Iceberg table is simple: -catalog.create_table( - identifier="default.bids", - location="/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/bids/", - schema=schema, - partition_spec=partition_spec, - sort_order=sort_order, -) +```python +with table.update_schema() as update: + update.rename("retries", "num_retries") + # In a struct, only the new name field + update.rename("properties.confirmed_by", "exchange") ``` -### Update table schema +### Rename column -Add new columns through the `Transaction` or `UpdateSchema` API: +Move a field inside of struct: ```python with table.update_schema() as update: - update.add_column("x", IntegerType(), "doc") + update.move_first("symbol") + update.move_after("bid", "ask") + # In a struct, only the new name field + update.move_before("details.exchange", "properties.created_by") ``` -Or, without a context manager by calling the `.commit()` explicitly: +### Update column + +Update a fields' type, description or required. ```python -table.update_schema().add_column("x", IntegerType(), "doc").commit() +with table.update_schema() as update: + # Promote a float to a double + update.update_column("bid", field_type=DoubleType()) + # Make a field optional + update.update_column("symbol", required=False) + # Update the documentation + update.update_column("symbol", doc="Name of the share on the exchange") ``` -Alternatively, use the transaction API to combine changes from multiple operations: +Be careful, some operations are not compatible, but can still be done at your own risk by setting `allow_incompatible_changes`: ```python -from datetime import datetime +with table.update_schema(allow_incompatible_changes=True) as update: + # Incompatible change, cannot require an optional field + update.update_column("symbol", required=True) +``` -with table.transaction() as transaction: - transaction.update_schema().add_column("x", IntegerType(), "doc").commit() - transaction.set_properties(schema_updated_at=str(datetime.now())) +### Delete column + +Delete a field, careful this is a incompatible change (readers/writers might expect this field): + +```python +with table.update_schema(allow_incompatible_changes=True) as update: + update.delete_column("some_field") ``` -### Update table properties +## Table properties Set and remove properties through the `Transaction` API: @@ -187,7 +267,7 @@ with table.transaction() as transaction: assert table.properties == {} ``` -Or, without a context manager: +Or, without context manager: ```python table = table.transaction().set_properties(abc="def").commit_transaction() @@ -336,19 +416,17 @@ Dataset( Using [Ray Dataset API](https://docs.ray.io/en/latest/data/api/dataset.html) to interact with the dataset: ```python -print( - ray_dataset.take(2) -) +print(ray_dataset.take(2)) [ { - 'VendorID': 2, - 'tpep_pickup_datetime': datetime.datetime(2008, 12, 31, 23, 23, 50, tzinfo=), - 'tpep_dropoff_datetime': datetime.datetime(2009, 1, 1, 0, 34, 31, tzinfo=) + "VendorID": 2, + "tpep_pickup_datetime": datetime.datetime(2008, 12, 31, 23, 23, 50), + "tpep_dropoff_datetime": datetime.datetime(2009, 1, 1, 0, 34, 31), }, { - 'VendorID': 2, - 'tpep_pickup_datetime': datetime.datetime(2008, 12, 31, 23, 5, 3, tzinfo=), - 'tpep_dropoff_datetime': datetime.datetime(2009, 1, 1, 16, 10, 18, tzinfo=) - } + "VendorID": 2, + "tpep_pickup_datetime": datetime.datetime(2008, 12, 31, 23, 5, 3), + "tpep_dropoff_datetime": datetime.datetime(2009, 1, 1, 16, 10, 18), + }, ] ``` diff --git a/python/mkdocs/docs/contributing.md b/python/mkdocs/docs/contributing.md index 989cbbea44f8..87a8cc701bb0 100644 --- a/python/mkdocs/docs/contributing.md +++ b/python/mkdocs/docs/contributing.md @@ -160,4 +160,4 @@ PyIceberg offers support from Python 3.8 onwards, we can't use the [type hints f ## Third party libraries -PyIceberg naturally integrates into the rich Python ecosystem, however it is important to be hesistant to add third party packages. Adding a lot of packages makes the library heavyweight, and causes incompatibilities with other projects if they use a different version of the library. Also, big libraries such as `s3fs`, `adlfs`, `pyarrow`, `thrift` should be optional to avoid downloading everything, while not being sure if is actually being used. +PyIceberg naturally integrates into the rich Python ecosystem, however it is important to be hesitant adding third party packages. Adding a lot of packages makes the library heavyweight, and causes incompatibilities with other projects if they use a different version of the library. Also, big libraries such as `s3fs`, `adlfs`, `pyarrow`, `thrift` should be optional to avoid downloading everything, while not being sure if is actually being used. diff --git a/python/tests/test_integration_schema.py b/python/tests/test_integration_schema.py index 16e65cb6552e..b4fda50dc0db 100644 --- a/python/tests/test_integration_schema.py +++ b/python/tests/test_integration_schema.py @@ -654,7 +654,7 @@ def test_update_types_case_insensitive(catalog: Catalog) -> None: ] -@pytest.mark.parametrize("from_type, to_type", allowed_promotions) +@pytest.mark.parametrize("from_type, to_type", allowed_promotions, ids=str) @pytest.mark.integration def test_allowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, catalog: Catalog) -> None: tbl = _create_table_with_schema( @@ -693,8 +693,8 @@ def test_allowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, catal ] -@pytest.mark.parametrize("from_type", disallowed_promotions_types) -@pytest.mark.parametrize("to_type", disallowed_promotions_types) +@pytest.mark.parametrize("from_type", disallowed_promotions_types, ids=str) +@pytest.mark.parametrize("to_type", disallowed_promotions_types, ids=str) @pytest.mark.integration def test_disallowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, catalog: Catalog) -> None: tbl = _create_table_with_schema( From 069179a4db1c508681c9d82376f3d4b1bcb71209 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Sun, 3 Sep 2023 23:12:51 +0200 Subject: [PATCH 17/17] Comments --- python/mkdocs/docs/api.md | 20 +- python/pyiceberg/table/__init__.py | 95 ++----- python/tests/table/test_init.py | 122 ++++++++- python/tests/test_integration_schema.py | 348 +++++++----------------- 4 files changed, 252 insertions(+), 333 deletions(-) diff --git a/python/mkdocs/docs/api.md b/python/mkdocs/docs/api.md index 2ac86bec4fed..55eadc5f5b45 100644 --- a/python/mkdocs/docs/api.md +++ b/python/mkdocs/docs/api.md @@ -152,15 +152,7 @@ To load a table directly from a metadata file (i.e., **without** using a catalog from pyiceberg.table import StaticTable static_table = StaticTable.from_metadata( - # For example: - # "s3a://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json", - tbl.metadata_location, - properties={ - "s3.endpoint": "http://127.0.0.1:9000", - "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO", - "s3.access-key-id": "admin", - "s3.secret-access-key": "password", - }, + "s3://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json" ) ``` @@ -204,11 +196,11 @@ Renaming a field in an Iceberg table is simple: ```python with table.update_schema() as update: update.rename("retries", "num_retries") - # In a struct, only the new name field + # This will rename `confirmed_by` to `exchange` update.rename("properties.confirmed_by", "exchange") ``` -### Rename column +### Move column Move a field inside of struct: @@ -216,8 +208,8 @@ Move a field inside of struct: with table.update_schema() as update: update.move_first("symbol") update.move_after("bid", "ask") - # In a struct, only the new name field - update.move_before("details.exchange", "properties.created_by") + # This will move `confirmed_by` before `exchange` + update.move_before("details.created_by", "details.exchange") ``` ### Update column @@ -308,7 +300,7 @@ The low level API `plan_files` methods returns a set of tasks that provide the f ```json [ - "s3a://warehouse/wh/nyc/taxis/data/00003-4-42464649-92dd-41ad-b83b-dea1a2fe4b58-00001.parquet" + "s3://warehouse/wh/nyc/taxis/data/00003-4-42464649-92dd-41ad-b83b-dea1a2fe4b58-00001.parquet" ] ``` diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py index 431a8a9e512e..b905c955c848 100644 --- a/python/pyiceberg/table/__init__.py +++ b/python/pyiceberg/table/__init__.py @@ -1020,7 +1020,7 @@ def add_column( parent_field = parent_type.element_field if not parent_field.field_type.is_struct: - raise ValueError(f"Cannot add column '{name}' to non-struct type: {'.'.join(parent)}") + raise ValueError(f"Cannot add column '{name}' to non-struct type: {parent_full_path}") parent_id = parent_field.field_id @@ -1106,27 +1106,15 @@ def rename_column(self, path_from: Union[str, Tuple[str, ...]], new_name: str) - required=field_from.required, ) - if path_from in self._identifier_field_names: - self._identifier_field_names.remove(path_from) - self._identifier_field_names.add(f"{path_from[:-len(field_from.name)]}{new_name}") + # Lookup the field because of casing + from_field_correct_casing = self._schema.find_column_name(field_from.field_id) + if from_field_correct_casing in self._identifier_field_names: + self._identifier_field_names.remove(from_field_correct_casing) + new_identifier_path = f"{from_field_correct_casing[:-len(field_from.name)]}{new_name}" + self._identifier_field_names.add(new_identifier_path) return self - def require_column(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: - """Make a column required. - - This is a breaking change since writers have to make sure that - this value is not-null. - - Args: - path: The path to the field - - Returns: - The UpdateSchema with the requirement change staged. - """ - self._set_column_requirement(path, required=True) - return self - def make_column_optional(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchema: """Make a column optional. @@ -1136,7 +1124,7 @@ def make_column_optional(self, path: Union[str, Tuple[str, ...]]) -> UpdateSchem Returns: The UpdateSchema with the requirement change staged. """ - self._set_column_requirement(path, False) + self._set_column_requirement(path, required=False) return self def set_identifier_fields(self, *fields: str) -> None: @@ -1196,12 +1184,18 @@ def update_column( path = (path,) if isinstance(path, str) else path full_name = ".".join(path) + if field_type is None and required is None and doc is None: + return self + field = self._schema.find_field(full_name, self._case_sensitive) if field.field_id in self._deletes: raise ValueError(f"Cannot update a column that will be deleted: {full_name}") if field_type is not None: + if not field.field_type.is_primitive: + raise ValidationError(f"Cannot change column type: {field.field_type} is not a primitive") + if not self._allow_incompatible_changes and field.field_type != field_type: try: promote(field.field_type, field_type) @@ -1230,47 +1224,6 @@ def update_column( return self - def update_column_doc(self, path: Union[str, Tuple[str, ...]], doc: str) -> UpdateSchema: - """Update the documentation of column. - - Args: - path: The path to the field. - doc: The new documentation of the column - - Returns: - The UpdateSchema with the doc update staged. - """ - path = (path,) if isinstance(path, str) else path - full_name = ".".join(path) - - field = self._schema.find_field(full_name, self._case_sensitive) - - if field.field_id in self._deletes: - raise ValueError(f"Cannot update a column that will be deleted: {full_name}") - - if field.doc == doc: - # Noop - return self - - if updated := self._updates.get(field.field_id): - self._updates[field.field_id] = NestedField( - field_id=updated.field_id, - name=updated.name, - field_type=updated.field_type, - doc=doc, - required=updated.required, - ) - else: - self._updates[field.field_id] = NestedField( - field_id=field.field_id, - name=field.name, - field_type=field.field_type, - doc=doc, - required=field.required, - ) - - return self - def _find_for_move(self, name: str) -> Optional[int]: try: return self._schema.find_field(name, self._case_sensitive).field_id @@ -1294,6 +1247,7 @@ def _move(self, move: Move) -> None: self._moves[parent_field.field_id] = self._moves.get(parent_field.field_id, []) + [move] else: + # In the top level field if move.op == MoveOperation.After or move.op == MoveOperation.Before: if move.other_field_id is None: raise ValueError("Expected other field when performing before/after move") @@ -1373,11 +1327,11 @@ def move_after(self, path: Union[str, Tuple[str, ...]], after_name: Union[str, T if field_id is None: raise ValueError(f"Cannot move missing column: {full_name}") - after_full_name = ".".join(after_name) if isinstance(after_name, tuple) else after_name - after_field_id = self._find_for_move(after_full_name) + after_path = ".".join(after_name) if isinstance(after_name, tuple) else after_name + after_field_id = self._find_for_move(after_path) if after_field_id is None: - raise ValueError(f"Cannot move {full_name} after missing column: {after_full_name}") + raise ValueError(f"Cannot move {full_name} after missing column: {after_path}") if field_id == after_field_id: raise ValueError(f"Cannot move {full_name} after itself") @@ -1415,12 +1369,12 @@ def _apply(self) -> Schema: # Should never happen raise ValueError("Could not apply changes") + # Check the field-ids new_schema = Schema(*struct.fields) - field_ids = set() for name in self._identifier_field_names: try: - field = new_schema.find_field(name, self._case_sensitive) + field = new_schema.find_field(name, case_sensitive=self._case_sensitive) except ValueError as e: raise ValueError( f"Cannot find identifier field {name}. In case of deletion, update the identifier fields first." @@ -1428,7 +1382,7 @@ def _apply(self) -> Schema: field_ids.add(field.field_id) - return Schema(*struct.fields, identifier_field_ids=field_ids) + return Schema(*struct.fields, schema_id=1 + max(self._table.schemas().keys()), identifier_field_ids=field_ids) def assign_new_column_id(self) -> int: return next(self._last_column_id) @@ -1592,14 +1546,13 @@ def _move_fields(fields: Tuple[NestedField, ...], moves: List[Move]) -> Tuple[Ne def _add_and_move_fields( fields: Tuple[NestedField, ...], adds: List[NestedField], moves: List[Move] ) -> Optional[Tuple[NestedField, ...]]: - if adds: + if len(adds) > 0: # always apply adds first so that added fields can be moved added = _add_fields(fields, adds) - if moves: + if len(moves) > 0: return _move_fields(added, moves) else: return added - # add fields - elif moves: + elif len(moves) > 0: return _move_fields(fields, moves) return None if len(adds) == 0 else tuple(*fields, *adds) diff --git a/python/tests/table/test_init.py b/python/tests/table/test_init.py index a7b14a8e27ce..3ee0cd37f2d9 100644 --- a/python/tests/table/test_init.py +++ b/python/tests/table/test_init.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint:disable=redefined-outer-name +from typing import Dict import pytest from sortedcontainers import SortedList @@ -39,6 +40,7 @@ SetPropertiesUpdate, StaticTable, Table, + UpdateSchema, _match_deletes_to_datafile, ) from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER @@ -55,7 +57,25 @@ SortOrder, ) from pyiceberg.transforms import BucketTransform, IdentityTransform -from pyiceberg.types import LongType, NestedField +from pyiceberg.types import ( + BinaryType, + BooleanType, + DateType, + DoubleType, + FloatType, + IntegerType, + ListType, + LongType, + MapType, + NestedField, + PrimitiveType, + StringType, + StructType, + TimestampType, + TimestamptzType, + TimeType, + UUIDType, +) def test_schema(table: Table) -> None: @@ -374,3 +394,103 @@ def test_match_deletes_to_datafile_duplicate_number() -> None: def test_serialize_set_properties_updates() -> None: assert SetPropertiesUpdate(updates={"abc": "🤪"}).model_dump_json() == """{"action":"set-properties","updates":{"abc":"🤪"}}""" + + +def test_add_column(table: Table) -> None: + update = UpdateSchema(table) + update.add_column(path="b", field_type=IntegerType()) + apply_schema: Schema = update._apply() # pylint: disable=W0212 + assert len(apply_schema.fields) == 4 + + assert apply_schema == Schema( + NestedField(field_id=1, name="x", field_type=LongType(), required=True), + NestedField(field_id=2, name="y", field_type=LongType(), required=True, doc="comment"), + NestedField(field_id=3, name="z", field_type=LongType(), required=True), + NestedField(field_id=4, name="b", field_type=IntegerType(), required=False), + identifier_field_ids=[1, 2], + ) + assert apply_schema.schema_id == 2 + assert apply_schema.highest_field_id == 4 + + +def test_add_primitive_type_column(table: Table) -> None: + primitive_type: Dict[str, PrimitiveType] = { + "boolean": BooleanType(), + "int": IntegerType(), + "long": LongType(), + "float": FloatType(), + "double": DoubleType(), + "date": DateType(), + "time": TimeType(), + "timestamp": TimestampType(), + "timestamptz": TimestamptzType(), + "string": StringType(), + "uuid": UUIDType(), + "binary": BinaryType(), + } + + for name, type_ in primitive_type.items(): + field_name = f"new_column_{name}" + update = UpdateSchema(table) + update.add_column(path=field_name, field_type=type_, doc=f"new_column_{name}") + new_schema = update._apply() # pylint: disable=W0212 + + field: NestedField = new_schema.find_field(field_name) + assert field.field_type == type_ + assert field.doc == f"new_column_{name}" + + +def test_add_nested_type_column(table: Table) -> None: + # add struct type column + field_name = "new_column_struct" + update = UpdateSchema(table) + struct_ = StructType( + NestedField(1, "lat", DoubleType()), + NestedField(2, "long", DoubleType()), + ) + update.add_column(path=field_name, field_type=struct_) + schema_ = update._apply() # pylint: disable=W0212 + field: NestedField = schema_.find_field(field_name) + assert field.field_type == StructType( + NestedField(5, "lat", DoubleType()), + NestedField(6, "long", DoubleType()), + ) + assert schema_.highest_field_id == 6 + + +def test_add_nested_map_type_column(table: Table) -> None: + # add map type column + field_name = "new_column_map" + update = UpdateSchema(table) + map_ = MapType(1, StringType(), 2, IntegerType(), False) + update.add_column(path=field_name, field_type=map_) + new_schema = update._apply() # pylint: disable=W0212 + field: NestedField = new_schema.find_field(field_name) + assert field.field_type == MapType(5, StringType(), 6, IntegerType(), False) + assert new_schema.highest_field_id == 6 + + +def test_add_nested_list_type_column(table: Table) -> None: + # add list type column + field_name = "new_column_list" + update = UpdateSchema(table) + list_ = ListType( + element_id=101, + element_type=StructType( + NestedField(102, "lat", DoubleType()), + NestedField(103, "long", DoubleType()), + ), + element_required=False, + ) + update.add_column(path=field_name, field_type=list_) + new_schema = update._apply() # pylint: disable=W0212 + field: NestedField = new_schema.find_field(field_name) + assert field.field_type == ListType( + element_id=5, + element_type=StructType( + NestedField(6, "lat", DoubleType()), + NestedField(7, "long", DoubleType()), + ), + element_required=False, + ) + assert new_schema.highest_field_id == 7 diff --git a/python/tests/test_integration_schema.py b/python/tests/test_integration_schema.py index b4fda50dc0db..f0ccb1b0e858 100644 --- a/python/tests/test_integration_schema.py +++ b/python/tests/test_integration_schema.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. # pylint:disable=redefined-outer-name -from typing import Dict import pytest @@ -65,111 +64,6 @@ def simple_table(catalog: Catalog, table_schema_simple: Schema) -> Table: return _create_table_with_schema(catalog, table_schema_simple) -@pytest.mark.integration -def test_add_column(simple_table: Table) -> None: - update = UpdateSchema(simple_table) - update.add_column(path="b", field_type=IntegerType()) - apply_schema: Schema = update._apply() # pylint: disable=W0212 - assert len(apply_schema.fields) == 4 - - assert apply_schema == Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=False), - NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), - NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), - NestedField(field_id=4, name="b", field_type=IntegerType(), required=False), - identifier_field_ids=[2], - ) - assert apply_schema.schema_id == 0 - assert apply_schema.highest_field_id == 4 - - -@pytest.mark.integration -def test_add_primitive_type_column(simple_table: Table) -> None: - primitive_type: Dict[str, PrimitiveType] = { - "boolean": BooleanType(), - "int": IntegerType(), - "long": LongType(), - "float": FloatType(), - "double": DoubleType(), - "date": DateType(), - "time": TimeType(), - "timestamp": TimestampType(), - "timestamptz": TimestamptzType(), - "string": StringType(), - "uuid": UUIDType(), - "binary": BinaryType(), - } - - for name, type_ in primitive_type.items(): - field_name = f"new_column_{name}" - update = UpdateSchema(simple_table) - update.add_column(path=field_name, field_type=type_, doc=f"new_column_{name}") - new_schema = update._apply() # pylint: disable=W0212 - - field: NestedField = new_schema.find_field(field_name) - assert field.field_type == type_ - assert field.doc == f"new_column_{name}" - - -@pytest.mark.integration -def test_add_nested_type_column(simple_table: Table) -> None: - # add struct type column - field_name = "new_column_struct" - update = UpdateSchema(simple_table) - struct_ = StructType( - NestedField(1, "lat", DoubleType()), - NestedField(2, "long", DoubleType()), - ) - update.add_column(path=field_name, field_type=struct_) - schema_ = update._apply() # pylint: disable=W0212 - field: NestedField = schema_.find_field(field_name) - assert field.field_type == StructType( - NestedField(5, "lat", DoubleType()), - NestedField(6, "long", DoubleType()), - ) - assert schema_.highest_field_id == 6 - - -@pytest.mark.integration -def test_add_nested_map_type_column(simple_table: Table) -> None: - # add map type column - field_name = "new_column_map" - update = UpdateSchema(simple_table) - map_ = MapType(1, StringType(), 2, IntegerType(), False) - update.add_column(path=field_name, field_type=map_) - new_schema = update._apply() # pylint: disable=W0212 - field: NestedField = new_schema.find_field(field_name) - assert field.field_type == MapType(5, StringType(), 6, IntegerType(), False) - assert new_schema.highest_field_id == 6 - - -@pytest.mark.integration -def test_add_nested_list_type_column(simple_table: Table) -> None: - # add list type column - field_name = "new_column_list" - update = UpdateSchema(simple_table) - list_ = ListType( - element_id=101, - element_type=StructType( - NestedField(102, "lat", DoubleType()), - NestedField(103, "long", DoubleType()), - ), - element_required=False, - ) - update.add_column(path=field_name, field_type=list_) - new_schema = update._apply() # pylint: disable=W0212 - field: NestedField = new_schema.find_field(field_name) - assert field.field_type == ListType( - element_id=5, - element_type=StructType( - NestedField(6, "lat", DoubleType()), - NestedField(7, "long", DoubleType()), - ), - element_required=False, - ) - assert new_schema.highest_field_id == 7 - - def _create_table_with_schema(catalog: Catalog, schema: Schema) -> Table: tbl_name = "default.test_schema_evolution" try: @@ -199,10 +93,30 @@ def test_add_to_non_struct_type(catalog: Catalog, table_schema_simple: Schema) - update = UpdateSchema(table) with pytest.raises(ValueError) as exc_info: update.add_column(path=("foo", "lat"), field_type=IntegerType()) - assert "Cannot add column 'lat' to non-struct type" in str(exc_info.value) + assert "Cannot add column 'lat' to non-struct type: foo" in str(exc_info.value) @pytest.mark.integration +def test_schema_evolution_nested_field(catalog: Catalog) -> None: + schema = Schema( + NestedField( + field_id=1, + name="foo", + field_type=StructType(NestedField(2, name="bar", field_type=StringType(), required=False)), + required=False, + ), + ) + tbl = _create_table_with_schema(catalog, schema) + + assert tbl.schema() == schema + + with pytest.raises(ValidationError) as exc_info: + with tbl.transaction() as tx: + tx.update_schema().update_column("foo", StringType()).commit() + + assert "Cannot change column type: struct<2: bar: optional string> is not a primitive" in str(exc_info.value) + + @pytest.mark.integration def test_schema_evolution_via_transaction(catalog: Catalog) -> None: schema = Schema( @@ -220,7 +134,6 @@ def test_schema_evolution_via_transaction(catalog: Catalog) -> None: NestedField(field_id=1, name="col_uuid", field_type=UUIDType(), required=False), NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), NestedField(field_id=3, name="col_string", field_type=StringType(), required=False), - schema_id=1, ) tbl.update_schema().add_column("col_integer", IntegerType()).commit() @@ -230,7 +143,6 @@ def test_schema_evolution_via_transaction(catalog: Catalog) -> None: NestedField(field_id=2, name="col_fixed", field_type=FixedType(25), required=False), NestedField(field_id=3, name="col_string", field_type=StringType(), required=False), NestedField(field_id=4, name="col_integer", field_type=IntegerType(), required=False), - schema_id=1, ) with pytest.raises(CommitFailedException) as exc_info: @@ -252,11 +164,9 @@ def test_schema_evolution_via_transaction(catalog: Catalog) -> None: NestedField(field_id=3, name="col_string", field_type=StringType(), required=False), NestedField(field_id=4, name="col_integer", field_type=IntegerType(), required=False), NestedField(field_id=5, name="col_long", field_type=LongType(), required=False), - schema_id=1, ) -@pytest.mark.integration @pytest.mark.integration def test_schema_evolution_nested(catalog: Catalog) -> None: nested_schema = Schema( @@ -297,7 +207,6 @@ def test_schema_evolution_nested(catalog: Catalog) -> None: ), required=False, ), - schema_id=1, ) tbl = _create_table_with_schema(catalog, nested_schema) @@ -353,8 +262,6 @@ def test_schema_evolution_nested(catalog: Catalog) -> None: ), required=False, ), - schema_id=1, - identifier_field_ids=[], ) ) @@ -407,7 +314,6 @@ def test_schema_evolution_nested(catalog: Catalog) -> None: ), required=False, ), - schema_id=0, identifier_field_ids=[2], ) @@ -425,6 +331,15 @@ def test_no_changes(simple_table: Table, table_schema_simple: Schema) -> None: assert simple_table.schema() == table_schema_simple +@pytest.mark.integration +def test_no_changes_empty_commit(simple_table: Table, table_schema_simple: Schema) -> None: + with simple_table.update_schema() as update: + # No updates, so this should be a noop + update.update_column(path="foo") + + assert simple_table.schema() == table_schema_simple + + @pytest.mark.integration def test_delete_field(simple_table: Table) -> None: with simple_table.update_schema() as schema_update: @@ -434,7 +349,6 @@ def test_delete_field(simple_table: Table) -> None: # foo is missing 👍 NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), - schema_id=1, identifier_field_ids=[2], ) @@ -448,7 +362,6 @@ def test_delete_field_case_insensitive(simple_table: Table) -> None: # foo is missing 👍 NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), - schema_id=1, identifier_field_ids=[2], ) @@ -477,7 +390,6 @@ def test_delete_identifier_fields_nested(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, identifier_field_ids=[3], ), ) @@ -689,7 +601,7 @@ def test_allowed_updates(from_type: PrimitiveType, to_type: PrimitiveType, catal # https://github.com/apache/iceberg/issues/8389 # DecimalType(9, 2), # DecimalType(9, 3), - # DecimalType(18, 2) + DecimalType(18, 2), ] @@ -728,7 +640,6 @@ def test_rename_simple(simple_table: Table) -> None: NestedField(field_id=1, name="vo", field_type=StringType(), required=False), NestedField(field_id=2, name="bar", field_type=IntegerType(), required=True), NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False), - schema_id=1, identifier_field_ids=[2], ) @@ -760,6 +671,33 @@ def test_rename_simple_nested(catalog: Catalog) -> None: ) +@pytest.mark.integration +def test_rename_simple_nested_with_dots(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField( + field_id=1, + name="a.b", + field_type=StructType(NestedField(field_id=2, name="c.d", field_type=StringType())), + required=True, + ), + ), + ) + + with tbl.update_schema() as schema_update: + schema_update.rename_column(("a.b", "c.d"), "e.f") + + assert tbl.schema() == Schema( + NestedField( + field_id=1, + name="a.b", + field_type=StructType(NestedField(field_id=2, name="e.f", field_type=StringType())), + required=True, + ), + ) + + @pytest.mark.integration def test_rename(catalog: Catalog) -> None: tbl = _create_table_with_schema( @@ -805,7 +743,6 @@ def test_rename(catalog: Catalog) -> None: required=False, ), NestedField(field_id=4, name="foo", field_type=StringType(), required=True), - schema_id=0, identifier_field_ids=[], ), ) @@ -857,7 +794,6 @@ def test_rename(catalog: Catalog) -> None: required=False, ), NestedField(field_id=4, name="bar", field_type=StringType(), required=True), - schema_id=0, identifier_field_ids=[], ) @@ -904,11 +840,10 @@ def test_rename_case_insensitive(catalog: Catalog) -> None: NestedField(field_id=12, name="name", field_type=StringType(), required=False), NestedField(field_id=13, name="leeftijd", field_type=IntegerType(), required=True), ), - required=False, + required=True, ), NestedField(field_id=4, name="foo", field_type=StringType(), required=True), - schema_id=0, - identifier_field_ids=[], + identifier_field_ids=[13], ), ) @@ -956,11 +891,10 @@ def test_rename_case_insensitive(catalog: Catalog) -> None: NestedField(field_id=12, name="name", field_type=StringType(), required=False), NestedField(field_id=13, name="age", field_type=IntegerType(), required=True), ), - required=False, + required=True, ), NestedField(field_id=4, name="bar", field_type=StringType(), required=True), - schema_id=0, - identifier_field_ids=[], + identifier_field_ids=[13], ) @@ -970,7 +904,6 @@ def test_add_struct(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType()), - schema_id=1, ), ) @@ -985,7 +918,6 @@ def test_add_struct(catalog: Catalog) -> None: assert tbl.schema() == Schema( NestedField(field_id=1, name="foo", field_type=StringType()), NestedField(field_id=2, name="location", field_type=struct, required=False), - schema_id=1, ) @@ -995,7 +927,6 @@ def test_add_nested_map_of_structs(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType()), - schema_id=1, ), ) @@ -1040,8 +971,6 @@ def test_add_nested_map_of_structs(catalog: Catalog) -> None: ), required=False, ), - schema_id=1, - identifier_field_ids=[], ) @@ -1051,7 +980,6 @@ def test_add_nested_list_of_structs(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType()), - schema_id=1, ), ) @@ -1083,16 +1011,12 @@ def test_add_nested_list_of_structs(catalog: Catalog) -> None: ), required=False, ), - schema_id=1, - identifier_field_ids=[], ) @pytest.mark.integration def test_add_required_column(catalog: Catalog) -> None: - schema_ = Schema( - NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] - ) + schema_ = Schema(NestedField(field_id=1, name="a", field_type=BooleanType(), required=False)) table = _create_table_with_schema(catalog, schema_) update = UpdateSchema(table) with pytest.raises(ValueError) as exc_info: @@ -1107,16 +1031,12 @@ def test_add_required_column(catalog: Catalog) -> None: assert new_schema == Schema( NestedField(field_id=1, name="a", field_type=BooleanType(), required=False), NestedField(field_id=2, name="data", field_type=IntegerType(), required=True), - schema_id=0, - identifier_field_ids=[], ) @pytest.mark.integration def test_add_required_column_case_insensitive(catalog: Catalog) -> None: - schema_ = Schema( - NestedField(field_id=1, name="id", field_type=BooleanType(), required=False), schema_id=1, identifier_field_ids=[] - ) + schema_ = Schema(NestedField(field_id=1, name="id", field_type=BooleanType(), required=False)) table = _create_table_with_schema(catalog, schema_) with pytest.raises(ValueError) as exc_info: @@ -1132,8 +1052,6 @@ def test_add_required_column_case_insensitive(catalog: Catalog) -> None: assert new_schema == Schema( NestedField(field_id=1, name="id", field_type=BooleanType(), required=False), NestedField(field_id=2, name="ID", field_type=IntegerType(), required=True), - schema_id=0, - identifier_field_ids=[], ) @@ -1143,7 +1061,6 @@ def test_make_column_optional(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1152,8 +1069,6 @@ def test_make_column_optional(catalog: Catalog) -> None: assert tbl.schema() == Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=False), - schema_id=0, - identifier_field_ids=[], ) @@ -1212,7 +1127,6 @@ def test_mixed_changes(catalog: Catalog) -> None: field_type=MapType(key_id=18, value_id=19, key_type=StringType(), value_type=StringType()), required=False, ), - schema_id=1, ), ) @@ -1229,11 +1143,11 @@ def test_mixed_changes(catalog: Catalog) -> None: schema_update.rename_column("points.y", "y.y") schema_update.update_column("id", field_type=LongType(), doc="unique id") schema_update.update_column("locations.lat", DoubleType()) - schema_update.update_column_doc("locations.lat", "latitude") + schema_update.update_column("locations.lat", doc="latitude") schema_update.delete_column("locations.long") schema_update.delete_column("properties") schema_update.make_column_optional("points.x") - schema_update.require_column("data") + schema_update.update_column("data", required=True) schema_update.add_column(("locations", "description"), StringType(), doc="location description") assert tbl.schema() == Schema( @@ -1296,8 +1210,6 @@ def test_mixed_changes(catalog: Catalog) -> None: required=True, ), NestedField(field_id=24, name="toplevel", field_type=DecimalType(precision=9, scale=2), required=False), - schema_id=1, - identifier_field_ids=[], ) @@ -1317,7 +1229,6 @@ def test_delete_then_add(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1327,7 +1238,6 @@ def test_delete_then_add(catalog: Catalog) -> None: assert tbl.schema() == Schema( NestedField(field_id=2, name="foo", field_type=StringType(), required=False), - schema_id=1, ) @@ -1345,7 +1255,6 @@ def test_delete_then_add_nested(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ), ) @@ -1363,7 +1272,6 @@ def test_delete_then_add_nested(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ) @@ -1373,7 +1281,6 @@ def test_delete_missing_column(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1390,7 +1297,6 @@ def test_add_delete_conflict(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1412,7 +1318,6 @@ def test_add_delete_conflict(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ), ) @@ -1429,7 +1334,6 @@ def test_rename_missing_column(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1446,7 +1350,6 @@ def test_rename_missing_conflicts(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1461,7 +1364,6 @@ def test_rename_missing_conflicts(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1479,7 +1381,6 @@ def test_update_missing_column(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1496,7 +1397,6 @@ def test_update_delete_conflict(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=IntegerType(), required=True), - schema_id=1, ), ) @@ -1514,7 +1414,6 @@ def test_delete_update_conflict(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=IntegerType(), required=True), - schema_id=1, ), ) @@ -1576,14 +1475,13 @@ def test_update_added_column_doc(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), - schema_id=1, ), ) with pytest.raises(ValueError) as exc_info: with tbl.update_schema() as schema_update: schema_update.add_column("value", LongType()) - schema_update.update_column_doc("value", "a value") + schema_update.update_column("value", doc="a value") assert "Could not find field with name value, case_sensitive=True" in str(exc_info.value) @@ -1594,14 +1492,13 @@ def test_update_deleted_column_doc(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType(), required=True), - schema_id=1, ), ) with pytest.raises(ValueError) as exc_info: with tbl.update_schema() as schema_update: schema_update.delete_column("foo") - schema_update.update_column_doc("foo", "a value") + schema_update.update_column("foo", doc="a value") assert "Cannot update a column that will be deleted: foo" in str(exc_info.value) @@ -1615,7 +1512,6 @@ def test_multiple_moves(catalog: Catalog) -> None: NestedField(field_id=2, name="b", field_type=IntegerType(), required=True), NestedField(field_id=3, name="c", field_type=IntegerType(), required=True), NestedField(field_id=4, name="d", field_type=IntegerType(), required=True), - schema_id=1, ), ) @@ -1630,7 +1526,6 @@ def test_multiple_moves(catalog: Catalog) -> None: NestedField(field_id=2, name="b", field_type=IntegerType(), required=True), NestedField(field_id=4, name="d", field_type=IntegerType(), required=True), NestedField(field_id=1, name="a", field_type=IntegerType(), required=True), - schema_id=1, ) @@ -1641,7 +1536,6 @@ def test_move_top_level_column_first(catalog: Catalog) -> None: Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1651,7 +1545,6 @@ def test_move_top_level_column_first(catalog: Catalog) -> None: assert tbl.schema() == Schema( NestedField(field_id=2, name="data", field_type=StringType(), required=True), NestedField(field_id=1, name="id", field_type=LongType(), required=True), - schema_id=1, ) @@ -1662,7 +1555,6 @@ def test_move_top_level_column_before_first(catalog: Catalog) -> None: Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1672,7 +1564,6 @@ def test_move_top_level_column_before_first(catalog: Catalog) -> None: assert tbl.schema() == Schema( NestedField(field_id=2, name="data", field_type=StringType(), required=True), NestedField(field_id=1, name="id", field_type=LongType(), required=True), - schema_id=1, ) @@ -1683,7 +1574,6 @@ def test_move_top_level_column_after_last(catalog: Catalog) -> None: Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1693,7 +1583,6 @@ def test_move_top_level_column_after_last(catalog: Catalog) -> None: assert tbl.schema() == Schema( NestedField(field_id=2, name="data", field_type=StringType(), required=True), NestedField(field_id=1, name="id", field_type=LongType(), required=True), - schema_id=1, ) @@ -1712,7 +1601,6 @@ def test_move_nested_field_first(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ), ) @@ -1730,7 +1618,6 @@ def test_move_nested_field_first(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ) @@ -1749,7 +1636,6 @@ def test_move_nested_field_before_first(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ), ) @@ -1767,7 +1653,6 @@ def test_move_nested_field_before_first(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ) @@ -1786,7 +1671,6 @@ def test_move_nested_field_after_first(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ), ) @@ -1805,7 +1689,6 @@ def test_move_nested_field_after_first(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ) ) @@ -1826,7 +1709,6 @@ def test_move_nested_field_after(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ), ) @@ -1845,7 +1727,6 @@ def test_move_nested_field_after(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ) @@ -1865,7 +1746,6 @@ def test_move_nested_field_before(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ), ) @@ -1884,7 +1764,6 @@ def test_move_nested_field_before(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ) @@ -1909,7 +1788,6 @@ def test_move_map_value_struct_field(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ), ) @@ -1933,7 +1811,6 @@ def test_move_map_value_struct_field(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ) @@ -1944,7 +1821,6 @@ def test_move_added_top_level_column(catalog: Catalog) -> None: Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1956,7 +1832,6 @@ def test_move_added_top_level_column(catalog: Catalog) -> None: NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=3, name="ts", field_type=TimestamptzType(), required=False), NestedField(field_id=2, name="data", field_type=StringType(), required=True), - schema_id=1, ) @@ -1967,7 +1842,6 @@ def test_move_added_top_level_column_after_added_column(catalog: Catalog) -> Non Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -1982,7 +1856,6 @@ def test_move_added_top_level_column_after_added_column(catalog: Catalog) -> Non NestedField(field_id=3, name="ts", field_type=TimestamptzType(), required=False), NestedField(field_id=4, name="count", field_type=LongType(), required=False), NestedField(field_id=2, name="data", field_type=StringType(), required=True), - schema_id=1, ) @@ -2001,7 +1874,6 @@ def test_move_added_nested_struct_field(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ), ) @@ -2021,7 +1893,6 @@ def test_move_added_nested_struct_field(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ) @@ -2040,7 +1911,6 @@ def test_move_added_nested_field_before_added_column(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ), ) @@ -2063,7 +1933,6 @@ def test_move_added_nested_field_before_added_column(catalog: Catalog) -> None: ), required=True, ), - schema_id=1, ) @@ -2073,7 +1942,6 @@ def test_move_self_reference_fails(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType()), - schema_id=1, ), ) @@ -2094,7 +1962,6 @@ def test_move_missing_column_fails(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType()), - schema_id=1, ), ) @@ -2120,7 +1987,6 @@ def test_move_before_add_fails(catalog: Catalog) -> None: catalog, Schema( NestedField(field_id=1, name="foo", field_type=StringType()), - schema_id=1, ), ) @@ -2150,7 +2016,6 @@ def test_move_missing_reference_column_fails(catalog: Catalog) -> None: Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=True), - schema_id=1, ), ) @@ -2178,7 +2043,6 @@ def test_move_primitive_map_key_fails(catalog: Catalog) -> None: field_type=MapType(key_id=4, value_id=5, key_type=StringType(), value_type=StringType()), required=False, ), - schema_id=1, ), ) @@ -2201,7 +2065,6 @@ def test_move_primitive_map_value_fails(catalog: Catalog) -> None: field_type=MapType(key_id=4, value_id=5, key_type=StringType(), value_type=StructType()), required=False, ), - schema_id=1, ), ) @@ -2227,7 +2090,6 @@ def test_move_top_level_between_structs_fails(catalog: Catalog) -> None: ), required=False, ), - schema_id=1, ), ) @@ -2260,7 +2122,6 @@ def test_move_between_structs_fails(catalog: Catalog) -> None: ), required=False, ), - schema_id=1, ), ) @@ -2275,9 +2136,7 @@ def test_move_between_structs_fails(catalog: Catalog) -> None: def test_add_existing_identifier_fields(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema() as update_schema: @@ -2290,9 +2149,7 @@ def test_add_existing_identifier_fields(catalog: Catalog) -> None: def test_add_new_identifiers_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema(allow_incompatible_changes=True) as update_schema: @@ -2306,9 +2163,7 @@ def test_add_new_identifiers_field_columns(catalog: Catalog) -> None: def test_add_new_identifiers_field_columns_out_of_order(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema(allow_incompatible_changes=True) as update_schema: @@ -2322,9 +2177,7 @@ def test_add_new_identifiers_field_columns_out_of_order(catalog: Catalog) -> Non def test_add_nested_identifier_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema(allow_incompatible_changes=True) as update_schema: @@ -2342,9 +2195,7 @@ def test_add_nested_identifier_field_columns(catalog: Catalog) -> None: def test_add_nested_identifier_field_columns_single_transaction(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema(allow_incompatible_changes=True) as update_schema: @@ -2360,9 +2211,7 @@ def test_add_nested_identifier_field_columns_single_transaction(catalog: Catalog def test_add_nested_nested_identifier_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema(allow_incompatible_changes=True) as update_schema: @@ -2387,9 +2236,7 @@ def test_add_nested_nested_identifier_field_columns(catalog: Catalog) -> None: def test_add_dotted_identifier_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema(allow_incompatible_changes=True) as update_schema: @@ -2403,9 +2250,7 @@ def test_add_dotted_identifier_field_columns(catalog: Catalog) -> None: def test_remove_identifier_fields(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema(allow_incompatible_changes=True) as update_schema: @@ -2429,7 +2274,6 @@ def test_set_identifier_field_fails_schema(catalog: Catalog) -> None: NestedField(field_id=1, name="id", field_type=IntegerType(), required=False), NestedField(field_id=2, name="float", field_type=FloatType(), required=True), NestedField(field_id=3, name="double", field_type=DoubleType(), required=True), - schema_id=1, identifier_field_ids=[], ), ) @@ -2484,9 +2328,7 @@ def test_set_identifier_field_fails(nested_table: Table) -> None: def test_delete_identifier_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema() as schema_update: @@ -2495,9 +2337,7 @@ def test_delete_identifier_field_columns(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema() as schema_update: @@ -2509,9 +2349,7 @@ def test_delete_identifier_field_columns(catalog: Catalog) -> None: def test_delete_containing_nested_identifier_field_columns_fails(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema(allow_incompatible_changes=True) as schema_update: @@ -2528,7 +2366,6 @@ def test_delete_containing_nested_identifier_field_columns_fails(catalog: Catalo field_type=StructType(NestedField(field_id=3, name="nested", field_type=StringType(), required=True)), required=True, ), - schema_id=1, identifier_field_ids=[3], ) @@ -2543,9 +2380,7 @@ def test_delete_containing_nested_identifier_field_columns_fails(catalog: Catalo def test_rename_identifier_fields(catalog: Catalog) -> None: tbl = _create_table_with_schema( catalog, - Schema( - NestedField(field_id=1, name="foo", field_type=StringType(), required=True), schema_id=1, identifier_field_ids=[1] - ), + Schema(NestedField(field_id=1, name="foo", field_type=StringType(), required=True), identifier_field_ids=[1]), ) with tbl.update_schema() as schema_update: @@ -2562,7 +2397,6 @@ def test_move_identifier_fields(catalog: Catalog) -> None: Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=True), - schema_id=1, identifier_field_ids=[1], ), ) @@ -2593,7 +2427,6 @@ def test_move_identifier_fields_case_insensitive(catalog: Catalog) -> None: Schema( NestedField(field_id=1, name="id", field_type=LongType(), required=True), NestedField(field_id=2, name="data", field_type=StringType(), required=True), - schema_id=1, identifier_field_ids=[1], ), ) @@ -2615,3 +2448,24 @@ def test_move_identifier_fields_case_insensitive(catalog: Catalog) -> None: assert tbl.schema().identifier_field_ids == [1] assert tbl.schema().identifier_field_names() == {"id"} + + +@pytest.mark.integration +def test_two_add_schemas_in_a_single_transaction(catalog: Catalog) -> None: + tbl = _create_table_with_schema( + catalog, + Schema( + NestedField(field_id=1, name="foo", field_type=StringType()), + ), + ) + + with pytest.raises(ValueError) as exc_info: + with tbl.transaction() as tr: + with tr.update_schema() as update: + update.add_column("bar", field_type=StringType()) + with tr.update_schema() as update: + update.add_column("baz", field_type=StringType()) + + assert "Updates in a single commit need to be unique, duplicate: " in str( + exc_info.value + )