dlt-hub · rudolfix · Jun 5, 2025 · May 22, 2025 · May 22, 2025 · May 22, 2025
diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py
@@ -118,6 +118,7 @@ def merge_delta_table(
     table: DeltaTable,
     data: Union[pa.Table, pa.RecordBatchReader],
     schema: TTableSchema,
+    load_table_name: str,
 ) -> None:
     """Merges in-memory Arrow data into on-disk Delta table."""
 
@@ -149,7 +150,10 @@ def merge_delta_table(
 
         qry.execute()
     else:
-        ValueError(f'Merge strategy "{strategy}" not supported.')
+        raise ValueError(
+            f'Merge strategy "{strategy}" is not supported for Delta tables. '
+            f'Table: "{load_table_name}".'
+        )
 
 
 def get_delta_tables(

diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py
@@ -10,7 +10,8 @@
 from dlt.common.libs.pyarrow import cast_arrow_schema_types
 from dlt.common.libs.utils import load_open_tables
 from dlt.common.pipeline import SupportsPipeline
-from dlt.common.schema.typing import TWriteDisposition
+from dlt.common.schema.typing import TWriteDisposition, TTableSchema
+from dlt.common.schema.utils import get_first_column_name_with_prop, get_columns_names_with_prop
 from dlt.common.utils import assert_min_pkg_version
 from dlt.common.exceptions import MissingDependencyException
 from dlt.common.storages.configuration import FileSystemCredentials, FilesystemConfiguration
@@ -25,6 +26,7 @@
     from pyiceberg.catalog import Catalog as IcebergCatalog
     from pyiceberg.exceptions import NoSuchTableError
     import pyarrow as pa
+    import pyiceberg.io.pyarrow as _pio
 except ModuleNotFoundError:
     raise MissingDependencyException(
         "dlt pyiceberg helpers",
@@ -33,6 +35,20 @@
     )
 
 
+# TODO: remove with pyiceberg's release after 0.9.1
+_orig_get_kwargs = _pio._get_parquet_writer_kwargs
+
+
+def _patched_get_parquet_writer_kwargs(table_properties):  # type: ignore[no-untyped-def]
+    """Return the original kwargs **plus** store_decimal_as_integer=True."""
+    kwargs = _orig_get_kwargs(table_properties)
+    kwargs.setdefault("store_decimal_as_integer", True)
+    return kwargs
+
+
+_pio._get_parquet_writer_kwargs = _patched_get_parquet_writer_kwargs
+
+
 def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
     ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP = {
         pa.types.is_time32: pa.time64("us"),
@@ -63,6 +79,43 @@ def write_iceberg_table(
     )
 
 
+def merge_iceberg_table(
+    table: IcebergTable,
+    data: pa.Table,
+    schema: TTableSchema,
+    load_table_name: str,
+) -> None:
+    """Merges in-memory Arrow data into on-disk Iceberg table."""
+    strategy = schema["x-merge-strategy"]  # type: ignore[typeddict-item]
+    if strategy == "upsert":
+        # evolve schema
+        with table.update_schema() as update:
+            update.union_by_name(ensure_iceberg_compatible_arrow_schema(data.schema))
+
+        if "parent" in schema:
+            join_cols = [get_first_column_name_with_prop(schema, "unique")]
+        else:
+            join_cols = get_columns_names_with_prop(schema, "primary_key")
+
+        # TODO: replace the batching method with transaction with pyiceberg's release after 0.9.1
+        for rb in data.to_batches(max_chunksize=1_000):
+            batch_tbl = pa.Table.from_batches([rb])
+            batch_tbl = ensure_iceberg_compatible_arrow_data(batch_tbl)
+
+            table.upsert(
+                df=batch_tbl,
+                join_cols=join_cols,
+                when_matched_update_all=True,
+                when_not_matched_insert_all=True,
+                case_sensitive=True,
+            )
+    else:
+        raise ValueError(
+            f'Merge strategy "{strategy}" is not supported for Iceberg tables. '
+            f'Table: "{load_table_name}".'
+        )
+
+
 def get_sql_catalog(
     catalog_name: str,
     uri: str,

diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py
@@ -30,7 +30,7 @@ def filesystem_merge_strategies_selector(
     *,
     table_schema: TTableSchema,
 ) -> Sequence[TLoaderMergeStrategy]:
-    if table_schema.get("table_format") == "delta":
+    if table_schema.get("table_format") in ["delta", "iceberg"]:
         return supported_merge_strategies
     else:
         return []

diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
@@ -190,6 +190,7 @@ def run(self) -> None:
                     table=delta_table,
                     data=arrow_rbr,
                     schema=self._load_table,
+                    load_table_name=self.load_table_name,
                 )
             else:
                 location = self._job_client.get_open_table_location("delta", self.load_table_name)
@@ -212,7 +213,7 @@ def run(self) -> None:
 
 class IcebergLoadFilesystemJob(TableFormatLoadFilesystemJob):
     def run(self) -> None:
-        from dlt.common.libs.pyiceberg import write_iceberg_table, create_table
+        from dlt.common.libs.pyiceberg import write_iceberg_table, merge_iceberg_table, create_table
 
         try:
             table = self._job_client.load_open_table(
@@ -234,11 +235,19 @@ def run(self) -> None:
             self.run()
             return
 
-        write_iceberg_table(
-            table=table,
-            data=self.arrow_dataset.to_table(),
-            write_disposition=self._load_table["write_disposition"],
-        )
+        if self._load_table["write_disposition"] == "merge" and table is not None:
+            merge_iceberg_table(
+                table=table,
+                data=self.arrow_dataset.to_table(),
+                schema=self._load_table,
+                load_table_name=self.load_table_name,
+            )
+        else:
+            write_iceberg_table(
+                table=table,
+                data=self.arrow_dataset.to_table(),
+                write_disposition=self._load_table["write_disposition"],
+            )
 
 
 class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob):

diff --git a/docs/website/docs/dlt-ecosystem/destinations/iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/iceberg.md
@@ -120,4 +120,18 @@ The [S3-compatible](./filesystem.md#using-s3-compatible-storage) interface for G
 The `az` [scheme](./filesystem.md#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which dlt used under the hood, currently does not support `az`.
 
 ## Table format `merge` support
-The `merge` write disposition is not supported for Iceberg and falls back to `append`. If you're interested in support for the `merge` write disposition with Iceberg, check out [dlt+ Iceberg destination](../../plus/ecosystem/iceberg.md).
+The [`upsert`](../../general-usage/merge-loading.md#upsert-strategy) merge strategy is supported for `iceberg`. This strategy requires that the input data contains no duplicate rows based on the key columns, and that the target table also does not contain duplicates on those keys. 
+
+:::warning
+Until _pyiceberg_ > 0.9.1 is released, upsert is executed in chunks of **1000** rows. 
+:::
+
+```py
+@dlt.resource(
+    write_disposition={"disposition": "merge", "strategy": "upsert"},
+    primary_key="my_primary_key",
+    table_format="iceberg"
+)
+def my_upsert_resource():
+    ...
+```
diff --git a/docs/website/docs/general-usage/merge-loading.md b/docs/website/docs/general-usage/merge-loading.md
@@ -554,7 +554,7 @@ The `upsert` merge strategy is currently supported for these destinations:
 - `mssql`
 - `postgres`
 - `snowflake`
-- `filesystem` with `delta` table format (see limitations [here](../dlt-ecosystem/destinations/delta-iceberg#known-limitations))
+- `filesystem` with `delta` table format (see limitations [here](../dlt-ecosystem/destinations/delta-iceberg#known-limitations)) and `iceberg` table format
 :::
 
 The `upsert` merge strategy does primary-key based *upserts*:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ packages = [
 ]
 
 [tool.poetry.dependencies]
-python = ">=3.9,<3.14"
+python = ">=3.9.2, <3.14, !=3.9.7"
 requests = ">=2.26.0"
 pendulum = ">=2.1.2"
 simplejson = ">=3.17.5"
@@ -104,7 +104,8 @@ db-dtypes = { version = ">=1.2.0", optional = true }
 # https://github.com/apache/airflow/issues/28723
 # pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] }
 # we will rely on manual installation of `sqlalchemy>=2.0.18` instead
-pyiceberg = { version = ">=0.9.0", optional = true }
+pyiceberg = { version = ">=0.9.1" , optional = true }
+
 databricks-sdk = {version = ">=0.38.0", optional = true}
 pywin32 = {version = ">=306", optional = true, platform = "win32"}
 rich-argparse = "^1.6.0"

diff --git a/tests/common/destination/test_destination_capabilities.py b/tests/common/destination/test_destination_capabilities.py
@@ -42,7 +42,10 @@ def test_resolve_merge_strategy() -> None:
     )
 
     # unknown table formats
-    assert resolve_merge_strategy(schema.tables, iceberg_table, filesystem().capabilities()) is None
+    assert (
+        resolve_merge_strategy(schema.tables, iceberg_table, filesystem().capabilities())
+        == "upsert"
+    )
     assert resolve_merge_strategy(schema.tables, delta_table, athena().capabilities()) is None
 
     # not supported strategy