apache · rdblue · Sep 4, 2023 · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023
diff --git a/python/Makefile b/python/Makefile
@@ -30,7 +30,7 @@ lint:
 	poetry run pre-commit run --all-files
 
 test:
-	poetry run pytest tests/ -m "unmarked or parametrize" ${PYTEST_ARGS}
+	poetry run pytest tests/ -m "(unmarked or parametrize) and not integration" ${PYTEST_ARGS}
 
 test-s3:
 	sh ./dev/run-minio.sh
@@ -53,15 +53,18 @@ test-adlfs:
 	sh ./dev/run-azurite.sh
 	poetry run pytest tests/ -m adlfs ${PYTEST_ARGS}
 
+test-gcs:
+	sh ./dev/run-gcs-server.sh
+	poetry run  pytest tests/ -m gcs ${PYTEST_ARGS}
+
 test-coverage:
-	sh ./dev/run-minio.sh
+	docker-compose -f dev/docker-compose-integration.yml kill
+	docker-compose -f dev/docker-compose-integration.yml rm -f
+	docker-compose -f dev/docker-compose-integration.yml up -d
 	sh ./dev/run-azurite.sh
 	sh ./dev/run-gcs-server.sh
-	poetry run coverage run --source=pyiceberg/ -m pytest tests/ -m "not integration" ${PYTEST_ARGS}
+	docker-compose -f dev/docker-compose-integration.yml exec -T spark-iceberg ipython ./provision.py
+	poetry run coverage run --source=pyiceberg/ -m pytest tests/ ${PYTEST_ARGS}
 	poetry run coverage report -m --fail-under=90
 	poetry run coverage html
 	poetry run coverage xml
-
-test-gcs:
-	sh ./dev/run-gcs-server.sh
-	poetry run  pytest tests/ -m gcs ${PYTEST_ARGS}
diff --git a/python/mkdocs/docs/api.md b/python/mkdocs/docs/api.md
@@ -42,83 +42,68 @@ Then load the `prod` catalog:
 ```python
 from pyiceberg.catalog import load_catalog
 
-catalog = load_catalog("prod")
-
-catalog.list_namespaces()
+catalog = load_catalog(
+    "docs",
+    **{
+        "uri": "http://127.0.0.1:8181",
+        "s3.endpoint": "http://127.0.0.1:9000",
+        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
+        "s3.access-key-id": "admin",
+        "s3.secret-access-key": "password",
+    }
+)
 ```
 
-Returns two namespaces:
+Let's create a namespace:
 
 ```python
-[("default",), ("nyc",)]
+catalog.create_namespace("docs_example")
 ```
 
-Listing the tables in the `nyc` namespace:
+And then list them:
 
 ```python
-catalog.list_tables("nyc")
-```
-
-Returns as list with tuples, containing a single table `taxis`:
+ns = catalog.list_namespaces()
 
-```python
-[("nyc", "taxis")]
+assert ns == [("docs_example",)]
 ```
 
-## Load a table
-
-### From a catalog
-
-Loading the `taxis` table:
+And then list tables in the namespace:
 
 ```python
-catalog.load_table("nyc.taxis")
-# Equivalent to:
-catalog.load_table(("nyc", "taxis"))
-# The tuple syntax can be used if the namespace or table contains a dot.
-```
-
-This returns a `Table` that represents an Iceberg table that can be queried and altered.
-
-### Directly from a metadata file
-
-To load a table directly from a metadata file (i.e., **without** using a catalog), you can use a `StaticTable` as follows:
-
-```python
-from pyiceberg.table import StaticTable
-
-table = StaticTable.from_metadata(
-    "s3a://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json"
-)
-```
-
-For the rest, this table behaves similarly as a table loaded using a catalog. Note that `StaticTable` is intended to be _read only_.
-
-Any properties related to file IO can be passed accordingly:
-
-```python
-table = StaticTable.from_metadata(
-    "s3a://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json",
-    {PY_IO_IMPL: "pyiceberg.some.FileIO.class"},
-)
+catalog.list_tables("docs_example")
 ```
 
 ## Create a table
 
 To create a table from a catalog:
 
 ```python
-from pyiceberg.catalog import load_catalog
 from pyiceberg.schema import Schema
-from pyiceberg.types import TimestampType, DoubleType, StringType, NestedField
+from pyiceberg.types import (
+    TimestampType,
+    FloatType,
+    DoubleType,
+    StringType,
+    NestedField,
+    StructType,
+)
 
 schema = Schema(
+    NestedField(field_id=1, name="datetime", field_type=TimestampType(), required=True),
+    NestedField(field_id=2, name="symbol", field_type=StringType(), required=True),
+    NestedField(field_id=3, name="bid", field_type=FloatType(), required=False),
+    NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
     NestedField(
-        field_id=1, name="datetime", field_type=TimestampType(), required=False
+        field_id=5,
+        name="details",
+        field_type=StructType(
+            NestedField(
+                field_id=4, name="created_by", field_type=StringType(), required=False
+            ),
+        ),
+        required=False,
     ),
-    NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False),
-    NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False),
-    NestedField(field_id=4, name="symbol", field_type=StringType(), required=False),
 )
 
 from pyiceberg.partitioning import PartitionSpec, PartitionField
@@ -133,52 +118,132 @@ partition_spec = PartitionSpec(
 from pyiceberg.table.sorting import SortOrder, SortField
 from pyiceberg.transforms import IdentityTransform
 
-sort_order = SortOrder(SortField(source_id=4, transform=IdentityTransform()))
-
-catalog = load_catalog("prod")
+# Sort on the symbol
+sort_order = SortOrder(SortField(source_id=2, transform=IdentityTransform()))
 
 catalog.create_table(
-    identifier="default.bids",
-    location="/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/bids/",
+    identifier="docs_example.bids",
     schema=schema,
     partition_spec=partition_spec,
     sort_order=sort_order,
 )
 ```
 
-### Update table schema
+## Load a table
 
-Add new columns through the `Transaction` or `UpdateSchema` API:
+### Catalog table
 
-Use the Transaction API:
+Loading the `bids` table:
+
+```python
+table = catalog.load_table("docs_example.bids")
+# Equivalent to:
+table = catalog.load_table(("docs_example", "bids"))
+# The tuple syntax can be used if the namespace or table contains a dot.
+```
+
+This returns a `Table` that represents an Iceberg table that can be queried and altered.
+
+### Static table
+
+To load a table directly from a metadata file (i.e., **without** using a catalog), you can use a `StaticTable` as follows:
+
+```python
+from pyiceberg.table import StaticTable
+
+static_table = StaticTable.from_metadata(
+    "s3://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json"
+)
+```
+
+The static-table is considered read-only.
+
+## Schema evolution
+
+PyIceberg supports full schema evolution through the Python API. It takes care of setting the field-IDs and makes sure that only non-breaking changes are done (can be overriden).
+
+In the examples below, the `.update_schema()` is called from the table itself.
+
+```python
+with table.update_schema() as update:
+    update.add_column("some_field", IntegerType(), "doc")
+```
+
+You can also initiate a transaction if you want to make more changes than just evolving the schema:
 
 ```python
 with table.transaction() as transaction:
-    transaction.update_schema().add_column("x", IntegerType(), "doc").commit()
+    with transaction.update_schema() as update_schema:
+        update.add_column("some_other_field", IntegerType(), "doc")
+    # ... Update properties etc
+```
+
+### Add column
+
+Using `add_column` you can add a column, without having to worry about the field-id:
+
+```python
+with table.update_schema() as update:
+    update.add_column("retries", IntegerType(), "Number of retries to place the bid")
+    # In a struct
+    update.add_column("details.confirmed_by", StringType(), "Name of the exchange")
+```
+
+### Rename column
+
+Renaming a field in an Iceberg table is simple:
+
+```python
+with table.update_schema() as update:
+    update.rename("retries", "num_retries")
+    # This will rename `confirmed_by` to `exchange`
+    update.rename("properties.confirmed_by", "exchange")
 ```
 
-Or, without a context manager:
+### Move column
+
+Move a field inside of struct:
 
 ```python
-transaction = table.transaction()
-transaction.update_schema().add_column("x", IntegerType(), "doc").commit()
-transaction.commit_transaction()
+with table.update_schema() as update:
+    update.move_first("symbol")
+    update.move_after("bid", "ask")
+    # This will move `confirmed_by` before `exchange`
+    update.move_before("details.created_by", "details.exchange")
 ```
 
-Or, use the UpdateSchema API directly:
+### Update column
+
+Update a fields' type, description or required.
 
 ```python
 with table.update_schema() as update:
-    update.add_column("x", IntegerType(), "doc")
+    # Promote a float to a double
+    update.update_column("bid", field_type=DoubleType())
+    # Make a field optional
+    update.update_column("symbol", required=False)
+    # Update the documentation
+    update.update_column("symbol", doc="Name of the share on the exchange")
+```
+
+Be careful, some operations are not compatible, but can still be done at your own risk by setting `allow_incompatible_changes`:
+
+```python
+with table.update_schema(allow_incompatible_changes=True) as update:
+    # Incompatible change, cannot require an optional field
+    update.update_column("symbol", required=True)
 ```
 
-Or, without a context manager:
+### Delete column
+
+Delete a field, careful this is a incompatible change (readers/writers might expect this field):
 
 ```python
-table.update_schema().add_column("x", IntegerType(), "doc").commit()
+with table.update_schema(allow_incompatible_changes=True) as update:
+    update.delete_column("some_field")
 ```
 
-### Update table properties
+## Table properties
 
 Set and remove properties through the `Transaction` API:
 
@@ -194,7 +259,7 @@ with table.transaction() as transaction:
 assert table.properties == {}
 ```
 
-Or, without a context manager:
+Or, without context manager:
 
 ```python
 table = table.transaction().set_properties(abc="def").commit_transaction()
@@ -235,7 +300,7 @@ The low level API `plan_files` methods returns a set of tasks that provide the f
 
 ```json
 [
-  "s3a://warehouse/wh/nyc/taxis/data/00003-4-42464649-92dd-41ad-b83b-dea1a2fe4b58-00001.parquet"
+  "s3://warehouse/wh/nyc/taxis/data/00003-4-42464649-92dd-41ad-b83b-dea1a2fe4b58-00001.parquet"
 ]
 ```
 
@@ -343,19 +408,17 @@ Dataset(
 Using [Ray Dataset API](https://docs.ray.io/en/latest/data/api/dataset.html) to interact with the dataset:
 
 ```python
-print(
-    ray_dataset.take(2)
-)
+print(ray_dataset.take(2))
 [
     {
-        'VendorID': 2,
-        'tpep_pickup_datetime': datetime.datetime(2008, 12, 31, 23, 23, 50, tzinfo=<UTC>),
-        'tpep_dropoff_datetime': datetime.datetime(2009, 1, 1, 0, 34, 31, tzinfo=<UTC>)
+        "VendorID": 2,
+        "tpep_pickup_datetime": datetime.datetime(2008, 12, 31, 23, 23, 50),
+        "tpep_dropoff_datetime": datetime.datetime(2009, 1, 1, 0, 34, 31),
     },
     {
-        'VendorID': 2,
-        'tpep_pickup_datetime': datetime.datetime(2008, 12, 31, 23, 5, 3, tzinfo=<UTC>),
-        'tpep_dropoff_datetime': datetime.datetime(2009, 1, 1, 16, 10, 18, tzinfo=<UTC>)
-    }
+        "VendorID": 2,
+        "tpep_pickup_datetime": datetime.datetime(2008, 12, 31, 23, 5, 3),
+        "tpep_dropoff_datetime": datetime.datetime(2009, 1, 1, 16, 10, 18),
+    },
 ]
 ```
diff --git a/python/mkdocs/docs/contributing.md b/python/mkdocs/docs/contributing.md
@@ -160,4 +160,4 @@ PyIceberg offers support from Python 3.8 onwards, we can't use the [type hints f
 
 ## Third party libraries
 
-PyIceberg naturally integrates into the rich Python ecosystem, however it is important to be hesistant to add third party packages. Adding a lot of packages makes the library heavyweight, and causes incompatibilities with other projects if they use a different version of the library. Also, big libraries such as `s3fs`, `adlfs`, `pyarrow`, `thrift` should be optional to avoid downloading everything, while not being sure if is actually being used.
+PyIceberg naturally integrates into the rich Python ecosystem, however it is important to be hesitant adding third party packages. Adding a lot of packages makes the library heavyweight, and causes incompatibilities with other projects if they use a different version of the library. Also, big libraries such as `s3fs`, `adlfs`, `pyarrow`, `thrift` should be optional to avoid downloading everything, while not being sure if is actually being used.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -160,4 +160,4 @@ PyIceberg offers support from Python 3.8 onwards, we can't use the [type hints f

		## Third party libraries

		PyIceberg naturally integrates into the rich Python ecosystem, however it is important to be hesistant to add third party packages. Adding a lot of packages makes the library heavyweight, and causes incompatibilities with other projects if they use a different version of the library. Also, big libraries such as `s3fs`, `adlfs`, `pyarrow`, `thrift` should be optional to avoid downloading everything, while not being sure if is actually being used.
		PyIceberg naturally integrates into the rich Python ecosystem, however it is important to be hesitant adding third party packages. Adding a lot of packages makes the library heavyweight, and causes incompatibilities with other projects if they use a different version of the library. Also, big libraries such as `s3fs`, `adlfs`, `pyarrow`, `thrift` should be optional to avoid downloading everything, while not being sure if is actually being used.