diff --git a/Makefile b/Makefile
index 2b33f8f4..0ee41ce6 100644
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,11 @@ MODULE := lance-spark-$(SPARK_VERSION)_$(SCALA_VERSION)
 BUNDLE_MODULE := lance-spark-bundle-$(SPARK_VERSION)_$(SCALA_VERSION)
 BASE_MODULE := lance-spark-base_$(SCALA_VERSION)
 
+# Spark download versions for Docker
+include docker/versions.mk
+SPARK_DOWNLOAD_VERSION := $(SPARK_DOWNLOAD_VERSION_$(SPARK_VERSION))
+PY4J_VERSION := $(PY4J_VERSION_$(SPARK_VERSION))
+
 DOCKER_COMPOSE := $(shell \
 	if docker compose version >/dev/null 2>&1; then \
 		echo "docker compose"; \
@@ -95,9 +100,15 @@ clean:
 
 .PHONY: docker-build
 docker-build:
-	$(MAKE) bundle SPARK_VERSION=3.5 SCALA_VERSION=2.12
-	cp lance-spark-bundle-3.5_2.12/target/lance-spark-bundle-3.5_2.12-*.jar docker/
-	cd docker && docker compose build --no-cache spark-lance
+	@ls $(BUNDLE_MODULE)/target/$(BUNDLE_MODULE)-*.jar >/dev/null 2>&1 || \
+		(echo "Error: Bundle jar not found. Run 'make bundle' first." && exit 1)
+	rm -f docker/lance-spark-bundle-*.jar
+	cp $(BUNDLE_MODULE)/target/$(BUNDLE_MODULE)-*.jar docker/
+	cd docker && $(DOCKER_COMPOSE) build --no-cache \
+		--build-arg SPARK_DOWNLOAD_VERSION=$(SPARK_DOWNLOAD_VERSION) \
+		--build-arg SPARK_MAJOR_VERSION=$(SPARK_VERSION) \
+		--build-arg SCALA_VERSION=$(SCALA_VERSION) \
+		spark-lance
 
 .PHONY: docker-up
 docker-up: check-docker-compose
@@ -111,6 +122,30 @@ docker-shell:
 docker-down: check-docker-compose
 	cd docker && ${DOCKER_COMPOSE} down
 
+.PHONY: docker-build-minimal
+docker-build-minimal:
+	@ls $(BUNDLE_MODULE)/target/$(BUNDLE_MODULE)-*.jar >/dev/null 2>&1 || \
+		(echo "Error: Bundle jar not found. Run 'make bundle' first." && exit 1)
+	rm -f docker/lance-spark-bundle-*.jar
+	cp $(BUNDLE_MODULE)/target/$(BUNDLE_MODULE)-*.jar docker/
+	cd docker && docker build \
+		--build-arg SPARK_DOWNLOAD_VERSION=$(SPARK_DOWNLOAD_VERSION) \
+		--build-arg SPARK_MAJOR_VERSION=$(SPARK_VERSION) \
+		--build-arg SCALA_VERSION=$(SCALA_VERSION) \
+		--build-arg PY4J_VERSION=$(PY4J_VERSION) \
+		-f Dockerfile.minimal \
+		-t spark-lance-minimal:$(SPARK_VERSION)_$(SCALA_VERSION) \
+		.
+
+.PHONY: docker-test
+docker-test:
+	@docker image inspect spark-lance-minimal:$(SPARK_VERSION)_$(SCALA_VERSION) >/dev/null 2>&1 || \
+		(echo "Error: Docker image 'spark-lance-minimal:$(SPARK_VERSION)_$(SCALA_VERSION)' not found. Run 'make docker-build-minimal' first." && exit 1)
+	docker run --rm --hostname spark-lance \
+		-e SPARK_VERSION=$(SPARK_VERSION) \
+		spark-lance-minimal:$(SPARK_VERSION)_$(SCALA_VERSION) \
+		"pytest /home/lance/tests/ -v --timeout=120"
+
 # =============================================================================
 # Documentation
 # =============================================================================
@@ -151,6 +186,7 @@ help:
 	@echo "  docker-up      - Start docker containers"
 	@echo "  docker-shell   - Open shell in spark-lance container"
 	@echo "  docker-down    - Stop docker containers"
+	@echo "  docker-test    - Run integration tests in spark-lance-minimal container"
 	@echo ""
 	@echo "Documentation:"
 	@echo "  serve-docs     - Serve documentation locally"
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 3da5293a..1ab7104a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -27,14 +27,20 @@ RUN python3 -m venv /opt/venv && \
 
 ENV PATH="/opt/venv/bin:$PATH"
 
+# Build arguments
+ARG SPARK_DOWNLOAD_VERSION=4.0.0
+ARG SPARK_MAJOR_VERSION=4.0
+ARG SCALA_VERSION=2.13
+
 # Optional env variables
 ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"}
 ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH
 
 WORKDIR ${SPARK_HOME}
 
-ENV SPARK_VERSION=3.5.8
-ENV SPARK_MAJOR_VERSION=3.5
+ENV SPARK_VERSION=${SPARK_DOWNLOAD_VERSION}
+ENV SPARK_MAJOR_VERSION=${SPARK_MAJOR_VERSION}
+ENV SCALA_VERSION=${SCALA_VERSION}
 ENV LANCE_SPARK_VERSION=0.1.3-beta.8
 ENV LANCE_NS_VERSION=0.4.5
 
@@ -53,7 +59,7 @@ RUN curl -L https://repo1.maven.org/maven2/com/lancedb/lance-namespace-glue/${LA
  -o /opt/spark/jars/lance-namespace-glue-${LANCE_NS_VERSION}.jar
 
 # For local testing, uncomment the lines below and comment out the Maven downloads above:
-#COPY lance-spark-bundle-${SPARK_MAJOR_VERSION}_2.12-${LANCE_SPARK_VERSION}.jar /opt/spark/jars/
+#COPY lance-spark-bundle-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-${LANCE_SPARK_VERSION}.jar /opt/spark/jars/
 #COPY lance-namespace-glue-${LANCE_NS_VERSION}.jar /opt/spark/jars/
 
 # Download OpenDAL native libraries for Linux architectures
@@ -82,6 +88,9 @@ RUN mkdir -p /home/lance/warehouse /home/lance/notebooks /home/lance/spark-event
 # Copy notebooks if available
 COPY notebooks/ /home/lance/notebooks/
 
+# Copy tests
+COPY tests/ /home/lance/tests/
+
 # Add a notebook command
 RUN echo '#! /bin/sh' >> /bin/notebook \
  && echo 'export PYSPARK_DRIVER_PYTHON=jupyter-lab' >> /bin/notebook \
diff --git a/docker/Dockerfile.minimal b/docker/Dockerfile.minimal
new file mode 100644
index 00000000..ce61f9c7
--- /dev/null
+++ b/docker/Dockerfile.minimal
@@ -0,0 +1,67 @@
+# syntax=docker/dockerfile:1
+#
+# Minimal image for running lance-spark integration tests.
+# Layers are ordered so that package installation is cached and only
+# the Spark download + bundle JAR copy run on each rebuild.
+
+FROM ubuntu:24.04
+
+# --- Cached layers: system packages and Python test deps ---
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+      curl \
+      openjdk-17-jdk-headless \
+      python3 \
+      python3-dev \
+      python3-venv \
+      python3-pip && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN python3 -m venv /opt/venv && \
+    . /opt/venv/bin/activate && \
+    pip install --upgrade pip && \
+    pip install pytest pytest-timeout packaging
+
+ENV PATH="/opt/venv/bin:$PATH"
+ENV SPARK_HOME="/opt/spark"
+ENV PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
+ENV PATH="${SPARK_HOME}/sbin:${SPARK_HOME}/bin:${PATH}"
+
+ARG SPARK_DOWNLOAD_VERSION=4.0.0
+ARG SPARK_MAJOR_VERSION=4.0
+ARG SCALA_VERSION=2.13
+ARG PY4J_VERSION=0.10.9.9
+
+ENV PYTHONPATH="${SPARK_HOME}/python/lib/py4j-${PY4J_VERSION}-src.zip:${PYTHONPATH}"
+
+RUN mkdir -p ${SPARK_HOME} \
+    && curl https://archive.apache.org/dist/spark/spark-${SPARK_DOWNLOAD_VERSION}/spark-${SPARK_DOWNLOAD_VERSION}-bin-hadoop3.tgz \
+       -o spark.tgz \
+    && tar xzf spark.tgz --directory ${SPARK_HOME} --strip-components 1 \
+    && rm spark.tgz
+
+# --- Uncached layers: Spark download and bundle JAR ---
+
+# Add a random query as a cache buster
+ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
+
+COPY lance-spark-bundle-${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-*.jar ${SPARK_HOME}/jars/
+
+# Spark configuration and entrypoint
+COPY spark-defaults.conf ${SPARK_HOME}/conf/
+RUN chmod u+x ${SPARK_HOME}/sbin/* && \
+    chmod u+x ${SPARK_HOME}/bin/*
+
+# Create directories for Spark events and test data
+RUN mkdir -p /home/lance/warehouse /home/lance/spark-events /home/lance/data
+
+# Copy tests
+RUN mkdir -p /home/lance/tests
+COPY tests/ /home/lance/tests/
+
+WORKDIR ${SPARK_HOME}
+COPY entrypoint.sh .
+
+ENTRYPOINT ["./entrypoint.sh"]
+CMD ["tail", "-f", "/dev/null"]
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index 152d95e2..8c0c9302 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -3,12 +3,9 @@ services:
   spark-lance:
     image: spark-lance:latest
     container_name: spark-lance
-    build: 
+    build:
       context: .
       dockerfile: Dockerfile
-      args:
-        - SPARK_VERSION=3.5.6
-        - LANCE_VERSION=0.0.5
     networks:
       lance_net:
     depends_on:
@@ -16,7 +13,6 @@ services:
     volumes:
       - ./warehouse:/home/lance/warehouse
       - ./notebooks:/home/lance/notebooks
-      - ../lance-spark-bundle-3.5_2.12/target:/lance-jars
     environment:
       - AWS_ACCESS_KEY_ID=admin
       - AWS_SECRET_ACCESS_KEY=password
@@ -66,4 +62,4 @@ services:
       "
 
 networks:
-  lance_net:
\ No newline at end of file
+  lance_net:
diff --git a/docker/requirements.txt b/docker/requirements.txt
index 240e41a8..46965f77 100644
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
@@ -1,3 +1,5 @@
+pytest==8.3.5
+pytest-timeout==2.3.1
 jupyterlab==3.6.7
 jupyter-server==1.24.0
 pandas==2.3.1
diff --git a/docker/tests/test_lance_spark.py b/docker/tests/test_lance_spark.py
new file mode 100644
index 00000000..23855748
--- /dev/null
+++ b/docker/tests/test_lance_spark.py
@@ -0,0 +1,1198 @@
+"""
+Automated integration tests for Lance-Spark.
+
+These tests run inside the Docker container against a real Spark + MinIO environment.
+
+Test organization follows the Lance documentation structure:
+- DDL (Data Definition Language): Namespace, Table, Index, Optimize, Vacuum operations
+- DQL (Data Query Language): SELECT queries and data retrieval
+- DML (Data Manipulation Language): INSERT, UPDATE, DELETE, MERGE, ADD COLUMN operations
+"""
+
+import os
+import time
+import pytest
+from packaging.version import Version
+from pyspark.sql import SparkSession
+
+SPARK_VERSION = Version(os.environ.get("SPARK_VERSION", "3.5"))
+
+
+@pytest.fixture(scope="module")
+def spark():
+    """Create a Spark session configured with Lance catalog."""
+    session = (
+        SparkSession.builder
+        .appName("LanceSparkTests")
+        .config("spark.sql.catalog.lance", "org.lance.spark.LanceNamespaceSparkCatalog")
+        .config("spark.sql.catalog.lance.impl", "dir")
+        .config("spark.sql.catalog.lance.root", "/home/lance/data")
+        .config("spark.sql.extensions", "org.lance.spark.extensions.LanceSparkSessionExtensions")
+        .getOrCreate()
+    )
+    session.sql("SET spark.sql.defaultCatalog=lance")
+    yield session
+    session.stop()
+
+
+def drop_table(spark, table_name):
+    """Drop a table, using PURGE only on Spark >= 3.5."""
+    purge = " PURGE" if SPARK_VERSION >= Version("3.5") else ""
+    spark.sql(f"DROP TABLE IF EXISTS {table_name}{purge}")
+
+
+@pytest.fixture(autouse=True)
+def cleanup_tables(spark):
+    """Clean up test tables before and after each test."""
+    drop_table(spark, "default.test_table")
+    drop_table(spark, "default.employees")
+    # TODO - reenable once `tableExists` works on Spark 4.0
+    #spark.catalog.dropTempView("source") if spark.catalog.tableExists("source") else None
+    #spark.catalog.dropTempView("tmp_view") if spark.catalog.tableExists("tmp_view") else None
+    spark.catalog.dropTempView("source")
+    spark.catalog.dropTempView("tmp_view")
+    yield
+    drop_table(spark, "default.test_table")
+    drop_table(spark, "default.employees")
+    # TODO - reenable once `tableExists` works on Spark 4.0
+    #spark.catalog.dropTempView("source") if spark.catalog.tableExists("source") else None
+    #spark.catalog.dropTempView("tmp_view") if spark.catalog.tableExists("tmp_view") else None
+    spark.catalog.dropTempView("source")
+    spark.catalog.dropTempView("tmp_view")
+
+
+# =============================================================================
+# DDL (Data Definition Language) Tests
+# =============================================================================
+
+class TestDDLNamespace:
+    """Test DDL namespace operations: CREATE, DROP, SHOW, DESCRIBE NAMESPACE."""
+
+    def test_show_catalogs(self, spark):
+        """Verify Lance catalog is registered."""
+        catalogs = spark.sql("SHOW CATALOGS").collect()
+        catalog_names = [row[0] for row in catalogs]
+        assert "lance" in catalog_names
+
+    def test_create_namespace(self, spark):
+        """Test CREATE NAMESPACE."""
+        spark.sql("CREATE NAMESPACE IF NOT EXISTS default")
+        namespaces = spark.sql("SHOW NAMESPACES").collect()
+        namespace_names = [row[0] for row in namespaces]
+        assert "default" in namespace_names
+
+    def test_show_namespaces(self, spark):
+        """Test SHOW NAMESPACES."""
+        spark.sql("CREATE NAMESPACE IF NOT EXISTS default")
+        namespaces = spark.sql("SHOW NAMESPACES").collect()
+        assert len(namespaces) >= 1
+        namespace_names = [row[0] for row in namespaces]
+        assert "default" in namespace_names
+
+
+class TestDDLTable:
+    """Test DDL table operations: CREATE, SHOW, DESCRIBE, DROP TABLE."""
+
+    def test_create_table(self, spark):
+        """Test CREATE TABLE."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value DOUBLE
+            )
+        """)
+
+        tables = spark.sql("SHOW TABLES IN default").collect()
+        table_names = [row.tableName for row in tables]
+        assert "test_table" in table_names
+
+    def test_show_tables(self, spark):
+        """Test SHOW TABLES."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING
+            )
+        """)
+
+        tables = spark.sql("SHOW TABLES IN default").collect()
+        assert len(tables) >= 1
+        table_names = [row.tableName for row in tables]
+        assert "test_table" in table_names
+
+    def test_describe_table(self, spark):
+        """Test DESCRIBE TABLE."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value DOUBLE
+            )
+        """)
+
+        schema = spark.sql("DESCRIBE TABLE default.test_table").collect()
+        col_names = [row.col_name for row in schema if row.col_name and not row.col_name.startswith("#")]
+        assert "id" in col_names
+        assert "name" in col_names
+        assert "value" in col_names
+
+    def test_drop_table(self, spark):
+        """Test DROP TABLE."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING
+            )
+        """)
+
+        drop_table(spark, "default.test_table")
+
+        tables = spark.sql("SHOW TABLES IN default").collect()
+        table_names = [row.tableName for row in tables]
+        assert "test_table" not in table_names
+
+
+class TestDDLIndex:
+    """Test DDL index operations: CREATE INDEX (BTree, FTS)."""
+
+    def test_create_btree_index_on_int(self, spark):
+        """Test CREATE INDEX with BTree on integer column."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value DOUBLE
+            )
+        """)
+
+        # Insert data first (index requires data)
+        data = [(i, f"Name{i}", float(i * 10)) for i in range(100)]
+        df = spark.createDataFrame(data, ["id", "name", "value"])
+        df.writeTo("default.test_table").append()
+
+        # Create BTree index on id column
+        result = spark.sql("""
+            ALTER TABLE default.test_table
+            CREATE INDEX idx_id USING btree (id)
+        """).collect()
+
+        # Verify index was created (returns fragment count and index name)
+        assert len(result) == 1
+        assert result[0][1] == "idx_id"
+
+        # Verify queries still work after indexing
+        query_result = spark.sql("""
+            SELECT * FROM default.test_table WHERE id = 50
+        """).collect()
+        assert len(query_result) == 1
+        assert query_result[0].id == 50
+
+    def test_create_btree_index_on_string(self, spark):
+        """Test CREATE INDEX with BTree on string column."""
+        spark.sql("""
+            CREATE TABLE default.employees (
+                id INT,
+                name STRING,
+                department STRING,
+                salary INT
+            )
+        """)
+
+        data = [
+            (1, "Alice", "Engineering", 75000),
+            (2, "Bob", "Marketing", 65000),
+            (3, "Charlie", "Engineering", 70000),
+            (4, "Diana", "Sales", 80000),
+            (5, "Eve", "Engineering", 60000),
+        ]
+        df = spark.createDataFrame(data, ["id", "name", "department", "salary"])
+        df.writeTo("default.employees").append()
+
+        # Create BTree index on department column
+        result = spark.sql("""
+            ALTER TABLE default.employees
+            CREATE INDEX idx_dept USING btree (department)
+        """).collect()
+
+        assert len(result) == 1
+        assert result[0][1] == "idx_dept"
+
+        # Query using the indexed column
+        query_result = spark.sql("""
+            SELECT * FROM default.employees WHERE department = 'Engineering'
+        """).collect()
+        assert len(query_result) == 3
+
+    def test_create_fts_index(self, spark):
+        """Test CREATE INDEX with full-text search (FTS)."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                title STRING,
+                content STRING
+            )
+        """)
+
+        data = [
+            (1, "Introduction to Python", "Python is a programming language"),
+            (2, "Java Basics", "Java is an object-oriented language"),
+            (3, "Python Advanced", "Advanced Python topics like decorators"),
+            (4, "Database Design", "SQL and database normalization"),
+            (5, "Web Development", "Building web applications with Python"),
+        ]
+        df = spark.createDataFrame(data, ["id", "title", "content"])
+        df.writeTo("default.test_table").append()
+
+        # Create FTS index on content column
+        result = spark.sql("""
+            ALTER TABLE default.test_table
+            CREATE INDEX idx_content_fts USING fts (content)
+            WITH ( base_tokenizer = 'simple', language = 'English' )
+        """).collect()
+
+        assert len(result) == 1
+        assert result[0][1] == "idx_content_fts"
+
+    def test_create_index_empty_table(self, spark):
+        """Test CREATE INDEX on empty table."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING
+            )
+        """)
+
+        # Creating index on empty table should return 0 fragments indexed
+        result = spark.sql("""
+            ALTER TABLE default.test_table
+            CREATE INDEX idx_id USING btree (id)
+        """).collect()
+
+        # Should return with 0 fragments indexed
+        assert result[0][0] == 0
+
+
+class TestDDLOptimize:
+    """Test DDL OPTIMIZE operations for compacting table fragments."""
+
+    def test_optimize_without_args(self, spark):
+        """Test OPTIMIZE without options."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        # Insert data in multiple batches to create multiple fragments
+        for batch in range(5):
+            data = [(batch * 10 + i, f"Name{batch * 10 + i}", batch * 10 + i) for i in range(10)]
+            df = spark.createDataFrame(data, ["id", "name", "value"])
+            df.writeTo("default.test_table").append()
+
+        # Run OPTIMIZE
+        result = spark.sql("OPTIMIZE default.test_table").collect()
+
+        # Verify output schema and that compaction occurred
+        assert len(result) == 1
+        row = result[0]
+        # OPTIMIZE returns: fragments_removed, fragments_added, files_removed, files_added
+        assert row.fragments_removed >= 0
+        assert row.fragments_added >= 0
+        assert row.files_removed >= 0
+        assert row.files_added >= 0
+
+        # Verify data integrity after optimization
+        count = spark.table("default.test_table").count()
+        assert count == 50  # 5 batches * 10 rows
+
+    def test_optimize_with_target_rows(self, spark):
+        """Test OPTIMIZE with target_rows_per_fragment option."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        # Insert data in multiple small batches
+        for batch in range(5):
+            data = [(batch * 10 + i, f"Name{batch * 10 + i}", batch * 10 + i) for i in range(10)]
+            df = spark.createDataFrame(data, ["id", "name", "value"])
+            df.writeTo("default.test_table").append()
+
+        # Optimize with target rows per fragment
+        result = spark.sql("""
+            OPTIMIZE default.test_table WITH (target_rows_per_fragment = 100)
+        """).collect()
+
+        assert len(result) == 1
+        row = result[0]
+        assert row.fragments_removed >= 0
+        assert row.fragments_added >= 0
+
+        # Verify data integrity
+        count = spark.table("default.test_table").count()
+        assert count == 50
+
+    def test_optimize_with_multiple_options(self, spark):
+        """Test OPTIMIZE with multiple options."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        # Insert data
+        for batch in range(3):
+            data = [(batch * 20 + i, f"Name{batch * 20 + i}", batch * 20 + i) for i in range(20)]
+            df = spark.createDataFrame(data, ["id", "name", "value"])
+            df.writeTo("default.test_table").append()
+
+        # Optimize with multiple options
+        result = spark.sql("""
+            OPTIMIZE default.test_table WITH (
+                target_rows_per_fragment = 100,
+                num_threads = 2,
+                materialize_deletions = true
+            )
+        """).collect()
+
+        assert len(result) == 1
+        row = result[0]
+        assert row.fragments_removed >= 0
+        assert row.fragments_added >= 0
+
+        # Verify data integrity
+        count = spark.table("default.test_table").count()
+        assert count == 60  # 3 batches * 20 rows
+
+    def test_optimize_after_deletes(self, spark):
+        """Test OPTIMIZE after DELETE to materialize soft deletes."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        # Insert data
+        data = [(i, f"Name{i}", i) for i in range(100)]
+        df = spark.createDataFrame(data, ["id", "name", "value"])
+        df.writeTo("default.test_table").append()
+
+        # Delete some rows
+        spark.sql("DELETE FROM default.test_table WHERE id < 20")
+
+        # Optimize to materialize deletions
+        result = spark.sql("""
+            OPTIMIZE default.test_table WITH (materialize_deletions = true)
+        """).collect()
+
+        assert len(result) == 1
+
+        # Verify correct row count after optimization
+        count = spark.table("default.test_table").count()
+        assert count == 80  # 100 - 20 deleted
+
+    def test_optimize_empty_table(self, spark):
+        """Test OPTIMIZE on empty table."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING
+            )
+        """)
+
+        # Optimize empty table should succeed without error
+        result = spark.sql("OPTIMIZE default.test_table").collect()
+
+        assert len(result) == 1
+        row = result[0]
+        # Empty table should have 0 fragments to compact
+        assert row.fragments_removed == 0
+        assert row.fragments_added == 0
+
+
+class TestDDLVacuum:
+    """Test DDL VACUUM operations for removing old versions."""
+
+    def test_vacuum_without_args(self, spark):
+        """Test VACUUM without options."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        # Insert data to create a version
+        data = [(i, f"Name{i}", i) for i in range(10)]
+        df = spark.createDataFrame(data, ["id", "name", "value"])
+        df.writeTo("default.test_table").append()
+
+        # Run VACUUM
+        result = spark.sql("VACUUM default.test_table").collect()
+
+        # Verify output schema
+        assert len(result) == 1
+        row = result[0]
+        # VACUUM returns: bytes_removed, old_versions
+        assert row.bytes_removed >= 0
+        assert row.old_versions >= 0
+
+    def test_vacuum_with_before_version(self, spark):
+        """Test VACUUM with before_version option."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        # Create multiple versions by inserting data multiple times
+        for version in range(5):
+            data = [(version * 10 + i, f"Name{version * 10 + i}", version * 10 + i) for i in range(10)]
+            df = spark.createDataFrame(data, ["id", "name", "value"])
+            df.writeTo("default.test_table").append()
+
+        # Vacuum with before_version to remove old versions
+        result = spark.sql("""
+            VACUUM default.test_table WITH (before_version = 1000000)
+        """).collect()
+
+        assert len(result) == 1
+        row = result[0]
+        # With multiple versions, we expect some cleanup
+        assert row.bytes_removed >= 0
+        assert row.old_versions >= 0
+
+        # Verify data is still accessible
+        count = spark.table("default.test_table").count()
+        assert count == 50  # 5 versions * 10 rows
+
+    def test_vacuum_with_timestamp(self, spark):
+        """Test VACUUM with before_timestamp_millis option."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        # Insert initial data
+        data = [(i, f"Name{i}", i) for i in range(10)]
+        df = spark.createDataFrame(data, ["id", "name", "value"])
+        df.writeTo("default.test_table").append()
+
+        # Small delay to separate versions
+        time.sleep(0.1)
+        before_ts = int(time.time() * 1000)
+
+        # Insert more data to create newer versions
+        for version in range(3):
+            data = [(100 + version * 10 + i, f"NewName{version * 10 + i}", version * 10 + i) for i in range(10)]
+            df = spark.createDataFrame(data, ["id", "name", "value"])
+            df.writeTo("default.test_table").append()
+
+        # Vacuum versions older than the timestamp
+        result = spark.sql(f"""
+            VACUUM default.test_table WITH (before_timestamp_millis = {before_ts})
+        """).collect()
+
+        assert len(result) == 1
+        row = result[0]
+        assert row.bytes_removed >= 0
+        assert row.old_versions >= 0
+
+        # Verify current data is accessible
+        count = spark.table("default.test_table").count()
+        assert count == 40  # 10 initial + 3 versions * 10 rows
+
+    def test_vacuum_after_optimize(self, spark):
+        """Test VACUUM after OPTIMIZE to clean up removed fragments."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        # Create fragmented data
+        for batch in range(5):
+            data = [(batch * 10 + i, f"Name{batch * 10 + i}", batch * 10 + i) for i in range(10)]
+            df = spark.createDataFrame(data, ["id", "name", "value"])
+            df.writeTo("default.test_table").append()
+
+        # Optimize to compact fragments
+        spark.sql("""
+            OPTIMIZE default.test_table WITH (target_rows_per_fragment = 100)
+        """)
+
+        # Vacuum to remove old fragment files
+        result = spark.sql("""
+            VACUUM default.test_table WITH (before_version = 1000000)
+        """).collect()
+
+        assert len(result) == 1
+        row = result[0]
+        # After optimize, vacuum should clean up the old fragments
+        assert row.bytes_removed >= 0
+        assert row.old_versions >= 0
+
+        # Verify data integrity
+        count = spark.table("default.test_table").count()
+        assert count == 50
+
+    def test_vacuum_preserves_current_data(self, spark):
+        """Test VACUUM preserves all current data."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        # Insert and update data multiple times
+        spark.sql("""
+            INSERT INTO default.test_table VALUES
+            (1, 'Alice', 100),
+            (2, 'Bob', 200),
+            (3, 'Charlie', 300)
+        """)
+
+        # Update to create new versions
+        spark.sql("UPDATE default.test_table SET value = 150 WHERE id = 1")
+        spark.sql("UPDATE default.test_table SET value = 250 WHERE id = 2")
+
+        # Delete and re-insert
+        spark.sql("DELETE FROM default.test_table WHERE id = 3")
+        spark.sql("INSERT INTO default.test_table VALUES (3, 'Charlie', 350)")
+
+        # Vacuum
+        result = spark.sql("""
+            VACUUM default.test_table WITH (before_version = 1000000)
+        """).collect()
+
+        assert len(result) == 1
+
+        # Verify all current data is preserved
+        rows = spark.table("default.test_table").orderBy("id").collect()
+        assert len(rows) == 3
+        assert rows[0].id == 1 and rows[0].value == 150
+        assert rows[1].id == 2 and rows[1].value == 250
+        assert rows[2].id == 3 and rows[2].value == 350
+
+
+# =============================================================================
+# DQL (Data Query Language) Tests
+# =============================================================================
+
+class TestDQLSelect:
+    """Test DQL SELECT operations."""
+
+    def test_select_all(self, spark):
+        """Test SELECT * query."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value DOUBLE
+            )
+        """)
+
+        data = [(1, "Alice", 10.5), (2, "Bob", 20.3), (3, "Charlie", 30.1)]
+        df = spark.createDataFrame(data, ["id", "name", "value"])
+        df.writeTo("default.test_table").append()
+
+        result = spark.sql("SELECT * FROM default.test_table").collect()
+        assert len(result) == 3
+
+        ids = sorted([row.id for row in result])
+        assert ids == [1, 2, 3]
+
+    def test_select_with_where(self, spark):
+        """Test SELECT with WHERE clause."""
+        spark.sql("""
+            CREATE TABLE default.employees (
+                id INT,
+                name STRING,
+                age INT,
+                department STRING,
+                salary INT
+            )
+        """)
+
+        data = [
+            (1, "Alice", 25, "Engineering", 75000),
+            (2, "Bob", 30, "Marketing", 65000),
+            (3, "Charlie", 35, "Sales", 70000),
+            (4, "Diana", 28, "Engineering", 80000),
+            (5, "Eve", 32, "HR", 60000),
+        ]
+        df = spark.createDataFrame(data, ["id", "name", "age", "department", "salary"])
+        df.writeTo("default.employees").append()
+
+        result = spark.sql("""
+            SELECT * FROM default.employees
+            WHERE department = 'Engineering'
+        """).collect()
+
+        assert len(result) == 2
+        names = sorted([row.name for row in result])
+        assert names == ["Alice", "Diana"]
+
+    def test_select_with_group_by(self, spark):
+        """Test SELECT with GROUP BY aggregation."""
+        spark.sql("""
+            CREATE TABLE default.employees (
+                id INT,
+                name STRING,
+                age INT,
+                department STRING,
+                salary INT
+            )
+        """)
+
+        data = [
+            (1, "Alice", 25, "Engineering", 75000),
+            (2, "Bob", 30, "Marketing", 65000),
+            (3, "Charlie", 35, "Sales", 70000),
+            (4, "Diana", 28, "Engineering", 80000),
+            (5, "Eve", 32, "HR", 60000),
+        ]
+        df = spark.createDataFrame(data, ["id", "name", "age", "department", "salary"])
+        df.writeTo("default.employees").append()
+
+        result = spark.sql("""
+            SELECT department, COUNT(*) as count, AVG(salary) as avg_salary
+            FROM default.employees
+            GROUP BY department
+            ORDER BY count DESC
+        """).collect()
+
+        eng_row = [row for row in result if row.department == "Engineering"][0]
+        assert eng_row["count"] == 2
+        assert eng_row.avg_salary == 77500.0
+
+    def test_select_with_order_by(self, spark):
+        """Test SELECT with ORDER BY clause."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value DOUBLE
+            )
+        """)
+
+        data = [(3, "Charlie", 30.1), (1, "Alice", 10.5), (2, "Bob", 20.3)]
+        df = spark.createDataFrame(data, ["id", "name", "value"])
+        df.writeTo("default.test_table").append()
+
+        result = spark.sql("""
+            SELECT * FROM default.test_table ORDER BY id ASC
+        """).collect()
+
+        ids = [row.id for row in result]
+        assert ids == [1, 2, 3]
+
+    def test_select_with_limit(self, spark):
+        """Test SELECT with LIMIT clause."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value DOUBLE
+            )
+        """)
+
+        data = [(i, f"Name{i}", float(i)) for i in range(10)]
+        df = spark.createDataFrame(data, ["id", "name", "value"])
+        df.writeTo("default.test_table").append()
+
+        result = spark.sql("""
+            SELECT * FROM default.test_table LIMIT 5
+        """).collect()
+
+        assert len(result) == 5
+
+    def test_select_data_types(self, spark):
+        """Test SELECT with various data types: INT, BIGINT, FLOAT, DOUBLE, STRING, BOOLEAN."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                int_col INT,
+                long_col BIGINT,
+                float_col FLOAT,
+                double_col DOUBLE,
+                string_col STRING,
+                bool_col BOOLEAN
+            )
+        """)
+
+        data = [(1, 100000000000, 1.5, 2.5, "test", True)]
+        df = spark.createDataFrame(
+            data,
+            ["int_col", "long_col", "float_col", "double_col", "string_col", "bool_col"]
+        )
+        df.writeTo("default.test_table").append()
+
+        result = spark.table("default.test_table").collect()[0]
+        assert result.int_col == 1
+        assert result.long_col == 100000000000
+        assert abs(result.float_col - 1.5) < 0.01
+        assert result.double_col == 2.5
+        assert result.string_col == "test"
+        assert result.bool_col is True
+
+
+# =============================================================================
+# DML (Data Manipulation Language) Tests
+# =============================================================================
+
+class TestDMLInsert:
+    """Test DML INSERT INTO operations."""
+
+    def test_insert_into_values(self, spark):
+        """Test INSERT INTO with VALUES clause."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value DOUBLE
+            )
+        """)
+
+        # Insert using VALUES
+        spark.sql("""
+            INSERT INTO default.test_table VALUES
+            (1, 'Alice', 10.5),
+            (2, 'Bob', 20.3),
+            (3, 'Charlie', 30.1)
+        """)
+
+        result = spark.table("default.test_table").collect()
+        assert len(result) == 3
+
+        ids = sorted([row.id for row in result])
+        assert ids == [1, 2, 3]
+
+    def test_insert_into_select(self, spark):
+        """Test INSERT INTO with SELECT clause."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value DOUBLE
+            )
+        """)
+
+        # Insert initial data
+        spark.sql("""
+            INSERT INTO default.test_table VALUES
+            (1, 'Alice', 10.5),
+            (2, 'Bob', 20.3)
+        """)
+
+        # Insert from select (duplicate the data with modified ids)
+        spark.sql("""
+            INSERT INTO default.test_table
+            SELECT id + 10, name, value * 2
+            FROM default.test_table
+        """)
+
+        result = spark.table("default.test_table").collect()
+        assert len(result) == 4
+
+        ids = sorted([row.id for row in result])
+        assert ids == [1, 2, 11, 12]
+
+    def test_insert_append_data(self, spark):
+        """Test INSERT by appending data with DataFrame API."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value DOUBLE
+            )
+        """)
+
+        data1 = [(1, "Alice", 10.5), (2, "Bob", 20.3)]
+        df1 = spark.createDataFrame(data1, ["id", "name", "value"])
+        df1.writeTo("default.test_table").append()
+
+        data2 = [(3, "Charlie", 30.1), (4, "Diana", 40.2)]
+        df2 = spark.createDataFrame(data2, ["id", "name", "value"])
+        df2.writeTo("default.test_table").append()
+
+        count = spark.table("default.test_table").count()
+        assert count == 4
+
+
+class TestDMLUpdate:
+    """Test DML UPDATE SET operations."""
+
+    def test_update_single_column(self, spark):
+        """Test UPDATE SET single column."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        spark.sql("""
+            INSERT INTO default.test_table VALUES
+            (1, 'Alice', 10),
+            (2, 'Bob', 20),
+            (3, 'Charlie', 30)
+        """)
+
+        # Update value for specific id
+        spark.sql("""
+            UPDATE default.test_table SET value = 100 WHERE id = 2
+        """)
+
+        result = spark.sql("""
+            SELECT * FROM default.test_table WHERE id = 2
+        """).collect()
+
+        assert len(result) == 1
+        assert result[0].value == 100
+
+    def test_update_multiple_rows(self, spark):
+        """Test UPDATE SET affecting multiple rows."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        spark.sql("""
+            INSERT INTO default.test_table VALUES
+            (1, 'Alice', 10),
+            (2, 'Bob', 20),
+            (3, 'Charlie', 30),
+            (4, 'Diana', 40)
+        """)
+
+        # Update all rows where value < 30
+        spark.sql("""
+            UPDATE default.test_table SET value = value + 100 WHERE value < 30
+        """)
+
+        result = spark.table("default.test_table").orderBy("id").collect()
+
+        assert result[0].value == 110  # id=1: 10 + 100
+        assert result[1].value == 120  # id=2: 20 + 100
+        assert result[2].value == 30   # id=3: unchanged
+        assert result[3].value == 40   # id=4: unchanged
+
+
+class TestDMLDelete:
+    """Test DML DELETE FROM operations."""
+
+    def test_delete_with_condition(self, spark):
+        """Test DELETE FROM with WHERE clause."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value DOUBLE
+            )
+        """)
+
+        spark.sql("""
+            INSERT INTO default.test_table VALUES
+            (1, 'Alice', 10.5),
+            (2, 'Bob', 20.3),
+            (3, 'Charlie', 30.1),
+            (4, 'Diana', 40.2),
+            (5, 'Eve', 50.0)
+        """)
+
+        # Delete rows where id > 3
+        spark.sql("""
+            DELETE FROM default.test_table WHERE id > 3
+        """)
+
+        result = spark.table("default.test_table").collect()
+        assert len(result) == 3
+
+        ids = sorted([row.id for row in result])
+        assert ids == [1, 2, 3]
+
+    def test_delete_with_string_condition(self, spark):
+        """Test DELETE FROM with string column condition."""
+        spark.sql("""
+            CREATE TABLE default.employees (
+                id INT,
+                name STRING,
+                department STRING,
+                salary INT
+            )
+        """)
+
+        spark.sql("""
+            INSERT INTO default.employees VALUES
+            (1, 'Alice', 'Engineering', 75000),
+            (2, 'Bob', 'Marketing', 65000),
+            (3, 'Charlie', 'Engineering', 70000),
+            (4, 'Diana', 'Sales', 80000)
+        """)
+
+        # Delete all Marketing employees
+        spark.sql("""
+            DELETE FROM default.employees WHERE department = 'Marketing'
+        """)
+
+        result = spark.table("default.employees").collect()
+        assert len(result) == 3
+
+        departments = [row.department for row in result]
+        assert "Marketing" not in departments
+
+
+class TestDMLMerge:
+    """Test DML MERGE INTO operations."""
+
+    def test_merge_into(self, spark):
+        """Test MERGE INTO with WHEN MATCHED and WHEN NOT MATCHED."""
+        # Create target table
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        spark.sql("""
+            INSERT INTO default.test_table VALUES
+            (1, 'Alice', 10),
+            (2, 'Bob', 20),
+            (3, 'Charlie', 30)
+        """)
+
+        # Create source data as temp view
+        source_data = [(2, "Bob_Updated", 200), (4, "Diana", 40)]
+        source_df = spark.createDataFrame(source_data, ["id", "name", "value"])
+        source_df.createOrReplaceTempView("source")
+
+        # Merge: update matching rows, insert new rows
+        spark.sql("""
+            MERGE INTO default.test_table t
+            USING source s
+            ON t.id = s.id
+            WHEN MATCHED THEN UPDATE SET name = s.name, value = s.value
+            WHEN NOT MATCHED THEN INSERT (id, name, value) VALUES (s.id, s.name, s.value)
+        """)
+
+        result = spark.table("default.test_table").orderBy("id").collect()
+
+        assert len(result) == 4
+
+        # Check updated row
+        bob_row = [r for r in result if r.id == 2][0]
+        assert bob_row.name == "Bob_Updated"
+        assert bob_row.value == 200
+
+        # Check inserted row
+        diana_row = [r for r in result if r.id == 4][0]
+        assert diana_row.name == "Diana"
+        assert diana_row.value == 40
+
+
+class TestDMLAddColumn:
+    """Test DML ADD COLUMN FROM operations for schema evolution with backfill."""
+
+    def test_add_column_from_view(self, spark):
+        """Test ALTER TABLE ADD COLUMNS FROM with single column."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        spark.sql("""
+            INSERT INTO default.test_table VALUES
+            (1, 'Alice', 100),
+            (2, 'Bob', 200),
+            (3, 'Charlie', 300)
+        """)
+
+        # Create temp view with _rowaddr, _fragid, and new column
+        spark.sql("""
+            CREATE TEMPORARY VIEW tmp_view AS
+            SELECT _rowaddr, _fragid, value * 2 as doubled_value
+            FROM default.test_table
+        """)
+
+        # Add column from the view
+        spark.sql("""
+            ALTER TABLE default.test_table ADD COLUMNS doubled_value FROM tmp_view
+        """)
+
+        # Verify the new column was added with correct values
+        result = spark.sql("""
+            SELECT id, name, value, doubled_value
+            FROM default.test_table
+            ORDER BY id
+        """).collect()
+
+        assert len(result) == 3
+        assert result[0].doubled_value == 200  # 100 * 2
+        assert result[1].doubled_value == 400  # 200 * 2
+        assert result[2].doubled_value == 600  # 300 * 2
+
+    def test_add_multiple_columns(self, spark):
+        """Test ALTER TABLE ADD COLUMNS FROM with multiple columns."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        spark.sql("""
+            INSERT INTO default.test_table VALUES
+            (1, 'Alice', 100),
+            (2, 'Bob', 200),
+            (3, 'Charlie', 300)
+        """)
+
+        # Create temp view with multiple new columns
+        spark.sql("""
+            CREATE TEMPORARY VIEW tmp_view AS
+            SELECT _rowaddr, _fragid,
+                   value * 2 as doubled,
+                   value + 50 as plus_fifty,
+                   CONCAT(name, '_suffix') as name_with_suffix
+            FROM default.test_table
+        """)
+
+        # Add multiple columns
+        spark.sql("""
+            ALTER TABLE default.test_table ADD COLUMNS doubled, plus_fifty, name_with_suffix FROM tmp_view
+        """)
+
+        # Verify all columns were added
+        result = spark.sql("""
+            SELECT id, doubled, plus_fifty, name_with_suffix
+            FROM default.test_table
+            ORDER BY id
+        """).collect()
+
+        assert len(result) == 3
+        assert result[0].doubled == 200
+        assert result[0].plus_fifty == 150
+        assert result[0].name_with_suffix == "Alice_suffix"
+        assert result[1].doubled == 400
+        assert result[1].plus_fifty == 250
+        assert result[1].name_with_suffix == "Bob_suffix"
+
+    def test_add_column_partial_rows(self, spark):
+        """Test ADD COLUMNS FROM with data for only some rows (others get null)."""
+        spark.sql("""
+            CREATE TABLE default.test_table (
+                id INT,
+                name STRING,
+                value INT
+            )
+        """)
+
+        spark.sql("""
+            INSERT INTO default.test_table VALUES
+            (1, 'Alice', 100),
+            (2, 'Bob', 200),
+            (3, 'Charlie', 300),
+            (4, 'Diana', 400),
+            (5, 'Eve', 500)
+        """)
+
+        # Create temp view with data for only some rows
+        spark.sql("""
+            CREATE TEMPORARY VIEW tmp_view AS
+            SELECT _rowaddr, _fragid, CONCAT('special_', name) as special_name
+            FROM default.test_table
+            WHERE id IN (1, 3, 5)
+        """)
+
+        # Add column - rows not in view should get null
+        spark.sql("""
+            ALTER TABLE default.test_table ADD COLUMNS special_name FROM tmp_view
+        """)
+
+        result = spark.sql("""
+            SELECT id, special_name
+            FROM default.test_table
+            ORDER BY id
+        """).collect()
+
+        assert len(result) == 5
+        assert result[0].special_name == "special_Alice"
+        assert result[1].special_name is None  # id=2 not in view
+        assert result[2].special_name == "special_Charlie"
+        assert result[3].special_name is None  # id=4 not in view
+        assert result[4].special_name == "special_Eve"
+
+    def test_add_column_computed_values(self, spark):
+        """Test ADD COLUMNS FROM with computed/derived values."""
+        spark.sql("""
+            CREATE TABLE default.employees (
+                id INT,
+                name STRING,
+                salary INT,
+                bonus_percent INT
+            )
+        """)
+
+        spark.sql("""
+            INSERT INTO default.employees VALUES
+            (1, 'Alice', 50000, 10),
+            (2, 'Bob', 60000, 15),
+            (3, 'Charlie', 70000, 20)
+        """)
+
+        # Compute total compensation as a new column
+        spark.sql("""
+            CREATE TEMPORARY VIEW tmp_view AS
+            SELECT _rowaddr, _fragid,
+                   salary + (salary * bonus_percent / 100) as total_compensation
+            FROM default.employees
+        """)
+
+        spark.sql("""
+            ALTER TABLE default.employees ADD COLUMNS total_compensation FROM tmp_view
+        """)
+
+        result = spark.sql("""
+            SELECT id, name, salary, bonus_percent, total_compensation
+            FROM default.employees
+            ORDER BY id
+        """).collect()
+
+        assert len(result) == 3
+        assert result[0].total_compensation == 55000   # 50000 + 5000
+        assert result[1].total_compensation == 69000   # 60000 + 9000
+        assert result[2].total_compensation == 84000   # 70000 + 14000
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/docker/versions.mk b/docker/versions.mk
new file mode 100644
index 00000000..385b472d
--- /dev/null
+++ b/docker/versions.mk
@@ -0,0 +1,9 @@
+# Full Spark download versions for Docker images
+SPARK_DOWNLOAD_VERSION_3.4 := 3.4.4
+SPARK_DOWNLOAD_VERSION_3.5 := 3.5.8
+SPARK_DOWNLOAD_VERSION_4.0 := 4.0.2
+
+# Py4J versions bundled with each Spark release
+PY4J_VERSION_3.4 := 0.10.9.7
+PY4J_VERSION_3.5 := 0.10.9.7
+PY4J_VERSION_4.0 := 0.10.9.9
diff --git a/lance-spark-3.4_2.12/pom.xml b/lance-spark-3.4_2.12/pom.xml
index f88c87a2..ab5ea4f5 100644
--- a/lance-spark-3.4_2.12/pom.xml
+++ b/lance-spark-3.4_2.12/pom.xml
@@ -15,8 +15,7 @@
     <packaging>jar</packaging>
 
     <properties>
-        <scala.version>${scala212.version}</scala.version>
-        <scala.compat.version>${scala212.compat.version}</scala.compat.version>
+        <arrow.version>${arrow14.version}</arrow.version>
     </properties>
 
     <dependencies>
@@ -180,4 +179,4 @@
             </plugin>
         </plugins>
     </build>
-</project>
\ No newline at end of file
+</project>
diff --git a/lance-spark-3.4_2.13/pom.xml b/lance-spark-3.4_2.13/pom.xml
index 2207ebed..47ef8149 100644
--- a/lance-spark-3.4_2.13/pom.xml
+++ b/lance-spark-3.4_2.13/pom.xml
@@ -17,6 +17,7 @@
     <properties>
         <scala.version>${scala213.version}</scala.version>
         <scala.compat.version>${scala213.compat.version}</scala.compat.version>
+        <arrow.version>${arrow14.version}</arrow.version>
     </properties>
 
     <dependencies>
@@ -184,4 +185,4 @@
             </plugin>
         </plugins>
     </build>
-</project>
\ No newline at end of file
+</project>
diff --git a/lance-spark-3.5_2.12/pom.xml b/lance-spark-3.5_2.12/pom.xml
index 5b1119a6..97d0d970 100644
--- a/lance-spark-3.5_2.12/pom.xml
+++ b/lance-spark-3.5_2.12/pom.xml
@@ -14,10 +14,6 @@
     <description>Apache Spark 3.5 Connector for Lance</description>
     <packaging>jar</packaging>
 
-    <properties>
-        <scala.version>${scala212.version}</scala.version>
-        <scala.compat.version>${scala212.compat.version}</scala.compat.version>
-    </properties>
 
     <dependencies>
         <dependency>
@@ -181,4 +177,4 @@
             </plugin>
         </plugins>
     </build>
-</project>
\ No newline at end of file
+</project>
diff --git a/lance-spark-3.5_2.13/pom.xml b/lance-spark-3.5_2.13/pom.xml
index 56f05daa..c3b1d3c1 100644
--- a/lance-spark-3.5_2.13/pom.xml
+++ b/lance-spark-3.5_2.13/pom.xml
@@ -185,4 +185,4 @@
             </plugin>
         </plugins>
     </build>
-</project>
\ No newline at end of file
+</project>
diff --git a/lance-spark-4.0_2.13/pom.xml b/lance-spark-4.0_2.13/pom.xml
index a7af240e..17a67e65 100644
--- a/lance-spark-4.0_2.13/pom.xml
+++ b/lance-spark-4.0_2.13/pom.xml
@@ -17,8 +17,9 @@
     <properties>
         <scala.version>${scala213.version}</scala.version>
         <scala.compat.version>${scala213.compat.version}</scala.compat.version>
+        <arrow.version>${arrow18.version}</arrow.version>
+        <java.release>${java17.release}</java.release>
         <antlr4.version>4.13.1</antlr4.version>
-        <java.release>17</java.release>
     </properties>
 
     <dependencies>
diff --git a/lance-spark-base_2.12/pom.xml b/lance-spark-base_2.12/pom.xml
index 8acbb4c4..bd0d3bf3 100644
--- a/lance-spark-base_2.12/pom.xml
+++ b/lance-spark-base_2.12/pom.xml
@@ -102,4 +102,4 @@
             </plugin>
         </plugins>
     </build>
-</project>
\ No newline at end of file
+</project>
diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentColumnarBatchScanner.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentColumnarBatchScanner.java
index 34b4d610..32feddd6 100644
--- a/lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentColumnarBatchScanner.java
+++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentColumnarBatchScanner.java
@@ -79,7 +79,9 @@ public boolean loadNextBatch() throws IOException {
     return false;
   }
 
-  /** @return the current batch, the caller responsible for closing the batch */
+  /**
+   * @return the current batch, the caller responsible for closing the batch
+   */
   public ColumnarBatch getCurrentBatch() {
     return currentColumnarBatch;
   }
diff --git a/lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentScanner.java b/lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentScanner.java
index f5de7a4c..658770d6 100644
--- a/lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentScanner.java
+++ b/lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentScanner.java
@@ -132,7 +132,9 @@ public static LanceFragmentScanner create(int fragmentId, LanceInputPartition in
     }
   }
 
-  /** @return the arrow reader. The caller is responsible for closing the reader */
+  /**
+   * @return the arrow reader. The caller is responsible for closing the reader
+   */
   public ArrowReader getArrowReader() {
     return scanner.scanBatches();
   }
diff --git a/lance-spark-base_2.13/pom.xml b/lance-spark-base_2.13/pom.xml
index 2c01de10..aa19758a 100644
--- a/lance-spark-base_2.13/pom.xml
+++ b/lance-spark-base_2.13/pom.xml
@@ -146,4 +146,4 @@
             </plugin>
         </plugins>
     </build>
-</project>
\ No newline at end of file
+</project>
diff --git a/lance-spark-bundle-3.4_2.12/pom.xml b/lance-spark-bundle-3.4_2.12/pom.xml
index a2ff6448..aba9c2e4 100644
--- a/lance-spark-bundle-3.4_2.12/pom.xml
+++ b/lance-spark-bundle-3.4_2.12/pom.xml
@@ -15,8 +15,7 @@
     <packaging>jar</packaging>
 
     <properties>
-        <scala.version>${scala212.version}</scala.version>
-        <scala.compat.version>${scala212.compat.version}</scala.compat.version>
+        <arrow.version>${arrow14.version}</arrow.version>
     </properties>
 
     <dependencies>
@@ -108,4 +107,4 @@
             </plugin>
         </plugins>
     </build>
-</project>
\ No newline at end of file
+</project>
diff --git a/lance-spark-bundle-3.4_2.12/src/main/java/org/lance/spark/bundle/BundleInfo.java b/lance-spark-bundle-3.4_2.12/src/main/java/org/lance/spark/bundle/BundleInfo.java
index 69381c91..987faabc 100644
--- a/lance-spark-bundle-3.4_2.12/src/main/java/org/lance/spark/bundle/BundleInfo.java
+++ b/lance-spark-bundle-3.4_2.12/src/main/java/org/lance/spark/bundle/BundleInfo.java
@@ -23,7 +23,9 @@ private BundleInfo() {
     // Utility class
   }
 
-  /** @return the bundle name */
+  /**
+   * @return the bundle name
+   */
   public static String getBundleName() {
     return "lance-spark-bundle-3.4_2.12";
   }
diff --git a/lance-spark-bundle-3.4_2.13/pom.xml b/lance-spark-bundle-3.4_2.13/pom.xml
index 06864f0c..0ec0f03c 100644
--- a/lance-spark-bundle-3.4_2.13/pom.xml
+++ b/lance-spark-bundle-3.4_2.13/pom.xml
@@ -17,6 +17,7 @@
     <properties>
         <scala.version>${scala213.version}</scala.version>
         <scala.compat.version>${scala213.compat.version}</scala.compat.version>
+        <arrow.version>${arrow14.version}</arrow.version>
     </properties>
 
     <dependencies>
@@ -108,4 +109,4 @@
             </plugin>
         </plugins>
     </build>
-</project>
\ No newline at end of file
+</project>
diff --git a/lance-spark-bundle-3.4_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java b/lance-spark-bundle-3.4_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java
index 5227b50a..8bb430e1 100644
--- a/lance-spark-bundle-3.4_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java
+++ b/lance-spark-bundle-3.4_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java
@@ -23,7 +23,9 @@ private BundleInfo() {
     // Utility class
   }
 
-  /** @return the bundle name */
+  /**
+   * @return the bundle name
+   */
   public static String getBundleName() {
     return "lance-spark-bundle-3.4_2.13";
   }
diff --git a/lance-spark-bundle-3.5_2.12/pom.xml b/lance-spark-bundle-3.5_2.12/pom.xml
index c227dc0c..9decf88e 100644
--- a/lance-spark-bundle-3.5_2.12/pom.xml
+++ b/lance-spark-bundle-3.5_2.12/pom.xml
@@ -14,10 +14,6 @@
     <description>Bundled Jar of Apache Spark 3.5 Connector for Lance</description>
     <packaging>jar</packaging>
 
-    <properties>
-        <scala.version>${scala212.version}</scala.version>
-        <scala.compat.version>${scala212.compat.version}</scala.compat.version>
-    </properties>
 
     <dependencies>
         <dependency>
@@ -108,4 +104,4 @@
             </plugin>
         </plugins>
     </build>
-</project>
\ No newline at end of file
+</project>
diff --git a/lance-spark-bundle-3.5_2.12/src/main/java/org/lance/spark/bundle/BundleInfo.java b/lance-spark-bundle-3.5_2.12/src/main/java/org/lance/spark/bundle/BundleInfo.java
index f0a2719d..c8c40763 100644
--- a/lance-spark-bundle-3.5_2.12/src/main/java/org/lance/spark/bundle/BundleInfo.java
+++ b/lance-spark-bundle-3.5_2.12/src/main/java/org/lance/spark/bundle/BundleInfo.java
@@ -23,7 +23,9 @@ private BundleInfo() {
     // Utility class
   }
 
-  /** @return the bundle name */
+  /**
+   * @return the bundle name
+   */
   public static String getBundleName() {
     return "lance-spark-bundle-3.5_2.12";
   }
diff --git a/lance-spark-bundle-3.5_2.13/pom.xml b/lance-spark-bundle-3.5_2.13/pom.xml
index 53cc92e5..196de1f6 100644
--- a/lance-spark-bundle-3.5_2.13/pom.xml
+++ b/lance-spark-bundle-3.5_2.13/pom.xml
@@ -108,4 +108,4 @@
             </plugin>
         </plugins>
     </build>
-</project>
\ No newline at end of file
+</project>
diff --git a/lance-spark-bundle-3.5_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java b/lance-spark-bundle-3.5_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java
index 30772b11..e98e4c07 100644
--- a/lance-spark-bundle-3.5_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java
+++ b/lance-spark-bundle-3.5_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java
@@ -23,7 +23,9 @@ private BundleInfo() {
     // Utility class
   }
 
-  /** @return the bundle name */
+  /**
+   * @return the bundle name
+   */
   public static String getBundleName() {
     return "lance-spark-bundle-3.5_2.13";
   }
diff --git a/lance-spark-bundle-4.0_2.13/pom.xml b/lance-spark-bundle-4.0_2.13/pom.xml
index 092c78a0..ccb5de47 100644
--- a/lance-spark-bundle-4.0_2.13/pom.xml
+++ b/lance-spark-bundle-4.0_2.13/pom.xml
@@ -17,6 +17,8 @@
     <properties>
         <scala.version>${scala213.version}</scala.version>
         <scala.compat.version>${scala213.compat.version}</scala.compat.version>
+        <arrow.version>${arrow18.version}</arrow.version>
+        <java.release>${java17.release}</java.release>
     </properties>
 
     <dependencies>
diff --git a/lance-spark-bundle-4.0_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java b/lance-spark-bundle-4.0_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java
index bd3e02ae..a3c42072 100644
--- a/lance-spark-bundle-4.0_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java
+++ b/lance-spark-bundle-4.0_2.13/src/main/java/org/lance/spark/bundle/BundleInfo.java
@@ -23,7 +23,9 @@ private BundleInfo() {
     // Utility class
   }
 
-  /** @return the bundle name */
+  /**
+   * @return the bundle name
+   */
   public static String getBundleName() {
     return "lance-spark-bundle-4.0_2.13";
   }
diff --git a/pom.xml b/pom.xml
index 8c6d014d..7633e0ea 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,6 +54,12 @@
         <lance.version>2.0.0</lance.version>
         <lance-namespace-impl.version>0.1.3</lance-namespace-impl.version>
 
+        <arrow14.version>14.0.2</arrow14.version>
+        <arrow15.version>15.0.2</arrow15.version>
+        <arrow18.version>18.1.0</arrow18.version>
+        <!-- Default Arrow version (Spark 3.5); modules override as needed -->
+        <arrow.version>${arrow15.version}</arrow.version>
+
         <antlr4.version>4.9.3</antlr4.version>
 
         <spark34.version>3.4.4</spark34.version>
@@ -75,8 +81,8 @@
         <scala.compat.version>${scala212.compat.version}</scala.compat.version>
 
         <spotless.skip>false</spotless.skip>
-        <spotless.version>2.30.0</spotless.version>
-        <spotless.java.googlejavaformat.version>1.7</spotless.java.googlejavaformat.version>
+        <spotless.version>2.43.0</spotless.version>
+        <spotless.java.googlejavaformat.version>1.19.2</spotless.java.googlejavaformat.version>
         
         <spotless.scala.scalafmt.version>3.7.5</spotless.scala.scalafmt.version>
         <spotless.delimiter>package</spotless.delimiter>
@@ -112,7 +118,9 @@
         <maven-install-plugin.version>2.5.2</maven-install-plugin.version>
         <maven-resources-plugin.version>3.0.2</maven-resources-plugin.version>
         <maven-surefire-plugin.version>3.2.5</maven-surefire-plugin.version>
-        <java.release>11</java.release>
+        <java11.release>11</java11.release>
+        <java17.release>17</java17.release>
+        <java.release>${java11.release}</java.release>
     </properties>
 
     <modules>
@@ -132,13 +140,48 @@
 
     <dependencyManagement>
         <dependencies>
-            
             <dependency>
                 <groupId>org.apache.spark</groupId>
                 <artifactId>spark-sql_${scala.compat.version}</artifactId>
                 <version>${spark.version}</version>
                 <scope>provided</scope>
             </dependency>
+            <!-- Arrow version pinning - modules override arrow.version as needed -->
+            <dependency>
+                <groupId>org.apache.arrow</groupId>
+                <artifactId>arrow-vector</artifactId>
+                <version>${arrow.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.arrow</groupId>
+                <artifactId>arrow-format</artifactId>
+                <version>${arrow.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.arrow</groupId>
+                <artifactId>arrow-memory-core</artifactId>
+                <version>${arrow.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.arrow</groupId>
+                <artifactId>arrow-memory-netty</artifactId>
+                <version>${arrow.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.arrow</groupId>
+                <artifactId>arrow-memory-netty-buffer-patch</artifactId>
+                <version>${arrow.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.arrow</groupId>
+                <artifactId>arrow-c-data</artifactId>
+                <version>${arrow.version}</version>
+            </dependency>
+            <dependency>
+                <groupId>org.apache.arrow</groupId>
+                <artifactId>arrow-dataset</artifactId>
+                <version>${arrow.version}</version>
+            </dependency>
         </dependencies>
     </dependencyManagement>
 
@@ -417,56 +460,6 @@
                 </plugins>
             </build>
         </profile>
-        <profile>
-            <id>scala-2.12</id>
-            <activation>
-                <activeByDefault>false</activeByDefault>
-            </activation>
-            <properties>
-                <scala.version>${scala212.version}</scala.version>
-                <scala.compat.version>${scala212.compat.version}</scala.compat.version>
-            </properties>
-        </profile>
-        <profile>
-            <id>scala-2.13</id>
-            <activation>
-                <activeByDefault>false</activeByDefault>
-            </activation>
-            <properties>
-                <scala.version>${scala213.version}</scala.version>
-                <scala.compat.version>${scala213.compat.version}</scala.compat.version>
-            </properties>
-        </profile>
-        <profile>
-            <id>spark-3.4</id>
-            <activation>
-                <activeByDefault>false</activeByDefault>
-            </activation>
-            <properties>
-                <spark.version>${spark34.version}</spark.version>
-                <spark.compat.version>${spark34.compat.version}</spark.compat.version>
-            </properties>
-        </profile>
-        <profile>
-            <id>spark-3.5</id>
-            <activation>
-                <activeByDefault>false</activeByDefault>
-            </activation>
-            <properties>
-                <spark.version>${spark35.version}</spark.version>
-                <spark.compat.version>${spark35.compat.version}</spark.compat.version>
-            </properties>
-        </profile>
-        <profile>
-            <id>spark-4.0</id>
-            <activation>
-                <activeByDefault>false</activeByDefault>
-            </activation>
-            <properties>
-                <spark.version>${spark40.version}</spark.version>
-                <spark.compat.version>${spark40.compat.version}</spark.compat.version>
-            </properties>
-        </profile>
         <profile>
             <id>deploy-to-ossrh</id>
             <build>