apache · kaustuvnandy · Sep 3, 2025 · Sep 3, 2025 · Sep 4, 2025 · Sep 5, 2025
diff --git a/mkdocs/docs/recipe-count.md b/mkdocs/docs/recipe-count.md
@@ -0,0 +1,96 @@
+---
+title: Count Recipe - Efficiently Count Rows in Iceberg Tables
+---
+
+# Counting Rows in an Iceberg Table
+
+This recipe demonstrates how to use the `count()` function to efficiently count rows in an Iceberg table using PyIceberg. The count operation is optimized for performance by reading file metadata rather than scanning actual data.
+
+## How Count Works
+
+The `count()` method leverages Iceberg's metadata architecture to provide fast row counts by:
+
+1. **Reading file manifests**: Examines metadata about data files without loading the actual data
+2. **Aggregating record counts**: Sums up record counts stored in Parquet file footers
+3. **Applying filters at metadata level**: Pushes down predicates to skip irrelevant files
+4. **Handling deletes**: Automatically accounts for delete files and tombstones
+
+## Basic Usage
+
+Count all rows in a table:
+
+```python
+from pyiceberg.catalog import load_catalog
+
+catalog = load_catalog("default")
+table = catalog.load_table("default.cities")
+
+# Get total row count
+row_count = table.scan().count()
+print(f"Total rows in table: {row_count}")
+```
+
+## Count with Filters
+
+Count rows matching specific conditions:
+
+```python
+from pyiceberg.expressions import GreaterThan, EqualTo, And
+
+# Count rows with population > 1,000,000
+large_cities = table.scan().filter(GreaterThan("population", 1000000)).count()
-# Count rows with population > 1,000,000
-large_cities = table.scan().filter(GreaterThan("population", 1000000)).count()
+large_cities = table.scan().filter("population > 1000000").count()
-# Count rows with population > 1,000,000
-large_cities = table.scan().filter(GreaterThan("population", 1000000)).count()
+large_cities = table.scan().filter("population > 1000000").count()
+print(f"Large cities: {large_cities}")
+
+# Count rows with specific country and population criteria
+filtered_count = table.scan().filter(
+    And(EqualTo("country", "Netherlands"), GreaterThan("population", 100000))
+).count()
+print(f"Dutch cities with population > 100k: {filtered_count}")
+```
+
+## Performance Characteristics
+
+The count operation is highly efficient because:
+
+- **No data scanning**: Only reads metadata from file headers
+- **Parallel processing**: Can process multiple files concurrently
+- **Filter pushdown**: Eliminates files that don't match criteria
+- **Cached statistics**: Utilizes pre-computed record counts
+
+## Test Scenarios
+
+Our test suite validates count behavior across different scenarios:
+
+### Basic Counting (test_count_basic)
+```python
+# Simulates a table with a single file containing 42 records
+assert table.scan().count() == 42
+```
+
+### Empty Tables (test_count_empty)
+```python
+# Handles tables with no data files
+assert empty_table.scan().count() == 0
+```
+
+### Large Datasets (test_count_large)
+```python
+# Aggregates counts across multiple files (2 files × 500,000 records each)
+assert large_table.scan().count() == 1000000
+```
+
+## Best Practices
+
+1. **Use count() for data validation**: Verify expected row counts after ETL operations
+2. **Combine with filters**: Get targeted counts without full table scans
+3. **Monitor table growth**: Track record counts over time for capacity planning
+4. **Validate partitions**: Count rows per partition to ensure balanced distribution
+
+## Common Use Cases
+
+- **Data quality checks**: Verify ETL job outputs
+- **Partition analysis**: Compare record counts across partitions
+- **Performance monitoring**: Track table growth and query patterns
+- **Cost estimation**: Understand data volume before expensive operations
+
+For more details and complete API documentation, see the [API documentation](api.md#count-rows-in-a-table).
diff --git a/tests/table/test_count.py b/tests/table/test_count.py
@@ -0,0 +1,129 @@
+"""
+Unit tests for the DataScan.count() method in PyIceberg.
+
+The count() method is essential for determining the number of rows in an Iceberg table
+without having to load the actual data. It works by examining file metadata and task
+plans to efficiently calculate row counts across distributed data files.
+
+These tests validate the count functionality across different scenarios:
+1. Basic counting with single file tasks
+2. Empty table handling (zero records)
+3. Large-scale counting with multiple file tasks
+
+The tests use mocking to simulate different table states without requiring actual
+Iceberg table infrastructure, ensuring fast and isolated unit tests.
+"""
+
+import pytest
+from unittest.mock import MagicMock, Mock, patch
+from pyiceberg.table import DataScan
+from pyiceberg.expressions import AlwaysTrue
+
+
+class DummyFile:
+    """
+    Mock representation of an Iceberg data file.
+
+    In real scenarios, this would contain metadata about Parquet files
+    including record counts, file paths, and statistics.
+    """
+    def __init__(self, record_count):
+        self.record_count = record_count
+
+
+class DummyTask:
+    """
+    Mock representation of a scan task in Iceberg query planning.
+
+    A scan task represents work to be done on a specific data file,
+    including any residual filters and delete files that need to be applied.
+    In actual usage, tasks are generated by the query planner based on
+    partition pruning and filter pushdown optimizations.
+    """
+    def __init__(self, record_count, residual=None, delete_files=None):
+        self.file = DummyFile(record_count)
+        self.residual = residual if residual is not None else AlwaysTrue()
+        self.delete_files = delete_files or []
+
+def test_count_basic():
+    """
+    Test basic count functionality with a single file containing data.
+
+    This test verifies that the count() method correctly aggregates record counts
+    from a single scan task. It simulates a table with one data file containing
+    42 records and validates that the count method returns the correct total.
+
+    The test demonstrates the typical use case where:
+    - A table has one or more data files
+    - Each file has metadata containing record counts
+    - The count() method aggregates these counts efficiently
+    """
+    # Create a mock table with the necessary attributes
+    table = Mock(spec=DataScan)
+
+    # Mock the plan_files method to return our dummy task
+    task = DummyTask(42, residual=AlwaysTrue(), delete_files=[])
+    table.plan_files = MagicMock(return_value=[task])
+
+    # Import and call the actual count method
+    from pyiceberg.table import DataScan as ActualDataScan
+    table.count = ActualDataScan.count.__get__(table, ActualDataScan)
+
+    assert table.count() == 42
+
+
+def test_count_empty():
+    """
+    Test count functionality on an empty table.
+
+    This test ensures that the count() method correctly handles empty tables
+    that have no data files or scan tasks. It validates that an empty table
+    returns a count of 0 without raising any errors.
+
+    This scenario is important for:
+    - Newly created tables before any data is inserted
+    - Tables where all data has been deleted
+    - Tables with restrictive filters that match no data
+    """
+    # Create a mock table with the necessary attributes
+    table = Mock(spec=DataScan)
+
+    # Mock the plan_files method to return no tasks
+    table.plan_files = MagicMock(return_value=[])
+
+    # Import and call the actual count method
+    from pyiceberg.table import DataScan as ActualDataScan
+    table.count = ActualDataScan.count.__get__(table, ActualDataScan)
+
+    assert table.count() == 0
+
+
+def test_count_large():
+    """
+    Test count functionality with multiple files containing large datasets.
+
+    This test validates that the count() method can efficiently handle tables
+    with multiple data files and large record counts. It simulates a distributed
+    scenario where data is split across multiple files, each containing 500,000
+    records, for a total of 1 million records.
+
+    This test covers:
+    - Aggregation across multiple scan tasks
+    - Handling of large record counts (performance implications)
+    - Distributed data scenarios common in big data environments
+    """
+    # Create a mock table with the necessary attributes
+    table = Mock(spec=DataScan)
+
+    # Mock the plan_files method to return multiple tasks
+    tasks = [
+        DummyTask(500000, residual=AlwaysTrue(), delete_files=[]),
+        DummyTask(500000, residual=AlwaysTrue(), delete_files=[]),
+    ]
+    table.plan_files = MagicMock(return_value=tasks)
+
+    # Import and call the actual count method
+    from pyiceberg.table import DataScan as ActualDataScan
+    table.count = ActualDataScan.count.__get__(table, ActualDataScan)
+
+    assert table.count() == 1000000