fix: Fix issue with stream upload batch size upload limit (#2290)

TrevorBergeron · web-flow · commit 6cdf64b0674d · 2025-11-26T10:55:21.000-08:00
diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py
@@ -124,12 +124,13 @@ def to_arrow(
         geo_format: Literal["wkb", "wkt"] = "wkt",
         duration_type: Literal["int", "duration"] = "duration",
         json_type: Literal["string"] = "string",
+        max_chunksize: Optional[int] = None,
     ) -> tuple[pa.Schema, Iterable[pa.RecordBatch]]:
         if geo_format != "wkt":
             raise NotImplementedError(f"geo format {geo_format} not yet implemented")
         assert json_type == "string"
 
-        batches = self.data.to_batches()
+        batches = self.data.to_batches(max_chunksize=max_chunksize)
         schema = self.data.schema
         if duration_type == "int":
             schema = _schema_durations_to_ints(schema)
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
@@ -19,6 +19,7 @@
 import datetime
 import io
 import itertools
+import math
 import os
 import typing
 from typing import (
@@ -397,6 +398,15 @@ def stream_data(
         offsets_col: str,
     ) -> bq_data.BigqueryDataSource:
         """Load managed data into bigquery"""
+        MAX_BYTES = 10000000  # streaming api has 10MB limit
+        SAFETY_MARGIN = (
+            40  # Perf seems bad for large chunks, so do 40x smaller than max
+        )
+        batch_count = math.ceil(
+            data.metadata.total_bytes / (MAX_BYTES // SAFETY_MARGIN)
+        )
+        rows_per_batch = math.ceil(data.metadata.row_count / batch_count)
+
         schema_w_offsets = data.schema.append(
             schemata.SchemaItem(offsets_col, bigframes.dtypes.INT_DTYPE)
         )
@@ -410,16 +420,24 @@ def stream_data(
         )
         rows_w_offsets = ((*row, offset) for offset, row in enumerate(rows))
 
-        for errors in self._bqclient.insert_rows(
-            load_table_destination,
-            rows_w_offsets,
-            selected_fields=bq_schema,
-            row_ids=map(str, itertools.count()),  # used to ensure only-once insertion
-        ):
-            if errors:
-                raise ValueError(
-                    f"Problem loading at least one row from DataFrame: {errors}. {constants.FEEDBACK_LINK}"
-                )
+        # TODO: don't use batched
+        batches = _batched(rows_w_offsets, rows_per_batch)
+        ids_iter = map(str, itertools.count())
+
+        for batch in batches:
+            batch_rows = list(batch)
+            row_ids = itertools.islice(ids_iter, len(batch_rows))
+
+            for errors in self._bqclient.insert_rows(
+                load_table_destination,
+                batch_rows,
+                selected_fields=bq_schema,
+                row_ids=row_ids,  # used to ensure only-once insertion
+            ):
+                if errors:
+                    raise ValueError(
+                        f"Problem loading at least one row from DataFrame: {errors}. {constants.FEEDBACK_LINK}"
+                    )
         destination_table = self._bqclient.get_table(load_table_destination)
         return bq_data.BigqueryDataSource(
             bq_data.GbqTable.from_table(destination_table),
@@ -434,6 +452,15 @@ def write_data(
         offsets_col: str,
     ) -> bq_data.BigqueryDataSource:
         """Load managed data into bigquery"""
+        MAX_BYTES = 10000000  # streaming api has 10MB limit
+        SAFETY_MARGIN = (
+            4  # aim for 2.5mb to account for row variance, format differences, etc.
+        )
+        batch_count = math.ceil(
+            data.metadata.total_bytes / (MAX_BYTES // SAFETY_MARGIN)
+        )
+        rows_per_batch = math.ceil(data.metadata.row_count / batch_count)
+
         schema_w_offsets = data.schema.append(
             schemata.SchemaItem(offsets_col, bigframes.dtypes.INT_DTYPE)
         )
@@ -450,7 +477,9 @@ def write_data(
 
         def request_gen() -> Generator[bq_storage_types.AppendRowsRequest, None, None]:
             schema, batches = data.to_arrow(
-                offsets_col=offsets_col, duration_type="int"
+                offsets_col=offsets_col,
+                duration_type="int",
+                max_chunksize=rows_per_batch,
             )
             offset = 0
             for batch in batches:
@@ -1332,3 +1361,10 @@ def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype):
             f"Nested JSON types, found in column `{name}`: `{column_type}`', "
             f"are currently unsupported for upload. {constants.FEEDBACK_LINK}"
         )
+
+
+# itertools.batched not available in python <3.12, so we use this instead
+def _batched(iterator: Iterable, n: int) -> Iterable:
+    assert n > 0
+    while batch := tuple(itertools.islice(iterator, n)):
+        yield batch
diff --git a/tests/system/large/test_session.py b/tests/system/large/test_session.py
@@ -17,13 +17,50 @@
 
 import google.cloud.bigquery as bigquery
 import google.cloud.exceptions
+import numpy as np
+import pandas as pd
 import pytest
 
 import bigframes
 import bigframes.pandas as bpd
 import bigframes.session._io.bigquery
 
 
+@pytest.fixture
+def large_pd_df():
+    nrows = 1000000
+
+    np_int1 = np.random.randint(0, 1000, size=nrows, dtype=np.int32)
+    np_int2 = np.random.randint(10000, 20000, size=nrows, dtype=np.int64)
+    np_bool = np.random.choice([True, False], size=nrows)
+    np_float1 = np.random.rand(nrows).astype(np.float32)
+    np_float2 = np.random.normal(loc=50.0, scale=10.0, size=nrows).astype(np.float64)
+
+    return pd.DataFrame(
+        {
+            "int_col_1": np_int1,
+            "int_col_2": np_int2,
+            "bool_col": np_bool,
+            "float_col_1": np_float1,
+            "float_col_2": np_float2,
+        }
+    )
+
+
+@pytest.mark.parametrize(
+    ("write_engine"),
+    [
+        ("bigquery_load"),
+        ("bigquery_streaming"),
+        ("bigquery_write"),
+    ],
+)
+def test_read_pandas_large_df(session, large_pd_df, write_engine: str):
+    df = session.read_pandas(large_pd_df, write_engine=write_engine)
+    assert len(df.peek(5)) == 5
+    assert len(large_pd_df) == 1000000
+
+
 def test_close(session: bigframes.Session):
     # we will create two tables and confirm that they are deleted
     # when the session is closed