From dd4853dea9f944d56c4613b2c6effa76eeca1bdd Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 24 Jul 2025 21:26:05 +0000 Subject: [PATCH 01/45] remove expensive len() call --- bigframes/display/anywidget.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 3d12a2032c..131a81b26e 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -46,8 +46,10 @@ class TableWidget(WIDGET_BASE): - """ - An interactive, paginated table widget for BigFrames DataFrames. + """An interactive, paginated table widget for BigFrames DataFrames. + + This widget provides a user-friendly way to display and navigate through + large BigQuery DataFrames within a Jupyter environment. """ def __init__(self, dataframe: bigframes.dataframe.DataFrame): @@ -74,19 +76,20 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): initial_page_size = bigframes.options.display.max_rows # Initialize data fetching attributes. - self._batches = dataframe._to_pandas_batches(page_size=initial_page_size) + self._batches = dataframe.to_pandas_batches(page_size=initial_page_size) + + # Access total_rows through type casting (internal use only) + from bigframes.core.blocks import PandasBatches + + if isinstance(self._batches, PandasBatches): + self.row_count = self._batches.total_rows or 0 + else: + # Fallback for compatibility + self.row_count = 0 # set traitlets properties that trigger observers self.page_size = initial_page_size - # len(dataframe) is expensive, since it will trigger a - # SELECT COUNT(*) query. It is a must have however. - # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()` - # before we get here so that the count might already be cached. - # TODO(b/452747934): Allow row_count to be None and check to see if - # there are multiple pages and show "page 1 of many" in this case. - self.row_count = self._batches.total_rows or 0 - # get the initial page self._set_table_html() From 6afa44b3d3cbf21c2335a47bb70ea356ff0339f6 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 24 Jul 2025 23:21:12 +0000 Subject: [PATCH 02/45] add testcase --- notebooks/dataframes/anywidget_mode.ipynb | 28 +++++- tests/system/small/test_anywidget.py | 116 ++++++++++++++++++++-- 2 files changed, 137 insertions(+), 7 deletions(-) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 328d4a05f1..7de6d43d6c 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -73,6 +73,18 @@ "id": "f289d250", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "Query job 1ea2b594-2bd7-46de-a3c8-6aeee5884ba2 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", @@ -127,10 +139,22 @@ "id": "ce250157", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "Query job 67e679e9-94da-47f7-8be1-8b4a496fbfbd is DONE. 171.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9e3e413eb0774a62818c58d217af8488", + "model_id": "e74c3920b93644a0b2afdaa3841cad31", "version_major": 2, "version_minor": 1 }, @@ -182,6 +206,7 @@ "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "df5e93f0d03f45cda67aa6da7f9ef1ae", + "model_id": "b4f7a3f86ef54e07b24ef10061088391", "version_major": 2, "version_minor": 1 }, @@ -268,6 +293,7 @@ "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a4ec5248708442fabc59c446c78a1304", + "model_id": "44a829aca2f24cfdba4b61afd1a259fe", "version_major": 2, "version_minor": 1 }, diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 8a91176dd9..fbb8851ef9 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import typing + import pandas as pd import pytest @@ -61,11 +63,12 @@ def table_widget(paginated_bf_df: bf.dataframe.DataFrame): Helper fixture to create a TableWidget instance with a fixed page size. This reduces duplication across tests that use the same widget configuration. """ - from bigframes import display + + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): # Delay context manager cleanup of `max_rows` until after tests finish. - yield display.TableWidget(paginated_bf_df) + yield TableWidget(paginated_bf_df) @pytest.fixture(scope="module") @@ -90,10 +93,10 @@ def small_bf_df( @pytest.fixture def small_widget(small_bf_df): """Helper fixture for tests using a DataFrame smaller than the page size.""" - from bigframes import display + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 5): - yield display.TableWidget(small_bf_df) + yield TableWidget(small_bf_df) @pytest.fixture(scope="module") @@ -135,10 +138,10 @@ def test_widget_initialization_should_calculate_total_row_count( paginated_bf_df: bf.dataframe.DataFrame, ): """A TableWidget should correctly calculate the total row count on creation.""" - from bigframes import display + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = display.TableWidget(paginated_bf_df) + widget = TableWidget(paginated_bf_df) assert widget.row_count == EXPECTED_ROW_COUNT @@ -436,6 +439,107 @@ def test_widget_creation_should_load_css_for_rendering(table_widget): assert ".bigframes-widget .footer" in css_content +def test_widget_row_count_should_be_immutable_after_creation( + paginated_bf_df: bf.dataframe.DataFrame, +): + """ + Given a widget created with a specific configuration when global display + options are changed later, the widget's original row_count should remain + unchanged. + """ + from bigframes.display import TableWidget + + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + widget = TableWidget(paginated_bf_df) + initial_row_count = widget.row_count + assert initial_row_count == EXPECTED_ROW_COUNT + + # Change a global option that could influence row count + bf.options.display.max_rows = 10 + + # The widget's row count was fixed at creation and should not change. + assert widget.row_count == initial_row_count + + +def test_widget_should_fallback_to_zero_rows_when_total_rows_is_none( + paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch +): + """ + Given an internal component that fails to provide a total row count, + when the widget is created, the row_count should safely fall back to 0. + """ + from bigframes.core.blocks import PandasBatches + + # Simulate an internal failure where total_rows returns None + monkeypatch.setattr(PandasBatches, "total_rows", property(lambda self: None)) + + with bf.option_context("display.repr_mode", "anywidget"): + from bigframes.display import TableWidget + + widget = TableWidget(paginated_bf_df) + + assert widget.row_count == 0 + + +def test_widget_should_fallback_to_zero_rows_when_batches_are_invalid_type( + paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch +): + """ + Given an internal component that returns an unexpected data type, + when the widget is created, the row_count should safely fall back to 0. + """ + # Simulate internal method returning an unexpected type (a simple iterator) + def mock_to_pandas_batches(self, **kwargs): + return iter([paginated_bf_df.to_pandas().iloc[:2]]) + + monkeypatch.setattr( + "bigframes.dataframe.DataFrame.to_pandas_batches", mock_to_pandas_batches + ) + + with bf.option_context("display.repr_mode", "anywidget"): + from bigframes.display import TableWidget + + widget = TableWidget(paginated_bf_df) + + assert widget.row_count == 0 + + +@pytest.mark.parametrize( + "max_results, expected_rows", + [ + (None, EXPECTED_ROW_COUNT), + (3, 3), + (10, EXPECTED_ROW_COUNT), + ], + ids=["no_limit", "limit_is_less_than_total", "limit_is_greater_than_total"], +) +def test_widget_row_count_should_respect_max_results_on_creation( + paginated_bf_df: bf.dataframe.DataFrame, + max_results: typing.Optional[int], + expected_rows: int, +): + """ + Given a max_results value, when a TableWidget is created with custom batches, + its row_count should be correctly capped by that value. + """ + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + from bigframes.core.blocks import PandasBatches + from bigframes.display import TableWidget + + widget = TableWidget(paginated_bf_df) + + # Override batches with max_results to test the behavior + widget._batches = paginated_bf_df.to_pandas_batches( + page_size=widget.page_size, max_results=max_results + ) + + # Re-apply thelogic to update row_count + if isinstance(widget._batches, PandasBatches): + widget.row_count = widget._batches.total_rows or 0 + + assert widget.row_count == expected_rows + + # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. From 878e350c7e821434e36b1e2260df38f5a2771ebe Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 24 Jul 2025 23:24:46 +0000 Subject: [PATCH 03/45] fix a typo --- tests/system/small/test_anywidget.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index fbb8851ef9..37ff53fa01 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -533,7 +533,7 @@ def test_widget_row_count_should_respect_max_results_on_creation( page_size=widget.page_size, max_results=max_results ) - # Re-apply thelogic to update row_count + # Re-apply the logic to update row_count if isinstance(widget._batches, PandasBatches): widget.row_count = widget._batches.total_rows or 0 From 01781dfe224e58c66fd92382b744cdbdc013cac0 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 25 Jul 2025 22:40:53 +0000 Subject: [PATCH 04/45] change how row_count is updated --- bigframes/display/anywidget.py | 13 ++--- notebooks/dataframes/dataframe.ipynb | 4 +- tests/system/small/test_anywidget.py | 81 ++++++++++++++++++---------- 3 files changed, 59 insertions(+), 39 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 131a81b26e..69a31b885e 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -78,14 +78,11 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Initialize data fetching attributes. self._batches = dataframe.to_pandas_batches(page_size=initial_page_size) - # Access total_rows through type casting (internal use only) - from bigframes.core.blocks import PandasBatches - - if isinstance(self._batches, PandasBatches): - self.row_count = self._batches.total_rows or 0 - else: - # Fallback for compatibility - self.row_count = 0 + # Get total rows efficiently by executing the query once + execute_result = dataframe._block.session._executor.execute( + dataframe._block.expr, ordered=True + ) + self.row_count = execute_result.total_rows or 0 # set traitlets properties that trigger observers self.page_size = initial_page_size diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb index de9bb1d04f..ae03b56c72 100644 --- a/notebooks/dataframes/dataframe.ipynb +++ b/notebooks/dataframes/dataframe.ipynb @@ -5366,7 +5366,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "venv", "language": "python", "name": "python3" }, @@ -5380,7 +5380,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.10.15" } }, "nbformat": 4, diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 37ff53fa01..fa49c41f97 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -112,6 +112,32 @@ def empty_bf_df( return session.read_pandas(empty_pandas_df) +def mock_execute_total_rows_is_none(self, schema, *args, **kwargs): + """Mocks an execution result where the total row count is missing.""" + from bigframes.session.executor import ExecuteResult + + return ExecuteResult( + iter([]), # arrow_batches + schema=schema, + query_job=None, + total_bytes=None, + total_rows=None, # The specific failure condition for this case + ) + + +def mock_execute_batches_are_invalid(self, schema, *args, **kwargs): + """Mocks an execution result where the batch data is an invalid type.""" + from bigframes.session.executor import ExecuteResult + + return ExecuteResult( + None, # Invalid type for arrow_batches, which should be an iterator + schema=schema, + query_job=None, + total_bytes=None, + total_rows=100, # A valid row count, as the error is in the batch data + ) + + def _assert_html_matches_pandas_slice( table_html: str, expected_pd_slice: pd.DataFrame, @@ -461,46 +487,43 @@ def test_widget_row_count_should_be_immutable_after_creation( assert widget.row_count == initial_row_count -def test_widget_should_fallback_to_zero_rows_when_total_rows_is_none( - paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch -): - """ - Given an internal component that fails to provide a total row count, - when the widget is created, the row_count should safely fall back to 0. - """ - from bigframes.core.blocks import PandasBatches - - # Simulate an internal failure where total_rows returns None - monkeypatch.setattr(PandasBatches, "total_rows", property(lambda self: None)) - - with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display import TableWidget - - widget = TableWidget(paginated_bf_df) - - assert widget.row_count == 0 - - -def test_widget_should_fallback_to_zero_rows_when_batches_are_invalid_type( - paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch +@pytest.mark.parametrize( + "mock_function", + [ + mock_execute_total_rows_is_none, + mock_execute_batches_are_invalid, + ], + # 'ids' provides descriptive names for each test run in the pytest report. + ids=[ + "when_total_rows_is_None", + "when_arrow_batches_are_invalid", + ], +) +def test_widget_should_fallback_to_zero_rows_on_error( + paginated_bf_df: bf.dataframe.DataFrame, + monkeypatch: pytest.MonkeyPatch, + mock_function, ): """ - Given an internal component that returns an unexpected data type, - when the widget is created, the row_count should safely fall back to 0. + Given an internal component fails to return valid execution data, + when the TableWidget is created, its row_count should safely fall back to 0. """ - # Simulate internal method returning an unexpected type (a simple iterator) - def mock_to_pandas_batches(self, **kwargs): - return iter([paginated_bf_df.to_pandas().iloc[:2]]) - + # The 'self' argument is automatically handled when monkeypatch calls the method. + # We use a lambda to pass the DataFrame's schema to our mock function. monkeypatch.setattr( - "bigframes.dataframe.DataFrame.to_pandas_batches", mock_to_pandas_batches + "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", + lambda self, *args, **kwargs: mock_function( + self, paginated_bf_df._block.expr.schema, *args, **kwargs + ), ) with bf.option_context("display.repr_mode", "anywidget"): from bigframes.display import TableWidget + # The widget should handle the faulty data from the mock without crashing. widget = TableWidget(paginated_bf_df) + # The key assertion: The widget safely defaults to 0 rows. assert widget.row_count == 0 From 5496e74207076b9f60d6b31936b38657ef5b3c8f Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 29 Jul 2025 18:59:06 +0000 Subject: [PATCH 05/45] testcase stil fails, need to merged in 1888 --- bigframes/display/anywidget.py | 32 ++++++---- tests/system/small/test_anywidget.py | 96 +++++++++++----------------- 2 files changed, 57 insertions(+), 71 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 69a31b885e..c487ddff6e 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -66,29 +66,35 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): super().__init__() self._dataframe = dataframe - # Initialize attributes that might be needed by observers FIRST + # Initialize attributes that might be needed by observers first self._table_id = str(uuid.uuid4()) self._all_data_loaded = False self._batch_iter: Optional[Iterator[pd.DataFrame]] = None self._cached_batches: List[pd.DataFrame] = [] - # respect display options for initial page size + # Respect display options for initial page size initial_page_size = bigframes.options.display.max_rows - # Initialize data fetching attributes. - self._batches = dataframe.to_pandas_batches(page_size=initial_page_size) + try: + # Fetches initial data batches and row count for display. + # `to_pandas_batches` provides an iterable of pandas DataFrames + # and eagerly retrieves the total row count + self._batches = dataframe.to_pandas_batches( + page_size=initial_page_size, + ) - # Get total rows efficiently by executing the query once - execute_result = dataframe._block.session._executor.execute( - dataframe._block.expr, ordered=True - ) - self.row_count = execute_result.total_rows or 0 + # Access the total_rows property directly + self.row_count = self._batches.total_rows or 0 + self.page_size = initial_page_size - # set traitlets properties that trigger observers - self.page_size = initial_page_size + # Generates the initial HTML table content + self._set_table_html() - # get the initial page - self._set_table_html() + except Exception: + self.row_count = 0 + self.page_size = initial_page_size + self._batches = iter([]) + self.table_html = "" @functools.cached_property def _esm(self): diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index fa49c41f97..cba700edec 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import typing - import pandas as pd import pytest @@ -112,29 +110,20 @@ def empty_bf_df( return session.read_pandas(empty_pandas_df) -def mock_execute_total_rows_is_none(self, schema, *args, **kwargs): - """Mocks an execution result where the total row count is missing.""" - from bigframes.session.executor import ExecuteResult - - return ExecuteResult( - iter([]), # arrow_batches - schema=schema, - query_job=None, - total_bytes=None, - total_rows=None, # The specific failure condition for this case - ) - - -def mock_execute_batches_are_invalid(self, schema, *args, **kwargs): - """Mocks an execution result where the batch data is an invalid type.""" +def mock_execute_result_with_params( + self, schema, total_rows_val, arrow_batches_val, *args, **kwargs +): + """ + Mocks an execution result with configurable total_rows and arrow_batches. + """ from bigframes.session.executor import ExecuteResult return ExecuteResult( - None, # Invalid type for arrow_batches, which should be an iterator + iter(arrow_batches_val), schema=schema, query_job=None, total_bytes=None, - total_rows=100, # A valid row count, as the error is in the batch data + total_rows=total_rows_val, ) @@ -475,25 +464,27 @@ def test_widget_row_count_should_be_immutable_after_creation( """ from bigframes.display import TableWidget + # Use a context manager to ensure the option is reset with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): widget = TableWidget(paginated_bf_df) initial_row_count = widget.row_count - assert initial_row_count == EXPECTED_ROW_COUNT # Change a global option that could influence row count bf.options.display.max_rows = 10 - # The widget's row count was fixed at creation and should not change. + # Verify the row count remains immutable. assert widget.row_count == initial_row_count @pytest.mark.parametrize( - "mock_function", + "total_rows_param, arrow_batches_param", [ - mock_execute_total_rows_is_none, - mock_execute_batches_are_invalid, + # Corresponds to mock_execute_total_rows_is_none + (None, []), + # Corresponds to mock_execute_batches_are_invalid (assuming empty list + # for invalid batches for now) + (100, []), ], - # 'ids' provides descriptive names for each test run in the pytest report. ids=[ "when_total_rows_is_None", "when_arrow_batches_are_invalid", @@ -502,65 +493,54 @@ def test_widget_row_count_should_be_immutable_after_creation( def test_widget_should_fallback_to_zero_rows_on_error( paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch, - mock_function, + total_rows_param, + arrow_batches_param, ): """ Given an internal component fails to return valid execution data, when the TableWidget is created, its row_count should safely fall back to 0. """ - # The 'self' argument is automatically handled when monkeypatch calls the method. - # We use a lambda to pass the DataFrame's schema to our mock function. + # Patch the executor's 'execute' method to simulate an error. monkeypatch.setattr( "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", - lambda self, *args, **kwargs: mock_function( - self, paginated_bf_df._block.expr.schema, *args, **kwargs + lambda self, *args, **kwargs: mock_execute_result_with_params( + self, + paginated_bf_df._block.expr.schema, + total_rows_param, + arrow_batches_param, + *args, + **kwargs ), ) + # Create the TableWidget under the error condition. with bf.option_context("display.repr_mode", "anywidget"): from bigframes.display import TableWidget # The widget should handle the faulty data from the mock without crashing. widget = TableWidget(paginated_bf_df) - # The key assertion: The widget safely defaults to 0 rows. + # The widget safely defaults to 0 rows. assert widget.row_count == 0 -@pytest.mark.parametrize( - "max_results, expected_rows", - [ - (None, EXPECTED_ROW_COUNT), - (3, 3), - (10, EXPECTED_ROW_COUNT), - ], - ids=["no_limit", "limit_is_less_than_total", "limit_is_greater_than_total"], -) -def test_widget_row_count_should_respect_max_results_on_creation( +def test_widget_row_count_reflects_actual_data_available( paginated_bf_df: bf.dataframe.DataFrame, - max_results: typing.Optional[int], - expected_rows: int, ): """ - Given a max_results value, when a TableWidget is created with custom batches, - its row_count should be correctly capped by that value. + Test that widget row_count reflects the actual data available, + regardless of theoretical limits. """ - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - from bigframes.core.blocks import PandasBatches - from bigframes.display import TableWidget + from bigframes.display import TableWidget + # Set up display options that define a page size. + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): widget = TableWidget(paginated_bf_df) - # Override batches with max_results to test the behavior - widget._batches = paginated_bf_df.to_pandas_batches( - page_size=widget.page_size, max_results=max_results - ) - - # Re-apply the logic to update row_count - if isinstance(widget._batches, PandasBatches): - widget.row_count = widget._batches.total_rows or 0 - - assert widget.row_count == expected_rows + # The widget should report the total rows in the DataFrame, + # not limited by page_size (which only affects pagination) + assert widget.row_count == EXPECTED_ROW_COUNT + assert widget.page_size == 2 # Respects the display option # TODO(shuowei): Add tests for custom index and multiindex From 0b364b04348fb8fe7be0d5369aa2be7fd32650f2 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 30 Jul 2025 03:27:32 +0000 Subject: [PATCH 06/45] update the method of using PandasBatches.total_rows --- bigframes/display/anywidget.py | 28 ++++++++++++++++++---------- tests/system/small/test_anywidget.py | 15 +++++++++++---- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index c487ddff6e..198fc1604d 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -17,13 +17,14 @@ from importlib import resources import functools import math -from typing import Any, Dict, Iterator, List, Optional, Type +import typing +from typing import Any, cast, Dict, Iterator, List, Optional, Type import uuid import pandas as pd import bigframes -import bigframes.dataframe +import bigframes.core.blocks import bigframes.display.html # anywidget and traitlets are optional dependencies. We don't want the import of this @@ -70,6 +71,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): self._table_id = str(uuid.uuid4()) self._all_data_loaded = False self._batch_iter: Optional[Iterator[pd.DataFrame]] = None + self._batches: Optional[bigframes.core.blocks.PandasBatches] = None self._cached_batches: List[pd.DataFrame] = [] # Respect display options for initial page size @@ -77,14 +79,16 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): try: # Fetches initial data batches and row count for display. - # `to_pandas_batches` provides an iterable of pandas DataFrames - # and eagerly retrieves the total row count - self._batches = dataframe.to_pandas_batches( + batches = dataframe.to_pandas_batches( page_size=initial_page_size, ) + self._batches = cast(bigframes.core.blocks.PandasBatches, batches) - # Access the total_rows property directly - self.row_count = self._batches.total_rows or 0 + # Use total_rows if available, otherwise default to 0. + if self._batches: + self.row_count = self._batches.total_rows or 0 + else: + self.row_count = 0 self.page_size = initial_page_size # Generates the initial HTML table content @@ -93,7 +97,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): except Exception: self.row_count = 0 self.page_size = initial_page_size - self._batches = iter([]) + self._batches = None self.table_html = "" @functools.cached_property @@ -177,7 +181,10 @@ def _get_next_batch(self) -> bool: def _batch_iterator(self) -> Iterator[pd.DataFrame]: """Lazily initializes and returns the batch iterator.""" if self._batch_iter is None: - self._batch_iter = iter(self._batches) + if self._batches is None: + self._batch_iter = iter([]) + else: + self._batch_iter = iter(self._batches) return self._batch_iter @property @@ -189,7 +196,8 @@ def _cached_data(self) -> pd.DataFrame: def _reset_batches_for_new_page_size(self): """Reset the batch iterator when page size changes.""" - self._batches = self._dataframe._to_pandas_batches(page_size=self.page_size) + batches = self._dataframe.to_pandas_batches(page_size=self.page_size) + self._batches = typing.cast(bigframes.core.blocks.PandasBatches, batches) self._cached_batches = [] self._batch_iter = None self._all_data_loaded = False diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index cba700edec..716f44f039 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -476,14 +476,21 @@ def test_widget_row_count_should_be_immutable_after_creation( assert widget.row_count == initial_row_count +class FaultyIterator: + def __iter__(self): + return self + + def __next__(self): + raise ValueError("Simulated read error") + + @pytest.mark.parametrize( "total_rows_param, arrow_batches_param", [ - # Corresponds to mock_execute_total_rows_is_none + # Case 1: total_rows is None, which should be handled gracefully. (None, []), - # Corresponds to mock_execute_batches_are_invalid (assuming empty list - # for invalid batches for now) - (100, []), + # Case 2: Batches are invalid and will raise an error during iteration. + (100, FaultyIterator()), ], ids=[ "when_total_rows_is_None", From 858488d03d121936d6e9f31ec0b50f1742ed681a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 1 Aug 2025 08:06:01 +0000 Subject: [PATCH 07/45] change tests in read_gbq_colab --- bigframes/display/anywidget.py | 31 ++++++++------------ tests/benchmark/read_gbq_colab/first_page.py | 8 ++--- tests/benchmark/read_gbq_colab/last_page.py | 7 ++--- 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 198fc1604d..131bf7fe5a 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -77,28 +77,21 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Respect display options for initial page size initial_page_size = bigframes.options.display.max_rows - try: - # Fetches initial data batches and row count for display. - batches = dataframe.to_pandas_batches( - page_size=initial_page_size, - ) - self._batches = cast(bigframes.core.blocks.PandasBatches, batches) + # Fetches initial data batches and row count for display. + batches = dataframe.to_pandas_batches( + page_size=initial_page_size, + ) + self._batches = cast(bigframes.core.blocks.PandasBatches, batches) - # Use total_rows if available, otherwise default to 0. - if self._batches: - self.row_count = self._batches.total_rows or 0 - else: - self.row_count = 0 - self.page_size = initial_page_size + # Use total_rwos from batches directly + self.row_count = self._batches.total_rows or 0 - # Generates the initial HTML table content - self._set_table_html() + # Set page_size after _batches is initialized so observers have + # access to batch data + self.page_size = initial_page_size - except Exception: - self.row_count = 0 - self.page_size = initial_page_size - self._batches = None - self.table_html = "" + # Generates the initial HTML table content + self._set_table_html() @functools.cached_property def _esm(self): diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 33e2a24bd7..2c57750d1f 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -27,10 +27,10 @@ def first_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Get number of rows (to calculate number of pages) and the first page. - batches = df._to_pandas_batches(page_size=PAGE_SIZE) - assert (tr := batches.total_rows) is not None and tr >= 0 - next(iter(batches)) + # Use total_rows from batches directly and the first page + execute_result = df._block.session._executor.execute(df._block.expr, ordered=True) + execute_result.total_rows or 0 + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 2e485a070a..57796cab88 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -27,10 +27,9 @@ def last_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Get number of rows (to calculate number of pages) and then all pages. - batches = df._to_pandas_batches(page_size=PAGE_SIZE) - assert (tr := batches.total_rows) is not None and tr >= 0 - for _ in batches: + execute_result = df._block.session._executor.execute(df._block.expr, ordered=True) + execute_result.total_rows or 0 + for _ in df.to_pandas_batches(page_size=PAGE_SIZE): pass From 91b2c5ee1a7bee616be755ca2c9ee586e13b94e3 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 1 Aug 2025 08:12:03 +0000 Subject: [PATCH 08/45] polish comment --- bigframes/display/anywidget.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 131bf7fe5a..3f78875a8a 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -86,8 +86,8 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Use total_rwos from batches directly self.row_count = self._batches.total_rows or 0 - # Set page_size after _batches is initialized so observers have - # access to batch data + # Set page_size after _batches is available since traitlets observers + # may depend on _batches being initialized when the change trigger happens self.page_size = initial_page_size # Generates the initial HTML table content From 0b50d0cec89e355e848b3f3a3778ab1411cff861 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 6 Aug 2025 20:54:27 +0000 Subject: [PATCH 09/45] fix a test --- bigframes/display/anywidget.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 3f78875a8a..f782505ca2 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -21,6 +21,7 @@ from typing import Any, cast, Dict, Iterator, List, Optional, Type import uuid +import google.api_core.exceptions import pandas as pd import bigframes @@ -166,8 +167,16 @@ def _get_next_batch(self) -> bool: batch = next(iterator) self._cached_batches.append(batch) return True - except StopIteration: + except ( + StopIteration, + google.api_core.exceptions.GoogleAPICallError, + TypeError, + ValueError, + ) as e: self._all_data_loaded = True + if not isinstance(e, StopIteration): + # If we fail to get a batch, assume no more data is available. + self.row_count = 0 return False @property From 7549f74cbe409f8670851de32f4c4e7ec0cf3175 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 12 Aug 2025 00:21:40 +0000 Subject: [PATCH 10/45] change code and update more testcase --- bigframes/display/anywidget.py | 17 +++++++---------- .../read_gbq_colab/aggregate_output.py | 8 +++----- tests/benchmark/read_gbq_colab/filter_output.py | 12 ++++++------ tests/benchmark/read_gbq_colab/first_page.py | 11 +++++++---- tests/benchmark/read_gbq_colab/last_page.py | 6 +++--- tests/benchmark/read_gbq_colab/sort_output.py | 8 +++----- 6 files changed, 29 insertions(+), 33 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index f782505ca2..c364405db5 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -72,26 +72,27 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): self._table_id = str(uuid.uuid4()) self._all_data_loaded = False self._batch_iter: Optional[Iterator[pd.DataFrame]] = None - self._batches: Optional[bigframes.core.blocks.PandasBatches] = None self._cached_batches: List[pd.DataFrame] = [] # Respect display options for initial page size initial_page_size = bigframes.options.display.max_rows - # Fetches initial data batches and row count for display. batches = dataframe.to_pandas_batches( page_size=initial_page_size, ) - self._batches = cast(bigframes.core.blocks.PandasBatches, batches) + self._batches: bigframes.core.blocks.PandasBatches = cast( + bigframes.core.blocks.PandasBatches, batches + ) - # Use total_rwos from batches directly + # The query issued by `to_pandas_batches()` already contains metadata + # about how many results there were. Use that to avoid doing an extra + # COUNT(*) query that `len(...)` would do. self.row_count = self._batches.total_rows or 0 # Set page_size after _batches is available since traitlets observers # may depend on _batches being initialized when the change trigger happens self.page_size = initial_page_size - # Generates the initial HTML table content self._set_table_html() @functools.cached_property @@ -182,11 +183,7 @@ def _get_next_batch(self) -> bool: @property def _batch_iterator(self) -> Iterator[pd.DataFrame]: """Lazily initializes and returns the batch iterator.""" - if self._batch_iter is None: - if self._batches is None: - self._batch_iter = iter([]) - else: - self._batch_iter = iter(self._batches) + self._batch_iter = iter(self._batches) return self._batch_iter @property diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index e5620d8e16..52ed95678e 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -26,8 +26,7 @@ def aggregate_output(*, project_id, dataset_id, table_id): df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") # Simulate getting the first page, since we'll always do that first in the UI. - batches = df._to_pandas_batches(page_size=PAGE_SIZE) - assert (tr := batches.total_rows) is not None and tr >= 0 + batches = df.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) # To simulate very small rows that can only fit a boolean, @@ -44,9 +43,8 @@ def aggregate_output(*, project_id, dataset_id, table_id): .sum(numeric_only=True) ) - batches = df_aggregated._to_pandas_batches(page_size=PAGE_SIZE) - assert (tr := batches.total_rows) is not None and tr >= 0 - next(iter(batches)) + batches_aggregated = df_aggregated.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches_aggregated)) if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index dc88d31366..7ae0398a6e 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -31,19 +31,19 @@ def filter_output( df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") # Simulate getting the first page, since we'll always do that first in the UI. - batches = df._to_pandas_batches(page_size=PAGE_SIZE) - assert (tr := batches.total_rows) is not None and tr >= 0 + batches = df.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] - batches = df_filtered._to_pandas_batches(page_size=PAGE_SIZE) - assert (tr := batches.total_rows) is not None and tr >= 0 - first_page = next(iter(batches)) + batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE) # It's possible we don't have any pages at all, since we filtered out all # matching rows. - assert len(first_page.index) <= tr + first_page = next(iter(batches_filtered)) + rows = batches_filtered.total_rows + assert rows is not None + assert len(first_page.index) <= rows if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 2c57750d1f..3f21693522 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib +import typing import benchmark.utils as utils @@ -27,10 +28,12 @@ def first_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Use total_rows from batches directly and the first page - execute_result = df._block.session._executor.execute(df._block.expr, ordered=True) - execute_result.total_rows or 0 - next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + # Get number of rows (to calculate number of pages) and the first page. + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + first_page = next(iter(batches)) + assert first_page is not None + total_rows = typing.cast(typing.Any, batches).total_rows + assert total_rows is not None if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 57796cab88..e00b304900 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -27,9 +27,9 @@ def last_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - execute_result = df._block.session._executor.execute(df._block.expr, ordered=True) - execute_result.total_rows or 0 - for _ in df.to_pandas_batches(page_size=PAGE_SIZE): + # Get number of rows (to calculate number of pages) and then all pages. + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + for _ in batches: pass diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index 3044e0c2a3..ded42b77e5 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -28,8 +28,7 @@ def sort_output(*, project_id, dataset_id, table_id): ) # Simulate getting the first page, since we'll always do that first in the UI. - batches = df._to_pandas_batches(page_size=PAGE_SIZE) - assert (tr := batches.total_rows) is not None and tr >= 0 + batches = df.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) # Simulate the user sorting by a column and visualizing those results @@ -38,9 +37,8 @@ def sort_output(*, project_id, dataset_id, table_id): sort_column = "col_bool_0" df_sorted = df.sort_values(sort_column) - batches = df_sorted._to_pandas_batches(page_size=PAGE_SIZE) - assert (tr := batches.total_rows) is not None and tr >= 0 - next(iter(batches)) + batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE) + next(iter(batches_sorted)) if __name__ == "__main__": From aa25d1ed2869f8d252148123e1cd8fe2a4fad34b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 14 Aug 2025 21:55:53 +0000 Subject: [PATCH 11/45] remove unneeded except --- bigframes/display/anywidget.py | 8 +------- tests/benchmark/read_gbq_colab/filter_output.py | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index c364405db5..95fd6ddd68 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -21,7 +21,6 @@ from typing import Any, cast, Dict, Iterator, List, Optional, Type import uuid -import google.api_core.exceptions import pandas as pd import bigframes @@ -168,12 +167,7 @@ def _get_next_batch(self) -> bool: batch = next(iterator) self._cached_batches.append(batch) return True - except ( - StopIteration, - google.api_core.exceptions.GoogleAPICallError, - TypeError, - ValueError, - ) as e: + except StopIteration as e: self._all_data_loaded = True if not isinstance(e, StopIteration): # If we fail to get a batch, assume no more data is available. diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index 7ae0398a6e..d8a8fd1abb 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -37,6 +37,7 @@ def filter_output( # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE) + assert batches_filtered.total_rows >= 0 # It's possible we don't have any pages at all, since we filtered out all # matching rows. From a8cc856639e630cc0b8dc2dac4264dac4bc5f968 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 14 Aug 2025 22:11:59 +0000 Subject: [PATCH 12/45] add assert for total_rows --- tests/benchmark/read_gbq_colab/aggregate_output.py | 2 ++ tests/benchmark/read_gbq_colab/filter_output.py | 11 +++++++---- tests/benchmark/read_gbq_colab/first_page.py | 3 +-- tests/benchmark/read_gbq_colab/sort_output.py | 3 +++ 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index 52ed95678e..891991d9f7 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib +import typing import benchmark.utils as utils @@ -27,6 +28,7 @@ def aggregate_output(*, project_id, dataset_id, table_id): # Simulate getting the first page, since we'll always do that first in the UI. batches = df.to_pandas_batches(page_size=PAGE_SIZE) + assert typing.cast(typing.Any, batches).total_rows >= 0 next(iter(batches)) # To simulate very small rows that can only fit a boolean, diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index d8a8fd1abb..363203fd83 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -12,9 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib +import typing import benchmark.utils as utils +import bigframes.core.blocks import bigframes.pandas as bpd PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE @@ -37,13 +39,14 @@ def filter_output( # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE) - assert batches_filtered.total_rows >= 0 - + batches_filtered = typing.cast( + bigframes.core.blocks.PandasBatches, batches_filtered + ) + rows = batches_filtered.total_rows + assert rows >= 0 # It's possible we don't have any pages at all, since we filtered out all # matching rows. first_page = next(iter(batches_filtered)) - rows = batches_filtered.total_rows - assert rows is not None assert len(first_page.index) <= rows diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 3f21693522..16d4d9ad01 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -30,10 +30,9 @@ def first_page(*, project_id, dataset_id, table_id): # Get number of rows (to calculate number of pages) and the first page. batches = df.to_pandas_batches(page_size=PAGE_SIZE) + assert typing.cast(typing.Any, batches).total_rows >= 0 first_page = next(iter(batches)) assert first_page is not None - total_rows = typing.cast(typing.Any, batches).total_rows - assert total_rows is not None if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index ded42b77e5..2443cb25a5 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib +import typing import benchmark.utils as utils @@ -29,6 +30,7 @@ def sort_output(*, project_id, dataset_id, table_id): # Simulate getting the first page, since we'll always do that first in the UI. batches = df.to_pandas_batches(page_size=PAGE_SIZE) + assert typing.cast(typing.Any, batches).total_rows >= 0 next(iter(batches)) # Simulate the user sorting by a column and visualizing those results @@ -38,6 +40,7 @@ def sort_output(*, project_id, dataset_id, table_id): df_sorted = df.sort_values(sort_column) batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE) + assert typing.cast(typing.Any, batches_sorted).total_rows >= 0 next(iter(batches_sorted)) From f6789e5bbcf5703adc2fbd7c305f2425d7a9f5c2 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 19 Aug 2025 21:11:38 +0000 Subject: [PATCH 13/45] get actual row_counts --- bigframes/display/anywidget.py | 29 +++-- notebooks/dataframes/anywidget_mode.ipynb | 100 +++++++++++++----- .../read_gbq_colab/aggregate_output.py | 24 ++++- .../benchmark/read_gbq_colab/filter_output.py | 24 +++-- tests/benchmark/read_gbq_colab/first_page.py | 10 +- tests/benchmark/read_gbq_colab/sort_output.py | 22 +++- tests/system/small/test_anywidget.py | 24 +---- 7 files changed, 155 insertions(+), 78 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 95fd6ddd68..34e6ae1933 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -17,8 +17,7 @@ from importlib import resources import functools import math -import typing -from typing import Any, cast, Dict, Iterator, List, Optional, Type +from typing import Any, Dict, Iterator, List, Optional, Type import uuid import pandas as pd @@ -76,17 +75,19 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Respect display options for initial page size initial_page_size = bigframes.options.display.max_rows - batches = dataframe.to_pandas_batches( - page_size=initial_page_size, - ) - self._batches: bigframes.core.blocks.PandasBatches = cast( - bigframes.core.blocks.PandasBatches, batches + execute_result = dataframe._block.session._executor.execute( + dataframe._block.expr, + ordered=True, + use_explicit_destination=True, ) # The query issued by `to_pandas_batches()` already contains metadata # about how many results there were. Use that to avoid doing an extra # COUNT(*) query that `len(...)` would do. - self.row_count = self._batches.total_rows or 0 + self.row_count = execute_result.total_rows or 0 + + # Create pandas batches from the ExecuteResult + self._batches = execute_result.to_pandas_batches(page_size=initial_page_size) # Set page_size after _batches is available since traitlets observers # may depend on _batches being initialized when the change trigger happens @@ -189,8 +190,16 @@ def _cached_data(self) -> pd.DataFrame: def _reset_batches_for_new_page_size(self): """Reset the batch iterator when page size changes.""" - batches = self._dataframe.to_pandas_batches(page_size=self.page_size) - self._batches = typing.cast(bigframes.core.blocks.PandasBatches, batches) + # Execute with explicit destination for consistency with __init__ + execute_result = self._dataframe._block.session._executor.execute( + self._dataframe._block.expr, + ordered=True, + use_explicit_destination=True, + ) + + # Create pandas batches from the ExecuteResult + self._batches = execute_result.to_pandas_batches(page_size=self.page_size) + self._cached_batches = [] self._batch_iter = None self._all_data_loaded = False diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 7de6d43d6c..f1ab0eee6c 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -73,18 +73,6 @@ "id": "f289d250", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 1ea2b594-2bd7-46de-a3c8-6aeee5884ba2 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -142,7 +130,19 @@ { "data": { "text/html": [ - "Query job 67e679e9-94da-47f7-8be1-8b4a496fbfbd is DONE. 171.4 MB processed. Open Job" + "Query job 3245c62b-5969-4b78-b1f2-4330592d3c65 is DONE. 171.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 1a5cec48-7128-4986-86a6-369a8f366974 is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -154,7 +154,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "e74c3920b93644a0b2afdaa3841cad31", + "model_id": "d59362abcff6445ea879b5f43e0ca9b3", "version_major": 2, "version_minor": 1 }, @@ -195,6 +195,30 @@ "id": "6920d49b", "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "Query job 356f561b-5017-413f-950b-2bc4c7798a24 is DONE. 171.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 72162728-56a3-47ce-bdb1-61b038cc2146 is DONE. 171.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "name": "stdout", "output_type": "stream", @@ -205,13 +229,13 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "df5e93f0d03f45cda67aa6da7f9ef1ae", - "model_id": "b4f7a3f86ef54e07b24ef10061088391", + "model_id": "8fac39e9b92e42d283883137f155526f", "version_major": 2, "version_minor": 1 }, "text/plain": [ "TableWidget(page_size=10, row_count=5552452, table_html='= 0 + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert execute_result.total_rows is not None and execute_result.total_rows >= 0 + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) # To simulate very small rows that can only fit a boolean, @@ -44,8 +48,18 @@ def aggregate_output(*, project_id, dataset_id, table_id): .groupby("rounded") .sum(numeric_only=True) ) - - batches_aggregated = df_aggregated.to_pandas_batches(page_size=PAGE_SIZE) + execute_result_aggregated = df_aggregated._block.session._executor.execute( + df_aggregated._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert ( + execute_result_aggregated.total_rows is not None + and execute_result_aggregated.total_rows >= 0 + ) + batches_aggregated = execute_result_aggregated.to_pandas_batches( + page_size=PAGE_SIZE + ) next(iter(batches_aggregated)) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index 363203fd83..b1dfdf3424 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -12,11 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib -import typing import benchmark.utils as utils -import bigframes.core.blocks import bigframes.pandas as bpd PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE @@ -33,17 +31,29 @@ def filter_output( df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") # Simulate getting the first page, since we'll always do that first in the UI. - batches = df.to_pandas_batches(page_size=PAGE_SIZE) + # Force BigQuery execution to get total_rows metadata + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] - batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE) - batches_filtered = typing.cast( - bigframes.core.blocks.PandasBatches, batches_filtered + # Force BigQuery execution for filtered DataFrame to get total_rows metadata + execute_result_filtered = df_filtered._block.session._executor.execute( + df_filtered._block.expr, + ordered=True, + use_explicit_destination=True, ) - rows = batches_filtered.total_rows + + rows = execute_result_filtered.total_rows or 0 assert rows >= 0 + + batches_filtered = execute_result_filtered.to_pandas_batches(page_size=PAGE_SIZE) + # It's possible we don't have any pages at all, since we filtered out all # matching rows. first_page = next(iter(batches_filtered)) diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 16d4d9ad01..90bd4024cb 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib -import typing import benchmark.utils as utils @@ -29,8 +28,13 @@ def first_page(*, project_id, dataset_id, table_id): ) # Get number of rows (to calculate number of pages) and the first page. - batches = df.to_pandas_batches(page_size=PAGE_SIZE) - assert typing.cast(typing.Any, batches).total_rows >= 0 + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert execute_result.total_rows is not None and execute_result.total_rows >= 0 + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) first_page = next(iter(batches)) assert first_page is not None diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index 2443cb25a5..9724373dde 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import pathlib -import typing import benchmark.utils as utils @@ -29,8 +28,13 @@ def sort_output(*, project_id, dataset_id, table_id): ) # Simulate getting the first page, since we'll always do that first in the UI. - batches = df.to_pandas_batches(page_size=PAGE_SIZE) - assert typing.cast(typing.Any, batches).total_rows >= 0 + execute_result = df._block.session._executor.execute( + df._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert execute_result.total_rows is not None and execute_result.total_rows >= 0 + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) # Simulate the user sorting by a column and visualizing those results @@ -39,8 +43,16 @@ def sort_output(*, project_id, dataset_id, table_id): sort_column = "col_bool_0" df_sorted = df.sort_values(sort_column) - batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE) - assert typing.cast(typing.Any, batches_sorted).total_rows >= 0 + execute_result_sorted = df_sorted._block.session._executor.execute( + df_sorted._block.expr, + ordered=True, + use_explicit_destination=True, + ) + assert ( + execute_result_sorted.total_rows is not None + and execute_result_sorted.total_rows >= 0 + ) + batches_sorted = execute_result_sorted.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches_sorted)) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 716f44f039..2103c52dbb 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -484,24 +484,9 @@ def __next__(self): raise ValueError("Simulated read error") -@pytest.mark.parametrize( - "total_rows_param, arrow_batches_param", - [ - # Case 1: total_rows is None, which should be handled gracefully. - (None, []), - # Case 2: Batches are invalid and will raise an error during iteration. - (100, FaultyIterator()), - ], - ids=[ - "when_total_rows_is_None", - "when_arrow_batches_are_invalid", - ], -) -def test_widget_should_fallback_to_zero_rows_on_error( +def test_widget_should_fallback_to_zero_rows_with_invlid_total_rows( paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch, - total_rows_param, - arrow_batches_param, ): """ Given an internal component fails to return valid execution data, @@ -511,12 +496,7 @@ def test_widget_should_fallback_to_zero_rows_on_error( monkeypatch.setattr( "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", lambda self, *args, **kwargs: mock_execute_result_with_params( - self, - paginated_bf_df._block.expr.schema, - total_rows_param, - arrow_batches_param, - *args, - **kwargs + self, paginated_bf_df._block.expr.schema, None, [], *args, **kwargs ), ) From 9a5ad86fb70ac9eab66d396a092715d0ba648c01 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 19 Aug 2025 21:21:29 +0000 Subject: [PATCH 14/45] avoid two query calls --- bigframes/display/anywidget.py | 12 +++--- notebooks/dataframes/anywidget_mode.ipynb | 48 +++-------------------- 2 files changed, 11 insertions(+), 49 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 34e6ae1933..1ca57e89ef 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -89,9 +89,10 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Create pandas batches from the ExecuteResult self._batches = execute_result.to_pandas_batches(page_size=initial_page_size) - # Set page_size after _batches is available since traitlets observers - # may depend on _batches being initialized when the change trigger happens - self.page_size = initial_page_size + # Set page_size after _batches is available, but avoid triggering observers + # by setting the underlying traitlet value directly + self._trait_values["page_size"] = initial_page_size + self._trait_notifiers["page_size"] = {} # Initialize notifiers if needed self._set_table_html() @@ -168,11 +169,8 @@ def _get_next_batch(self) -> bool: batch = next(iterator) self._cached_batches.append(batch) return True - except StopIteration as e: + except StopIteration: self._all_data_loaded = True - if not isinstance(e, StopIteration): - # If we fail to get a batch, assume no more data is available. - self.row_count = 0 return False @property diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index f1ab0eee6c..e93ea1ded2 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -130,19 +130,7 @@ { "data": { "text/html": [ - "Query job 3245c62b-5969-4b78-b1f2-4330592d3c65 is DONE. 171.4 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 1a5cec48-7128-4986-86a6-369a8f366974 is DONE. 171.4 MB processed. Open Job" + "Query job 087c4276-8c26-467f-852b-c0d31848f666 is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -154,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d59362abcff6445ea879b5f43e0ca9b3", + "model_id": "c2a4111b39c3462a8d0f4f2e4a01635b", "version_major": 2, "version_minor": 1 }, @@ -198,19 +186,7 @@ { "data": { "text/html": [ - "Query job 356f561b-5017-413f-950b-2bc4c7798a24 is DONE. 171.4 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 72162728-56a3-47ce-bdb1-61b038cc2146 is DONE. 171.4 MB processed. Open Job" + "Query job 86d748cf-699c-407c-8eba-2d6421375aad is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -229,7 +205,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8fac39e9b92e42d283883137f155526f", + "model_id": "401985bd2b3f40f3a2f7e48eeabb272d", "version_major": 2, "version_minor": 1 }, @@ -317,19 +293,7 @@ { "data": { "text/html": [ - "Query job 77f0582b-b68c-46a7-bf25-463837a4ef3f is DONE. 171.4 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ec2bcbc2-0f5a-45e9-affc-485183cb245e is DONE. 171.4 MB processed. Open Job" + "Query job 2cb31c3a-ccbc-40fc-b548-ce8503fd2cc3 is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -348,7 +312,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fe6358fd83d6431198944e601ea00372", + "model_id": "9d2e3ced089a4cadbec9eb06d3724237", "version_major": 2, "version_minor": 1 }, From 1d70cfdb7bed304382bde53477690d1b48722bfe Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 21 Aug 2025 21:06:22 +0000 Subject: [PATCH 15/45] remove double query when display widget --- bigframes/display/anywidget.py | 13 ++++++++----- notebooks/dataframes/anywidget_mode.ipynb | 14 +++++++------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 1ca57e89ef..a916823e9c 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -63,8 +63,9 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget." ) - super().__init__() self._dataframe = dataframe + self._initializing = True + super().__init__() # Initialize attributes that might be needed by observers first self._table_id = str(uuid.uuid4()) @@ -89,12 +90,10 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Create pandas batches from the ExecuteResult self._batches = execute_result.to_pandas_batches(page_size=initial_page_size) - # Set page_size after _batches is available, but avoid triggering observers - # by setting the underlying traitlet value directly - self._trait_values["page_size"] = initial_page_size - self._trait_notifiers["page_size"] = {} # Initialize notifiers if needed + self.page_size = initial_page_size self._set_table_html() + self._initializing = False @functools.cached_property def _esm(self): @@ -227,11 +226,15 @@ def _set_table_html(self): @traitlets.observe("page") def _page_changed(self, _change: Dict[str, Any]): """Handler for when the page number is changed from the frontend.""" + if self._initializing: + return self._set_table_html() @traitlets.observe("page_size") def _page_size_changed(self, _change: Dict[str, Any]): """Handler for when the page size is changed from the frontend.""" + if self._initializing: + return # Reset the page to 0 when page size changes to avoid invalid page states self.page = 0 diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index e93ea1ded2..c9e7a32df0 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -130,7 +130,7 @@ { "data": { "text/html": [ - "Query job 087c4276-8c26-467f-852b-c0d31848f666 is DONE. 171.4 MB processed. Open Job" + "Query job 1171b7b3-3f65-4165-a69d-69dad5a100d1 is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -142,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "c2a4111b39c3462a8d0f4f2e4a01635b", + "model_id": "6b70bf0e30a04a3cab11e03b2ed80856", "version_major": 2, "version_minor": 1 }, @@ -186,7 +186,7 @@ { "data": { "text/html": [ - "Query job 86d748cf-699c-407c-8eba-2d6421375aad is DONE. 171.4 MB processed. Open Job" + "Query job 3100859b-c57c-42fe-a5fb-abb4f2f25db2 is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -205,7 +205,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "401985bd2b3f40f3a2f7e48eeabb272d", + "model_id": "4714b0794f55435a8d3e136517158a5c", "version_major": 2, "version_minor": 1 }, @@ -293,7 +293,7 @@ { "data": { "text/html": [ - "Query job 2cb31c3a-ccbc-40fc-b548-ce8503fd2cc3 is DONE. 171.4 MB processed. Open Job" + "Query job b4143f15-4bac-44a5-bb29-c5056f95b30b is DONE. 171.4 MB processed. Open Job" ], "text/plain": [ "" @@ -312,7 +312,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "9d2e3ced089a4cadbec9eb06d3724237", + "model_id": "c70b5611db6b4e6a806a16d0a8287cd3", "version_major": 2, "version_minor": 1 }, @@ -351,7 +351,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.16" + "version": "3.10.15" } }, "nbformat": 4, From 223183a60b0ea2b67c46829e8f5df4c53b9b35d7 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Sat, 13 Sep 2025 03:19:35 +0000 Subject: [PATCH 16/45] get row count directly --- bigframes/display/anywidget.py | 41 +++++++++++------------ notebooks/dataframes/anywidget_mode.ipynb | 18 +++++----- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index a916823e9c..56a0ce9a4e 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -17,7 +17,7 @@ from importlib import resources import functools import math -from typing import Any, Dict, Iterator, List, Optional, Type +from typing import Any, cast, Dict, Iterator, List, Optional, Type import uuid import pandas as pd @@ -52,6 +52,17 @@ class TableWidget(WIDGET_BASE): large BigQuery DataFrames within a Jupyter environment. """ + page_size = traitlets.Int(0).tag(sync=True) + # Use dynamic default + page_size = traitlets.Int().tag(sync=True) + row_count = traitlets.Int(0).tag(sync=True) + table_html = traitlets.Unicode().tag(sync=True) + + @traitlets.default("page_size") + def _page_size_default(self): + """Set the default page size from display options.""" + return bigframes.options.display.max_rows + def __init__(self, dataframe: bigframes.dataframe.DataFrame): """Initialize the TableWidget. @@ -76,21 +87,15 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Respect display options for initial page size initial_page_size = bigframes.options.display.max_rows - execute_result = dataframe._block.session._executor.execute( - dataframe._block.expr, - ordered=True, - use_explicit_destination=True, + self._batches: bigframes.core.blocks.PandasBatches = cast( + bigframes.core.blocks.PandasBatches, + dataframe.to_pandas_batches(page_size=initial_page_size), ) # The query issued by `to_pandas_batches()` already contains metadata # about how many results there were. Use that to avoid doing an extra # COUNT(*) query that `len(...)` would do. - self.row_count = execute_result.total_rows or 0 - - # Create pandas batches from the ExecuteResult - self._batches = execute_result.to_pandas_batches(page_size=initial_page_size) - - self.page_size = initial_page_size + self.row_count = self._batches.total_rows or 0 self._set_table_html() self._initializing = False @@ -106,7 +111,7 @@ def _css(self): return resources.read_text(bigframes.display, "table_widget.css") page = traitlets.Int(0).tag(sync=True) - page_size = traitlets.Int(25).tag(sync=True) + page_size = traitlets.Int().tag(sync=True) row_count = traitlets.Int(0).tag(sync=True) table_html = traitlets.Unicode().tag(sync=True) @@ -187,16 +192,10 @@ def _cached_data(self) -> pd.DataFrame: def _reset_batches_for_new_page_size(self): """Reset the batch iterator when page size changes.""" - # Execute with explicit destination for consistency with __init__ - execute_result = self._dataframe._block.session._executor.execute( - self._dataframe._block.expr, - ordered=True, - use_explicit_destination=True, + self._batches = cast( + bigframes.core.blocks.PandasBatches, + self._dataframe.to_pandas_batches(page_size=self.page_size), ) - - # Create pandas batches from the ExecuteResult - self._batches = execute_result.to_pandas_batches(page_size=self.page_size) - self._cached_batches = [] self._batch_iter = None self._all_data_loaded = False diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index c9e7a32df0..1d6de2e92e 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "d10bfca4", "metadata": {}, "outputs": [], @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "ca22f059", "metadata": {}, "outputs": [], @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "1bc5aaf3", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "f289d250", "metadata": {}, "outputs": [ @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "42bb02ab", "metadata": {}, "outputs": [ @@ -123,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "ce250157", "metadata": {}, "outputs": [ @@ -179,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "6920d49b", "metadata": {}, "outputs": [ @@ -241,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "12b68f15", "metadata": {}, "outputs": [ @@ -278,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "a9d5d13a", "metadata": {}, "outputs": [ From ac55b503d9a9840effa923720260bf531fc6d5c8 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 16 Sep 2025 05:39:01 +0000 Subject: [PATCH 17/45] restore notebook --- notebooks/dataframes/anywidget_mode.ipynb | 76 +++++++---------------- 1 file changed, 23 insertions(+), 53 deletions(-) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 1d6de2e92e..d4911ad5b9 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "d10bfca4", "metadata": {}, "outputs": [], @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "ca22f059", "metadata": {}, "outputs": [], @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "1bc5aaf3", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "f289d250", "metadata": {}, "outputs": [ @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "42bb02ab", "metadata": {}, "outputs": [ @@ -123,26 +123,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "ce250157", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 1171b7b3-3f65-4165-a69d-69dad5a100d1 is DONE. 171.4 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6b70bf0e30a04a3cab11e03b2ed80856", + "model_id": "a85f5799996d4de1a7912182c43fdf54", "version_major": 2, "version_minor": 1 }, @@ -179,22 +167,10 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "6920d49b", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job 3100859b-c57c-42fe-a5fb-abb4f2f25db2 is DONE. 171.4 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stdout", "output_type": "stream", @@ -205,13 +181,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "4714b0794f55435a8d3e136517158a5c", + "model_id": "261075a0d1d2487f804926884fe55eb2", "version_major": 2, "version_minor": 1 }, "text/plain": [ "TableWidget(page_size=10, row_count=5552452, table_html='
bool: @property def _batch_iterator(self) -> Iterator[pd.DataFrame]: """Lazily initializes and returns the batch iterator.""" - self._batch_iter = iter(self._batches) + if self._batch_iter is None: + self._batch_iter = iter(self._batches) return self._batch_iter @property diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index 572b020154..394015487e 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -25,12 +25,13 @@ def aggregate_output(*, project_id, dataset_id, table_id): # e.g. "{local_inline}" or "{local_large}" df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") - # Simulate getting the first page, since we'll always do that first in the UI. + # Call the executor directly to isolate the query execution time + # from other DataFrame overhead for this benchmark. execute_result = df._block.session._executor.execute( df._block.expr, ordered=True, use_explicit_destination=True, - ) + ) # type: ignore[call-arg] assert execute_result.total_rows is not None and execute_result.total_rows >= 0 batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) @@ -52,7 +53,7 @@ def aggregate_output(*, project_id, dataset_id, table_id): df_aggregated._block.expr, ordered=True, use_explicit_destination=True, - ) + ) # type: ignore[call-arg] assert ( execute_result_aggregated.total_rows is not None and execute_result_aggregated.total_rows >= 0 diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index b1dfdf3424..49eba4d688 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -30,13 +30,13 @@ def filter_output( # e.g. "{local_inline}" or "{local_large}" df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") - # Simulate getting the first page, since we'll always do that first in the UI. - # Force BigQuery execution to get total_rows metadata + # Call the executor directly to isolate the query execution time + # from other DataFrame overhead for this benchmark. execute_result = df._block.session._executor.execute( df._block.expr, ordered=True, use_explicit_destination=True, - ) + ) # type: ignore[call-arg] batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 90bd4024cb..02337dd39d 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -27,12 +27,13 @@ def first_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Get number of rows (to calculate number of pages) and the first page. + # Call the executor directly to isolate the query execution time + # from other DataFrame overhead for this benchmark. execute_result = df._block.session._executor.execute( df._block.expr, ordered=True, use_explicit_destination=True, - ) + ) # type: ignore[call-arg] assert execute_result.total_rows is not None and execute_result.total_rows >= 0 batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) first_page = next(iter(batches)) diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index 9724373dde..45a3c1c97d 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -27,12 +27,13 @@ def sort_output(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Simulate getting the first page, since we'll always do that first in the UI. + # Call the executor directly to isolate the query execution time + # from other DataFrame overhead for this benchmark. execute_result = df._block.session._executor.execute( df._block.expr, ordered=True, use_explicit_destination=True, - ) + ) # type: ignore[call-arg] assert execute_result.total_rows is not None and execute_result.total_rows >= 0 batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) @@ -47,7 +48,7 @@ def sort_output(*, project_id, dataset_id, table_id): df_sorted._block.expr, ordered=True, use_explicit_destination=True, - ) + ) # type: ignore[call-arg] assert ( execute_result_sorted.total_rows is not None and execute_result_sorted.total_rows >= 0 diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 2103c52dbb..f02c398f33 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -484,7 +484,7 @@ def __next__(self): raise ValueError("Simulated read error") -def test_widget_should_fallback_to_zero_rows_with_invlid_total_rows( +def test_widget_should_fallback_to_zero_rows_with_invalid_total_rows( paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch, ): From 635e8219e4b9e491d8e281503d15c3beddc31f2c Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 3 Oct 2025 13:42:49 +0000 Subject: [PATCH 21/45] still have zero total rows issue --- bigframes/display/anywidget.py | 52 ++++--- bigframes/session/bq_caching_executor.py | 5 +- notebooks/dataframes/anywidget_mode.ipynb | 127 +++++------------- .../read_gbq_colab/aggregate_output.py | 15 ++- .../benchmark/read_gbq_colab/filter_output.py | 13 +- tests/benchmark/read_gbq_colab/first_page.py | 8 +- tests/benchmark/read_gbq_colab/sort_output.py | 15 ++- 7 files changed, 104 insertions(+), 131 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index c74acafb0e..2429053a19 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -25,11 +25,12 @@ import bigframes import bigframes.core.blocks import bigframes.display.html +import bigframes.session.execution_spec -# anywidget and traitlets are optional dependencies. We don't want the import of this -# module to fail if they aren't installed, though. Instead, we try to limit the surface that -# these packages could affect. This makes unit testing easier and ensures we don't -# accidentally make these required packages. +# anywidget and traitlets are optional dependencies. We don't want the import of +# this module to fail if they aren't installed, though. Instead, we try to +# limit the surface that these packages could affect. This makes unit testing +# easier and ensures we don't accidentally make these required packages. try: import anywidget import traitlets @@ -65,7 +66,8 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): """ if not ANYWIDGET_INSTALLED: raise ImportError( - "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget." + "Please `pip install anywidget traitlets` or " + "`pip install 'bigframes[anywidget]'` to use TableWidget." ) self._dataframe = dataframe @@ -87,16 +89,22 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Respect display options for initial page size self.page_size = bigframes.options.display.max_rows - self._batches: bigframes.core.blocks.PandasBatches = cast( + # Force execution with explicit destination to get total_rows metadata + execute_result = dataframe._block.session._executor.execute( + dataframe._block.expr, + execution_spec=bigframes.session.execution_spec.ExecutionSpec( + ordered=True, promise_under_10gb=False + ), + ) + # The query issued by `to_pandas_batches()` already contains + # metadata about how many results there were. Use that to avoid + # doing an extra COUNT(*) query that `len(...)` would do. + self.row_count = execute_result.total_rows or 0 + self._batches = cast( bigframes.core.blocks.PandasBatches, - dataframe.to_pandas_batches(page_size=self.page_size), + execute_result.to_pandas_batches(page_size=self.page_size), ) - # The query issued by `to_pandas_batches()` already contains metadata - # about how many results there were. Use that to avoid doing an extra - # COUNT(*) query that `len(...)` would do. - self.row_count = self._batches.total_rows or 0 - self._set_table_html() self._initializing = False @@ -186,17 +194,27 @@ def _cached_data(self) -> pd.DataFrame: return pd.DataFrame(columns=self._dataframe.columns) return pd.concat(self._cached_batches, ignore_index=True) - def _reset_batches_for_new_page_size(self): + def _reset_batches_for_new_page_size(self) -> None: """Reset the batch iterator when page size changes.""" + # Execute with explicit destination for consistency with __init__ + execute_result = self._dataframe._block.session._executor.execute( + self._dataframe._block.expr, + execution_spec=bigframes.session.execution_spec.ExecutionSpec( + ordered=True, promise_under_10gb=False + ), + ) + + # Create pandas batches from the ExecuteResult self._batches = cast( bigframes.core.blocks.PandasBatches, - self._dataframe.to_pandas_batches(page_size=self.page_size), + execute_result.to_pandas_batches(page_size=self.page_size), ) + self._cached_batches = [] self._batch_iter = None self._all_data_loaded = False - def _set_table_html(self): + def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" start = self.page * self.page_size end = start + self.page_size @@ -219,14 +237,14 @@ def _set_table_html(self): ) @traitlets.observe("page") - def _page_changed(self, _change: Dict[str, Any]): + def _page_changed(self, _change: Dict[str, Any]) -> None: """Handler for when the page number is changed from the frontend.""" if self._initializing: return self._set_table_html() @traitlets.observe("page_size") - def _page_size_changed(self, _change: Dict[str, Any]): + def _page_size_changed(self, _change: Dict[str, Any]) -> None: """Handler for when the page size is changed from the frontend.""" if self._initializing: return diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index c830ca1e29..78683ead61 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -677,11 +677,14 @@ def _execute_plan_gbq( ) table_info: Optional[bigquery.Table] = None + total_rows = None if query_job and query_job.destination: table_info = self.bqclient.get_table(query_job.destination) size_bytes = table_info.num_bytes + total_rows = table_info.num_rows else: size_bytes = None + total_rows = iterator.total_rows # we could actually cache even when caching is not explicitly requested, but being conservative for now if cache_spec is not None: @@ -708,7 +711,7 @@ def _execute_plan_gbq( schema=og_schema, query_job=query_job, total_bytes=size_bytes, - total_rows=iterator.total_rows, + total_rows=total_rows, total_bytes_processed=iterator.total_bytes_processed, ) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index d4911ad5b9..d8fdeb2173 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "ca22f059", "metadata": {}, "outputs": [], @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "1bc5aaf3", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "f289d250", "metadata": {}, "outputs": [ @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "42bb02ab", "metadata": {}, "outputs": [ @@ -123,27 +123,27 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "ce250157", "metadata": {}, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a85f5799996d4de1a7912182c43fdf54", - "version_major": 2, - "version_minor": 1 - }, - "text/plain": [ - "TableWidget(page_size=10, row_count=5552452, table_html='
345\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 346\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 347\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mPANDAS_PARAM_TRACKING_TASK\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 194\u001b[0m )\n\u001b[0;32m--> 195\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 196\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[0m_call_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 180\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 181\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mNotImplementedError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 182\u001b[0m \u001b[0;31m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dataframe.py\u001b[0m in \u001b[0;36m_repr_html_\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0;31m# This ensures that each cell gets its own widget and prevents\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0;31m# unintended sharing between cells\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 859\u001b[0;31m \u001b[0mwidget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTableWidget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 860\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0mipython_display\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwidget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/display/anywidget.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, dataframe)\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;31m# Force execution with explicit destination to get total_rows metadata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 92\u001b[0;31m execute_result = dataframe._block.session._executor.execute( \n\u001b[0m\u001b[1;32m 93\u001b[0m \u001b[0mbigframes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArrayValue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_block\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# Wrap in ArrayValue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mordered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: BigQueryCachingExecutor.execute() got an unexpected keyword argument 'ordered'" + ] }, { "data": { - "text/html": [], "text/plain": [ "Computation deferred. Computation will process 171.4 MB" ] @@ -167,31 +167,21 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "6920d49b", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total pages: 555246\n" + "ename": "TypeError", + "evalue": "BigQueryCachingExecutor.execute() got an unexpected keyword argument 'ordered'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# Create widget programmatically\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mwidget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTableWidget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Total pages: {math.ceil(widget.row_count / widget.page_size)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/display/anywidget.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, dataframe)\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;31m# Force execution with explicit destination to get total_rows metadata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 92\u001b[0;31m execute_result = dataframe._block.session._executor.execute( \n\u001b[0m\u001b[1;32m 93\u001b[0m \u001b[0mbigframes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArrayValue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_block\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# Wrap in ArrayValue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mordered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: BigQueryCachingExecutor.execute() got an unexpected keyword argument 'ordered'" ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "261075a0d1d2487f804926884fe55eb2", - "version_major": 2, - "version_minor": 1 - }, - "text/plain": [ - "TableWidget(page_size=10, row_count=5552452, table_html='
345\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 346\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 347\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mPANDAS_PARAM_TRACKING_TASK\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 194\u001b[0m )\n\u001b[0;32m--> 195\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 196\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[0m_call_stack\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 179\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 180\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 181\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mNotImplementedError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 182\u001b[0m \u001b[0;31m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dataframe.py\u001b[0m in \u001b[0;36m_repr_html_\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0;31m# This ensures that each cell gets its own widget and prevents\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0;31m# unintended sharing between cells\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 859\u001b[0;31m \u001b[0mwidget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdisplay\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTableWidget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 860\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0mipython_display\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwidget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/display/anywidget.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, dataframe)\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;31m# Force execution with explicit destination to get total_rows metadata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 92\u001b[0;31m execute_result = dataframe._block.session._executor.execute( \n\u001b[0m\u001b[1;32m 93\u001b[0m \u001b[0mbigframes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArrayValue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_block\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# Wrap in ArrayValue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mordered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: BigQueryCachingExecutor.execute() got an unexpected keyword argument 'ordered'" - ] + "data": { + "text/html": [ + "Query job 6d85c081-49c7-408a-ab96-e0e9e5102419 is DONE. 171.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "31ba8e41e4ca4579b85409237cb7a566", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5552452, table_html='
\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# Create widget programmatically\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mwidget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTableWidget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Total pages: {math.ceil(widget.row_count / widget.page_size)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/display/anywidget.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, dataframe)\u001b[0m\n\u001b[1;32m 90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;31m# Force execution with explicit destination to get total_rows metadata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 92\u001b[0;31m execute_result = dataframe._block.session._executor.execute( \n\u001b[0m\u001b[1;32m 93\u001b[0m \u001b[0mbigframes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArrayValue\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_block\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# Wrap in ArrayValue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0mordered\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: BigQueryCachingExecutor.execute() got an unexpected keyword argument 'ordered'" + "data": { + "text/html": [ + "Query job 48cb4908-a59a-420f-8fcb-200d0d9187ef is DONE. 171.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total pages: 555246\n" ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5d22f3f19e4140b0ba51869e97c3f690", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5552452, table_html='
Date: Fri, 3 Oct 2025 14:10:14 +0000 Subject: [PATCH 23/45] benchmark change --- .../benchmark/read_gbq_colab/aggregate_output.py | 3 +-- tests/benchmark/read_gbq_colab/filter_output.py | 3 +-- tests/benchmark/read_gbq_colab/first_page.py | 3 +-- tests/benchmark/read_gbq_colab/last_page.py | 15 +++++++++++---- tests/benchmark/read_gbq_colab/sort_output.py | 3 +-- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index fd1b5494ea..a8b46d839f 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -13,10 +13,9 @@ # limitations under the License. import pathlib -import benchmark.utils as utils - import bigframes.pandas as bpd import bigframes.session.execution_spec +import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index 01939f8ddc..d321b7f799 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -13,10 +13,9 @@ # limitations under the License. import pathlib -import benchmark.utils as utils - import bigframes.pandas as bpd import bigframes.session.execution_spec +import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index 11bb81f1e4..f731192cd0 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -13,10 +13,9 @@ # limitations under the License. import pathlib -import benchmark.utils as utils - import bigframes.pandas import bigframes.session.execution_spec +import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index e00b304900..b24c2babb5 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -13,9 +13,8 @@ # limitations under the License. import pathlib -import benchmark.utils as utils - import bigframes.pandas +import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE @@ -27,8 +26,16 @@ def last_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Get number of rows (to calculate number of pages) and then all pages. - batches = df.to_pandas_batches(page_size=PAGE_SIZE) + # Call the executor directly to isolate the query execution time + # from other DataFrame overhead for this benchmark. + execute_result = df._block.session._executor.execute( + df._block.expr, + execution_spec=bigframes.session.execution_spec.ExecutionSpec( + ordered=True, promise_under_10gb=False + ), + ) + assert execute_result.total_rows is not None and execute_result.total_rows >= 0 + batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) for _ in batches: pass diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index 151916feb2..470feb1be4 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -13,10 +13,9 @@ # limitations under the License. import pathlib -import benchmark.utils as utils - import bigframes.pandas import bigframes.session.execution_spec +import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE From dc332ef4e809cfc70eb7135a66f9b1475c3bb329 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 7 Oct 2025 19:52:11 +0000 Subject: [PATCH 24/45] revert a benchmark --- .../read_gbq_colab/aggregate_output.py | 35 +++++-------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index a8b46d839f..cd33ed2640 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -13,9 +13,9 @@ # limitations under the License. import pathlib +import benchmark.utils as utils + import bigframes.pandas as bpd -import bigframes.session.execution_spec -import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE @@ -25,17 +25,9 @@ def aggregate_output(*, project_id, dataset_id, table_id): # e.g. "{local_inline}" or "{local_large}" df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") - # Call the executor directly to isolate the query execution time - # from other DataFrame overhead for this benchmark. - execute_result = df._block.session._executor.execute( - df._block.expr, - execution_spec=bigframes.session.execution_spec.ExecutionSpec( - ordered=True, promise_under_10gb=False - ), - ) - assert execute_result.total_rows is not None and execute_result.total_rows >= 0 - batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) - next(iter(batches)) + # Simulate getting the first page, since we'll always do that first in the UI. + df.shape + next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) # To simulate very small rows that can only fit a boolean, # some tables don't have an integer column. If an integer column is available, @@ -50,20 +42,9 @@ def aggregate_output(*, project_id, dataset_id, table_id): .groupby("rounded") .sum(numeric_only=True) ) - execute_result_aggregated = df_aggregated._block.session._executor.execute( - df_aggregated._block.expr, - execution_spec=bigframes.session.execution_spec.ExecutionSpec( - ordered=True, promise_under_10gb=False - ), - ) - assert ( - execute_result_aggregated.total_rows is not None - and execute_result_aggregated.total_rows >= 0 - ) - batches_aggregated = execute_result_aggregated.to_pandas_batches( - page_size=PAGE_SIZE - ) - next(iter(batches_aggregated)) + + df_aggregated.shape + next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE))) if __name__ == "__main__": From 47193f1a860c55ce77e095f7610a5d4ea309c387 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 8 Oct 2025 05:54:20 +0000 Subject: [PATCH 25/45] revert executor change --- bigframes/session/bq_caching_executor.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 8d5808eafa..c830ca1e29 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -196,10 +196,7 @@ def execute( self._publisher.publish(bigframes.core.events.ExecutionStarted()) # TODO: Support export jobs in combination with semi executors - if ( - execution_spec.destination_spec is None - and execution_spec.promise_under_10gb - ): + if execution_spec.destination_spec is None: plan = self.prepare_plan(array_value.node, target="simplify") for exec in self._semi_executors: maybe_result = exec.execute( @@ -680,14 +677,11 @@ def _execute_plan_gbq( ) table_info: Optional[bigquery.Table] = None - total_rows = None if query_job and query_job.destination: table_info = self.bqclient.get_table(query_job.destination) size_bytes = table_info.num_bytes - total_rows = table_info.num_rows else: size_bytes = None - total_rows = iterator.total_rows # we could actually cache even when caching is not explicitly requested, but being conservative for now if cache_spec is not None: @@ -714,7 +708,7 @@ def _execute_plan_gbq( schema=og_schema, query_job=query_job, total_bytes=size_bytes, - total_rows=total_rows, + total_rows=iterator.total_rows, total_bytes_processed=iterator.total_bytes_processed, ) From f4b633620574c01a86d8ad8fcd7fc2f04b64c33c Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 9 Oct 2025 04:31:50 +0000 Subject: [PATCH 26/45] raising a NotImplementedError when the row count is none --- bigframes/dataframe.py | 8 ++++++++ tests/system/small/test_anywidget.py | 30 ++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c3735ca3c2..4c40256769 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -794,6 +794,10 @@ def __repr__(self) -> str: pandas_df, row_count, query_job = self._block.retrieve_repr_request_results( max_results ) + if row_count is None: + raise NotImplementedError( + "Cannot determine total number of rows. Please use .to_pandas() to display." + ) self._set_internal_query_job(query_job) @@ -879,6 +883,10 @@ def _repr_html_(self) -> str: pandas_df, row_count, query_job = df._block.retrieve_repr_request_results( max_results ) + if row_count is None: + raise NotImplementedError( + "Cannot determine total number of rows. Please use .to_pandas() to display." + ) self._set_internal_query_job(query_job) column_count = len(pandas_df.columns) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index f02c398f33..0969f19d05 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. + import pandas as pd import pytest import bigframes as bf +import bigframes.pandas as bpd pytest.importorskip("anywidget") @@ -530,6 +532,34 @@ def test_widget_row_count_reflects_actual_data_available( assert widget.page_size == 2 # Respects the display option +def test_repr_html_raises_when_row_count_is_none(monkeypatch): + df = bpd.DataFrame({"col1": [1, 2, 3]}) + + def mock_retrieve_repr_request_results(*args, **kwargs): + return df.to_pandas(), None, None + + monkeypatch.setattr( + df._block, "retrieve_repr_request_results", mock_retrieve_repr_request_results + ) + + with pytest.raises(NotImplementedError): + df._repr_html_() + + +def test_repr_raises_when_row_count_is_none(monkeypatch): + df = bpd.DataFrame({"col1": [1, 2, 3]}) + + def mock_retrieve_repr_request_results(*args, **kwargs): + return df.to_pandas(), None, None + + monkeypatch.setattr( + df._block, "retrieve_repr_request_results", mock_retrieve_repr_request_results + ) + + with pytest.raises(NotImplementedError): + df.__repr__() + + # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. From 085687fed3e6a04df3259d9ccd25a2c6e7c8f99e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Sat, 11 Oct 2025 05:45:47 +0000 Subject: [PATCH 27/45] change return type --- bigframes/core/blocks.py | 5 ++++- bigframes/dataframe.py | 4 ++-- bigframes/display/anywidget.py | 32 ++++------------------------ notebooks/dataframes/dataframe.ipynb | 4 ++-- 4 files changed, 12 insertions(+), 33 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 1900b7208a..c207d93d19 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -99,7 +99,7 @@ @dataclasses.dataclass -class PandasBatches(Iterator[pd.DataFrame]): +class PandasBatches: """Interface for mutable objects with state represented by a block value object.""" def __init__( @@ -124,6 +124,9 @@ def total_bytes_processed(self) -> Optional[int]: def __next__(self) -> pd.DataFrame: return next(self._dataframes) + def __iter__(self) -> Iterator[pd.DataFrame]: + return self + @dataclasses.dataclass() class MaterializationOptions: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 4c40256769..ab264af277 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1895,7 +1895,7 @@ def to_pandas_batches( max_results: Optional[int] = None, *, allow_large_results: Optional[bool] = None, - ) -> Iterable[pandas.DataFrame]: + ) -> blocks.PandasBatches: """Stream DataFrame results to an iterable of pandas DataFrame. page_size and max_results determine the size and number of batches, @@ -1938,7 +1938,7 @@ def to_pandas_batches( over the default size limit of 10 GB. Returns: - Iterable[pandas.DataFrame]: + bigframes.core.blocks.PandasBatches: An iterable of smaller dataframes which combine to form the original dataframe. Results stream from bigquery, see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index b42380242d..545d684e35 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -17,15 +17,13 @@ from importlib import resources import functools import math -from typing import Any, cast, Dict, Iterator, List, Optional, Type +from typing import Any, Dict, Iterator, List, Optional, Type import uuid import pandas as pd import bigframes -import bigframes.core.blocks import bigframes.display.html -import bigframes.session.execution_spec # anywidget and traitlets are optional dependencies. We don't want the import of # this module to fail if they aren't installed, though. Instead, we try to @@ -90,21 +88,11 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # Respect display options for initial page size self.page_size = bigframes.options.display.max_rows - # Force execution with explicit destination to get total_rows metadata - execute_result = dataframe._block.session._executor.execute( - dataframe._block.expr, - execution_spec=bigframes.session.execution_spec.ExecutionSpec( - ordered=True, promise_under_10gb=False - ), - ) # The query issued by `to_pandas_batches()` already contains # metadata about how many results there were. Use that to avoid # doing an extra COUNT(*) query that `len(...)` would do. - self.row_count = execute_result.total_rows or 0 - self._batches = cast( - bigframes.core.blocks.PandasBatches, - execute_result.to_pandas_batches(page_size=self.page_size), - ) + self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) + self.row_count = self._batches.total_rows or 0 self._set_table_html() self._initial_load_complete = True @@ -198,19 +186,7 @@ def _cached_data(self) -> pd.DataFrame: def _reset_batches_for_new_page_size(self) -> None: """Reset the batch iterator when page size changes.""" - # Execute with explicit destination for consistency with __init__ - execute_result = self._dataframe._block.session._executor.execute( - self._dataframe._block.expr, - execution_spec=bigframes.session.execution_spec.ExecutionSpec( - ordered=True, promise_under_10gb=False - ), - ) - - # Create pandas batches from the ExecuteResult - self._batches = cast( - bigframes.core.blocks.PandasBatches, - execute_result.to_pandas_batches(page_size=self.page_size), - ) + self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) self._cached_batches = [] self._batch_iter = None diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb index de9bb1d04f..c5dbb0564b 100644 --- a/notebooks/dataframes/dataframe.ipynb +++ b/notebooks/dataframes/dataframe.ipynb @@ -5366,7 +5366,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "3.10.18", "language": "python", "name": "python3" }, @@ -5380,7 +5380,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.10.18" } }, "nbformat": 4, From 1767bdce89f8f434e76b5675a4e44779716ec79c Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 20:57:53 +0000 Subject: [PATCH 28/45] Revert accidental change of dataframe.ipynb --- notebooks/dataframes/dataframe.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb index c5dbb0564b..de9bb1d04f 100644 --- a/notebooks/dataframes/dataframe.ipynb +++ b/notebooks/dataframes/dataframe.ipynb @@ -5366,7 +5366,7 @@ ], "metadata": { "kernelspec": { - "display_name": "3.10.18", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -5380,7 +5380,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.18" + "version": "3.11.1" } }, "nbformat": 4, From c9f27a19b510daa5f2942610f476dbf1c9c424eb Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 21:04:15 +0000 Subject: [PATCH 29/45] remove unnecessary execution in benchmark --- .../benchmark/read_gbq_colab/filter_output.py | 23 +++------------- tests/benchmark/read_gbq_colab/first_page.py | 13 ++-------- tests/benchmark/read_gbq_colab/last_page.py | 12 ++------- tests/benchmark/read_gbq_colab/sort_output.py | 26 +++---------------- 4 files changed, 11 insertions(+), 63 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index d321b7f799..a03e8d52c6 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -14,7 +14,6 @@ import pathlib import bigframes.pandas as bpd -import bigframes.session.execution_spec import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE @@ -30,32 +29,16 @@ def filter_output( # e.g. "{local_inline}" or "{local_large}" df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") - # Call the executor directly to isolate the query execution time - # from other DataFrame overhead for this benchmark. - execute_result = df._block.session._executor.execute( - df._block.expr, - execution_spec=bigframes.session.execution_spec.ExecutionSpec( - ordered=True, promise_under_10gb=False - ), - ) - batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) + batches = df.to_pandas_batches(page_size=PAGE_SIZE) next(iter(batches)) # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] - # Force BigQuery execution for filtered DataFrame to get total_rows metadata - execute_result_filtered = df_filtered._block.session._executor.execute( - df_filtered._block.expr, - execution_spec=bigframes.session.execution_spec.ExecutionSpec( - ordered=True, promise_under_10gb=False - ), - ) + batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE) - rows = execute_result_filtered.total_rows or 0 + rows = batches_filtered.total_rows or 0 assert rows >= 0 - batches_filtered = execute_result_filtered.to_pandas_batches(page_size=PAGE_SIZE) - # It's possible we don't have any pages at all, since we filtered out all # matching rows. first_page = next(iter(batches_filtered)) diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index f731192cd0..a626d171fa 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -14,7 +14,6 @@ import pathlib import bigframes.pandas -import bigframes.session.execution_spec import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE @@ -27,16 +26,8 @@ def first_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Call the executor directly to isolate the query execution time - # from other DataFrame overhead for this benchmark. - execute_result = df._block.session._executor.execute( - df._block.expr, - execution_spec=bigframes.session.execution_spec.ExecutionSpec( - ordered=True, promise_under_10gb=False - ), - ) - assert execute_result.total_rows is not None and execute_result.total_rows >= 0 - batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + assert batches.total_rows is not None and batches.total_rows >= 0 first_page = next(iter(batches)) assert first_page is not None diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index b24c2babb5..64f40f65f5 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -26,16 +26,8 @@ def last_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Call the executor directly to isolate the query execution time - # from other DataFrame overhead for this benchmark. - execute_result = df._block.session._executor.execute( - df._block.expr, - execution_spec=bigframes.session.execution_spec.ExecutionSpec( - ordered=True, promise_under_10gb=False - ), - ) - assert execute_result.total_rows is not None and execute_result.total_rows >= 0 - batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + assert batches.total_rows is not None and batches.total_rows >= 0 for _ in batches: pass diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index 470feb1be4..a33548400b 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -14,7 +14,6 @@ import pathlib import bigframes.pandas -import bigframes.session.execution_spec import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE @@ -27,16 +26,8 @@ def sort_output(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - # Call the executor directly to isolate the query execution time - # from other DataFrame overhead for this benchmark. - execute_result = df._block.session._executor.execute( - df._block.expr, - execution_spec=bigframes.session.execution_spec.ExecutionSpec( - ordered=True, promise_under_10gb=False - ), - ) - assert execute_result.total_rows is not None and execute_result.total_rows >= 0 - batches = execute_result.to_pandas_batches(page_size=PAGE_SIZE) + batches = df.to_pandas_batches(page_size=PAGE_SIZE) + assert batches.total_rows is not None and batches.total_rows >= 0 next(iter(batches)) # Simulate the user sorting by a column and visualizing those results @@ -45,17 +36,8 @@ def sort_output(*, project_id, dataset_id, table_id): sort_column = "col_bool_0" df_sorted = df.sort_values(sort_column) - execute_result_sorted = df_sorted._block.session._executor.execute( - df_sorted._block.expr, - execution_spec=bigframes.session.execution_spec.ExecutionSpec( - ordered=True, promise_under_10gb=False - ), - ) - assert ( - execute_result_sorted.total_rows is not None - and execute_result_sorted.total_rows >= 0 - ) - batches_sorted = execute_result_sorted.to_pandas_batches(page_size=PAGE_SIZE) + batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE) + assert batches_sorted.total_rows is not None and batches_sorted.total_rows >= 0 next(iter(batches_sorted)) From e7bbea1b8e316e9e57a5d5e1552d657bee752e29 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 21:09:33 +0000 Subject: [PATCH 30/45] remove row_count check --- bigframes/dataframe.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ab264af277..c0b78eb79b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -794,10 +794,6 @@ def __repr__(self) -> str: pandas_df, row_count, query_job = self._block.retrieve_repr_request_results( max_results ) - if row_count is None: - raise NotImplementedError( - "Cannot determine total number of rows. Please use .to_pandas() to display." - ) self._set_internal_query_job(query_job) @@ -883,10 +879,6 @@ def _repr_html_(self) -> str: pandas_df, row_count, query_job = df._block.retrieve_repr_request_results( max_results ) - if row_count is None: - raise NotImplementedError( - "Cannot determine total number of rows. Please use .to_pandas() to display." - ) self._set_internal_query_job(query_job) column_count = len(pandas_df.columns) From 3bb01149a74270e5781987a4a8f9accb78697500 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 21:15:40 +0000 Subject: [PATCH 31/45] remove extra execute_result --- tests/benchmark/read_gbq_colab/filter_output.py | 2 +- tests/benchmark/read_gbq_colab/first_page.py | 2 +- tests/benchmark/read_gbq_colab/last_page.py | 2 +- tests/benchmark/read_gbq_colab/sort_output.py | 8 ++++++-- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index a03e8d52c6..666a4ef366 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -56,4 +56,4 @@ def filter_output( project_id=config.project_id, dataset_id=config.dataset_id, table_id=config.table_id, - ) + ) \ No newline at end of file diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index a626d171fa..c9f546485a 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -43,4 +43,4 @@ def first_page(*, project_id, dataset_id, table_id): project_id=config.project_id, dataset_id=config.dataset_id, table_id=config.table_id, - ) + ) \ No newline at end of file diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 64f40f65f5..5de6b6c5ac 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -43,4 +43,4 @@ def last_page(*, project_id, dataset_id, table_id): project_id=config.project_id, dataset_id=config.dataset_id, table_id=config.table_id, - ) + ) \ No newline at end of file diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index a33548400b..383660c54e 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -37,7 +37,10 @@ def sort_output(*, project_id, dataset_id, table_id): df_sorted = df.sort_values(sort_column) batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE) - assert batches_sorted.total_rows is not None and batches_sorted.total_rows >= 0 + assert ( + batches_sorted.total_rows is not None + and batches_sorted.total_rows >= 0 + ) next(iter(batches_sorted)) @@ -47,9 +50,10 @@ def sort_output(*, project_id, dataset_id, table_id): utils.get_execution_time( sort_output, +. current_path, config.benchmark_suffix, project_id=config.project_id, dataset_id=config.dataset_id, table_id=config.table_id, - ) + ) \ No newline at end of file From 9e5c8be56467acab86971a3aede2c2affb0dcdf1 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 22:16:59 +0000 Subject: [PATCH 32/45] remove unnecessary tests --- .../benchmark/read_gbq_colab/filter_output.py | 2 +- tests/benchmark/read_gbq_colab/first_page.py | 2 +- tests/benchmark/read_gbq_colab/last_page.py | 2 +- tests/benchmark/read_gbq_colab/sort_output.py | 8 ++--- tests/system/small/test_anywidget.py | 29 ------------------- 5 files changed, 5 insertions(+), 38 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index 666a4ef366..a03e8d52c6 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -56,4 +56,4 @@ def filter_output( project_id=config.project_id, dataset_id=config.dataset_id, table_id=config.table_id, - ) \ No newline at end of file + ) diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index c9f546485a..a626d171fa 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -43,4 +43,4 @@ def first_page(*, project_id, dataset_id, table_id): project_id=config.project_id, dataset_id=config.dataset_id, table_id=config.table_id, - ) \ No newline at end of file + ) diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 5de6b6c5ac..64f40f65f5 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -43,4 +43,4 @@ def last_page(*, project_id, dataset_id, table_id): project_id=config.project_id, dataset_id=config.dataset_id, table_id=config.table_id, - ) \ No newline at end of file + ) diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index 383660c54e..a33548400b 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -37,10 +37,7 @@ def sort_output(*, project_id, dataset_id, table_id): df_sorted = df.sort_values(sort_column) batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE) - assert ( - batches_sorted.total_rows is not None - and batches_sorted.total_rows >= 0 - ) + assert batches_sorted.total_rows is not None and batches_sorted.total_rows >= 0 next(iter(batches_sorted)) @@ -50,10 +47,9 @@ def sort_output(*, project_id, dataset_id, table_id): utils.get_execution_time( sort_output, -. current_path, config.benchmark_suffix, project_id=config.project_id, dataset_id=config.dataset_id, table_id=config.table_id, - ) \ No newline at end of file + ) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 0969f19d05..29083c241b 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -17,7 +17,6 @@ import pytest import bigframes as bf -import bigframes.pandas as bpd pytest.importorskip("anywidget") @@ -532,34 +531,6 @@ def test_widget_row_count_reflects_actual_data_available( assert widget.page_size == 2 # Respects the display option -def test_repr_html_raises_when_row_count_is_none(monkeypatch): - df = bpd.DataFrame({"col1": [1, 2, 3]}) - - def mock_retrieve_repr_request_results(*args, **kwargs): - return df.to_pandas(), None, None - - monkeypatch.setattr( - df._block, "retrieve_repr_request_results", mock_retrieve_repr_request_results - ) - - with pytest.raises(NotImplementedError): - df._repr_html_() - - -def test_repr_raises_when_row_count_is_none(monkeypatch): - df = bpd.DataFrame({"col1": [1, 2, 3]}) - - def mock_retrieve_repr_request_results(*args, **kwargs): - return df.to_pandas(), None, None - - monkeypatch.setattr( - df._block, "retrieve_repr_request_results", mock_retrieve_repr_request_results - ) - - with pytest.raises(NotImplementedError): - df.__repr__() - - # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. From 50c92a5854b0f75be447c2ca506f979b84803c5a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 16 Oct 2025 18:18:52 +0000 Subject: [PATCH 33/45] Fix: Address review comments on PandasBatches and docstring - Reinstated 'Iterator[pd.DataFrame]' inheritance for 'PandasBatches' in 'bigframes/core/blocks.py'. - Removed internal type hint 'bigframes.core.blocks.PandasBatches:' from 'to_pandas_batches' docstring in 'bigframes/dataframe.py' to avoid exposing internal types in public documentation. --- bigframes/core/blocks.py | 2 +- bigframes/dataframe.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c207d93d19..1a3e95b585 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -99,7 +99,7 @@ @dataclasses.dataclass -class PandasBatches: +class PandasBatches(Iterator[pd.DataFrame]): """Interface for mutable objects with state represented by a block value object.""" def __init__( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c0b78eb79b..841ef68e02 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1930,10 +1930,9 @@ def to_pandas_batches( over the default size limit of 10 GB. Returns: - bigframes.core.blocks.PandasBatches: - An iterable of smaller dataframes which combine to - form the original dataframe. Results stream from bigquery, - see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable + An iterable of smaller dataframes which combine to + form the original dataframe. Results stream from bigquery, + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable """ return self._to_pandas_batches( page_size=page_size, From 8f33c057243048445132013ecf1cc67eab2f9af0 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 16 Oct 2025 19:50:26 +0000 Subject: [PATCH 34/45] Revert: Revert import change in read_gbq_colab benchmark This reverts the import path for the benchmark utils to 'benchmark.utils' to address concerns about google3 imports. --- bigframes/dataframe.py | 7 +- notebooks/Untitled-2.ipynb | 386 ++++++++++++++++++ .../audio_transcribe_partial_ordering.ipynb | 185 +++++++++ .../benchmark/read_gbq_colab/filter_output.py | 3 +- tests/benchmark/read_gbq_colab/first_page.py | 3 +- tests/benchmark/read_gbq_colab/last_page.py | 3 +- tests/benchmark/read_gbq_colab/sort_output.py | 3 +- 7 files changed, 583 insertions(+), 7 deletions(-) create mode 100644 notebooks/Untitled-2.ipynb create mode 100644 notebooks/multimodal/audio_transcribe_partial_ordering.ipynb diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 841ef68e02..d76718cfa6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1930,9 +1930,10 @@ def to_pandas_batches( over the default size limit of 10 GB. Returns: - An iterable of smaller dataframes which combine to - form the original dataframe. Results stream from bigquery, - see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable + Iterable[pandas.DataFrame]: + An iterable of smaller dataframes which combine to + form the original dataframe. Results stream from bigquery, + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterabl """ return self._to_pandas_batches( page_size=page_size, diff --git a/notebooks/Untitled-2.ipynb b/notebooks/Untitled-2.ipynb new file mode 100644 index 0000000000..64c224a414 --- /dev/null +++ b/notebooks/Untitled-2.ipynb @@ -0,0 +1,386 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "845b9090", + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "bpd.options.display.repr_mode = \"anywidget\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ae042351", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 0 Bytes in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "68a3c265b6f94778a4c31a7432e9da9f", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5, table_html='
18157874.1 \n", + "1 03.10.2018 G06F 11/30 18157347.8 \n", + "2 03.10.2018 A01K 31/00 18171005.4 \n", + "3 03.10.2018 H01L 21/20 18166536.5 \n", + "4 03.10.2018 H05B 6/12 18165514.3 \n", + "\n", + " filing_date priority_date_eu representative_line_1_eu \\\n", + "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", + "1 19.02.2018 31.03.2017 Hoffmann Eitle \n", + "2 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", + "3 16.02.2016 Scheider, Sascha et al \n", + "4 03.04.2018 30.03.2017 \n", + "\n", + " applicant_line_1 inventor_line_1 \\\n", + "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", + "1 FUJITSU LIMITED Kukihara, Kensuke \n", + "2 Linco Food Systems A/S Thrane, Uffe \n", + "3 EV Group E. Thallner GmbH Kurz, Florian \n", + "4 BSH Hausger√§te GmbH Acero Acero, Jesus \n", + "\n", + " title_line_1 number \n", + "0 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", + "1 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", + "2 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", + "3 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", + "4 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", + "\n", + "[5 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bpd._read_gbq_colab(\"\"\"\n", + "SELECT *\n", + "FROM `bigquery-public-data.labeled_patents.extracted_data`\n", + "LIMIT 5;\n", + "\"\"\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d936b25f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 0 Bytes in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cce2a6be23414e58a5310a5ec7a1c50d", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=4, table_html='
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bpd._read_gbq_colab(\"\"\"\n", + "SELECT\n", + " AI.GENERATE(\n", + " prompt=>(\"Extract the values.\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \"us.conn\")), \"r\")),\n", + " connection_id=>\"bigframes-dev.us.bigframes-default-connection\",\n", + " output_schema=>\"publication_date string, class_international string, application_number string, filing_date string\") AS result,\n", + " *\n", + "FROM `bigquery-public-data.labeled_patents.extracted_data`\n", + "LIMIT 5;\n", + "\"\"\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/multimodal/audio_transcribe_partial_ordering.ipynb b/notebooks/multimodal/audio_transcribe_partial_ordering.ipynb new file mode 100644 index 0000000000..643b3db3ea --- /dev/null +++ b/notebooks/multimodal/audio_transcribe_partial_ordering.ipynb @@ -0,0 +1,185 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Audio Transcription with Partial Ordering Mode in BigQuery DataFrames" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates how to use the `audio_transcribe` function from BigQuery DataFrames' LLM module in partial ordering mode. Partial ordering mode can improve performance by allowing BigQuery to process data in a more optimized, non-sequential manner." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT = \"bigframes-dev\" # replace with your project. \n", + "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n", + "\n", + "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n", + "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n", + "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n", + "# In this Notebook it uses bigframes-default-connection by default. You can also bring in your own connections in each method.\n", + "\n", + "import bigframes\n", + "# Setup project\n", + "bigframes.options.bigquery.project = PROJECT\n", + "\n", + "# Display options\n", + "bigframes.options.display.blob_display_width = 300\n", + "bigframes.options.display.progress_bar = None\n", + "\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Set to Partial Ordering Mode" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ordering mode set to: partial\n" + ] + } + ], + "source": [ + "bpd.options.compute.ordering_mode = \"partial\"\n", + "print(f\"Ordering mode set to: {bpd.options.compute.ordering_mode}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Prepare Audio Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use a publicly available audio file stored in a Google Cloud Storage bucket. We use `from_glob_path` to correctly load the audio file as a blob object." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + } + ], + "source": [ + "audio_gcs_path = \"gs://bigframes_blob_test/audio/*\"\n", + "df = bpd.from_glob_path(\n", + " audio_gcs_path, name=\"audio\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Run Audio Transcription" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we'll use the `audio_transcribe` function with the `gemini-2.0-flash-001` model to transcribe the audio file. This operation will be executed in partial ordering mode as configured." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "0 Now, as all books, not primarily intended as p...\n", + "Name: transcribed_content, dtype: string" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transcribed_series = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=False)\n", + "transcribed_series" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index a03e8d52c6..a4afcb2774 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -13,8 +13,9 @@ # limitations under the License. import pathlib +import benchmark.utils as utils + import bigframes.pandas as bpd -import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index a626d171fa..be54fc2634 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -13,8 +13,9 @@ # limitations under the License. import pathlib +import benchmark.utils as utils + import bigframes.pandas -import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 64f40f65f5..0b50433438 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -13,8 +13,9 @@ # limitations under the License. import pathlib +import benchmark.utils as utils + import bigframes.pandas -import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index a33548400b..bd9400fc2c 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -13,8 +13,9 @@ # limitations under the License. import pathlib +import benchmark.utils as utils + import bigframes.pandas -import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE From 9c450f2c24d16e1c487afa9b8dd84d551aba2603 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 16 Oct 2025 20:09:16 +0000 Subject: [PATCH 35/45] Revert: Revert unnecessary changes in read_gbq_colab benchmarks --- tests/benchmark/read_gbq_colab/filter_output.py | 3 +-- tests/benchmark/read_gbq_colab/first_page.py | 3 +-- tests/benchmark/read_gbq_colab/last_page.py | 3 +-- tests/benchmark/read_gbq_colab/sort_output.py | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index a4afcb2774..a03e8d52c6 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -13,9 +13,8 @@ # limitations under the License. import pathlib -import benchmark.utils as utils - import bigframes.pandas as bpd +import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index be54fc2634..a626d171fa 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -13,9 +13,8 @@ # limitations under the License. import pathlib -import benchmark.utils as utils - import bigframes.pandas +import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 0b50433438..64f40f65f5 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -13,9 +13,8 @@ # limitations under the License. import pathlib -import benchmark.utils as utils - import bigframes.pandas +import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index bd9400fc2c..a33548400b 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -13,9 +13,8 @@ # limitations under the License. import pathlib -import benchmark.utils as utils - import bigframes.pandas +import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE From 885bf895306e64b80e84af8b2b285794403a477d Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 16 Oct 2025 20:12:59 +0000 Subject: [PATCH 36/45] Remove notebooks/Untitled-2.ipynb --- notebooks/Untitled-2.ipynb | 386 ------------------------------------- 1 file changed, 386 deletions(-) delete mode 100644 notebooks/Untitled-2.ipynb diff --git a/notebooks/Untitled-2.ipynb b/notebooks/Untitled-2.ipynb deleted file mode 100644 index 64c224a414..0000000000 --- a/notebooks/Untitled-2.ipynb +++ /dev/null @@ -1,386 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "845b9090", - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd\n", - "bpd.options.display.repr_mode = \"anywidget\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ae042351", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 0 Bytes in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "68a3c265b6f94778a4c31a7432e9da9f", - "version_major": 2, - "version_minor": 1 - }, - "text/plain": [ - "TableWidget(page_size=10, row_count=5, table_html='
18157874.1 \n", - "1 03.10.2018 G06F 11/30 18157347.8 \n", - "2 03.10.2018 A01K 31/00 18171005.4 \n", - "3 03.10.2018 H01L 21/20 18166536.5 \n", - "4 03.10.2018 H05B 6/12 18165514.3 \n", - "\n", - " filing_date priority_date_eu representative_line_1_eu \\\n", - "0 21.02.2018 22.02.2017 Liedtke & Partner Patentanw√§lte \n", - "1 19.02.2018 31.03.2017 Hoffmann Eitle \n", - "2 05.02.2015 05.02.2014 Stork Bamberger Patentanw√§lte \n", - "3 16.02.2016 Scheider, Sascha et al \n", - "4 03.04.2018 30.03.2017 \n", - "\n", - " applicant_line_1 inventor_line_1 \\\n", - "0 SHB Hebezeugbau GmbH VOLGER, Alexander \n", - "1 FUJITSU LIMITED Kukihara, Kensuke \n", - "2 Linco Food Systems A/S Thrane, Uffe \n", - "3 EV Group E. Thallner GmbH Kurz, Florian \n", - "4 BSH Hausger√§te GmbH Acero Acero, Jesus \n", - "\n", - " title_line_1 number \n", - "0 STEUERUNGSSYSTEM F√úR AUTOMATISCHE PARKH√ÑUSER EP 3 366 869 A1 \n", - "1 METHOD EXECUTED BY A COMPUTER, INFORMATION PRO... EP 3 382 553 A1 \n", - "2 MASTH√ÑHNCHENCONTAINER ALS BESTANDTEIL EINER E... EP 3 381 276 A1 \n", - "3 VORRICHTUNG ZUM BONDEN VON SUBSTRATEN EP 3 382 744 A1 \n", - "4 VORRICHTUNG ZUR INDUKTIVEN ENERGIE√úBERTRAGUNG EP 3 383 141 A2 \n", - "\n", - "[5 rows x 14 columns]" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bpd._read_gbq_colab(\"\"\"\n", - "SELECT *\n", - "FROM `bigquery-public-data.labeled_patents.extracted_data`\n", - "LIMIT 5;\n", - "\"\"\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d936b25f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 0 Bytes in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "cce2a6be23414e58a5310a5ec7a1c50d", - "version_major": 2, - "version_minor": 1 - }, - "text/plain": [ - "TableWidget(page_size=10, row_count=4, table_html='
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "bpd._read_gbq_colab(\"\"\"\n", - "SELECT\n", - " AI.GENERATE(\n", - " prompt=>(\"Extract the values.\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \"us.conn\")), \"r\")),\n", - " connection_id=>\"bigframes-dev.us.bigframes-default-connection\",\n", - " output_schema=>\"publication_date string, class_international string, application_number string, filing_date string\") AS result,\n", - " *\n", - "FROM `bigquery-public-data.labeled_patents.extracted_data`\n", - "LIMIT 5;\n", - "\"\"\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.18" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 9a793e85067692ea492b2385b261996841191e75 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 16 Oct 2025 20:13:36 +0000 Subject: [PATCH 37/45] Remove notebooks/multimodal/audio_transcribe_partial_ordering.ipynb --- .../audio_transcribe_partial_ordering.ipynb | 185 ------------------ 1 file changed, 185 deletions(-) delete mode 100644 notebooks/multimodal/audio_transcribe_partial_ordering.ipynb diff --git a/notebooks/multimodal/audio_transcribe_partial_ordering.ipynb b/notebooks/multimodal/audio_transcribe_partial_ordering.ipynb deleted file mode 100644 index 643b3db3ea..0000000000 --- a/notebooks/multimodal/audio_transcribe_partial_ordering.ipynb +++ /dev/null @@ -1,185 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Audio Transcription with Partial Ordering Mode in BigQuery DataFrames" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook demonstrates how to use the `audio_transcribe` function from BigQuery DataFrames' LLM module in partial ordering mode. Partial ordering mode can improve performance by allowing BigQuery to process data in a more optimized, non-sequential manner." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1. Setup" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "PROJECT = \"bigframes-dev\" # replace with your project. \n", - "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n", - "\n", - "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n", - "# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n", - "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n", - "# In this Notebook it uses bigframes-default-connection by default. You can also bring in your own connections in each method.\n", - "\n", - "import bigframes\n", - "# Setup project\n", - "bigframes.options.bigquery.project = PROJECT\n", - "\n", - "# Display options\n", - "bigframes.options.display.blob_display_width = 300\n", - "bigframes.options.display.progress_bar = None\n", - "\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2. Set to Partial Ordering Mode" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ordering mode set to: partial\n" - ] - } - ], - "source": [ - "bpd.options.compute.ordering_mode = \"partial\"\n", - "print(f\"Ordering mode set to: {bpd.options.compute.ordering_mode}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3. Prepare Audio Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will use a publicly available audio file stored in a Google Cloud Storage bucket. We use `from_glob_path` to correctly load the audio file as a blob object." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - } - ], - "source": [ - "audio_gcs_path = \"gs://bigframes_blob_test/audio/*\"\n", - "df = bpd.from_glob_path(\n", - " audio_gcs_path, name=\"audio\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4. Run Audio Transcription" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we'll use the `audio_transcribe` function with the `gemini-2.0-flash-001` model to transcribe the audio file. This operation will be executed in partial ordering mode as configured." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/plain": [ - "0 Now, as all books, not primarily intended as p...\n", - "Name: transcribed_content, dtype: string" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "transcribed_series = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=False)\n", - "transcribed_series" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 37eab08d8729f32d254909ff7725d912f0de7d34 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 16 Oct 2025 20:16:11 +0000 Subject: [PATCH 38/45] remove unnecessary change --- tests/benchmark/read_gbq_colab/filter_output.py | 3 ++- tests/benchmark/read_gbq_colab/first_page.py | 3 ++- tests/benchmark/read_gbq_colab/last_page.py | 3 ++- tests/benchmark/read_gbq_colab/sort_output.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index a03e8d52c6..a4afcb2774 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -13,8 +13,9 @@ # limitations under the License. import pathlib +import benchmark.utils as utils + import bigframes.pandas as bpd -import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index a626d171fa..be54fc2634 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -13,8 +13,9 @@ # limitations under the License. import pathlib +import benchmark.utils as utils + import bigframes.pandas -import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 64f40f65f5..0b50433438 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -13,8 +13,9 @@ # limitations under the License. import pathlib +import benchmark.utils as utils + import bigframes.pandas -import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index a33548400b..bd9400fc2c 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -13,8 +13,9 @@ # limitations under the License. import pathlib +import benchmark.utils as utils + import bigframes.pandas -import tests.benchmark.utils as utils PAGE_SIZE = utils.READ_GBQ_COLAB_PAGE_SIZE From 058b7b7e43eae55e7c0aeafa2726b70d8ea337c1 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 16 Oct 2025 20:18:24 +0000 Subject: [PATCH 39/45] revert typo --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d76718cfa6..fb4435ff7d 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1933,7 +1933,7 @@ def to_pandas_batches( Iterable[pandas.DataFrame]: An iterable of smaller dataframes which combine to form the original dataframe. Results stream from bigquery, - see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterabl + see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable """ return self._to_pandas_batches( page_size=page_size, From bedfed491fdd7152d20e01259b5c50c83c1570b6 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 17 Oct 2025 04:25:20 +0000 Subject: [PATCH 40/45] add todo --- bigframes/display/anywidget.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 545d684e35..a78e332978 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -92,6 +92,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # metadata about how many results there were. Use that to avoid # doing an extra COUNT(*) query that `len(...)` would do. self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) + # TODO (shuowei): total_rows=None Incorrectly Defaults to 0. b/452747934 self.row_count = self._batches.total_rows or 0 self._set_table_html() From 02cf22722b38e9e94f6b7412e010a0b620b7d317 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 17 Oct 2025 04:28:24 +0000 Subject: [PATCH 41/45] change docstring --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index fb4435ff7d..028bd16f64 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1930,7 +1930,7 @@ def to_pandas_batches( over the default size limit of 10 GB. Returns: - Iterable[pandas.DataFrame]: + blocks.PandasBatches: An iterable of smaller dataframes which combine to form the original dataframe. Results stream from bigquery, see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable From af9a4e8198d89b19bf3067abbfe9b66a74e71a05 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 22 Oct 2025 00:11:33 +0000 Subject: [PATCH 42/45] revert changes to tests/benchmark/read_gbq_colab --- tests/benchmark/read_gbq_colab/aggregate_output.py | 10 ++++++---- tests/benchmark/read_gbq_colab/filter_output.py | 14 +++++++------- tests/benchmark/read_gbq_colab/first_page.py | 8 ++++---- tests/benchmark/read_gbq_colab/last_page.py | 5 +++-- tests/benchmark/read_gbq_colab/sort_output.py | 11 ++++++----- 5 files changed, 26 insertions(+), 22 deletions(-) diff --git a/tests/benchmark/read_gbq_colab/aggregate_output.py b/tests/benchmark/read_gbq_colab/aggregate_output.py index cd33ed2640..e5620d8e16 100644 --- a/tests/benchmark/read_gbq_colab/aggregate_output.py +++ b/tests/benchmark/read_gbq_colab/aggregate_output.py @@ -26,8 +26,9 @@ def aggregate_output(*, project_id, dataset_id, table_id): df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") # Simulate getting the first page, since we'll always do that first in the UI. - df.shape - next(iter(df.to_pandas_batches(page_size=PAGE_SIZE))) + batches = df._to_pandas_batches(page_size=PAGE_SIZE) + assert (tr := batches.total_rows) is not None and tr >= 0 + next(iter(batches)) # To simulate very small rows that can only fit a boolean, # some tables don't have an integer column. If an integer column is available, @@ -43,8 +44,9 @@ def aggregate_output(*, project_id, dataset_id, table_id): .sum(numeric_only=True) ) - df_aggregated.shape - next(iter(df_aggregated.to_pandas_batches(page_size=PAGE_SIZE))) + batches = df_aggregated._to_pandas_batches(page_size=PAGE_SIZE) + assert (tr := batches.total_rows) is not None and tr >= 0 + next(iter(batches)) if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/filter_output.py b/tests/benchmark/read_gbq_colab/filter_output.py index a4afcb2774..dc88d31366 100644 --- a/tests/benchmark/read_gbq_colab/filter_output.py +++ b/tests/benchmark/read_gbq_colab/filter_output.py @@ -30,20 +30,20 @@ def filter_output( # e.g. "{local_inline}" or "{local_large}" df = bpd._read_gbq_colab(f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}") - batches = df.to_pandas_batches(page_size=PAGE_SIZE) + # Simulate getting the first page, since we'll always do that first in the UI. + batches = df._to_pandas_batches(page_size=PAGE_SIZE) + assert (tr := batches.total_rows) is not None and tr >= 0 next(iter(batches)) # Simulate the user filtering by a column and visualizing those results df_filtered = df[df["col_bool_0"]] - batches_filtered = df_filtered.to_pandas_batches(page_size=PAGE_SIZE) - - rows = batches_filtered.total_rows or 0 - assert rows >= 0 + batches = df_filtered._to_pandas_batches(page_size=PAGE_SIZE) + assert (tr := batches.total_rows) is not None and tr >= 0 + first_page = next(iter(batches)) # It's possible we don't have any pages at all, since we filtered out all # matching rows. - first_page = next(iter(batches_filtered)) - assert len(first_page.index) <= rows + assert len(first_page.index) <= tr if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/first_page.py b/tests/benchmark/read_gbq_colab/first_page.py index be54fc2634..33e2a24bd7 100644 --- a/tests/benchmark/read_gbq_colab/first_page.py +++ b/tests/benchmark/read_gbq_colab/first_page.py @@ -27,10 +27,10 @@ def first_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - batches = df.to_pandas_batches(page_size=PAGE_SIZE) - assert batches.total_rows is not None and batches.total_rows >= 0 - first_page = next(iter(batches)) - assert first_page is not None + # Get number of rows (to calculate number of pages) and the first page. + batches = df._to_pandas_batches(page_size=PAGE_SIZE) + assert (tr := batches.total_rows) is not None and tr >= 0 + next(iter(batches)) if __name__ == "__main__": diff --git a/tests/benchmark/read_gbq_colab/last_page.py b/tests/benchmark/read_gbq_colab/last_page.py index 0b50433438..2e485a070a 100644 --- a/tests/benchmark/read_gbq_colab/last_page.py +++ b/tests/benchmark/read_gbq_colab/last_page.py @@ -27,8 +27,9 @@ def last_page(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - batches = df.to_pandas_batches(page_size=PAGE_SIZE) - assert batches.total_rows is not None and batches.total_rows >= 0 + # Get number of rows (to calculate number of pages) and then all pages. + batches = df._to_pandas_batches(page_size=PAGE_SIZE) + assert (tr := batches.total_rows) is not None and tr >= 0 for _ in batches: pass diff --git a/tests/benchmark/read_gbq_colab/sort_output.py b/tests/benchmark/read_gbq_colab/sort_output.py index bd9400fc2c..3044e0c2a3 100644 --- a/tests/benchmark/read_gbq_colab/sort_output.py +++ b/tests/benchmark/read_gbq_colab/sort_output.py @@ -27,8 +27,9 @@ def sort_output(*, project_id, dataset_id, table_id): f"SELECT * FROM `{project_id}`.{dataset_id}.{table_id}" ) - batches = df.to_pandas_batches(page_size=PAGE_SIZE) - assert batches.total_rows is not None and batches.total_rows >= 0 + # Simulate getting the first page, since we'll always do that first in the UI. + batches = df._to_pandas_batches(page_size=PAGE_SIZE) + assert (tr := batches.total_rows) is not None and tr >= 0 next(iter(batches)) # Simulate the user sorting by a column and visualizing those results @@ -37,9 +38,9 @@ def sort_output(*, project_id, dataset_id, table_id): sort_column = "col_bool_0" df_sorted = df.sort_values(sort_column) - batches_sorted = df_sorted.to_pandas_batches(page_size=PAGE_SIZE) - assert batches_sorted.total_rows is not None and batches_sorted.total_rows >= 0 - next(iter(batches_sorted)) + batches = df_sorted._to_pandas_batches(page_size=PAGE_SIZE) + assert (tr := batches.total_rows) is not None and tr >= 0 + next(iter(batches)) if __name__ == "__main__": From 93dbb4df58d3e7717f86461f4b1e929028f27f16 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 22 Oct 2025 00:48:28 +0000 Subject: [PATCH 43/45] merge change --- bigframes/core/blocks.py | 3 - bigframes/dataframe.py | 4 +- bigframes/display/anywidget.py | 28 ++++--- notebooks/dataframes/anywidget_mode.ipynb | 93 +++++++---------------- 4 files changed, 46 insertions(+), 82 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 1a3e95b585..1900b7208a 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -124,9 +124,6 @@ def total_bytes_processed(self) -> Optional[int]: def __next__(self) -> pd.DataFrame: return next(self._dataframes) - def __iter__(self) -> Iterator[pd.DataFrame]: - return self - @dataclasses.dataclass() class MaterializationOptions: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 028bd16f64..c3735ca3c2 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1887,7 +1887,7 @@ def to_pandas_batches( max_results: Optional[int] = None, *, allow_large_results: Optional[bool] = None, - ) -> blocks.PandasBatches: + ) -> Iterable[pandas.DataFrame]: """Stream DataFrame results to an iterable of pandas DataFrame. page_size and max_results determine the size and number of batches, @@ -1930,7 +1930,7 @@ def to_pandas_batches( over the default size limit of 10 GB. Returns: - blocks.PandasBatches: + Iterable[pandas.DataFrame]: An iterable of smaller dataframes which combine to form the original dataframe. Results stream from bigquery, see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index a78e332978..79b4b5ccd2 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -23,6 +23,8 @@ import pandas as pd import bigframes +from bigframes.core.blocks import PandasBatches +import bigframes.dataframe import bigframes.display.html # anywidget and traitlets are optional dependencies. We don't want the import of @@ -56,6 +58,7 @@ class TableWidget(WIDGET_BASE): row_count = traitlets.Int(0).tag(sync=True) table_html = traitlets.Unicode().tag(sync=True) _initial_load_complete = traitlets.Bool(False).tag(sync=True) + _batches: PandasBatches def __init__(self, dataframe: bigframes.dataframe.DataFrame): """Initialize the TableWidget. @@ -65,8 +68,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): """ if not ANYWIDGET_INSTALLED: raise ImportError( - "Please `pip install anywidget traitlets` or " - "`pip install 'bigframes[anywidget]'` to use TableWidget." + "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget." ) self._dataframe = dataframe @@ -85,16 +87,22 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): self._batch_iter: Optional[Iterator[pd.DataFrame]] = None self._cached_batches: List[pd.DataFrame] = [] - # Respect display options for initial page size - self.page_size = bigframes.options.display.max_rows + # respect display options for initial page size + initial_page_size = bigframes.options.display.max_rows - # The query issued by `to_pandas_batches()` already contains - # metadata about how many results there were. Use that to avoid - # doing an extra COUNT(*) query that `len(...)` would do. - self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) - # TODO (shuowei): total_rows=None Incorrectly Defaults to 0. b/452747934 + # set traitlets properties that trigger observers + self.page_size = initial_page_size + + # len(dataframe) is expensive, since it will trigger a + # SELECT COUNT(*) query. It is a must have however. + # TODO(b/428238610): Start iterating over the result of `to_pandas_batches()` + # before we get here so that the count might already be cached. + # TODO(b/452747934): Allow row_count to be None and check to see if + # there are multiple pages and show "page 1 of many" in this case + self._reset_batches_for_new_page_size() self.row_count = self._batches.total_rows or 0 + # get the initial page self._set_table_html() self._initial_load_complete = True self._initializing = False @@ -187,7 +195,7 @@ def _cached_data(self) -> pd.DataFrame: def _reset_batches_for_new_page_size(self) -> None: """Reset the batch iterator when page size changes.""" - self._batches = self._dataframe.to_pandas_batches(page_size=self.page_size) + self._batches = self._dataframe._to_pandas_batches(page_size=self.page_size) self._cached_batches = [] self._batch_iter = None diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index ce7fac1488..ffb1718407 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -128,34 +128,24 @@ "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "Query job 6d85c081-49c7-408a-ab96-e0e9e5102419 is DONE. 171.4 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "31ba8e41e4ca4579b85409237cb7a566", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "TableWidget(page_size=10, row_count=5552452, table_html='
Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total pages: 555246\n" + "ename": "AttributeError", + "evalue": "'TableWidget' object has no attribute '_batches'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# Create widget programmatically\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mwidget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTableWidget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Total pages: {math.ceil(widget.row_count / widget.page_size)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/display/anywidget.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, dataframe)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;31m# TODO(b/452747934): Allow row_count to be None and check to see if\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0;31m# there are multiple pages and show \"page 1 of many\" in this case\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 100\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrow_count\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_batches\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtotal_rows\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 101\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[0;31m# get the initial page\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'TableWidget' object has no attribute '_batches'" ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5d22f3f19e4140b0ba51869e97c3f690", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "TableWidget(page_size=10, row_count=5552452, table_html='
Date: Fri, 24 Oct 2025 21:19:42 +0000 Subject: [PATCH 44/45] update how we handle invalid row count --- bigframes/display/anywidget.py | 24 +++- notebooks/dataframes/anywidget_mode.ipynb | 133 +++++++++++++++++----- tests/system/small/test_anywidget.py | 7 +- 3 files changed, 130 insertions(+), 34 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 79b4b5ccd2..1ca441be42 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -23,7 +23,7 @@ import pandas as pd import bigframes -from bigframes.core.blocks import PandasBatches +from bigframes.core import blocks import bigframes.dataframe import bigframes.display.html @@ -58,7 +58,10 @@ class TableWidget(WIDGET_BASE): row_count = traitlets.Int(0).tag(sync=True) table_html = traitlets.Unicode().tag(sync=True) _initial_load_complete = traitlets.Bool(False).tag(sync=True) - _batches: PandasBatches + _batches: Optional[blocks.PandasBatches] = None + _error_message = traitlets.Unicode(allow_none=True, default_value=None).tag( + sync=True + ) def __init__(self, dataframe: bigframes.dataframe.DataFrame): """Initialize the TableWidget. @@ -100,7 +103,11 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # TODO(b/452747934): Allow row_count to be None and check to see if # there are multiple pages and show "page 1 of many" in this case self._reset_batches_for_new_page_size() - self.row_count = self._batches.total_rows or 0 + if self._batches is None or self._batches.total_rows is None: + self._error_message = "Could not determine total row count. Data might be unavailable or an error occurred." + self.row_count = 0 + else: + self.row_count = self._batches.total_rows # get the initial page self._set_table_html() @@ -183,7 +190,10 @@ def _get_next_batch(self) -> bool: def _batch_iterator(self) -> Iterator[pd.DataFrame]: """Lazily initializes and returns the batch iterator.""" if self._batch_iter is None: - self._batch_iter = iter(self._batches) + if self._batches is None: + self._batch_iter = iter([]) + else: + self._batch_iter = iter(self._batches) return self._batch_iter @property @@ -203,6 +213,12 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" + if self._error_message: + self.table_html = ( + f"
{self._error_message}
" + ) + return + start = self.page * self.page_size end = start + self.page_size diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index ffb1718407..c2af915721 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -127,25 +127,35 @@ "id": "ce250157", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dataframe.py:868: UserWarning: Anywidget mode is not available. Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. Falling back to deferred mode. Error: Traceback (most recent call last):\n", - " File \"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dataframe.py\", line 861, in _repr_html_\n", - " widget = display.TableWidget(df.copy())\n", - " File \"/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/display/anywidget.py\", line 100, in __init__\n", - " self.row_count = self._batches.total_rows or 0\n", - "AttributeError: 'TableWidget' object has no attribute '_batches'\n", - "\n", - " warnings.warn(\n" - ] - }, { "data": { "text/html": [ - "Computation deferred. Computation will process 171.4 MB" + "✅ Completed. " ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "aafd4f912b5f42e0896aa5f0c2c62620", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5552452, table_html='
\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m# Create widget programmatically\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mwidget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTableWidget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Total pages: {math.ceil(widget.row_count / widget.page_size)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/src/github.com/googleapis/python-bigquery-dataframes/bigframes/display/anywidget.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, dataframe)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;31m# TODO(b/452747934): Allow row_count to be None and check to see if\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0;31m# there are multiple pages and show \"page 1 of many\" in this case\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 100\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrow_count\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_batches\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtotal_rows\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 101\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[0;31m# get the initial page\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'TableWidget' object has no attribute '_batches'" + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total pages: 555246\n" ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5ec0ad9f11874d4f9d8edbc903ee7b5d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5552452, table_html='
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Small dataset pages: 1\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "651b5aac958c408183775152c2573a03", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5, table_html='
Date: Fri, 24 Oct 2025 21:21:53 +0000 Subject: [PATCH 45/45] eliminate duplated flags --- bigframes/display/anywidget.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 1ca441be42..a0b4f809d8 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -78,12 +78,6 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): super().__init__() - # This flag prevents observers from firing during initialization. - # When traitlets like `page` and `page_size` are set in `__init__`, we - # don't want their corresponding `_..._changed` methods to execute - # until the widget is fully constructed. - self._initializing = True - # Initialize attributes that might be needed by observers first self._table_id = str(uuid.uuid4()) self._all_data_loaded = False @@ -111,8 +105,10 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): # get the initial page self._set_table_html() + + # Signals to the frontend that the initial data load is complete. + # Also used as a guard to prevent observers from firing during initialization. self._initial_load_complete = True - self._initializing = False @functools.cached_property def _esm(self): @@ -242,14 +238,14 @@ def _set_table_html(self) -> None: @traitlets.observe("page") def _page_changed(self, _change: Dict[str, Any]) -> None: """Handler for when the page number is changed from the frontend.""" - if self._initializing: + if not self._initial_load_complete: return self._set_table_html() @traitlets.observe("page_size") def _page_size_changed(self, _change: Dict[str, Any]) -> None: """Handler for when the page size is changed from the frontend.""" - if self._initializing: + if not self._initial_load_complete: return # Reset the page to 0 when page size changes to avoid invalid page states self.page = 0