Skip to content
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions bigframes/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,9 +376,7 @@ def __repr__(self) -> __builtins__.str:
# metadata, like we do with DataFrame.
opts = bigframes.options.display
max_results = opts.max_rows
# anywdiget mode uses the same display logic as the "deferred" mode
# for faster execution
if opts.repr_mode in ("deferred", "anywidget"):
if opts.repr_mode == "deferred":
_, dry_run_query_job = self._block._compute_dry_run()
return formatter.repr_query_job(dry_run_query_job)

Expand Down
163 changes: 123 additions & 40 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,9 +789,7 @@ def __repr__(self) -> str:

opts = bigframes.options.display
max_results = opts.max_rows
# anywdiget mode uses the same display logic as the "deferred" mode
# for faster execution
if opts.repr_mode in ("deferred", "anywidget"):
if opts.repr_mode == "deferred":
return formatter.repr_query_job(self._compute_dry_run())

# TODO(swast): pass max_columns and get the true column count back. Maybe
Expand Down Expand Up @@ -829,7 +827,7 @@ def __repr__(self) -> str:
lines.append(f"[{row_count} rows x {column_count} columns]")
return "\n".join(lines)

def _repr_html_(self) -> str:
def _repr_html_fallback(self) -> str:
"""
Returns an html string primarily for use by notebooks for displaying
a representation of the DataFrame. Displays 20 rows by default since
Expand All @@ -840,57 +838,148 @@ def _repr_html_(self) -> str:
if opts.repr_mode == "deferred":
return formatter.repr_query_job(self._compute_dry_run())

# Process blob columns first, regardless of display mode
# Process blob columns first for non-deferred modes
df, blob_cols = self._process_blob_columns()

pandas_df, row_count, query_job = df._block.retrieve_repr_request_results(
max_results
)

self._set_internal_query_job(query_job)
column_count = len(pandas_df.columns)

return self._create_html_representation(
pandas_df, row_count, column_count, blob_cols
)

def _process_blob_columns(self) -> tuple[DataFrame, list[str]]:
"""Process blob columns for display."""
self._cached()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like the wrong function in which to call _cached(). _cached() can be very expensive (causes a query). Seems better to just call it once if needed in repr_mimebundle`.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The call to _cached() was removed from _process_blob_columns (now _get_display_df_and_blob_cols) to avoid unnecessary queries, as suggested.

df = self.copy()
df = self
blob_cols = []
if bigframes.options.display.blob_display:
blob_cols = [
series_name
for series_name, series in df.items()
for series_name, series in self.items()
if series.dtype == bigframes.dtypes.OBJ_REF_DTYPE
]
for col in blob_cols:
# TODO(garrettwu): Not necessary to get access urls for all the rows. Update when having a to get URLs from local data.
df[col] = df[col].blob._get_runtime(mode="R", with_metadata=True)
if blob_cols:
df = self.copy()
Comment on lines +856 to +865
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we define self twice? copy() is a pretty cheap operation (just makes a new block referencing the same expression), so I think we can just define it once.

Suggested change
df = self
blob_cols = []
if bigframes.options.display.blob_display:
blob_cols = [
series_name
for series_name, series in df.items()
for series_name, series in self.items()
if series.dtype == bigframes.dtypes.OBJ_REF_DTYPE
]
for col in blob_cols:
# TODO(garrettwu): Not necessary to get access urls for all the rows. Update when having a to get URLs from local data.
df[col] = df[col].blob._get_runtime(mode="R", with_metadata=True)
if blob_cols:
df = self.copy()
df = self.copy()
blob_cols = []
if bigframes.options.display.blob_display:
blob_cols = [
series_name
for series_name, series in df.items()
if series.dtype == bigframes.dtypes.OBJ_REF_DTYPE
]
if blob_cols:

for col in blob_cols:
df[col] = df[col].blob._get_runtime(mode="R", with_metadata=True)
return df, blob_cols

def _get_anywidget_bundle(self, include=None, exclude=None):
"""
Helper method to create and return the anywidget mimebundle.
This function encapsulates the logic for anywidget display.
"""
from bigframes import display

df, _ = self._process_blob_columns()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are we doing when we are "processing"? Seems like we should have a more descriptive name here that describes the purpose.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've renamed _process_blob_columns to _get_display_df_and_blob_cols to better reflect its purpose of preparing the DataFrame for display, including handling blob columns.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
df, _ = self._process_blob_columns()
# TODO(shuowei): Keep blob_cols and pass them to TableWidget so that they can render properly.
df, _ = self._process_blob_columns()


# Create and display the widget
widget = display.TableWidget(df)
widget_repr_result = widget._repr_mimebundle_(include=include, exclude=exclude)

# Handle both tuple (data, metadata) and dict returns
if isinstance(widget_repr_result, tuple):
widget_repr = dict(widget_repr_result[0]) # Extract data dict from tuple
else:
blob_cols = []
widget_repr = dict(widget_repr_result)

if opts.repr_mode == "anywidget":
try:
from IPython.display import display as ipython_display
# At this point, we have already executed the query as part of the
# widget construction. Let's use the information available to render
# the HTML and plain text versions.
widget_repr["text/html"] = widget.table_html

widget_repr["text/plain"] = self._create_text_representation(
widget._cached_data, widget.row_count
)

from bigframes import display
return widget_repr

# Always create a new widget instance for each display call
# This ensures that each cell gets its own widget and prevents
# unintended sharing between cells
widget = display.TableWidget(df.copy())
def _create_text_representation(
self, pandas_df: pandas.DataFrame, total_rows: typing.Optional[int]
) -> str:
"""Create a text representation of the DataFrame."""
opts = bigframes.options.display
with display_options.pandas_repr(opts):
import pandas.io.formats

ipython_display(widget)
return "" # Return empty string since we used display()
# safe to mutate this, this dict is owned by this code, and does not affect global config
to_string_kwargs = (
pandas.io.formats.format.get_dataframe_repr_params() # type: ignore
)
if not self._has_index:
to_string_kwargs.update({"index": False})

except (AttributeError, ValueError, ImportError):
# Fallback if anywidget is not available
# We add our own dimensions string, so don't want pandas to.
to_string_kwargs.update({"show_dimensions": False})
repr_string = pandas_df.to_string(**to_string_kwargs)

lines = repr_string.split("\n")

if total_rows is not None and total_rows > len(pandas_df):
lines.append("...")

lines.append("")
column_count = len(self.columns)
lines.append(f"[{total_rows or '?'} rows x {column_count} columns]")
return "\n".join(lines)

def _repr_mimebundle_(self, include=None, exclude=None):
"""
Custom display method for IPython/Jupyter environments.
This is called by IPython's display system when the object is displayed.
"""
opts = bigframes.options.display
# Only handle widget display in anywidget mode
if opts.repr_mode == "anywidget":
try:
return self._get_anywidget_bundle(include=include, exclude=exclude)

except ImportError:
# Fallback: let IPython use _repr_html_fallback() instead
warnings.warn(
"Anywidget mode is not available. "
"Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. "
f"Falling back to deferred mode. Error: {traceback.format_exc()}"
f"Falling back to static HTML. Error: {traceback.format_exc()}"
)
return formatter.repr_query_job(self._compute_dry_run())
# Don't return anything - let IPython fall back to _repr_html_fallback()
pass
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: pass is only necessary if there is no body, but we do have a body with the warning above.

Also, we are manually falling back.

Suggested change
# Fallback: let IPython use _repr_html_fallback() instead
warnings.warn(
"Anywidget mode is not available. "
"Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. "
f"Falling back to deferred mode. Error: {traceback.format_exc()}"
f"Falling back to static HTML. Error: {traceback.format_exc()}"
)
return formatter.repr_query_job(self._compute_dry_run())
# Don't return anything - let IPython fall back to _repr_html_fallback()
pass
# Anywidget is an optional dependency, so warn rather than fail.
# TODO(shuowei): When Anywidget becomes the default for all repr modes,
# remove this warning.
warnings.warn(
"Anywidget mode is not available. "
"Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. "
f"Falling back to static HTML. Error: {traceback.format_exc()}"
)


# In non-anywidget mode, fetch data once and use it for both HTML
# and plain text representations to avoid multiple queries.
opts = bigframes.options.display
max_results = opts.max_rows

df, blob_cols = self._process_blob_columns()

# Continue with regular HTML rendering for non-anywidget modes
# TODO(swast): pass max_columns and get the true column count back. Maybe
# get 1 more column than we have requested so that pandas can add the
# ... for us?
pandas_df, row_count, query_job = df._block.retrieve_repr_request_results(
max_results
)

self._set_internal_query_job(query_job)
column_count = len(pandas_df.columns)

html_string = self._create_html_representation(
pandas_df, row_count, column_count, blob_cols
)

text_representation = self._create_text_representation(pandas_df, row_count)

return {"text/html": html_string, "text/plain": text_representation}

def _create_html_representation(
self,
pandas_df: pandas.DataFrame,
row_count: int,
column_count: int,
blob_cols: list[str],
) -> str:
"""Create an HTML representation of the DataFrame."""
opts = bigframes.options.display
with display_options.pandas_repr(opts):
# Allows to preview images in the DataFrame. The implementation changes the string repr as well, that it doesn't truncate strings or escape html charaters such as "<" and ">". We may need to implement a full-fledged repr module to better support types not in pandas.
if bigframes.options.display.blob_display and blob_cols:

def obj_ref_rt_to_html(obj_ref_rt) -> str:
Expand All @@ -916,22 +1005,16 @@ def obj_ref_rt_to_html(obj_ref_rt) -> str:
return f'uri: {obj_ref_rt_json["objectref"]["uri"]}, authorizer: {obj_ref_rt_json["objectref"]["authorizer"]}'

formatters = {blob_col: obj_ref_rt_to_html for blob_col in blob_cols}

# set max_colwidth so not to truncate the image url
with pandas.option_context("display.max_colwidth", None):
max_rows = pandas.get_option("display.max_rows")
max_cols = pandas.get_option("display.max_columns")
show_dimensions = pandas.get_option("display.show_dimensions")
html_string = pandas_df.to_html(
escape=False,
notebook=True,
max_rows=max_rows,
max_cols=max_cols,
show_dimensions=show_dimensions,
max_rows=pandas.get_option("display.max_rows"),
max_cols=pandas.get_option("display.max_columns"),
show_dimensions=pandas.get_option("display.show_dimensions"),
formatters=formatters, # type: ignore
)
else:
# _repr_html_ stub is missing so mypy thinks it's a Series. Ignore mypy.
html_string = pandas_df._repr_html_() # type:ignore

html_string += f"[{row_count} rows x {column_count} columns in total]"
Expand Down
8 changes: 4 additions & 4 deletions bigframes/streaming/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,13 +291,13 @@ def __repr__(self, *args, **kwargs):

__repr__.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.__repr__))

def _repr_html_(self, *args, **kwargs):
return _return_type_wrapper(self._df._repr_html_, StreamingDataFrame)(
def _repr_html_fallback(self, *args, **kwargs):
return _return_type_wrapper(self._df._repr_html_fallback, StreamingDataFrame)(
*args, **kwargs
)

_repr_html_.__doc__ = _curate_df_doc(
inspect.getdoc(dataframe.DataFrame._repr_html_)
_repr_html_fallback.__doc__ = _curate_df_doc(
inspect.getdoc(dataframe.DataFrame._repr_html_fallback)
)

@property
Expand Down
Loading