Skip to content

Commit 429bea3

Browse files
committed
immediately query for information_schema tables
1 parent 36fb4ef commit 429bea3

File tree

4 files changed

+34
-16
lines changed

4 files changed

+34
-16
lines changed

bigframes/core/array_value.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,11 @@ def from_bq_data_source(
141141
)
142142
return cls(node)
143143

144+
@staticmethod
145+
def is_table_type_supported(table_type: Optional[str]):
146+
# Some external tables like those in GCS support all the features we want, such as time travel.
147+
return table_type in ("TABLE", "MATERIALIZED_VIEW", "EXTERNAL")
148+
144149
@property
145150
def column_ids(self) -> typing.Sequence[str]:
146151
"""Returns column ids as strings."""

bigframes/session/_io/bigquery/read_gbq_table.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ def _convert_information_schema_table_id_to_table_reference(
4242
default_project: Optional[str],
4343
) -> bigquery.TableReference:
4444
"""Squeeze an INFORMATION_SCHEMA reference into a TableReference.
45-
4645
This is kind-of a hack. INFORMATION_SCHEMA is a view that isn't available
4746
via the tables.get REST API.
4847
"""
@@ -152,13 +151,7 @@ def get_table_metadata(
152151

153152
return cached_table
154153

155-
table_id_casefold = table_id.casefold()
156-
if (
157-
# Ensure we don't have false positives for some user defined dataset
158-
# like MY_INFORMATION_SCHEMA or tables called INFORMATION_SCHEMA.
159-
".INFORMATION_SCHEMA.".casefold() in table_id_casefold
160-
or table_id_casefold.startswith("INFORMATION_SCHEMA.".casefold())
161-
):
154+
if is_information_schema(table_id):
162155
table = get_information_schema_metadata(
163156
bqclient=bqclient, table_id=table_id, default_project=default_project
164157
)
@@ -179,6 +172,17 @@ def get_table_metadata(
179172
return cached_table
180173

181174

175+
def is_information_schema(table_id: str):
176+
table_id_casefold = table_id.casefold()
177+
# Include the "."s to ensure we don't have false positives for some user
178+
# defined dataset like MY_INFORMATION_SCHEMA or tables called
179+
# INFORMATION_SCHEMA.
180+
return (
181+
".INFORMATION_SCHEMA.".casefold() in table_id_casefold
182+
or table_id_casefold.startswith("INFORMATION_SCHEMA.".casefold())
183+
)
184+
185+
182186
def is_time_travel_eligible(
183187
bqclient: bigquery.Client,
184188
table: google.cloud.bigquery.table.Table,
@@ -245,6 +249,8 @@ def is_time_travel_eligible(
245249
msg, category=bfe.TimeTravelDisabledWarning, stacklevel=stacklevel
246250
)
247251
return False
252+
elif table.table_type == "VIEW":
253+
return False
248254

249255
# table might support time travel, lets do a dry-run query with time travel
250256
if should_dry_run:

bigframes/session/loader.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@
4747
import pandas
4848
import pyarrow as pa
4949

50+
import bigframes._tools
51+
import bigframes._tools.strings
5052
from bigframes.core import guid, identifiers, local_data, nodes, ordering, utils
5153
import bigframes.core as core
5254
import bigframes.core.blocks as blocks
@@ -701,18 +703,23 @@ def read_gbq_table(
701703
# Optionally, execute the query
702704
# -----------------------------
703705

704-
# max_results introduces non-determinism and limits the cost on
705-
# clustered tables, so fallback to a query. We do this here so that
706-
# the index is consistent with tables that have primary keys, even
707-
# when max_results is set.
708-
if max_results is not None:
706+
if (
707+
# max_results introduces non-determinism and limits the cost on
708+
# clustered tables, so fallback to a query. We do this here so that
709+
# the index is consistent with tables that have primary keys, even
710+
# when max_results is set.
711+
max_results is not None
712+
# Views such as INFORMATION_SCHEMA also introduce non-determinism.
713+
# They can update frequently and don't support time travel.
714+
or not core.ArrayValue.is_table_type_supported(table.table_type)
715+
):
709716
# TODO(b/338111344): If we are running a query anyway, we might as
710717
# well generate ROW_NUMBER() at the same time.
711718
all_columns: Iterable[str] = (
712719
itertools.chain(index_cols, columns) if columns else ()
713720
)
714721
query = bf_io_bigquery.to_query(
715-
table_id,
722+
f"{table.project}.{table.dataset_id}.{table.table_id}",
716723
columns=all_columns,
717724
sql_predicate=bf_io_bigquery.compile_filters(filters)
718725
if filters

tests/system/small/pandas/test_read_gbq_information_schema.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
"view_id",
2121
[
2222
# https://cloud.google.com/bigquery/docs/information-schema-intro
23-
"region-US.INFORMATION_SCHEMA.JOBS_BY_USER",
23+
"region-US.INFORMATION_SCHEMA.SESSIONS_BY_USER",
2424
"region-US.INFORMATION_SCHEMA.SCHEMATA",
2525
],
2626
)
@@ -32,7 +32,7 @@ def test_read_gbq_jobs_by_user_returns_schema(
3232
else:
3333
table_id = view_id
3434

35-
df = unordered_session.read_gbq(table_id)
35+
df = unordered_session.read_gbq(table_id, max_results=10)
3636
assert df.dtypes is not None
3737

3838

0 commit comments

Comments
 (0)