Skip to content

Commit

Permalink
perf(python): Micro-optimise internal DataFrame height and width ch…
Browse files Browse the repository at this point in the history
…ecks (#21071)
  • Loading branch information
alexander-beedie authored Feb 4, 2025
1 parent aeead20 commit 7f4dc50
Show file tree
Hide file tree
Showing 16 changed files with 39 additions and 38 deletions.
4 changes: 2 additions & 2 deletions py-polars/polars/_utils/construction/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ def _expand_dict_values(
if isinstance(val, dict) and dtype != Struct:
vdf = pl.DataFrame(val, strict=strict)
if (
len(vdf) == 1
vdf.height == 1
and array_len > 1
and all(not d.is_nested() for d in vdf.schema.values())
):
Expand Down Expand Up @@ -1019,7 +1019,7 @@ def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFr
if not original_schema:
original_schema = list(df.schema.items())
if chunk_size != adaptive_chunk_size:
if (n_columns := len(df.columns)) > 0:
if (n_columns := df.width) > 0:
chunk_size = adaptive_chunk_size = n_chunk_elems // n_columns
else:
df.vstack(frame_chunk, in_place=True)
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/_utils/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,9 +291,9 @@ def _select_rows(
) -> DataFrame | Series:
"""Select one or more rows from the DataFrame."""
if isinstance(key, int):
num_rows = len(df)
num_rows = df.height
if (key >= num_rows) or (key < -num_rows):
msg = f"index {key} is out of bounds for DataFrame of height {len(df)}"
msg = f"index {key} is out of bounds for DataFrame of height {num_rows}"
raise IndexError(msg)
return df.slice(key, 1)

Expand Down
35 changes: 18 additions & 17 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1097,8 +1097,8 @@ def _div(self, other: Any, *, floordiv: bool) -> DataFrame:
return self.select(F.all() / lit(other))

elif not isinstance(other, DataFrame):
s = _prepare_other_arg(other, length=len(self))
other = DataFrame([s.alias(f"n{i}") for i in range(len(self.columns))])
s = _prepare_other_arg(other, length=self.height)
other = DataFrame([s.alias(f"n{i}") for i in range(self.width)])

orig_dtypes = other.dtypes
# TODO: Dispatch to a native floordiv
Expand Down Expand Up @@ -3316,7 +3316,7 @@ def write_excel(
... df.write_excel(
... workbook=wb,
... worksheet="data",
... position=(len(df) + 7, 1),
... position=(df.height + 7, 1),
... table_style={
... "style": "Table Style Light 4",
... "first_column": True,
Expand Down Expand Up @@ -3352,7 +3352,9 @@ def write_excel(
... }
... )
... ws.write(2, 1, "Basic/default conditional formatting", fmt_title)
... ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title)
... ws.write(
... df.height + 6, 1, "Customised conditional formatting", fmt_title
... )
Export a table containing two different types of sparklines. Use default
options for the "trend" sparkline and customized options (and positioning)
Expand Down Expand Up @@ -3475,13 +3477,12 @@ def write_excel(

# setup workbook/worksheet
wb, ws, can_close = _xl_setup_workbook(workbook, worksheet)
df, is_empty = self, not len(self)
df, is_empty = self, self.is_empty()

# The _xl_setup_table_columns function in the below section
# converts all collection types (e.g. List, Struct, Object) to strings
# Hence, we need to store the original schema so that it can be used
# when selecting columns using column selectors based on datatypes
df_original = df.clear()
# note: `_xl_setup_table_columns` converts nested data (List, Struct, etc.) to
# string, so we keep a reference to the original so that column selection with
# selectors that target such types remains correct
df_original = df

# setup table format/columns
fmt_cache = _XLFormatCache(wb)
Expand Down Expand Up @@ -3509,11 +3510,11 @@ def write_excel(
)
table_finish = (
table_start[0]
+ len(df)
+ df.height
+ int(is_empty)
- int(not include_header)
+ int(bool(column_totals)),
table_start[1] + len(df.columns) - 1,
table_start[1] + df.width - 1,
)

# write table structure and formats into the target sheet
Expand Down Expand Up @@ -4864,12 +4865,12 @@ def insert_column(self, index: int, column: IntoExprColumn) -> DataFrame:
└─────┴──────┴───────┴──────┘
"""
if (original_index := index) < 0:
index = len(self.columns) + index
index = self.width + index
if index < 0:
msg = f"column index {original_index} is out of range (frame has {len(self.columns)} columns)"
msg = f"column index {original_index} is out of range (frame has {self.width} columns)"
raise IndexError(msg)
elif index > len(self.columns):
msg = f"column index {original_index} is out of range (frame has {len(self.columns)} columns)"
elif index > self.width:
msg = f"column index {original_index} is out of range (frame has {self.width} columns)"
raise IndexError(msg)

if isinstance(column, pl.Series):
Expand Down Expand Up @@ -5316,7 +5317,7 @@ def replace_column(self, index: int, column: Series) -> DataFrame:
└───────┴─────┴─────┘
"""
if index < 0:
index = len(self.columns) + index
index = self.width + index
self._df.replace_column(index, column._s)
return self

Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/io/spreadsheet/_write_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def _xl_inject_sparklines(
if "negative_points" not in options:
options["negative_points"] = options.get("type") in ("column", "win_loss")

for _ in range(len(df)):
for _ in range(df.height):
data_start = xl_rowcol_to_cell(spk_row, data_start_col)
data_end = xl_rowcol_to_cell(spk_row, data_end_col)
options["range"] = f"{data_start}:{data_end}"
Expand Down Expand Up @@ -389,7 +389,7 @@ def _map_str(s: Series) -> Series:
)
)
n_ucase = sum((c[0] if c else "").isupper() for c in df.columns)
total = f"{'T' if (n_ucase > len(df.columns) // 2) else 't'}otal"
total = f"{'T' if (n_ucase > df.width // 2) else 't'}otal"
row_total_funcs = {total: _xl_table_formula(df, sum_cols, "sum")}
row_totals = [total]
else:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1059,7 +1059,7 @@ def _read_spreadsheet_calamine(
if read_options.get("header_row", False) is None and not read_options.get(
"column_names"
):
df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)]
df.columns = [f"column_{i}" for i in range(1, df.width + 1)]

df = _drop_null_data(
df,
Expand Down
6 changes: 3 additions & 3 deletions py-polars/tests/unit/dataframe/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def test_special_char_colname_init() -> None:
cols = [(c, pl.Int8) for c in punctuation]
df = pl.DataFrame(schema=cols)

assert len(cols) == len(df.columns)
assert len(cols) == df.width
assert len(df.rows()) == 0
assert df.is_empty()

Expand Down Expand Up @@ -226,7 +226,7 @@ def test_from_arrow(monkeypatch: Any) -> None:
override_schema = expected_schema.copy()
override_schema["e"] = pl.Int8
assert df.schema == override_schema
assert df.rows() == expected_data[: (len(df))]
assert df.rows() == expected_data[: (df.height)]

# init from record batches with overrides
df = pl.DataFrame(
Expand Down Expand Up @@ -1471,7 +1471,7 @@ def test_join_dates() -> None:
}
)
out = df.join(df, on="datetime")
assert len(out) == len(df)
assert out.height == df.height


def test_asof_cross_join() -> None:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/dataframe/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_from_dict_with_values_mixed() -> None:
)
dfx = df.select(pl.exclude("idx"))

assert len(df) == n_range
assert df.height == n_range
assert dfx[:5].rows() == dfx[5:10].rows()
assert dfx[-10:-5].rows() == dfx[-5:].rows()
assert dfx.row(n_range // 2, named=True) == mixed_dtype_data
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/dataframe/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def test_df_getitem_row_slice(df: pl.DataFrame) -> None:

assert (
sliced_py_data == sliced_df_data
), f"slice [{start}:{stop}:{step}] failed on df w/len={len(df)}"
), f"slice [{start}:{stop}:{step}] failed on df w/len={df.height}"


def test_df_getitem_col_single_name() -> None:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/dataframe/test_null_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
@example(df=pl.DataFrame())
def test_null_count(df: pl.DataFrame) -> None:
# note: the zero-row and zero-col cases are always passed as explicit examples
null_count, ncols = df.null_count(), len(df.columns)
null_count, ncols = df.null_count(), df.width
assert null_count.shape == (1, ncols)
for idx, count in enumerate(null_count.rows()[0]):
assert count == sum(v is None for v in df.to_series(idx).to_list())
2 changes: 1 addition & 1 deletion py-polars/tests/unit/dataframe/test_to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ def test_to_dict_misc(as_series: bool, inner_dtype: Any) -> None:
assert isinstance(s, dict)
for v in s.values():
assert isinstance(v, inner_dtype)
assert len(v) == len(df)
assert len(v) == df.height
2 changes: 1 addition & 1 deletion py-polars/tests/unit/datatypes/test_temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def test_to_dicts() -> None:
df = pl.DataFrame(
data, schema_overrides={"a": pl.Datetime("ns"), "d": pl.Duration("ns")}
)
assert len(df) == 1
assert df.height == 1

d = df.to_dicts()[0]
for col in data:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/functions/test_lit.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_lit_ambiguous_datetimes_11379() -> None:
)
}
)
for i in range(len(df)):
for i in range(df.height):
result = df.filter(pl.col("ts") >= df["ts"][i])
expected = df[i:]
assert_frame_equal(result, expected)
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1767,7 +1767,7 @@ def test_read_csv_comments_on_top_with_schema_11667() -> None:
}

df = pl.read_csv(io.StringIO(csv), comment_prefix="#", schema=schema)
assert len(df) == 2
assert df.height == 2
assert df.schema == schema


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -936,7 +936,7 @@ def test_offset_by_expressions() -> None:
}

# Check single-row cases
for i in range(len(df)):
for i in range(df.height):
df_slice = df[i : i + 1]
result = df_slice.select(
c=pl.col("a").dt.offset_by(pl.col("b")),
Expand Down
4 changes: 2 additions & 2 deletions py-polars/tests/unit/operations/test_explode.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ def test_fast_explode_merge_right_16923() -> None:
rechunk=True,
).explode("foo")

assert len(df) == 4
assert df.height == 4


def test_fast_explode_merge_left_16923() -> None:
Expand All @@ -399,7 +399,7 @@ def test_fast_explode_merge_left_16923() -> None:
rechunk=True,
).explode("foo")

assert len(df) == 4
assert df.height == 4


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/streaming/test_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def test_streaming_csv_headers_but_no_data_13770(tmp_path: Path) -> None:
.head()
.collect(streaming=True)
)
assert len(df) == 0
assert df.height == 0
assert df.schema == schema


Expand Down

0 comments on commit 7f4dc50

Please sign in to comment.