perf(python): Micro-optimise internal DataFrame height and width ch…

…ecks (#21071)
pola-rs · Feb 4, 2025 · 7f4dc50 · 7f4dc50
1 parent aeead20
commit 7f4dc50
Show file tree

Hide file tree

Showing 16 changed files with 39 additions and 38 deletions.
diff --git a/py-polars/polars/_utils/construction/dataframe.py b/py-polars/polars/_utils/construction/dataframe.py
@@ -366,7 +366,7 @@ def _expand_dict_values(
                 if isinstance(val, dict) and dtype != Struct:
                     vdf = pl.DataFrame(val, strict=strict)
                     if (
-                        len(vdf) == 1
+                        vdf.height == 1
                         and array_len > 1
                         and all(not d.is_nested() for d in vdf.schema.values())
                     ):
@@ -1019,7 +1019,7 @@ def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFr
             if not original_schema:
                 original_schema = list(df.schema.items())
             if chunk_size != adaptive_chunk_size:
-                if (n_columns := len(df.columns)) > 0:
+                if (n_columns := df.width) > 0:
                     chunk_size = adaptive_chunk_size = n_chunk_elems // n_columns
         else:
             df.vstack(frame_chunk, in_place=True)

diff --git a/py-polars/polars/_utils/getitem.py b/py-polars/polars/_utils/getitem.py
@@ -291,9 +291,9 @@ def _select_rows(
 ) -> DataFrame | Series:
     """Select one or more rows from the DataFrame."""
     if isinstance(key, int):
-        num_rows = len(df)
+        num_rows = df.height
         if (key >= num_rows) or (key < -num_rows):
-            msg = f"index {key} is out of bounds for DataFrame of height {len(df)}"
+            msg = f"index {key} is out of bounds for DataFrame of height {num_rows}"
             raise IndexError(msg)
         return df.slice(key, 1)
 

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -1097,8 +1097,8 @@ def _div(self, other: Any, *, floordiv: bool) -> DataFrame:
             return self.select(F.all() / lit(other))
 
         elif not isinstance(other, DataFrame):
-            s = _prepare_other_arg(other, length=len(self))
-            other = DataFrame([s.alias(f"n{i}") for i in range(len(self.columns))])
+            s = _prepare_other_arg(other, length=self.height)
+            other = DataFrame([s.alias(f"n{i}") for i in range(self.width)])
 
         orig_dtypes = other.dtypes
         # TODO: Dispatch to a native floordiv
@@ -3316,7 +3316,7 @@ def write_excel(
         ...     df.write_excel(
         ...         workbook=wb,
         ...         worksheet="data",
-        ...         position=(len(df) + 7, 1),
+        ...         position=(df.height + 7, 1),
         ...         table_style={
         ...             "style": "Table Style Light 4",
         ...             "first_column": True,
@@ -3352,7 +3352,9 @@ def write_excel(
         ...         }
         ...     )
         ...     ws.write(2, 1, "Basic/default conditional formatting", fmt_title)
-        ...     ws.write(len(df) + 6, 1, "Customised conditional formatting", fmt_title)
+        ...     ws.write(
+        ...         df.height + 6, 1, "Customised conditional formatting", fmt_title
+        ...     )
 
         Export a table containing two different types of sparklines. Use default
         options for the "trend" sparkline and customized options (and positioning)
@@ -3475,13 +3477,12 @@ def write_excel(
 
         # setup workbook/worksheet
         wb, ws, can_close = _xl_setup_workbook(workbook, worksheet)
-        df, is_empty = self, not len(self)
+        df, is_empty = self, self.is_empty()
 
-        # The _xl_setup_table_columns function in the below section
-        # converts all collection types (e.g. List, Struct, Object) to strings
-        # Hence, we need to store the original schema so that it can be used
-        # when selecting columns using column selectors based on datatypes
-        df_original = df.clear()
+        # note: `_xl_setup_table_columns` converts nested data (List, Struct, etc.) to
+        # string, so we keep a reference to the original so that column selection with
+        # selectors that target such types remains correct
+        df_original = df
 
         # setup table format/columns
         fmt_cache = _XLFormatCache(wb)
@@ -3509,11 +3510,11 @@ def write_excel(
         )
         table_finish = (
             table_start[0]
-            + len(df)
+            + df.height
             + int(is_empty)
             - int(not include_header)
             + int(bool(column_totals)),
-            table_start[1] + len(df.columns) - 1,
+            table_start[1] + df.width - 1,
         )
 
         # write table structure and formats into the target sheet
@@ -4864,12 +4865,12 @@ def insert_column(self, index: int, column: IntoExprColumn) -> DataFrame:
         └─────┴──────┴───────┴──────┘
         """
         if (original_index := index) < 0:
-            index = len(self.columns) + index
+            index = self.width + index
             if index < 0:
-                msg = f"column index {original_index} is out of range (frame has {len(self.columns)} columns)"
+                msg = f"column index {original_index} is out of range (frame has {self.width} columns)"
                 raise IndexError(msg)
-        elif index > len(self.columns):
-            msg = f"column index {original_index} is out of range (frame has {len(self.columns)} columns)"
+        elif index > self.width:
+            msg = f"column index {original_index} is out of range (frame has {self.width} columns)"
             raise IndexError(msg)
 
         if isinstance(column, pl.Series):
@@ -5316,7 +5317,7 @@ def replace_column(self, index: int, column: Series) -> DataFrame:
         └───────┴─────┴─────┘
         """
         if index < 0:
-            index = len(self.columns) + index
+            index = self.width + index
         self._df.replace_column(index, column._s)
         return self
 

diff --git a/py-polars/polars/io/spreadsheet/_write_utils.py b/py-polars/polars/io/spreadsheet/_write_utils.py
@@ -318,7 +318,7 @@ def _xl_inject_sparklines(
         if "negative_points" not in options:
             options["negative_points"] = options.get("type") in ("column", "win_loss")
 
-    for _ in range(len(df)):
+    for _ in range(df.height):
         data_start = xl_rowcol_to_cell(spk_row, data_start_col)
         data_end = xl_rowcol_to_cell(spk_row, data_end_col)
         options["range"] = f"{data_start}:{data_end}"
@@ -389,7 +389,7 @@ def _map_str(s: Series) -> Series:
                 )
             )
             n_ucase = sum((c[0] if c else "").isupper() for c in df.columns)
-            total = f"{'T' if (n_ucase > len(df.columns) // 2) else 't'}otal"
+            total = f"{'T' if (n_ucase > df.width // 2) else 't'}otal"
             row_total_funcs = {total: _xl_table_formula(df, sum_cols, "sum")}
             row_totals = [total]
         else:

diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
@@ -1059,7 +1059,7 @@ def _read_spreadsheet_calamine(
         if read_options.get("header_row", False) is None and not read_options.get(
             "column_names"
         ):
-            df.columns = [f"column_{i}" for i in range(1, len(df.columns) + 1)]
+            df.columns = [f"column_{i}" for i in range(1, df.width + 1)]
 
     df = _drop_null_data(
         df,

diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py
@@ -71,7 +71,7 @@ def test_special_char_colname_init() -> None:
     cols = [(c, pl.Int8) for c in punctuation]
     df = pl.DataFrame(schema=cols)
 
-    assert len(cols) == len(df.columns)
+    assert len(cols) == df.width
     assert len(df.rows()) == 0
     assert df.is_empty()
 
@@ -226,7 +226,7 @@ def test_from_arrow(monkeypatch: Any) -> None:
         override_schema = expected_schema.copy()
         override_schema["e"] = pl.Int8
         assert df.schema == override_schema
-        assert df.rows() == expected_data[: (len(df))]
+        assert df.rows() == expected_data[: (df.height)]
 
     # init from record batches with overrides
     df = pl.DataFrame(
@@ -1471,7 +1471,7 @@ def test_join_dates() -> None:
         }
     )
     out = df.join(df, on="datetime")
-    assert len(out) == len(df)
+    assert out.height == df.height
 
 
 def test_asof_cross_join() -> None:

diff --git a/py-polars/tests/unit/dataframe/test_from_dict.py b/py-polars/tests/unit/dataframe/test_from_dict.py
@@ -186,7 +186,7 @@ def test_from_dict_with_values_mixed() -> None:
     )
     dfx = df.select(pl.exclude("idx"))
 
-    assert len(df) == n_range
+    assert df.height == n_range
     assert dfx[:5].rows() == dfx[5:10].rows()
     assert dfx[-10:-5].rows() == dfx[-5:].rows()
     assert dfx.row(n_range // 2, named=True) == mixed_dtype_data

diff --git a/py-polars/tests/unit/dataframe/test_getitem.py b/py-polars/tests/unit/dataframe/test_getitem.py
@@ -68,7 +68,7 @@ def test_df_getitem_row_slice(df: pl.DataFrame) -> None:
 
         assert (
             sliced_py_data == sliced_df_data
-        ), f"slice [{start}:{stop}:{step}] failed on df w/len={len(df)}"
+        ), f"slice [{start}:{stop}:{step}] failed on df w/len={df.height}"
 
 
 def test_df_getitem_col_single_name() -> None:

diff --git a/py-polars/tests/unit/dataframe/test_null_count.py b/py-polars/tests/unit/dataframe/test_null_count.py
@@ -22,7 +22,7 @@
 @example(df=pl.DataFrame())
 def test_null_count(df: pl.DataFrame) -> None:
     # note: the zero-row and zero-col cases are always passed as explicit examples
-    null_count, ncols = df.null_count(), len(df.columns)
+    null_count, ncols = df.null_count(), df.width
     assert null_count.shape == (1, ncols)
     for idx, count in enumerate(null_count.rows()[0]):
         assert count == sum(v is None for v in df.to_series(idx).to_list())
diff --git a/py-polars/tests/unit/dataframe/test_to_dict.py b/py-polars/tests/unit/dataframe/test_to_dict.py
@@ -48,4 +48,4 @@ def test_to_dict_misc(as_series: bool, inner_dtype: Any) -> None:
     assert isinstance(s, dict)
     for v in s.values():
         assert isinstance(v, inner_dtype)
-        assert len(v) == len(df)
+        assert len(v) == df.height
diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py
@@ -395,7 +395,7 @@ def test_to_dicts() -> None:
     df = pl.DataFrame(
         data, schema_overrides={"a": pl.Datetime("ns"), "d": pl.Duration("ns")}
     )
-    assert len(df) == 1
+    assert df.height == 1
 
     d = df.to_dicts()[0]
     for col in data:

diff --git a/py-polars/tests/unit/functions/test_lit.py b/py-polars/tests/unit/functions/test_lit.py
@@ -79,7 +79,7 @@ def test_lit_ambiguous_datetimes_11379() -> None:
             )
         }
     )
-    for i in range(len(df)):
+    for i in range(df.height):
         result = df.filter(pl.col("ts") >= df["ts"][i])
         expected = df[i:]
         assert_frame_equal(result, expected)

diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py
@@ -1767,7 +1767,7 @@ def test_read_csv_comments_on_top_with_schema_11667() -> None:
     }
 
     df = pl.read_csv(io.StringIO(csv), comment_prefix="#", schema=schema)
-    assert len(df) == 2
+    assert df.height == 2
     assert df.schema == schema
 
 

diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py
@@ -936,7 +936,7 @@ def test_offset_by_expressions() -> None:
     }
 
     # Check single-row cases
-    for i in range(len(df)):
+    for i in range(df.height):
         df_slice = df[i : i + 1]
         result = df_slice.select(
             c=pl.col("a").dt.offset_by(pl.col("b")),

diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py
@@ -386,7 +386,7 @@ def test_fast_explode_merge_right_16923() -> None:
         rechunk=True,
     ).explode("foo")
 
-    assert len(df) == 4
+    assert df.height == 4
 
 
 def test_fast_explode_merge_left_16923() -> None:
@@ -399,7 +399,7 @@ def test_fast_explode_merge_left_16923() -> None:
         rechunk=True,
     ).explode("foo")
 
-    assert len(df) == 4
+    assert df.height == 4
 
 
 @pytest.mark.parametrize(

diff --git a/py-polars/tests/unit/streaming/test_streaming.py b/py-polars/tests/unit/streaming/test_streaming.py
@@ -302,7 +302,7 @@ def test_streaming_csv_headers_but_no_data_13770(tmp_path: Path) -> None:
         .head()
         .collect(streaming=True)
     )
-    assert len(df) == 0
+    assert df.height == 0
     assert df.schema == schema