fix(python): Don't trigger row limit in array construction (#19215)

pola-rs · Oct 13, 2024 · cca31b3 · cca31b3
1 parent 1997293
commit cca31b3
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 22 deletions.
diff --git a/crates/polars-error/src/constants.rs b/crates/polars-error/src/constants.rs
@@ -11,7 +11,7 @@ pub static FALSE: &str = "false";
 
 #[cfg(not(feature = "python"))]
 pub static LENGTH_LIMIT_MSG: &str =
-    "polars' maximum length reached. Consider compiling with 'bigidx' feature.";
+    "Polars' maximum length reached. Consider compiling with 'bigidx' feature.";
 #[cfg(feature = "python")]
 pub static LENGTH_LIMIT_MSG: &str =
-    "polars' maximum length reached. Consider installing 'polars-u64-idx'.";
+    "Polars' maximum length reached. Consider installing 'polars-u64-idx'.";
diff --git a/py-polars/polars/_utils/construction/series.py b/py-polars/polars/_utils/construction/series.py
@@ -36,6 +36,7 @@
     Object,
     Struct,
     Time,
+    UInt32,
     Unknown,
     dtype_to_py_type,
     is_polars_dtype,
@@ -57,9 +58,10 @@
 from polars.dependencies import numpy as np
 from polars.dependencies import pandas as pd
 from polars.dependencies import pyarrow as pa
+from polars.functions.eager import concat
 
 with contextlib.suppress(ImportError):  # Module not available when building docs
-    from polars.polars import PySeries
+    from polars.polars import PySeries, get_index_type
 
 if TYPE_CHECKING:
     from collections.abc import Iterable, Sequence
@@ -454,27 +456,48 @@ def numpy_to_pyseries(
         return constructor(
             name, values, nan_to_null if dtype in (np.float32, np.float64) else strict
         )
-    elif sum(values.shape) == 0:
-        # Optimize by ingesting 1D and reshaping in Rust
-        original_shape = values.shape
-        values = values.reshape(-1)
-        py_s = numpy_to_pyseries(
-            name,
-            values,
-            strict=strict,
-            nan_to_null=nan_to_null,
-        )
-        return wrap_s(py_s).reshape(original_shape)._s
     else:
         original_shape = values.shape
-        values = values.reshape(-1)
-        py_s = numpy_to_pyseries(
-            name,
-            values,
-            strict=strict,
-            nan_to_null=nan_to_null,
-        )
-        return wrap_s(py_s).reshape(original_shape)._s
+        values_1d = values.reshape(-1)
+
+        if get_index_type() == UInt32:
+            limit = 2**32 - 1
+        else:
+            limit = 2**64 - 1
+
+        if values.size <= limit:
+            py_s = numpy_to_pyseries(
+                name,
+                values_1d,
+                strict=strict,
+                nan_to_null=nan_to_null,
+            )
+            return wrap_s(py_s).reshape(original_shape)._s
+        else:
+            # Process in chunk, so we don't trigger ROWS_LIMIT
+            offset = 0
+            chunks = []
+
+            # Tuples are immutable, so convert to list
+            original_shape_chunk = list(original_shape)
+            # Rows size is now changed, so infer
+            original_shape_chunk[0] = -1
+            original_shape_chunk_t = tuple(original_shape_chunk)
+            while True:
+                chunk = values_1d[offset : offset + limit]
+                offset += limit
+                if chunk.shape[0] == 0:
+                    break
+
+                py_s = numpy_to_pyseries(
+                    name,
+                    chunk,
+                    strict=strict,
+                    nan_to_null=nan_to_null,
+                )
+                chunks.append(wrap_s(py_s).reshape(original_shape_chunk_t))
+
+            return concat(chunks)._s
 
 
 def series_to_pyseries(