Skip to content
Draft
68 changes: 34 additions & 34 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@
from narwhals._compliant import EagerDataFrame
from narwhals._utils import (
Implementation,
NullableSchema,
Version,
check_column_names_are_unique,
convert_str_slice_to_int_slice,
generate_temporary_column_name,
is_sequence_of,
not_implemented,
parse_columns_to_drop,
scale_bytes,
Expand Down Expand Up @@ -53,6 +55,7 @@
from narwhals._utils import Version, _LimitedContext
from narwhals.dtypes import DType
from narwhals.typing import (
IntoNullableSchema,
IntoSchema,
JoinStrategy,
SizedMultiIndexSelector,
Expand Down Expand Up @@ -119,36 +122,37 @@ def from_dict(
/,
*,
context: _LimitedContext,
schema: IntoSchema | Mapping[str, DType | None] | None,
schema: IntoSchema | IntoNullableSchema | None = None,
) -> Self:
if not schema and not data:
return cls.from_native(pa.table({}), context=context)
if not schema:
return cls.from_native(pa.table(data), context=context) # type: ignore[arg-type]
if not any(dtype is None for dtype in schema.values()):
from narwhals.schema import Schema
nullable_schema = NullableSchema(schema)

if nullable_schema.is_nullable:
if context._implementation._backend_version() < (14,):
msg = "Passing `None` dtype in `from_dict` requires PyArrow>=14"
raise NotImplementedError(msg)
res = pa.table(
{
name: pa.chunked_array( # type: ignore[misc]
[data[name] if data else []],
type=narwhals_to_native_dtype(nw_dtype, version=context._version)
if nw_dtype is not None
else None,
)
for name, nw_dtype in nullable_schema.items()
}
)
return cls.from_native(pa.table(res), context=context)

pa_schema = Schema(cast("IntoSchema", schema)).to_arrow()
if pa_schema and not data:
native = pa_schema.empty_table()
else:
native = pa.Table.from_pydict(data, schema=pa_schema)
return cls.from_native(native, context=context)
if context._implementation._backend_version() < (14,):
msg = "Passing `None` dtype in `from_dict` requires PyArrow>=14"
raise NotImplementedError(msg)
res = pa.table(
{
name: pa.chunked_array( # type: ignore[misc]
[data[name] if data else []],
type=narwhals_to_native_dtype(nw_dtype, version=context._version)
if nw_dtype is not None
else None,
)
for name, nw_dtype in schema.items()
}
)
return cls.from_native(pa.table(res), context=context)
pa_schema = nullable_schema.to_schema().to_arrow()
if pa_schema and not data:
native = pa_schema.empty_table()
else:
native = pa.Table.from_pydict(data, schema=pa_schema)
return cls.from_native(native, context=context)

@classmethod
def from_dicts(
Expand All @@ -157,17 +161,13 @@ def from_dicts(
/,
*,
context: _LimitedContext,
schema: IntoSchema | Mapping[str, DType | None] | None,
schema: IntoSchema | IntoNullableSchema | None = None,
) -> Self:
from narwhals.schema import Schema

if schema and any(dtype is None for dtype in schema.values()):
if schema and (nullable_schema := NullableSchema(schema)).is_nullable:
msg = "`from_dicts` with `schema` where any dtype is `None` is not supported for PyArrow."
raise NotImplementedError(msg)
pa_schema = (
Schema(cast("IntoSchema", schema)).to_arrow()
if schema is not None
else schema
nullable_schema.to_schema().to_arrow() if schema is not None else schema
)
if pa_schema and not data:
native = pa_schema.empty_table()
Expand Down Expand Up @@ -195,10 +195,10 @@ def from_numpy(
from narwhals.schema import Schema

arrays = [pa.array(val) for val in data.T]
if isinstance(schema, (Mapping, Schema)):
native = pa.Table.from_arrays(arrays, schema=Schema(schema).to_arrow())
else:
if is_sequence_of(schema, str) or schema is None:
native = pa.Table.from_arrays(arrays, cls._numpy_column_names(data, schema))
else:
native = pa.Table.from_arrays(arrays, schema=Schema(schema).to_arrow())
return cls.from_native(native, context=context)

def __narwhals_namespace__(self) -> ArrowNamespace:
Expand Down
5 changes: 3 additions & 2 deletions narwhals/_compliant/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
from narwhals.exceptions import ColumnNotFoundError
from narwhals.typing import (
AsofJoinStrategy,
IntoNullableSchema,
IntoSchema,
JoinStrategy,
MultiColSelector,
Expand Down Expand Up @@ -191,7 +192,7 @@ def from_dict(
/,
*,
context: CompliantNamespaceAny,
schema: IntoSchema | Mapping[str, DType | None] | None,
schema: IntoSchema | IntoNullableSchema | None,
) -> Self: ...
@classmethod
def from_dicts(
Expand All @@ -200,7 +201,7 @@ def from_dicts(
/,
*,
context: _LimitedContext,
schema: IntoSchema | Mapping[str, DType | None] | None,
schema: IntoSchema | IntoNullableSchema | None,
) -> Self: ...
@classmethod
def from_numpy(
Expand Down
28 changes: 17 additions & 11 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

from collections.abc import Iterable, Iterator, Mapping, Sequence
from itertools import chain, product
from typing import TYPE_CHECKING, Any, Callable, Literal, cast, overload

Expand All @@ -23,11 +22,13 @@
from narwhals._typing_compat import assert_never
from narwhals._utils import (
Implementation,
NullableSchema,
_into_arrow_table,
_remap_full_join_keys,
check_column_names_are_unique,
exclude_column_names,
generate_temporary_column_name,
is_sequence_of,
parse_columns_to_drop,
scale_bytes,
zip_strict,
Expand All @@ -36,6 +37,7 @@
from narwhals.exceptions import InvalidOperationError, ShapeError

if TYPE_CHECKING:
from collections.abc import Iterable, Iterator, Mapping, Sequence
from io import BytesIO
from pathlib import Path
from types import ModuleType
Expand All @@ -56,6 +58,7 @@
from narwhals.typing import (
AsofJoinStrategy,
DTypeBackend,
IntoNullableSchema,
IntoSchema,
JoinStrategy,
PivotAgg,
Expand Down Expand Up @@ -148,7 +151,7 @@ def from_dict(
/,
*,
context: _LimitedContext,
schema: IntoSchema | Mapping[str, DType | None] | None,
schema: IntoSchema | IntoNullableSchema | None = None,
) -> Self:
implementation = context._implementation
pdx = implementation.to_native_namespace()
Expand Down Expand Up @@ -183,7 +186,9 @@ def from_dict(
implementation=context._implementation,
version=context._version,
)
for ((key, dtype), backend) in zip(schema.items(), backends)
for ((key, dtype), backend) in zip(
NullableSchema(schema).items(), backends
)
if dtype is not None
}
native = native.astype(native_schema)
Expand All @@ -196,7 +201,7 @@ def from_dicts(
/,
*,
context: _LimitedContext,
schema: IntoSchema | Mapping[str, DType | None] | None,
schema: IntoSchema | IntoNullableSchema | None = None,
) -> Self:
implementation = context._implementation
ns = implementation.to_native_namespace()
Expand All @@ -218,7 +223,9 @@ def from_dicts(
implementation=context._implementation,
version=context._version,
)
for ((key, dtype), backend) in zip(schema.items(), backends)
for ((key, dtype), backend) in zip(
NullableSchema(schema).items(), backends
)
if dtype is not None
}
native = native.astype(native_schema)
Expand Down Expand Up @@ -250,16 +257,15 @@ def from_numpy(

implementation = context._implementation
DataFrame: Constructor = implementation.to_native_namespace().DataFrame
if isinstance(schema, (Mapping, Schema)):
if is_sequence_of(schema, str) or schema is None:
native = DataFrame(data, columns=cls._numpy_column_names(data, schema))
else:
schema = Schema(schema)
it: Iterable[DTypeBackend] = (
get_dtype_backend(native_type, implementation)
for native_type in schema.values()
)
native = DataFrame(data, columns=schema.keys()).astype(
Schema(schema).to_pandas(it)
)
else:
native = DataFrame(data, columns=cls._numpy_column_names(data, schema))
native = DataFrame(data, columns=schema.keys()).astype(schema.to_pandas(it))
return cls.from_native(native, context=context)

def __narwhals_dataframe__(self) -> Self:
Expand Down
17 changes: 10 additions & 7 deletions narwhals/_polars/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
)
from narwhals._utils import (
Implementation,
NullableSchema,
_into_arrow_table,
convert_str_slice_to_int_slice,
generate_temporary_column_name,
is_compliant_series,
is_index_selector,
is_range,
is_sequence_like,
is_sequence_of,
is_slice_index,
is_slice_none,
parse_columns_to_drop,
Expand Down Expand Up @@ -50,6 +52,7 @@
from narwhals.dataframe import DataFrame, LazyFrame
from narwhals.dtypes import DType
from narwhals.typing import (
IntoNullableSchema,
IntoSchema,
JoinStrategy,
MultiColSelector,
Expand Down Expand Up @@ -317,14 +320,14 @@ def from_dict(
/,
*,
context: _LimitedContext,
schema: IntoSchema | Mapping[str, DType | None] | None,
schema: IntoSchema | IntoNullableSchema | None = None,
) -> Self:
pl_schema = (
{
key: narwhals_to_native_dtype(dtype, context._version)
if dtype is not None
else None
for (key, dtype) in schema.items()
for (key, dtype) in NullableSchema(schema).items()
}
if schema
else None
Expand All @@ -338,14 +341,14 @@ def from_dicts(
/,
*,
context: _LimitedContext,
schema: IntoSchema | Mapping[str, DType | None] | None,
schema: IntoSchema | IntoNullableSchema | None = None,
) -> Self:
pl_schema = (
{
key: narwhals_to_native_dtype(dtype, context._version)
if dtype is not None
else None
for (key, dtype) in schema.items()
for (key, dtype) in NullableSchema(schema).items()
}
if schema
else None
Expand Down Expand Up @@ -378,9 +381,9 @@ def from_numpy(
from narwhals.schema import Schema

pl_schema = (
Schema(schema).to_polars()
if isinstance(schema, (Mapping, Schema))
else schema
schema
if is_sequence_of(schema, str) or schema is None
else Schema(schema).to_polars()
)
return cls.from_native(pl.from_numpy(data, pl_schema), context=context)

Expand Down
23 changes: 23 additions & 0 deletions narwhals/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import sys
from collections import OrderedDict
from collections.abc import Collection, Container, Iterable, Iterator, Mapping, Sequence
from datetime import timezone
from enum import Enum, auto
Expand Down Expand Up @@ -111,13 +112,17 @@
)
from narwhals.dataframe import DataFrame, LazyFrame
from narwhals.dtypes import DType
from narwhals.schema import Schema
from narwhals.series import Series
from narwhals.typing import (
CompliantDataFrame,
CompliantLazyFrame,
CompliantSeries,
DTypes,
FileSource,
IntoDType,
IntoNullableSchema,
IntoSchema,
IntoSeriesT,
MultiIndexSelector,
SingleIndexSelector,
Expand Down Expand Up @@ -2107,3 +2112,21 @@ def extend_bool(
Stolen from https://github.com/pola-rs/polars/blob/b8bfb07a4a37a8d449d6d1841e345817431142df/py-polars/polars/_utils/various.py#L580-L594
"""
return (value,) * n_match if isinstance(value, bool) else tuple(value)


class NullableSchema(OrderedDict[str, "IntoDType | None"]):
Copy link
Member Author

@FBruzzesi FBruzzesi Oct 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reason for this class is mostly two folded:

  • Use as a utility to convert a Sequence[tuple[str, DType {|None}]] into a mapping. Hence make it easy to use the same API (key, dtype in obj.items()).
  • Easily set a flag to know if any value passed is None

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name Nullable* is making me think this is related to #3176 (comment) again 🫣

I'm not opposed to having classes to make our internal API cleaner btw πŸ‘

Copy link
Member Author

@FBruzzesi FBruzzesi Nov 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The name Nullable* is making me think this is related to #3176 (comment) again 🫣

Not sure I see how it's related. Passing None seems more like to be a free card, "live and let live" kind of behavior.

I'm not opposed to having classes to make our internal API cleaner btw πŸ‘

Any preference in having it prefixed by _?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any preference in having it prefixed by _?

It is already in _utils, so no need for a prefix πŸ™‚

The current _ names are an artifact from when we had everything in utils.py

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I see how it's related. Passing None seems more like to be a free card, "live and let live" kind of behavior.

I'll try to give a more complete explanation later, but I was hinting at Nullable, Null, None being a bit overloaded.
Linking to (#3176 (comment)) was supposed to show that I made this mistake already πŸ˜‚

def __init__(self, schema: IntoSchema | IntoNullableSchema | None = None) -> None:
schema = schema or {}
super().__init__(schema)

self.is_nullable = None in self.values()

def to_schema(self) -> Schema:
"""Converts to Schema by filtering out None values."""
from narwhals.schema import Schema

if self.is_nullable:
msg = "Cannot convert nullable mapping into Schema"
raise AssertionError(msg)

return Schema(self.items()) # type: ignore[arg-type]
Loading
Loading