Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 62 additions & 10 deletions python/python/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,15 @@ def test_json_basic_write_read():
# Read back the dataset
dataset = lance.dataset(dataset_path)

# Verify storage schema
assert len(dataset.schema) == 2
assert dataset.schema.field("id").type == pa.int32()

# Check that JSON field is stored as JSONB internally
storage_field = dataset.schema.field("data")
assert storage_field.type == pa.large_binary()
assert storage_field.metadata is not None
assert b"ARROW:extension:name" in storage_field.metadata
assert storage_field.metadata[b"ARROW:extension:name"] == b"lance.json"
# Verify logical schema exposed to users
logical_schema = dataset.schema
assert len(logical_schema) == 2
assert logical_schema.field("id").type == pa.int32()
logical_field = logical_schema.field("data")
assert (
str(logical_field.type) == "extension<arrow.json>"
or logical_field.type == pa.utf8()
)

# Read data back
result_table = dataset.to_table()
Expand Down Expand Up @@ -355,3 +354,56 @@ def test_json_array_operations():
result = dataset.to_table(filter="json_array_length(data, '$.items') = 0")
assert result.num_rows == 1
assert result["id"][0].as_py() == 3


def test_json_filter_append_missing_json_cast(tmp_path: Path):
"""Ensure appending via dataset.schema keeps JSON columns valid."""

dataset_path = tmp_path / "json_append_issue.lance"

initial_table = pa.table(
{
"article_metadata": pa.array(
[json.dumps({"article_journal": "Cell"})], type=pa.json_()
),
"article_journal": pa.array(["Cell"], type=pa.string()),
}
)

lance.write_dataset(initial_table, dataset_path)
dataset = lance.dataset(dataset_path)
schema = dataset.schema
field = schema.field("article_metadata")
assert str(field.type) == "extension<arrow.json>" or field.type == pa.utf8()

append_table = pa.table(
{
"article_metadata": pa.array(
[
json.dumps({"article_journal": "PLoS One"}),
json.dumps({"article_journal": "Nature"}),
],
type=pa.json_(),
),
"article_journal": pa.array(["PLoS One", "Nature"], type=pa.string()),
}
)

append_cast = append_table.cast(schema)
first_value = append_cast.column("article_metadata").to_pylist()[0]
assert isinstance(first_value, str)

lance.write_dataset(append_cast, dataset_path, mode="append")
dataset = lance.dataset(dataset_path)
assert dataset.count_rows() == 3

result = dataset.to_table(
filter="json_get(article_metadata, 'article_journal') IS NOT NULL"
)

assert result.num_rows == 3
assert result.column("article_journal").to_pylist() == [
"Cell",
"PLoS One",
"Nature",
]
6 changes: 3 additions & 3 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ use crate::fragment::FileFragment;
use crate::indices::PyIndexConfig;
use crate::rt;
use crate::scanner::ScanStatistics;
use crate::schema::LanceSchema;
use crate::schema::{logical_schema_from_lance, LanceSchema};
use crate::session::Session;
use crate::utils::PyLance;
use crate::{LanceReader, Scanner};
Expand Down Expand Up @@ -578,8 +578,8 @@ impl Dataset {

#[getter(schema)]
fn schema(self_: PyRef<'_, Self>) -> PyResult<PyObject> {
let arrow_schema = ArrowSchema::from(self_.ds.schema());
arrow_schema.to_pyarrow(self_.py())
let logical_schema = logical_schema_from_lance(self_.ds.schema());
logical_schema.to_pyarrow(self_.py())
}

#[getter(lance_schema)]
Expand Down
7 changes: 3 additions & 4 deletions python/src/fragment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ use std::sync::Arc;
use arrow::ffi_stream::ArrowArrayStreamReader;
use arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow};
use arrow_array::RecordBatchReader;
use arrow_schema::Schema as ArrowSchema;
use futures::TryFutureExt;
use lance::dataset::fragment::FileFragment as LanceFragment;
use lance::dataset::scanner::ColumnOrdering;
Expand All @@ -39,7 +38,7 @@ use snafu::location;

use crate::dataset::{get_write_params, transforms_from_python, PyWriteDest};
use crate::error::PythonErrorExt;
use crate::schema::LanceSchema;
use crate::schema::{logical_schema_from_lance, LanceSchema};
use crate::utils::{export_vec, extract_vec, PyLance};
use crate::{rt, Dataset, Scanner};

Expand Down Expand Up @@ -354,8 +353,8 @@ impl FileFragment {

fn schema(self_: PyRef<'_, Self>) -> PyResult<PyObject> {
let schema = self_.fragment.dataset().schema();
let arrow_schema: ArrowSchema = schema.into();
arrow_schema.to_pyarrow(self_.py())
let logical_schema = logical_schema_from_lance(schema);
logical_schema.to_pyarrow(self_.py())
}

/// Returns the data file objects associated with this fragment.
Expand Down
9 changes: 6 additions & 3 deletions python/src/scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use pyo3::exceptions::PyValueError;

use crate::reader::LanceReader;
use crate::rt;
use crate::schema::logical_arrow_schema;

/// This will be wrapped by a python class to provide
/// additional functionality
Expand Down Expand Up @@ -99,9 +100,11 @@ impl Scanner {
#[getter(schema)]
fn schema(self_: PyRef<'_, Self>) -> PyResult<PyObject> {
let scanner = self_.scanner.clone();
rt().spawn(Some(self_.py()), async move { scanner.schema().await })?
.map(|s| s.to_pyarrow(self_.py()))
.map_err(|err| PyValueError::new_err(err.to_string()))?
let schema = rt()
.spawn(Some(self_.py()), async move { scanner.schema().await })?
.map_err(|err| PyValueError::new_err(err.to_string()))?;
let logical_schema = logical_arrow_schema(schema.as_ref());
logical_schema.to_pyarrow(self_.py())
}

#[pyo3(signature = (*, verbose = false))]
Expand Down
69 changes: 68 additions & 1 deletion python/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// SPDX-FileCopyrightText: Copyright The Lance Authors

use arrow::pyarrow::PyArrowType;
use arrow_schema::Schema as ArrowSchema;
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
use lance::datatypes::{Field, Schema};
use lance_file::datatypes::{Fields, FieldsWithMeta};
use lance_file::format::pb;
Expand Down Expand Up @@ -165,3 +165,70 @@ impl LanceSchema {
Ok(self.0.field(name).map(|f| LanceField(f.clone())))
}
}

fn logical_data_type(data_type: &DataType) -> DataType {
use std::sync::Arc;

match data_type {
DataType::Struct(fields) => DataType::Struct(
fields
.iter()
.map(|f| Arc::new(logical_field(f.as_ref())))
.collect(),
),
DataType::List(field) => DataType::List(Arc::new(logical_field(field.as_ref()))),
DataType::LargeList(field) => DataType::LargeList(Arc::new(logical_field(field.as_ref()))),
DataType::FixedSizeList(field, len) => {
DataType::FixedSizeList(Arc::new(logical_field(field.as_ref())), *len)
}
DataType::Map(field, keys_sorted) => {
DataType::Map(Arc::new(logical_field(field.as_ref())), *keys_sorted)
}
DataType::Dictionary(index_type, value_type) => DataType::Dictionary(
index_type.clone(),
Box::new(logical_data_type(value_type.as_ref())),
),
_ => data_type.clone(),
}
}

fn logical_field(field: &ArrowField) -> ArrowField {
use lance_arrow::json::{is_json_field, ARROW_JSON_EXT_NAME};
use lance_arrow::ARROW_EXT_NAME_KEY;

if is_json_field(field) {
let mut metadata = field.metadata().clone();
metadata.insert(
ARROW_EXT_NAME_KEY.to_string(),
ARROW_JSON_EXT_NAME.to_string(),
);
let mut new_field = ArrowField::new(field.name(), DataType::Utf8, field.is_nullable());
new_field.set_metadata(metadata);
new_field
} else {
let new_data_type = logical_data_type(field.data_type());
if new_data_type == *field.data_type() {
field.clone()
} else {
let mut new_field = ArrowField::new(field.name(), new_data_type, field.is_nullable());
let metadata = field.metadata().clone();
if !metadata.is_empty() {
new_field.set_metadata(metadata);
}
new_field
}
}
}

pub(crate) fn logical_arrow_schema(schema: &ArrowSchema) -> ArrowSchema {
let fields: Vec<ArrowField> = schema
.fields()
.iter()
.map(|f| logical_field(f.as_ref()))
.collect();
ArrowSchema::new_with_metadata(fields, schema.metadata().clone())
}

pub(crate) fn logical_schema_from_lance(schema: &Schema) -> ArrowSchema {
logical_arrow_schema(&ArrowSchema::from(schema))
}
Loading