Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 62 additions & 10 deletions python/python/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,15 @@ def test_json_basic_write_read():
# Read back the dataset
dataset = lance.dataset(dataset_path)

# Verify storage schema
assert len(dataset.schema) == 2
assert dataset.schema.field("id").type == pa.int32()

# Check that JSON field is stored as JSONB internally
storage_field = dataset.schema.field("data")
assert storage_field.type == pa.large_binary()
assert storage_field.metadata is not None
assert b"ARROW:extension:name" in storage_field.metadata
assert storage_field.metadata[b"ARROW:extension:name"] == b"lance.json"
# Verify logical schema exposed to users
logical_schema = dataset.schema
assert len(logical_schema) == 2
assert logical_schema.field("id").type == pa.int32()
logical_field = logical_schema.field("data")
assert (
str(logical_field.type) == "extension<arrow.json>"
or logical_field.type == pa.utf8()
)

# Read data back
result_table = dataset.to_table()
Expand Down Expand Up @@ -355,3 +354,56 @@ def test_json_array_operations():
result = dataset.to_table(filter="json_array_length(data, '$.items') = 0")
assert result.num_rows == 1
assert result["id"][0].as_py() == 3


def test_json_filter_append_missing_json_cast(tmp_path: Path):
"""Ensure appending via dataset.schema keeps JSON columns valid."""

dataset_path = tmp_path / "json_append_issue.lance"

initial_table = pa.table(
{
"article_metadata": pa.array(
[json.dumps({"article_journal": "Cell"})], type=pa.json_()
),
"article_journal": pa.array(["Cell"], type=pa.string()),
}
)

lance.write_dataset(initial_table, dataset_path)
dataset = lance.dataset(dataset_path)
schema = dataset.schema
field = schema.field("article_metadata")
assert str(field.type) == "extension<arrow.json>" or field.type == pa.utf8()

append_table = pa.table(
{
"article_metadata": pa.array(
[
json.dumps({"article_journal": "PLoS One"}),
json.dumps({"article_journal": "Nature"}),
],
type=pa.json_(),
),
"article_journal": pa.array(["PLoS One", "Nature"], type=pa.string()),
}
)

append_cast = append_table.cast(schema)
first_value = append_cast.column("article_metadata").to_pylist()[0]
assert isinstance(first_value, str)

lance.write_dataset(append_cast, dataset_path, mode="append")
dataset = lance.dataset(dataset_path)
assert dataset.count_rows() == 3

result = dataset.to_table(
filter="json_get(article_metadata, 'article_journal') IS NOT NULL"
)

assert result.num_rows == 3
assert result.column("article_journal").to_pylist() == [
"Cell",
"PLoS One",
"Nature",
]
6 changes: 3 additions & 3 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ use crate::fragment::FileFragment;
use crate::indices::PyIndexConfig;
use crate::rt;
use crate::scanner::ScanStatistics;
use crate::schema::LanceSchema;
use crate::schema::{logical_schema_from_lance, LanceSchema};
use crate::session::Session;
use crate::utils::PyLance;
use crate::{LanceReader, Scanner};
Expand Down Expand Up @@ -592,8 +592,8 @@ impl Dataset {

#[getter(schema)]
fn schema(self_: PyRef<'_, Self>) -> PyResult<PyObject> {
let arrow_schema = ArrowSchema::from(self_.ds.schema());
arrow_schema.to_pyarrow(self_.py())
let logical_schema = logical_schema_from_lance(self_.ds.schema());
logical_schema.to_pyarrow(self_.py())
}

#[getter(lance_schema)]
Expand Down
7 changes: 3 additions & 4 deletions python/src/fragment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ use std::sync::Arc;
use arrow::ffi_stream::ArrowArrayStreamReader;
use arrow::pyarrow::{FromPyArrow, PyArrowType, ToPyArrow};
use arrow_array::RecordBatchReader;
use arrow_schema::Schema as ArrowSchema;
use futures::TryFutureExt;
use lance::dataset::fragment::FileFragment as LanceFragment;
use lance::dataset::scanner::ColumnOrdering;
Expand All @@ -39,7 +38,7 @@ use snafu::location;

use crate::dataset::{get_write_params, transforms_from_python, PyWriteDest};
use crate::error::PythonErrorExt;
use crate::schema::LanceSchema;
use crate::schema::{logical_schema_from_lance, LanceSchema};
use crate::utils::{export_vec, extract_vec, PyLance};
use crate::{rt, Dataset, Scanner};

Expand Down Expand Up @@ -354,8 +353,8 @@ impl FileFragment {

fn schema(self_: PyRef<'_, Self>) -> PyResult<PyObject> {
let schema = self_.fragment.dataset().schema();
let arrow_schema: ArrowSchema = schema.into();
arrow_schema.to_pyarrow(self_.py())
let logical_schema = logical_schema_from_lance(schema);
logical_schema.to_pyarrow(self_.py())
}

/// Returns the data file objects associated with this fragment.
Expand Down
9 changes: 6 additions & 3 deletions python/src/scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use pyo3::exceptions::PyValueError;

use crate::reader::LanceReader;
use crate::rt;
use crate::schema::logical_arrow_schema;

/// This will be wrapped by a python class to provide
/// additional functionality
Expand Down Expand Up @@ -99,9 +100,11 @@ impl Scanner {
#[getter(schema)]
fn schema(self_: PyRef<'_, Self>) -> PyResult<PyObject> {
let scanner = self_.scanner.clone();
rt().spawn(Some(self_.py()), async move { scanner.schema().await })?
.map(|s| s.to_pyarrow(self_.py()))
.map_err(|err| PyValueError::new_err(err.to_string()))?
let schema = rt()
.spawn(Some(self_.py()), async move { scanner.schema().await })?
.map_err(|err| PyValueError::new_err(err.to_string()))?;
let logical_schema = logical_arrow_schema(schema.as_ref());
logical_schema.to_pyarrow(self_.py())
}

#[pyo3(signature = (*, verbose = false))]
Expand Down
21 changes: 21 additions & 0 deletions python/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
// SPDX-FileCopyrightText: Copyright The Lance Authors

use arrow::pyarrow::PyArrowType;
use arrow_array::RecordBatch;
use arrow_schema::Schema as ArrowSchema;
use lance::datatypes::{Field, Schema};
use lance_arrow::json::{convert_lance_json_to_arrow, has_json_fields};
use lance_file::datatypes::{Fields, FieldsWithMeta};
use lance_file::format::pb;
use prost::Message;
Expand Down Expand Up @@ -165,3 +167,22 @@ impl LanceSchema {
Ok(self.0.field(name).map(|f| LanceField(f.clone())))
}
}

pub(crate) fn logical_arrow_schema(schema: &ArrowSchema) -> ArrowSchema {
use std::sync::Arc;

if !schema.fields().iter().any(|f| has_json_fields(f.as_ref())) {
return schema.clone();
}

let schema_ref = Arc::new(schema.clone());
let empty_batch = RecordBatch::new_empty(schema_ref.clone());
match convert_lance_json_to_arrow(&empty_batch) {
Ok(converted) => converted.schema().as_ref().clone(),
Err(_) => schema.clone(),
}
}

pub(crate) fn logical_schema_from_lance(schema: &Schema) -> ArrowSchema {
logical_arrow_schema(&ArrowSchema::from(schema))
}
Loading