Skip to content

Commit 2e2c139

Browse files
caching strings from JSON (#1240)
Co-authored-by: David Hewitt <[email protected]>
1 parent c607fd8 commit 2e2c139

27 files changed

+195
-59
lines changed

.mypy-stubtest-allowlist

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
# TODO: don't want to expose this staticmethod, requires https://github.com/PyO3/pyo3/issues/2384
22
pydantic_core._pydantic_core.PydanticUndefinedType.new
3+
# As per #1240, from_json has custom logic to coverage the `cache_strings` kwarg
4+
pydantic_core._pydantic_core.from_json

Cargo.lock

Lines changed: 2 additions & 13 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ base64 = "0.21.7"
4444
num-bigint = "0.4.4"
4545
python3-dll-a = "0.2.7"
4646
uuid = "1.7.0"
47-
jiter = { version = "0.1.0", features = ["python"] }
47+
jiter = { version = "0.1.1", features = ["python"] }
4848

4949
[lib]
5050
name = "_pydantic_core"

python/pydantic_core/_pydantic_core.pyi

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -390,17 +390,26 @@ def to_json(
390390
JSON bytes.
391391
"""
392392

393-
def from_json(data: str | bytes | bytearray, *, allow_inf_nan: bool = True, cache_strings: bool = True) -> Any:
393+
def from_json(
394+
data: str | bytes | bytearray,
395+
*,
396+
allow_inf_nan: bool = True,
397+
cache_strings: bool | Literal['all', 'keys', 'none'] = True,
398+
allow_partial: bool = False,
399+
) -> Any:
394400
"""
395401
Deserialize JSON data to a Python object.
396402
397-
This is effectively a faster version of `json.loads()`.
403+
This is effectively a faster version of `json.loads()`, with some extra functionality.
398404
399405
Arguments:
400406
data: The JSON data to deserialize.
401407
allow_inf_nan: Whether to allow `Infinity`, `-Infinity` and `NaN` values as `json.loads()` does by default.
402408
cache_strings: Whether to cache strings to avoid constructing new Python objects,
403-
this should have a significant impact on performance while increasing memory usage slightly.
409+
this should have a significant impact on performance while increasing memory usage slightly,
410+
`all/True` means cache all strings, `keys` means cache only dict keys, `none/False` means no caching.
411+
allow_partial: Whether to allow partial deserialization, if `True` JSON data is returned if the end of the
412+
input is reached before the full object is deserialized, e.g. `["aa", "bb", "c` would return `['aa', 'bb']`.
404413
405414
Raises:
406415
ValueError: If deserialization fails.

python/pydantic_core/core_schema.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ class CoreConfig(TypedDict, total=False):
7575
Requires exceptiongroup backport pre Python 3.11.
7676
coerce_numbers_to_str: Whether to enable coercion of any `Number` type to `str` (not applicable in `strict` mode).
7777
regex_engine: The regex engine to use for regex pattern validation. Default is 'rust-regex'. See `StringSchema`.
78+
cache_strings: Whether to cache strings. Default is `True`, `True` or `'all'` is required to cache strings
79+
during general validation since validators don't know if they're in a key or a value.
7880
"""
7981

8082
title: str
@@ -110,6 +112,7 @@ class CoreConfig(TypedDict, total=False):
110112
validation_error_cause: bool # default: False
111113
coerce_numbers_to_str: bool # default: False
112114
regex_engine: Literal['rust-regex', 'python-re'] # default: 'rust-regex'
115+
cache_strings: Union[bool, Literal['all', 'keys', 'none']] # default: 'True'
113116

114117

115118
IncExCall: TypeAlias = 'set[int | str] | dict[int | str, IncExCall] | None'

src/input/return_enums.rs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use std::cmp::Ordering;
33
use std::ops::Rem;
44
use std::str::FromStr;
55

6-
use jiter::{JsonArray, JsonValue};
6+
use jiter::{JsonArray, JsonValue, StringCacheMode};
77
use num_bigint::BigInt;
88

99
use pyo3::exceptions::PyTypeError;
@@ -435,9 +435,15 @@ impl<'a> EitherString<'a> {
435435
}
436436
}
437437

438-
pub fn as_py_string(&'a self, py: Python<'a>) -> Bound<'a, PyString> {
438+
pub fn as_py_string(&'a self, py: Python<'a>, cache_str: StringCacheMode) -> Bound<'a, PyString> {
439439
match self {
440-
Self::Cow(cow) => PyString::new_bound(py, cow),
440+
Self::Cow(cow) => {
441+
if matches!(cache_str, StringCacheMode::All) {
442+
jiter::cached_py_string(py, cow.as_ref())
443+
} else {
444+
PyString::new_bound(py, cow.as_ref())
445+
}
446+
}
441447
Self::Py(py_string) => py_string.clone(),
442448
}
443449
}
@@ -461,12 +467,6 @@ impl<'a> From<Bound<'a, PyString>> for EitherString<'a> {
461467
}
462468
}
463469

464-
impl<'a> IntoPy<PyObject> for EitherString<'a> {
465-
fn into_py(self, py: Python<'_>) -> PyObject {
466-
self.as_py_string(py).into_py(py)
467-
}
468-
}
469-
470470
pub fn py_string_str<'a>(py_str: &'a Bound<'_, PyString>) -> ValResult<&'a str> {
471471
py_str.to_str().map_err(|_| {
472472
ValError::new_custom_input(

src/lib.rs

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ extern crate core;
44

55
use std::sync::OnceLock;
66

7+
use jiter::StringCacheMode;
78
use pyo3::exceptions::PyTypeError;
89
use pyo3::{prelude::*, sync::GILOnceCell};
910

@@ -38,19 +39,31 @@ pub use validators::{validate_core_schema, PySome, SchemaValidator};
3839

3940
use crate::input::Input;
4041

41-
#[pyfunction(signature = (data, *, allow_inf_nan=true, cache_strings=true))]
42+
#[derive(FromPyObject)]
43+
pub enum CacheStringsArg {
44+
Bool(bool),
45+
Literal(StringCacheMode),
46+
}
47+
48+
#[pyfunction(signature = (data, *, allow_inf_nan=true, cache_strings=CacheStringsArg::Bool(true), allow_partial=false))]
4249
pub fn from_json<'py>(
4350
py: Python<'py>,
4451
data: &Bound<'_, PyAny>,
4552
allow_inf_nan: bool,
46-
cache_strings: bool,
53+
cache_strings: CacheStringsArg,
54+
allow_partial: bool,
4755
) -> PyResult<Bound<'py, PyAny>> {
4856
let v_match = data
4957
.validate_bytes(false)
5058
.map_err(|_| PyTypeError::new_err("Expected bytes, bytearray or str"))?;
5159
let json_either_bytes = v_match.into_inner();
5260
let json_bytes = json_either_bytes.as_slice();
53-
jiter::python_parse(py, json_bytes, allow_inf_nan, cache_strings).map_err(|e| jiter::map_json_error(json_bytes, &e))
61+
let cache_mode = match cache_strings {
62+
CacheStringsArg::Bool(b) => b.into(),
63+
CacheStringsArg::Literal(mode) => mode,
64+
};
65+
jiter::python_parse(py, json_bytes, allow_inf_nan, cache_mode, allow_partial)
66+
.map_err(|e| jiter::map_json_error(json_bytes, &e))
5467
}
5568

5669
pub fn get_pydantic_core_version() -> &'static str {

src/validators/arguments.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ impl BuildValidator for ArgumentsValidator {
5555
for (arg_index, arg) in arguments_schema.iter().enumerate() {
5656
let arg = arg.downcast::<PyDict>()?;
5757

58-
let name: String = arg.get_as_req(intern!(py, "name"))?;
58+
let py_name: Bound<PyString> = arg.get_as_req(intern!(py, "name"))?;
59+
let name = py_name.to_string();
5960
let mode = arg.get_as::<Bound<'_, PyString>>(intern!(py, "mode"))?;
6061
let mode = mode
6162
.as_ref()
@@ -77,7 +78,7 @@ impl BuildValidator for ArgumentsValidator {
7778
}
7879
None => Some(LookupKey::from_string(py, &name)),
7980
};
80-
kwarg_key = Some(PyString::new_bound(py, &name).into());
81+
kwarg_key = Some(py_name.into_py(py));
8182
}
8283

8384
let schema = arg.get_as_req(intern!(py, "schema"))?;
@@ -274,7 +275,9 @@ impl Validator for ArgumentsValidator {
274275
if !used_kwargs.contains(either_str.as_cow()?.as_ref()) {
275276
match self.var_kwargs_validator {
276277
Some(ref validator) => match validator.validate(py, value.borrow_input(), state) {
277-
Ok(value) => output_kwargs.set_item(either_str.as_py_string(py), value)?,
278+
Ok(value) => {
279+
output_kwargs.set_item(either_str.as_py_string(py, state.cache_str()), value)?;
280+
}
278281
Err(ValError::LineErrors(line_errors)) => {
279282
for err in line_errors {
280283
errors.push(err.with_outer_location(raw_key.clone()));

src/validators/dataclass.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,10 @@ impl Validator for DataclassArgsValidator {
302302
if let Some(ref validator) = self.extras_validator {
303303
match validator.validate(py, value.borrow_input(), state) {
304304
Ok(value) => {
305-
output_dict.set_item(either_str.as_py_string(py), value)?;
305+
output_dict.set_item(
306+
either_str.as_py_string(py, state.cache_str()),
307+
value,
308+
)?;
306309
}
307310
Err(ValError::LineErrors(line_errors)) => {
308311
for err in line_errors {
@@ -312,7 +315,8 @@ impl Validator for DataclassArgsValidator {
312315
Err(err) => return Err(err),
313316
}
314317
} else {
315-
output_dict.set_item(either_str.as_py_string(py), value)?;
318+
output_dict
319+
.set_item(either_str.as_py_string(py, state.cache_str()), value)?;
316320
}
317321
}
318322
}
@@ -455,7 +459,7 @@ impl BuildValidator for DataclassValidator {
455459
let validator = build_validator(&sub_schema, config, definitions)?;
456460

457461
let post_init = if schema.get_as::<bool>(intern!(py, "post_init"))?.unwrap_or(false) {
458-
Some(PyString::new_bound(py, "__post_init__").into())
462+
Some(intern!(py, "__post_init__").into_py(py))
459463
} else {
460464
None
461465
};

src/validators/generator.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ pub struct InternalValidator {
219219
validation_mode: InputType,
220220
hide_input_in_errors: bool,
221221
validation_error_cause: bool,
222+
cache_str: jiter::StringCacheMode,
222223
}
223224

224225
impl fmt::Debug for InternalValidator {
@@ -250,6 +251,7 @@ impl InternalValidator {
250251
validation_mode: extra.input_type,
251252
hide_input_in_errors,
252253
validation_error_cause,
254+
cache_str: extra.cache_str,
253255
}
254256
}
255257

@@ -268,6 +270,7 @@ impl InternalValidator {
268270
from_attributes: self.from_attributes,
269271
context: self.context.as_ref().map(|data| data.bind(py)),
270272
self_instance: self.self_instance.as_ref().map(|data| data.bind(py)),
273+
cache_str: self.cache_str,
271274
};
272275
let mut state = ValidationState::new(extra, &mut self.recursion_guard);
273276
state.exactness = self.exactness;
@@ -302,6 +305,7 @@ impl InternalValidator {
302305
from_attributes: self.from_attributes,
303306
context: self.context.as_ref().map(|data| data.bind(py)),
304307
self_instance: self.self_instance.as_ref().map(|data| data.bind(py)),
308+
cache_str: self.cache_str,
305309
};
306310
let mut state = ValidationState::new(extra, &mut self.recursion_guard);
307311
state.exactness = self.exactness;

0 commit comments

Comments
 (0)