From e6c3e8f4621dd1ffdacb6584e8f8d939f7323a28 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 18:48:33 -0600 Subject: [PATCH 1/6] use jiter/ascii-string-creation --- Cargo.lock | 10 ++++++++-- Cargo.toml | 4 +++- src/input/return_enums.rs | 3 ++- src/validators/validation_state.rs | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e120e74c4..79cf3b1b0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -42,6 +42,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bytecount" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" + [[package]] name = "cc" version = "1.0.79" @@ -139,8 +145,7 @@ checksum = "62b02a5381cc465bd3041d84623d0fa3b66738b52b8e2fc3bab8ad63ab032f4a" [[package]] name = "jiter" version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c0b7c896d2b1da897be13affb0bbf7bff95437e9c50823ede962addadae58d8" +source = "git+https://github.com/pydantic/jiter?branch=ascii-string-creation#85c6285458c5b14fa4840fed08ee0e7a641ed80a" dependencies = [ "ahash", "lexical-parse-float", @@ -297,6 +302,7 @@ version = "2.17.0" dependencies = [ "ahash", "base64", + "bytecount", "enum_dispatch", "idna", "jiter", diff --git a/Cargo.toml b/Cargo.toml index 420509290..055688d21 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,9 @@ base64 = "0.21.7" num-bigint = "0.4.4" python3-dll-a = "0.2.7" uuid = "1.7.0" -jiter = { version = "0.1.1", features = ["python"] } +bytecount = { version = "0.6.7", default_features = false, features = ["runtime-dispatch-simd"] } +#jiter = { version = "0.1.1", features = ["python"] } +jiter = { git = "https://github.com/pydantic/jiter", branch = "ascii-string-creation", features = ["python"] } [lib] name = "_pydantic_core" diff --git a/src/input/return_enums.rs b/src/input/return_enums.rs index 5d1019f05..719a2e373 100644 --- a/src/input/return_enums.rs +++ b/src/input/return_enums.rs @@ -439,7 +439,8 @@ impl<'a> EitherString<'a> { match self { Self::Cow(cow) => { if matches!(cache_str, StringCacheMode::All) { - jiter::cached_py_string(py, cow.as_ref()) + let s = cow.as_ref(); + jiter::cached_py_string(py, s, bytecount::num_chars(s.as_bytes()) == s.len()) } else { PyString::new_bound(py, cow.as_ref()) } diff --git a/src/validators/validation_state.rs b/src/validators/validation_state.rs index 8e68cd8d9..54b620036 100644 --- a/src/validators/validation_state.rs +++ b/src/validators/validation_state.rs @@ -73,7 +73,7 @@ impl<'a, 'py> ValidationState<'a, 'py> { pub fn maybe_cached_str(&self, py: Python<'py>, s: &str) -> Bound<'py, PyString> { if matches!(self.extra.cache_str, StringCacheMode::All) { - jiter::cached_py_string(py, s) + jiter::cached_py_string(py, s, bytecount::num_chars(s.as_bytes()) == s.len()) } else { PyString::new_bound(py, s) } From abcdbb74e91023f225d327ddbf08529aa4a31323 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Wed, 27 Mar 2024 18:59:50 -0600 Subject: [PATCH 2/6] simplify, use without caching --- src/input/return_enums.rs | 11 ++--------- src/tools.rs | 11 +++++++++++ src/validators/validation_state.rs | 7 ++----- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/input/return_enums.rs b/src/input/return_enums.rs index 719a2e373..d28bac2ec 100644 --- a/src/input/return_enums.rs +++ b/src/input/return_enums.rs @@ -19,7 +19,7 @@ use serde::{ser::Error, Serialize, Serializer}; use crate::errors::{ py_err_string, ErrorType, ErrorTypeDefaults, InputValue, ToErrorValue, ValError, ValLineError, ValResult, }; -use crate::tools::{extract_i64, py_err}; +use crate::tools::{extract_i64, new_py_string, py_err}; use crate::validators::{CombinedValidator, Exactness, ValidationState, Validator}; use super::{py_error_on_minusone, BorrowInput, Input}; @@ -437,14 +437,7 @@ impl<'a> EitherString<'a> { pub fn as_py_string(&'a self, py: Python<'a>, cache_str: StringCacheMode) -> Bound<'a, PyString> { match self { - Self::Cow(cow) => { - if matches!(cache_str, StringCacheMode::All) { - let s = cow.as_ref(); - jiter::cached_py_string(py, s, bytecount::num_chars(s.as_bytes()) == s.len()) - } else { - PyString::new_bound(py, cow.as_ref()) - } - } + Self::Cow(cow) => new_py_string(py, cow.as_ref(), cache_str), Self::Py(py_string) => py_string.clone(), } } diff --git a/src/tools.rs b/src/tools.rs index a823311a0..7bc113aea 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -5,6 +5,8 @@ use pyo3::prelude::*; use pyo3::types::{PyDict, PyString}; use pyo3::{ffi, intern, FromPyObject}; +use jiter::{cached_py_string, pystring_fast_new, StringCacheMode}; + pub trait SchemaDict<'py> { fn get_as(&self, key: &Bound<'_, PyString>) -> PyResult> where @@ -143,3 +145,12 @@ pub fn extract_i64(v: &Bound<'_, PyAny>) -> Option { None } } + +pub(crate) fn new_py_string<'py>(py: Python<'py>, s: &str, cache_str: StringCacheMode) -> Bound<'py, PyString> { + let ascii_only = bytecount::num_chars(s.as_bytes()) == s.len(); + if matches!(cache_str, StringCacheMode::All) { + cached_py_string(py, s, ascii_only) + } else { + pystring_fast_new(py, s, ascii_only) + } +} diff --git a/src/validators/validation_state.rs b/src/validators/validation_state.rs index 54b620036..ef6954618 100644 --- a/src/validators/validation_state.rs +++ b/src/validators/validation_state.rs @@ -4,6 +4,7 @@ use pyo3::types::PyString; use jiter::StringCacheMode; use crate::recursion_guard::{ContainsRecursionState, RecursionState}; +use crate::tools::new_py_string; use super::Extra; @@ -72,11 +73,7 @@ impl<'a, 'py> ValidationState<'a, 'py> { } pub fn maybe_cached_str(&self, py: Python<'py>, s: &str) -> Bound<'py, PyString> { - if matches!(self.extra.cache_str, StringCacheMode::All) { - jiter::cached_py_string(py, s, bytecount::num_chars(s.as_bytes()) == s.len()) - } else { - PyString::new_bound(py, s) - } + new_py_string(py, s, self.extra.cache_str) } } From 05ff9c14025af4755bcf7d9c6bd78b48f9f4ca2a Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Tue, 2 Apr 2024 13:19:59 +0100 Subject: [PATCH 3/6] adopt jiter main --- Cargo.lock | 9 +------- Cargo.toml | 3 +-- src/input/shared.rs | 55 +++++++++++++++++++++------------------------ src/tools.rs | 3 ++- 4 files changed, 30 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 79cf3b1b0..646897d09 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -42,12 +42,6 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" -[[package]] -name = "bytecount" -version = "0.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" - [[package]] name = "cc" version = "1.0.79" @@ -145,7 +139,7 @@ checksum = "62b02a5381cc465bd3041d84623d0fa3b66738b52b8e2fc3bab8ad63ab032f4a" [[package]] name = "jiter" version = "0.1.1" -source = "git+https://github.com/pydantic/jiter?branch=ascii-string-creation#85c6285458c5b14fa4840fed08ee0e7a641ed80a" +source = "git+https://github.com/pydantic/jiter?branch=main#f6b698e8a8a17a8ea611591fce389eeba2338461" dependencies = [ "ahash", "lexical-parse-float", @@ -302,7 +296,6 @@ version = "2.17.0" dependencies = [ "ahash", "base64", - "bytecount", "enum_dispatch", "idna", "jiter", diff --git a/Cargo.toml b/Cargo.toml index 055688d21..d4e6818c0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,9 +44,8 @@ base64 = "0.21.7" num-bigint = "0.4.4" python3-dll-a = "0.2.7" uuid = "1.7.0" -bytecount = { version = "0.6.7", default_features = false, features = ["runtime-dispatch-simd"] } #jiter = { version = "0.1.1", features = ["python"] } -jiter = { git = "https://github.com/pydantic/jiter", branch = "ascii-string-creation", features = ["python"] } +jiter = { git = "https://github.com/pydantic/jiter", branch = "main", features = ["python"] } [lib] name = "_pydantic_core" diff --git a/src/input/shared.rs b/src/input/shared.rs index 5f0040e3e..e99bfabcf 100644 --- a/src/input/shared.rs +++ b/src/input/shared.rs @@ -2,7 +2,7 @@ use pyo3::prelude::*; use pyo3::sync::GILOnceCell; use pyo3::{intern, Py, PyAny, Python}; -use num_bigint::BigInt; +use jiter::{JsonErrorType, NumberInt}; use crate::errors::{ErrorTypeDefaults, ValError, ValResult}; @@ -68,29 +68,24 @@ fn strip_underscores(s: &str) -> Option { } /// parse a string as an int -/// -/// max length of the input is 4300, see -/// https://docs.python.org/3/whatsnew/3.11.html#other-cpython-implementation-changes and -/// https://github.com/python/cpython/issues/95778 for more info in that length bound pub fn str_as_int<'py>(input: &(impl Input<'py> + ?Sized), str: &str) -> ValResult> { let str = str.trim(); - let len = str.len(); - if len > 4300 { - Err(ValError::new(ErrorTypeDefaults::IntParsingSize, input)) - } else if let Some(int) = _parse_str(input, str, len) { - Ok(int) - } else if let Some(str_stripped) = strip_decimal_zeros(str) { - if let Some(int) = _parse_str(input, str_stripped, len) { - Ok(int) - } else { - Err(ValError::new(ErrorTypeDefaults::IntParsing, input)) + + // we have to call `NumberInt::try_from` directly first so we fail fast if the string is too long + match NumberInt::try_from(str.as_bytes()) { + Ok(NumberInt::Int(i)) => return Ok(EitherInt::I64(i)), + Ok(NumberInt::BigInt(i)) => return Ok(EitherInt::BigInt(i)), + Err(e) => { + if e.error_type == JsonErrorType::NumberOutOfRange { + return Err(ValError::new(ErrorTypeDefaults::IntParsingSize, input)); + } } + } + + if let Some(str_stripped) = strip_decimal_zeros(str) { + _parse_str(input, str_stripped) } else if let Some(str_stripped) = strip_underscores(str) { - if let Some(int) = _parse_str(input, &str_stripped, len) { - Ok(int) - } else { - Err(ValError::new(ErrorTypeDefaults::IntParsing, input)) - } + _parse_str(input, &str_stripped) } else { Err(ValError::new(ErrorTypeDefaults::IntParsing, input)) } @@ -108,16 +103,18 @@ pub fn str_as_float<'py>(input: &(impl Input<'py> + ?Sized), str: &str) -> ValRe } /// parse a string as an int, `input` is required here to get lifetimes to match up -/// -fn _parse_str<'py>(_input: &(impl Input<'py> + ?Sized), str: &str, len: usize) -> Option> { - if len < 19 { - if let Ok(i) = str.parse::() { - return Some(EitherInt::I64(i)); - } - } else if let Ok(i) = str.parse::() { - return Some(EitherInt::BigInt(i)); +/// max length of the input is 4300 which is checked by jiter, see +/// https://docs.python.org/3/whatsnew/3.11.html#other-cpython-implementation-changes and +/// https://github.com/python/cpython/issues/95778 for more info in that length bound +fn _parse_str<'py>(input: &(impl Input<'py> + ?Sized), str: &str) -> ValResult> { + match NumberInt::try_from(str.as_bytes()) { + Ok(jiter::NumberInt::Int(i)) => Ok(EitherInt::I64(i)), + Ok(jiter::NumberInt::BigInt(i)) => Ok(EitherInt::BigInt(i)), + Err(e) => match e.error_type { + JsonErrorType::NumberOutOfRange => Err(ValError::new(ErrorTypeDefaults::IntParsingSize, input)), + _ => Err(ValError::new(ErrorTypeDefaults::IntParsing, input)), + }, } - None } /// we don't want to parse as f64 then call `float_as_int` as it can loose precision for large ints, therefore diff --git a/src/tools.rs b/src/tools.rs index 7bc113aea..28c661d8f 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -147,7 +147,8 @@ pub fn extract_i64(v: &Bound<'_, PyAny>) -> Option { } pub(crate) fn new_py_string<'py>(py: Python<'py>, s: &str, cache_str: StringCacheMode) -> Bound<'py, PyString> { - let ascii_only = bytecount::num_chars(s.as_bytes()) == s.len(); + // we could use `bytecount::num_chars(s.as_bytes()) == s.len()` as orjson does, but it doesn't appear to be faster + let ascii_only = false; if matches!(cache_str, StringCacheMode::All) { cached_py_string(py, s, ascii_only) } else { From d7e096fadba0cea82826e6feaccac65ac260c9c6 Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Tue, 2 Apr 2024 15:00:03 +0100 Subject: [PATCH 4/6] update jiter --- Cargo.lock | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 646897d09..b470e623f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -139,13 +139,14 @@ checksum = "62b02a5381cc465bd3041d84623d0fa3b66738b52b8e2fc3bab8ad63ab032f4a" [[package]] name = "jiter" version = "0.1.1" -source = "git+https://github.com/pydantic/jiter?branch=main#f6b698e8a8a17a8ea611591fce389eeba2338461" +source = "git+https://github.com/pydantic/jiter?branch=main#a5f7dac3f4564f7e4f6c5c43ef0b3da0258c7f46" dependencies = [ "ahash", "lexical-parse-float", "num-bigint", "num-traits", "pyo3", + "pyo3-build-config", "smallvec", ] From 18bba32f4f7904c53688d4cb580fa3cc35644db4 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Tue, 2 Apr 2024 17:30:44 +0100 Subject: [PATCH 5/6] uprev jiter --- Cargo.lock | 5 +++-- Cargo.toml | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b470e623f..07b2aa47e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -138,8 +138,9 @@ checksum = "62b02a5381cc465bd3041d84623d0fa3b66738b52b8e2fc3bab8ad63ab032f4a" [[package]] name = "jiter" -version = "0.1.1" -source = "git+https://github.com/pydantic/jiter?branch=main#a5f7dac3f4564f7e4f6c5c43ef0b3da0258c7f46" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21d939cf1a3ca43a001361bc0f405a506b1d5a68dcd812cdb74fd7191e747b65" dependencies = [ "ahash", "lexical-parse-float", diff --git a/Cargo.toml b/Cargo.toml index d4e6818c0..7a61f70bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,8 +44,7 @@ base64 = "0.21.7" num-bigint = "0.4.4" python3-dll-a = "0.2.7" uuid = "1.7.0" -#jiter = { version = "0.1.1", features = ["python"] } -jiter = { git = "https://github.com/pydantic/jiter", branch = "main", features = ["python"] } +jiter = { version = "0.2.0", features = ["python"] } [lib] name = "_pydantic_core" From 8a4deef33526b625cc117a8696ff88b4c54961eb Mon Sep 17 00:00:00 2001 From: David Hewitt Date: Tue, 2 Apr 2024 18:58:21 +0100 Subject: [PATCH 6/6] uprev jiter again --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 07b2aa47e..441840c77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -138,9 +138,9 @@ checksum = "62b02a5381cc465bd3041d84623d0fa3b66738b52b8e2fc3bab8ad63ab032f4a" [[package]] name = "jiter" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21d939cf1a3ca43a001361bc0f405a506b1d5a68dcd812cdb74fd7191e747b65" +checksum = "8e1177860adcf80c1ae7d7c1d41561f008c7530664caebbfa5ddd8a7f7316b98" dependencies = [ "ahash", "lexical-parse-float", diff --git a/Cargo.toml b/Cargo.toml index 7a61f70bb..69acef742 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ base64 = "0.21.7" num-bigint = "0.4.4" python3-dll-a = "0.2.7" uuid = "1.7.0" -jiter = { version = "0.2.0", features = ["python"] } +jiter = { version = "0.2.1", features = ["python"] } [lib] name = "_pydantic_core"