diff --git a/Cargo.lock b/Cargo.lock index daaf0aa9..506395df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,12 +15,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "allocator-api2" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" - [[package]] name = "autocfg" version = "1.1.0" @@ -98,10 +92,6 @@ name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" -dependencies = [ - "ahash", - "allocator-api2", -] [[package]] name = "indexmap" @@ -126,7 +116,6 @@ dependencies = [ "ahash", "bencher", "codspeed-bencher-compat", - "hashbrown", "lexical-parse-float", "num-bigint", "num-traits", diff --git a/Cargo.toml b/Cargo.toml index 85da9fa6..1d33b1d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,6 @@ ahash = "0.8.0" smallvec = "1.11.0" pyo3 = { version = "0.21.0-beta.0", default-features=false, features = ["num-bigint"], optional = true } lexical-parse-float = { version = "0.8.5", features = ["format"] } -hashbrown = "0.14.3" [features] python = ["dep:pyo3"] diff --git a/benches/python.rs b/benches/python.rs index 8e20bbb1..7f817aa6 100644 --- a/benches/python.rs +++ b/benches/python.rs @@ -5,16 +5,17 @@ use std::io::Read; use pyo3::Python; -use jiter::python_parse; +use jiter::{cache_clear, python_parse, StringCacheMode}; fn python_parse_numeric(bench: &mut Bencher) { Python::with_gil(|py| { + cache_clear(py); bench.iter(|| { python_parse( py, br#" { "int": 1, "bigint": 123456789012345678901234567890, "float": 1.2} "#, false, - true, + StringCacheMode::All, false, ) .unwrap() @@ -24,64 +25,75 @@ fn python_parse_numeric(bench: &mut Bencher) { fn python_parse_other(bench: &mut Bencher) { Python::with_gil(|py| { - bench.iter(|| python_parse(py, br#"["string", true, false, null]"#, false, true, false).unwrap()); + cache_clear(py); + bench.iter(|| { + python_parse( + py, + br#"["string", true, false, null]"#, + false, + StringCacheMode::All, + false, + ) + .unwrap() + }); }) } -fn _python_parse_file(path: &str, bench: &mut Bencher, cache_strings: bool) { +fn _python_parse_file(path: &str, bench: &mut Bencher, cache_mode: StringCacheMode) { let mut file = File::open(path).unwrap(); let mut contents = String::new(); file.read_to_string(&mut contents).unwrap(); let json_data = contents.as_bytes(); Python::with_gil(|py| { - bench.iter(|| python_parse(py, json_data, false, cache_strings, false).unwrap()); + cache_clear(py); + bench.iter(|| python_parse(py, json_data, false, cache_mode, false).unwrap()); }) } fn python_parse_medium_response_not_cached(bench: &mut Bencher) { - _python_parse_file("./benches/medium_response.json", bench, false); + _python_parse_file("./benches/medium_response.json", bench, StringCacheMode::None); } fn python_parse_medium_response(bench: &mut Bencher) { - _python_parse_file("./benches/medium_response.json", bench, true); + _python_parse_file("./benches/medium_response.json", bench, StringCacheMode::All); } fn python_parse_true_object_not_cached(bench: &mut Bencher) { - _python_parse_file("./benches/true_object.json", bench, false); + _python_parse_file("./benches/true_object.json", bench, StringCacheMode::None); } fn python_parse_string_array_not_cached(bench: &mut Bencher) { - _python_parse_file("./benches/string_array.json", bench, false); + _python_parse_file("./benches/string_array.json", bench, StringCacheMode::None); } fn python_parse_string_array(bench: &mut Bencher) { - _python_parse_file("./benches/string_array.json", bench, true); + _python_parse_file("./benches/string_array.json", bench, StringCacheMode::All); } fn python_parse_x100_not_cached(bench: &mut Bencher) { - _python_parse_file("./benches/x100.json", bench, false); + _python_parse_file("./benches/x100.json", bench, StringCacheMode::None); } fn python_parse_x100(bench: &mut Bencher) { - _python_parse_file("./benches/x100.json", bench, true); + _python_parse_file("./benches/x100.json", bench, StringCacheMode::All); } fn python_parse_string_array_unique_not_cached(bench: &mut Bencher) { - _python_parse_file("./benches/string_array_unique.json", bench, false); + _python_parse_file("./benches/string_array_unique.json", bench, StringCacheMode::None); } fn python_parse_string_array_unique(bench: &mut Bencher) { - _python_parse_file("./benches/string_array_unique.json", bench, true); + _python_parse_file("./benches/string_array_unique.json", bench, StringCacheMode::All); } fn python_parse_true_object(bench: &mut Bencher) { - _python_parse_file("./benches/true_object.json", bench, true); + _python_parse_file("./benches/true_object.json", bench, StringCacheMode::All); } /// Note - caching strings should make no difference here fn python_parse_true_array(bench: &mut Bencher) { - _python_parse_file("./benches/true_array.json", bench, true); + _python_parse_file("./benches/true_array.json", bench, StringCacheMode::All); } benchmark_group!( diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index b13132c8..2c829744 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -4,21 +4,22 @@ version = 3 [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", + "zerocopy", ] [[package]] name = "arbitrary" -version = "1.3.0" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d098ff73c1ca148721f37baad5ea6a465a13f9573aba8641fbbbae8164a54e" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" [[package]] name = "autocfg" @@ -28,9 +29,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "cc" -version = "1.0.83" +version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" dependencies = [ "jobserver", "libc", @@ -50,9 +51,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "libc", @@ -61,15 +62,15 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.0" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" [[package]] name = "indexmap" -version = "2.0.0" +version = "2.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" dependencies = [ "equivalent", "hashbrown", @@ -77,16 +78,16 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "jiter" -version = "0.0.5" +version = "0.0.7" dependencies = [ "ahash", - "lexical-core", + "lexical-parse-float", "num-bigint", "num-traits", "smallvec", @@ -107,26 +108,13 @@ dependencies = [ [[package]] name = "jobserver" -version = "0.1.26" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" dependencies = [ "libc", ] -[[package]] -name = "lexical-core" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" -dependencies = [ - "lexical-parse-float", - "lexical-parse-integer", - "lexical-util", - "lexical-write-float", - "lexical-write-integer", -] - [[package]] name = "lexical-parse-float" version = "0.8.5" @@ -157,32 +145,11 @@ dependencies = [ "static_assertions", ] -[[package]] -name = "lexical-write-float" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" -dependencies = [ - "lexical-util", - "lexical-write-integer", - "static_assertions", -] - -[[package]] -name = "lexical-write-integer" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" -dependencies = [ - "lexical-util", - "static_assertions", -] - [[package]] name = "libc" -version = "0.2.147" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "libfuzzer-sys" @@ -208,67 +175,66 @@ dependencies = [ [[package]] name = "num-integer" -version = "0.1.45" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", "num-traits", ] [[package]] name = "num-traits" -version = "0.2.17" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" +checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" dependencies = [ "autocfg", ] [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "proc-macro2" -version = "1.0.66" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.33" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] [[package]] name = "ryu" -version = "1.0.15" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" [[package]] name = "serde" -version = "1.0.190" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91d3c334ca1ee894a2c6f6ad698fe8c435b76d504b13d436f0685d648d6d96f7" +checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.190" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67c5609f394e5c2bd7fc51efda478004ea80ef42fee983d5c67a65e34f32c0e3" +checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", @@ -277,9 +243,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.107" +version = "1.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b420ce6e3d8bd882e9b243c6eed35dbc9a6110c9769e74b584e0d68d1f20c65" +checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" dependencies = [ "indexmap", "itoa", @@ -289,9 +255,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "static_assertions" @@ -301,9 +267,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "syn" -version = "2.0.31" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "718fa2415bcb8d8bd775917a1bf12a7931b6dfa890753378538118181e0cb398" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -312,9 +278,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "version_check" @@ -327,3 +293,23 @@ name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/jiter-python/Cargo.toml b/jiter-python/Cargo.toml index 87240aa1..cb68896a 100644 --- a/jiter-python/Cargo.toml +++ b/jiter-python/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [dependencies] -pyo3 = { git = "https://github.com/davidhewitt/pyo3", branch="complete-py2", version = "0.21.0-dev", default-features=false, features = ["generate-import-lib", "auto-initialize", "num-bigint", "macros"] } +pyo3 = { version = "0.21.0-beta.0", features = ["num-bigint", "auto-initialize"] } jiter = { path = "..", features = ["python"] } [features] diff --git a/jiter-python/bench.py b/jiter-python/bench.py index 475bffec..516b31ec 100644 --- a/jiter-python/bench.py +++ b/jiter-python/bench.py @@ -14,6 +14,7 @@ ('array_short_arrays', '[{}]'.format(', '.join('["a", "b", "c", "d"]' for _ in range(10_000)))), ('one_long_string', json.dumps('x' * 100)), ('one_short_string', b'"foobar"'), + ('1m_strings', json.dumps([str(i) for i in range(1_000_000)])), ] diff --git a/jiter-python/src/lib.rs b/jiter-python/src/lib.rs index 5f875471..3346db4e 100644 --- a/jiter-python/src/lib.rs +++ b/jiter-python/src/lib.rs @@ -1,16 +1,21 @@ use pyo3::prelude::*; -use pyo3::pymodule; -use jiter::{python_parse, map_json_error}; +use jiter::{map_json_error, python_parse}; #[pyfunction(signature = (data, *, allow_inf_nan=true, cache_strings=true))] -pub fn from_json<'py>(py: Python<'py>, data: &'py [u8], allow_inf_nan: bool, cache_strings: bool) -> PyResult> { +pub fn from_json<'py>( + py: Python<'py>, + data: &[u8], + allow_inf_nan: bool, + cache_strings: bool, +) -> PyResult> { + let cache_mode = cache_strings.into(); let json_bytes = data; - python_parse(py, json_bytes, allow_inf_nan, cache_strings).map_err(|e| map_json_error(json_bytes, &e)) + python_parse(py, json_bytes, allow_inf_nan, cache_mode, false).map_err(|e| map_json_error(json_bytes, &e)) } #[pymodule] -fn jiter_python(_py: Python, m: &PyModule) -> PyResult<()> { +fn jiter_python(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(from_json, m)?)?; Ok(()) } diff --git a/src/lib.rs b/src/lib.rs index 374ac4c5..dc3080f0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,8 @@ mod lazy_index_map; mod number_decoder; mod parse; #[cfg(feature = "python")] +mod py_string_cache; +#[cfg(feature = "python")] mod python; mod string_decoder; mod value; @@ -17,5 +19,7 @@ pub use number_decoder::{NumberAny, NumberInt}; pub use parse::Peek; pub use value::{JsonArray, JsonObject, JsonValue}; +#[cfg(feature = "python")] +pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, StringCacheMode}; #[cfg(feature = "python")] pub use python::{map_json_error, python_parse}; diff --git a/src/py_string_cache.rs b/src/py_string_cache.rs new file mode 100644 index 00000000..92b70860 --- /dev/null +++ b/src/py_string_cache.rs @@ -0,0 +1,185 @@ +use std::cell::RefCell; + +use ahash::random_state::RandomState; +use pyo3::exceptions::{PyTypeError, PyValueError}; +use pyo3::prelude::*; +use pyo3::sync::{GILOnceCell, GILProtected}; +use pyo3::types::{PyBool, PyString}; + +#[derive(Debug, Clone, Copy)] +pub enum StringCacheMode { + All, + Keys, + None, +} + +impl<'py> FromPyObject<'py> for StringCacheMode { + fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult { + if let Ok(bool_mode) = ob.downcast::() { + Ok(bool_mode.is_true().into()) + } else if let Ok(str_mode) = ob.extract::<&str>() { + match str_mode { + "all" => Ok(Self::All), + "keys" => Ok(Self::Keys), + "none" => Ok(Self::None), + _ => Err(PyValueError::new_err( + "Invalid string cache mode, should be `'all'`, '`keys`', `'none`' or a `bool`", + )), + } + } else { + Err(PyTypeError::new_err( + "Invalid string cache mode, should be `'all'`, '`keys`', `'none`' or a `bool`", + )) + } + } +} + +impl From for StringCacheMode { + fn from(mode: bool) -> Self { + if mode { + Self::All + } else { + Self::None + } + } +} + +pub trait StringMaybeCache { + fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString>; + + fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + Self::get_key(py, json_str) + } +} + +pub struct StringCacheAll; + +impl StringMaybeCache for StringCacheAll { + fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + cached_py_string(py, json_str) + } +} + +pub struct StringCacheKeys; + +impl StringMaybeCache for StringCacheKeys { + fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + cached_py_string(py, json_str) + } + + fn get_value<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + PyString::new_bound(py, json_str) + } +} + +pub struct StringNoCache; + +impl StringMaybeCache for StringNoCache { + fn get_key<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyString> { + PyString::new_bound(py, json_str) + } +} + +static STRING_CACHE: GILOnceCell>> = GILOnceCell::new(); + +macro_rules! get_string_cache { + ($py:ident) => { + STRING_CACHE + .get_or_init($py, || GILProtected::new(RefCell::new(PyStringCache::default()))) + .get($py) + }; +} + +pub fn cache_usage(py: Python) -> usize { + get_string_cache!(py).borrow().usage() +} + +pub fn cache_clear(py: Python) { + get_string_cache!(py).borrow_mut().clear() +} + +pub fn cached_py_string<'py>(py: Python<'py>, raw_str: &str) -> Bound<'py, PyString> { + // from tests, 0 and 1 character strings are faster not cached + if (2..64).contains(&raw_str.len()) { + get_string_cache!(py).borrow_mut().get_or_insert(py, raw_str) + } else { + PyString::new_bound(py, raw_str) + } +} + +// capacity should be a power of 2 so the compiler can convert `%` to a right shift below +// Using a smaller number here (e.g. 1024) seems to be faster in many cases than a larger number (like 65536) +// and also avoids stack overflow risks +const CAPACITY: usize = 16_384; +type Entry = Option<(u64, Py)>; + +/// This is a Fully associative cache with LRU replacement policy. +/// See https://en.wikipedia.org/wiki/Cache_placement_policies#Fully_associative_cache +#[derive(Debug)] +struct PyStringCache { + entries: Box<[Entry; CAPACITY]>, + hash_builder: RandomState, +} + +const ARRAY_REPEAT_VALUE: Entry = None; + +impl Default for PyStringCache { + fn default() -> Self { + Self { + entries: Box::new([ARRAY_REPEAT_VALUE; CAPACITY]), + hash_builder: RandomState::default(), + } + } +} + +impl PyStringCache { + /// Lookup the cache for an entry with the given string. If it exists, return it. + /// If it is not set or has a different string, insert it and return it. + fn get_or_insert<'py>(&mut self, py: Python<'py>, s: &str) -> Bound<'py, PyString> { + let hash = self.hash_builder.hash_one(s); + + let hash_index = hash as usize % CAPACITY; + + let set_entry = |entry: &mut Entry| { + let py_str = PyString::new_bound(py, s); + *entry = Some((hash, py_str.to_owned().unbind())); + py_str + }; + + // we try up to 5 contiguous slots to find a match or an empty slot + for index in hash_index..hash_index.wrapping_add(5) { + if let Some(entry) = self.entries.get_mut(index) { + if let Some((entry_hash, ref py_str_ob)) = entry { + // to avoid a string comparison, we first compare the hashes + if *entry_hash == hash { + // if the hashes match, we compare the strings to be absolutely sure - as a hashmap would do + if py_str_ob.bind(py).to_str().ok() == Some(s) { + // the strings matched, return the cached string object + return py_str_ob.bind(py).to_owned(); + } + } + } else { + // we got to an empty entry, use it + return set_entry(entry); + } + } else { + // we reached the end of entries, break + break; + } + } + // we tried all 5 slots (or got to the end of entries) without finding a match + // or an empty slot, make this LRU by replacing the first entry + let entry = self.entries.get_mut(hash_index).unwrap(); + set_entry(entry) + } + + /// get the number of entries in the cache that are set + fn usage(&self) -> usize { + self.entries.iter().filter(|e| e.is_some()).count() + } + + /// clear the cache by resetting all entries to `None` + fn clear(&mut self) { + self.entries.fill(None); + } +} diff --git a/src/python.rs b/src/python.rs index 415f4e59..23bf0c67 100644 --- a/src/python.rs +++ b/src/python.rs @@ -1,17 +1,14 @@ -use std::cell::RefCell; - use pyo3::exceptions::PyValueError; use pyo3::ffi; use pyo3::prelude::*; -use pyo3::sync::{GILOnceCell, GILProtected}; use pyo3::types::{PyDict, PyList, PyString}; -use hashbrown::hash_map::{HashMap, RawEntryMut}; use smallvec::SmallVec; use crate::errors::{json_err, json_error, JsonError, JsonResult, DEFAULT_RECURSION_LIMIT}; use crate::number_decoder::{NumberAny, NumberInt}; use crate::parse::{Parser, Peek}; +use crate::py_string_cache::{StringCacheAll, StringCacheKeys, StringCacheMode, StringMaybeCache, StringNoCache}; use crate::string_decoder::{StringDecoder, Tape}; use crate::JsonErrorType; @@ -31,7 +28,7 @@ pub fn python_parse<'py>( py: Python<'py>, json_data: &[u8], allow_inf_nan: bool, - cache_strings: bool, + cache_mode: StringCacheMode, allow_partial: bool, ) -> JsonResult> { let mut python_parser = PythonParser { @@ -43,10 +40,10 @@ pub fn python_parse<'py>( }; let peek = python_parser.parser.peek()?; - let v = if cache_strings { - python_parser.py_take_value::(py, peek)? - } else { - python_parser.py_take_value::(py, peek)? + let v = match cache_mode { + StringCacheMode::All => python_parser.py_take_value::(py, peek)?, + StringCacheMode::Keys => python_parser.py_take_value::(py, peek)?, + StringCacheMode::None => python_parser.py_take_value::(py, peek)?, }; if !allow_partial { python_parser.parser.finish()?; @@ -103,7 +100,7 @@ impl<'j> PythonParser<'j> { } Peek::String => { let s = self.parser.consume_string::(&mut self.tape)?; - Ok(StringCache::get(py, s.as_str())) + Ok(StringCache::get_value(py, s.as_str()).into_any()) } Peek::Array => { let list = if let Some(peek_first) = tri!(self.parser.array_first(), PyList::empty_bound(py)) { @@ -168,7 +165,7 @@ impl<'j> PythonParser<'j> { py: Python<'py>, dict: &Bound<'py, PyDict>, ) -> JsonResult<()> { - let set_item = |key: Bound<'py, PyAny>, value: Bound<'py, PyAny>| { + let set_item = |key: Bound<'py, PyString>, value: Bound<'py, PyAny>| { let r = unsafe { ffi::PyDict_SetItem(dict.as_ptr(), key.as_ptr(), value.as_ptr()) }; // AFAIK this shouldn't happen since the key will always be a string which is hashable // we panic here rather than returning a result and using `?` below as it's up to 14% faster @@ -178,12 +175,12 @@ impl<'j> PythonParser<'j> { } }; if let Some(first_key) = self.parser.object_first::(&mut self.tape)? { - let first_key = StringCache::get(py, first_key.as_str()); + let first_key = StringCache::get_key(py, first_key.as_str()); let peek = self.parser.peek()?; let first_value = self._check_take_value::(py, peek)?; set_item(first_key, first_value); while let Some(key) = self.parser.object_step::(&mut self.tape)? { - let key = StringCache::get(py, key.as_str()); + let key = StringCache::get_key(py, key.as_str()); let peek = self.parser.peek()?; let value = self._check_take_value::(py, peek)?; set_item(key, value); @@ -224,52 +221,3 @@ impl<'j> PythonParser<'j> { r } } - -trait StringMaybeCache { - fn get<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyAny>; -} - -struct StringCache; - -impl StringMaybeCache for StringCache { - fn get<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyAny> { - static STRINGS_CACHE: GILOnceCell>>> = GILOnceCell::new(); - - // from tests, 0 and 1 character strings are faster not cached - if (2..64).contains(&json_str.len()) { - let cache = STRINGS_CACHE - .get_or_init(py, || GILProtected::new(RefCell::new(HashMap::new()))) - .get(py); - - let mut map = cache.borrow_mut(); - let entry = map.raw_entry_mut().from_key(json_str); - - let (py_string, inserted) = match entry { - RawEntryMut::Vacant(view) => { - let py_string = PyString::new_bound(py, json_str).into_any(); - view.insert(json_str.to_owned(), py_string.clone().into()); - (py_string, true) - } - RawEntryMut::Occupied(view) => (view.get().bind(py).clone(), false), - }; - if inserted { - // 500k limit means 1m keys + values, 1m 64 byte strings is ~64mb - if map.len() > 500_000 { - // TODO is there a fast way to keep (say) half the cache? - map.clear(); - } - } - py_string - } else { - PyString::new_bound(py, json_str).into_any() - } - } -} - -struct StringNoCache; - -impl StringMaybeCache for StringNoCache { - fn get<'py>(py: Python<'py>, json_str: &str) -> Bound<'py, PyAny> { - PyString::new_bound(py, json_str).into_any() - } -} diff --git a/tests/python.rs b/tests/python.rs index 13c8dacc..d69cde03 100644 --- a/tests/python.rs +++ b/tests/python.rs @@ -1,8 +1,8 @@ use pyo3::prelude::*; -use pyo3::types::{PyDict, PyList}; +use pyo3::types::{PyDict, PyList, PyString}; use pyo3::ToPyObject; -use jiter::{map_json_error, python_parse, JsonValue}; +use jiter::{cache_clear, cache_usage, map_json_error, python_parse, JsonValue, StringCacheMode}; #[test] fn test_to_py_object_numeric() { @@ -42,7 +42,7 @@ fn test_python_parse_numeric() { py, br#" { "int": 1, "bigint": 123456789012345678901234567890, "float": 1.2} "#, false, - true, + StringCacheMode::All, false, ) .unwrap(); @@ -60,7 +60,7 @@ fn test_python_parse_other_cached() { py, br#"["string", true, false, null, NaN, Infinity, -Infinity]"#, true, - true, + StringCacheMode::All, false, ) .unwrap(); @@ -71,7 +71,14 @@ fn test_python_parse_other_cached() { #[test] fn test_python_parse_other_no_cache() { Python::with_gil(|py| { - let obj = python_parse(py, br#"["string", true, false, null]"#, false, false, false).unwrap(); + let obj = python_parse( + py, + br#"["string", true, false, null]"#, + false, + StringCacheMode::None, + false, + ) + .unwrap(); assert_eq!(obj.to_string(), "['string', True, False, None]"); }) } @@ -79,7 +86,7 @@ fn test_python_parse_other_no_cache() { #[test] fn test_python_disallow_nan() { Python::with_gil(|py| { - let r = python_parse(py, br#"[NaN]"#, false, true, false); + let r = python_parse(py, br#"[NaN]"#, false, StringCacheMode::All, false); let e = r.map_err(|e| map_json_error(br#"[NaN]"#, &e)).unwrap_err(); assert_eq!(e.to_string(), "ValueError: expected value at line 1 column 2"); }) @@ -89,7 +96,7 @@ fn test_python_disallow_nan() { fn test_error() { Python::with_gil(|py| { let bytes = br#"["string""#; - let r = python_parse(py, bytes, false, true, false); + let r = python_parse(py, bytes, false, StringCacheMode::All, false); let e = r.map_err(|e| map_json_error(bytes, &e)).unwrap_err(); assert_eq!(e.to_string(), "ValueError: EOF while parsing a list at line 1 column 9"); }) @@ -101,7 +108,7 @@ fn test_recursion_limit() { let bytes = json.as_bytes(); Python::with_gil(|py| { - let r = python_parse(py, bytes, false, true, false); + let r = python_parse(py, bytes, false, StringCacheMode::All, false); let e = r.map_err(|e| map_json_error(bytes, &e)).unwrap_err(); assert_eq!( e.to_string(), @@ -117,12 +124,12 @@ fn test_recursion_limit_incr() { let bytes = json.as_bytes(); Python::with_gil(|py| { - let v = python_parse(py, bytes, false, true, false).unwrap(); + let v = python_parse(py, bytes, false, StringCacheMode::All, false).unwrap(); assert_eq!(v.len().unwrap(), 2000); }); Python::with_gil(|py| { - let v = python_parse(py, bytes, false, true, false).unwrap(); + let v = python_parse(py, bytes, false, StringCacheMode::All, false).unwrap(); assert_eq!(v.len().unwrap(), 2000); }); } @@ -133,7 +140,7 @@ fn test_extracted_value_error() { let bytes = json.as_bytes(); Python::with_gil(|py| { - let r = python_parse(py, bytes, false, true, false); + let r = python_parse(py, bytes, false, StringCacheMode::All, false); let e = r.map_err(|e| map_json_error(bytes, &e)).unwrap_err(); assert_eq!(e.to_string(), "ValueError: expected value at line 1 column 1"); }) @@ -143,13 +150,13 @@ fn test_extracted_value_error() { fn test_partial_array() { Python::with_gil(|py| { let bytes = br#"["string", true, null, 1, "foo"#; - let py_value = python_parse(py, bytes, false, true, true).unwrap(); + let py_value = python_parse(py, bytes, false, StringCacheMode::All, true).unwrap(); let string = py_value.to_string(); assert_eq!(string, "['string', True, None, 1]"); // test that stopping at every points is ok for i in 1..bytes.len() { - let py_value = python_parse(py, &bytes[..i], false, true, true).unwrap(); + let py_value = python_parse(py, &bytes[..i], false, StringCacheMode::All, true).unwrap(); assert!(py_value.is_instance_of::()); } }) @@ -159,13 +166,13 @@ fn test_partial_array() { fn test_partial_object() { Python::with_gil(|py| { let bytes = br#"{"a": 1, "b": 2, "c"#; - let py_value = python_parse(py, bytes, false, true, true).unwrap(); + let py_value = python_parse(py, bytes, false, StringCacheMode::All, true).unwrap(); let string = py_value.to_string(); assert_eq!(string, "{'a': 1, 'b': 2}"); // test that stopping at every points is ok for i in 1..bytes.len() { - let py_value = python_parse(py, &bytes[..i], false, true, true).unwrap(); + let py_value = python_parse(py, &bytes[..i], false, StringCacheMode::All, true).unwrap(); assert!(py_value.is_instance_of::()); } }) @@ -175,14 +182,77 @@ fn test_partial_object() { fn test_partial_nested() { Python::with_gil(|py| { let bytes = br#"{"a": 1, "b": 2, "c": [1, 2, {"d": 1, "#; - let py_value = python_parse(py, bytes, false, true, true).unwrap(); + let py_value = python_parse(py, bytes, false, true.into(), true).unwrap(); let string = py_value.to_string(); assert_eq!(string, "{'a': 1, 'b': 2, 'c': [1, 2, {'d': 1}]}"); // test that stopping at every points is ok for i in 1..bytes.len() { - let py_value = python_parse(py, &bytes[..i], false, true, true).unwrap(); + let py_value = python_parse(py, &bytes[..i], false, true.into(), true).unwrap(); assert!(py_value.is_instance_of::()); } }) } + +#[test] +fn test_python_cache_usage_all() { + Python::with_gil(|py| { + cache_clear(py); + let obj = python_parse(py, br#"{"foo": "bar", "spam": 3}"#, true, StringCacheMode::All, false).unwrap(); + assert_eq!(obj.to_string(), "{'foo': 'bar', 'spam': 3}"); + assert_eq!(cache_usage(py), 3); + }) +} + +#[test] +fn test_python_cache_usage_keys() { + Python::with_gil(|py| { + cache_clear(py); + let obj = python_parse(py, br#"{"foo": "bar", "spam": 3}"#, false, StringCacheMode::Keys, false).unwrap(); + assert_eq!(obj.to_string(), "{'foo': 'bar', 'spam': 3}"); + assert_eq!(cache_usage(py), 2); + }) +} + +#[test] +fn test_python_cache_usage_none() { + Python::with_gil(|py| { + cache_clear(py); + let obj = python_parse(py, br#"{"foo": "bar", "spam": 3}"#, false, StringCacheMode::None, false).unwrap(); + assert_eq!(obj.to_string(), "{'foo': 'bar', 'spam': 3}"); + assert_eq!(cache_usage(py), 0); + }) +} + +#[test] +fn test_cache_into() { + Python::with_gil(|py| { + let c: StringCacheMode = true.to_object(py).extract(py).unwrap(); + assert!(matches!(c, StringCacheMode::All)); + + let c: StringCacheMode = false.to_object(py).extract(py).unwrap(); + assert!(matches!(c, StringCacheMode::None)); + + let c: StringCacheMode = PyString::new_bound(py, "all").extract().unwrap(); + assert!(matches!(c, StringCacheMode::All)); + + let c: StringCacheMode = PyString::new_bound(py, "keys").extract().unwrap(); + assert!(matches!(c, StringCacheMode::Keys)); + + let c: StringCacheMode = PyString::new_bound(py, "none").extract().unwrap(); + assert!(matches!(c, StringCacheMode::None)); + + let e = PyString::new_bound(py, "wrong") + .extract::() + .unwrap_err(); + assert_eq!( + e.to_string(), + "ValueError: Invalid string cache mode, should be `'all'`, '`keys`', `'none`' or a `bool`" + ); + let e = 123.to_object(py).extract::(py).unwrap_err(); + assert_eq!( + e.to_string(), + "TypeError: Invalid string cache mode, should be `'all'`, '`keys`', `'none`' or a `bool`" + ); + }) +}