Skip to content

Commit 5a43132

Browse files
authored
Merge pull request #42 from akoshchiy/11270-json-concat
feat: add concat & improve strip_nulls
2 parents 582c139 + 2a68027 commit 5a43132

File tree

7 files changed

+687
-165
lines changed

7 files changed

+687
-165
lines changed

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,6 @@ harness = false
5151
name = "get_path"
5252
harness = false
5353

54+
[[bench]]
55+
name = "strip_nulls"
56+
harness = false

benches/strip_nulls.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// Copyright 2024 Datafuse Labs.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use std::{fs, io::Read};
16+
17+
use criterion::{criterion_group, criterion_main, Criterion};
18+
use jsonb::{from_slice, strip_nulls, Value};
19+
20+
fn read(file: &str) -> Vec<u8> {
21+
let mut f = fs::File::open(file).unwrap();
22+
let mut data = vec![];
23+
f.read_to_end(&mut data).unwrap();
24+
data
25+
}
26+
27+
fn strip_nulls_deser(data: &[u8]) {
28+
let mut buf = Vec::new();
29+
let mut json = from_slice(data).unwrap();
30+
strip_value_nulls(&mut json);
31+
json.write_to_vec(&mut buf);
32+
assert!(!buf.is_empty());
33+
}
34+
35+
fn strip_value_nulls(val: &mut Value<'_>) {
36+
match val {
37+
Value::Array(arr) => {
38+
for v in arr {
39+
strip_value_nulls(v);
40+
}
41+
}
42+
Value::Object(ref mut obj) => {
43+
for (_, v) in obj.iter_mut() {
44+
strip_value_nulls(v);
45+
}
46+
obj.retain(|_, v| !matches!(v, Value::Null));
47+
}
48+
_ => {}
49+
}
50+
}
51+
52+
fn strip_nulls_fast(data: &[u8]) {
53+
let mut buf = Vec::new();
54+
strip_nulls(data, &mut buf).unwrap();
55+
assert!(!buf.is_empty());
56+
}
57+
58+
fn add_benchmark(c: &mut Criterion) {
59+
let paths = fs::read_dir("./data/").unwrap();
60+
for path in paths {
61+
let file = format!("{}", path.unwrap().path().display());
62+
let bytes = read(&file);
63+
let json = from_slice(&bytes).unwrap().to_vec();
64+
65+
c.bench_function(&format!("strip_nulls_deser[{}]", file), |b| {
66+
b.iter(|| strip_nulls_deser(&json));
67+
});
68+
69+
c.bench_function(&format!("strip_nulls_fast[{}]", file), |b| {
70+
b.iter(|| strip_nulls_fast(&json));
71+
});
72+
}
73+
}
74+
75+
criterion_group!(benches, add_benchmark);
76+
criterion_main!(benches);

src/builder.rs

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
// Copyright 2024 Datafuse Labs.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use std::collections::BTreeMap;
16+
17+
use byteorder::{BigEndian, WriteBytesExt};
18+
19+
use crate::{
20+
constants::{ARRAY_CONTAINER_TAG, OBJECT_CONTAINER_TAG},
21+
jentry::JEntry,
22+
};
23+
24+
enum Entry<'a> {
25+
ArrayBuilder(ArrayBuilder<'a>),
26+
ObjectBuilder(ObjectBuilder<'a>),
27+
Raw(JEntry, &'a [u8]),
28+
}
29+
30+
pub(crate) struct ArrayBuilder<'a> {
31+
entries: Vec<Entry<'a>>,
32+
}
33+
34+
impl<'a> ArrayBuilder<'a> {
35+
pub(crate) fn new(capacity: usize) -> Self {
36+
Self {
37+
entries: Vec::with_capacity(capacity),
38+
}
39+
}
40+
41+
pub(crate) fn push_raw(&mut self, jentry: JEntry, data: &'a [u8]) {
42+
self.entries.push(Entry::Raw(jentry, data));
43+
}
44+
45+
pub(crate) fn push_array(&mut self, builder: ArrayBuilder<'a>) {
46+
self.entries.push(Entry::ArrayBuilder(builder));
47+
}
48+
49+
pub(crate) fn push_object(&mut self, builder: ObjectBuilder<'a>) {
50+
self.entries.push(Entry::ObjectBuilder(builder));
51+
}
52+
53+
pub(crate) fn len(&self) -> usize {
54+
self.entries.len()
55+
}
56+
57+
pub(crate) fn build_into(self, buf: &mut Vec<u8>) {
58+
let header = ARRAY_CONTAINER_TAG | self.entries.len() as u32;
59+
buf.write_u32::<BigEndian>(header).unwrap();
60+
61+
let mut jentry_index = reserve_jentries(buf, self.entries.len() * 4);
62+
63+
for entry in self.entries.into_iter() {
64+
let jentry = write_entry(buf, entry);
65+
replace_jentry(buf, jentry, &mut jentry_index);
66+
}
67+
}
68+
}
69+
70+
pub(crate) struct ObjectBuilder<'a> {
71+
entries: BTreeMap<&'a str, Entry<'a>>,
72+
}
73+
74+
impl<'a> ObjectBuilder<'a> {
75+
pub(crate) fn new() -> Self {
76+
Self {
77+
entries: BTreeMap::new(),
78+
}
79+
}
80+
81+
pub(crate) fn push_raw(&mut self, key: &'a str, jentry: JEntry, data: &'a [u8]) {
82+
self.entries.insert(key, Entry::Raw(jentry, data));
83+
}
84+
85+
pub(crate) fn push_array(&mut self, key: &'a str, builder: ArrayBuilder<'a>) {
86+
self.entries.insert(key, Entry::ArrayBuilder(builder));
87+
}
88+
89+
pub(crate) fn push_object(&mut self, key: &'a str, builder: ObjectBuilder<'a>) {
90+
self.entries.insert(key, Entry::ObjectBuilder(builder));
91+
}
92+
93+
pub(crate) fn len(&self) -> usize {
94+
self.entries.len()
95+
}
96+
97+
pub(crate) fn build_into(self, buf: &mut Vec<u8>) {
98+
let header = OBJECT_CONTAINER_TAG | self.entries.len() as u32;
99+
buf.write_u32::<BigEndian>(header).unwrap();
100+
101+
let mut jentry_index = reserve_jentries(buf, self.entries.len() * 8);
102+
103+
for (key, _) in self.entries.iter() {
104+
let key_len = key.len();
105+
buf.extend_from_slice(key.as_bytes());
106+
let jentry = JEntry::make_string_jentry(key_len);
107+
replace_jentry(buf, jentry, &mut jentry_index)
108+
}
109+
110+
for (_, entry) in self.entries.into_iter() {
111+
let jentry = write_entry(buf, entry);
112+
replace_jentry(buf, jentry, &mut jentry_index);
113+
}
114+
}
115+
}
116+
117+
fn write_entry(buf: &mut Vec<u8>, entry: Entry<'_>) -> JEntry {
118+
match entry {
119+
Entry::ArrayBuilder(builder) => {
120+
let jentry = JEntry::make_container_jentry(builder.len());
121+
builder.build_into(buf);
122+
jentry
123+
}
124+
Entry::ObjectBuilder(builder) => {
125+
let jentry = JEntry::make_container_jentry(builder.len());
126+
builder.build_into(buf);
127+
jentry
128+
}
129+
Entry::Raw(jentry, data) => {
130+
buf.extend_from_slice(data);
131+
jentry
132+
}
133+
}
134+
}
135+
136+
fn reserve_jentries(buf: &mut Vec<u8>, len: usize) -> usize {
137+
let old_len = buf.len();
138+
let new_len = old_len + len;
139+
buf.resize(new_len, 0);
140+
old_len
141+
}
142+
143+
fn replace_jentry(buf: &mut [u8], jentry: JEntry, jentry_index: &mut usize) {
144+
let jentry_bytes = jentry.encoded().to_be_bytes();
145+
for (i, b) in jentry_bytes.iter().enumerate() {
146+
buf[*jentry_index + i] = *b;
147+
}
148+
*jentry_index += 4;
149+
}

0 commit comments

Comments
 (0)