diff --git a/src/error.rs b/src/error.rs index 98b1dbb..a999cb4 100644 --- a/src/error.rs +++ b/src/error.rs @@ -80,6 +80,7 @@ pub enum Error { InvalidJsonbJEntry, InvalidJsonPath, + InvalidKeyPath, Syntax(ParseErrorCode, usize), } diff --git a/src/functions.rs b/src/functions.rs index 8ed377b..49cb824 100644 --- a/src/functions.rs +++ b/src/functions.rs @@ -16,7 +16,6 @@ use core::convert::TryInto; use std::borrow::Cow; use std::cmp::Ordering; use std::collections::VecDeque; -use std::str::from_utf8; use crate::constants::*; use crate::error::*; @@ -25,6 +24,7 @@ use crate::jentry::JEntry; use crate::jsonpath::JsonPath; use crate::jsonpath::Mode; use crate::jsonpath::Selector; +use crate::keypath::KeyPath; use crate::number::Number; use crate::parser::parse_value; use crate::value::Object; @@ -271,32 +271,41 @@ pub fn get_by_name(value: &[u8], name: &str, ignore_case: bool) -> Option>( +pub fn get_by_keypath<'a, I: Iterator>>( value: &[u8], - keypath: I, + keypaths: I, ) -> Option> { if !is_jsonb(value) { return match parse_value(value) { Ok(val) => { let mut current_val = &val; - for key in keypath { - match from_utf8(key) { - Ok(k) => { - let res = match current_val { - Value::Array(arr) => match k.parse::() { - Ok(idx) => arr.get(idx), - Err(_) => None, - }, - Value::Object(obj) => obj.get(k), - _ => None, - }; - match res { - Some(v) => current_val = v, - None => return None, - }; - } - Err(_) => return None, - } + for path in keypaths { + let res = match path { + KeyPath::Index(idx) => match current_val { + Value::Array(arr) => { + let length = arr.len() as i32; + if *idx > length || length + *idx < 0 { + None + } else { + let idx = if *idx >= 0 { + *idx as usize + } else { + (length + *idx) as usize + }; + arr.get(idx) + } + } + _ => None, + }, + KeyPath::QuotedName(name) | KeyPath::Name(name) => match current_val { + Value::Object(obj) => obj.get(name.as_ref()), + _ => None, + }, + }; + match res { + Some(v) => current_val = v, + None => return None, + }; } Some(current_val.to_vec()) } @@ -308,43 +317,51 @@ pub fn get_by_keypath<'a, I: Iterator>( let mut curr_jentry_encoded = 0; let mut curr_jentry: Option = None; - for key in keypath { - match from_utf8(key) { - Ok(k) => { - if let Some(ref jentry) = curr_jentry { - if jentry.type_code != CONTAINER_TAG { - return None; + for path in keypaths { + if let Some(ref jentry) = curr_jentry { + if jentry.type_code != CONTAINER_TAG { + return None; + } + } + let header = read_u32(value, curr_val_offset).unwrap(); + let length = (header & CONTAINER_HEADER_LEN_MASK) as i32; + match (path, header & CONTAINER_HEADER_TYPE_MASK) { + (KeyPath::QuotedName(name) | KeyPath::Name(name), OBJECT_CONTAINER_TAG) => { + match get_jentry_by_name(value, curr_val_offset, header, name, false) { + Some((jentry, encoded, value_offset)) => { + curr_jentry_encoded = encoded; + curr_jentry = Some(jentry); + curr_val_offset = value_offset; } + None => return None, }; - let header = read_u32(value, curr_val_offset).unwrap(); - match header & CONTAINER_HEADER_TYPE_MASK { - OBJECT_CONTAINER_TAG => { - match get_jentry_by_name(value, curr_val_offset, header, k, false) { - Some((jentry, encoded, value_offset)) => { - curr_jentry_encoded = encoded; - curr_jentry = Some(jentry); - curr_val_offset = value_offset; - } - None => return None, - }; + } + (KeyPath::Index(idx), ARRAY_CONTAINER_TAG) => { + if *idx > length || length + *idx < 0 { + return None; + } else { + let idx = if *idx >= 0 { + *idx as usize + } else { + (length + *idx) as usize + }; + match get_jentry_by_index(value, curr_val_offset, header, idx) { + Some((jentry, encoded, value_offset)) => { + curr_jentry_encoded = encoded; + curr_jentry = Some(jentry); + curr_val_offset = value_offset; + } + None => return None, } - ARRAY_CONTAINER_TAG => match k.parse::() { - Ok(idx) => match get_jentry_by_index(value, curr_val_offset, header, idx) { - Some((jentry, encoded, value_offset)) => { - curr_jentry_encoded = encoded; - curr_jentry = Some(jentry); - curr_val_offset = value_offset; - } - None => return None, - }, - Err(_) => return None, - }, - _ => return None, } } - Err(_) => return None, + (_, _) => return None, } } + // If the key paths is empty, return original value. + if curr_jentry_encoded == 0 { + return Some(value.to_vec()); + } curr_jentry .map(|jentry| extract_by_jentry(&jentry, curr_jentry_encoded, curr_val_offset, value)) } diff --git a/src/jsonpath/mod.rs b/src/jsonpath/mod.rs index 6861639..12cf5cc 100644 --- a/src/jsonpath/mod.rs +++ b/src/jsonpath/mod.rs @@ -17,5 +17,7 @@ mod path; mod selector; pub use parser::parse_json_path; +pub(crate) use parser::raw_string; +pub(crate) use parser::string; pub use path::*; pub use selector::*; diff --git a/src/jsonpath/parser.rs b/src/jsonpath/parser.rs index 66f7528..aed2f30 100644 --- a/src/jsonpath/parser.rs +++ b/src/jsonpath/parser.rs @@ -40,7 +40,7 @@ pub fn parse_json_path(input: &[u8]) -> Result, Error> { } Ok(json_path) } - Err(nom::Err::Error(_err) | nom::Err::Failure(_err)) => Err(Error::InvalidJsonb), + Err(nom::Err::Error(_) | nom::Err::Failure(_)) => Err(Error::InvalidJsonPath), Err(nom::Err::Incomplete(_)) => unreachable!(), } } @@ -73,7 +73,7 @@ fn check_escaped(input: &[u8], i: &mut usize) -> bool { true } -fn raw_string(input: &[u8]) -> IResult<&[u8], Cow<'_, str>> { +pub(crate) fn raw_string(input: &[u8]) -> IResult<&[u8], Cow<'_, str>> { let mut i = 0; let mut escapes = 0; while i < input.len() { @@ -85,8 +85,9 @@ fn raw_string(input: &[u8]) -> IResult<&[u8], Cow<'_, str>> { return Err(nom::Err::Error(NomError::new(input, ErrorKind::Char))); } } - b' ' | b'.' | b':' | b'[' | b']' | b'(' | b')' | b'?' | b'@' | b'$' | b'|' | b'<' - | b'>' | b'!' | b'=' | b'+' | b'-' | b'*' | b'/' | b'%' | b'"' | b'\'' => { + b' ' | b',' | b'.' | b':' | b'{' | b'}' | b'[' | b']' | b'(' | b')' | b'?' | b'@' + | b'$' | b'|' | b'<' | b'>' | b'!' | b'=' | b'+' | b'-' | b'*' | b'/' | b'%' | b'"' + | b'\'' => { break; } _ => { @@ -111,7 +112,7 @@ fn raw_string(input: &[u8]) -> IResult<&[u8], Cow<'_, str>> { Err(nom::Err::Error(NomError::new(input, ErrorKind::Char))) } -fn string(input: &[u8]) -> IResult<&[u8], Cow<'_, str>> { +pub(crate) fn string(input: &[u8]) -> IResult<&[u8], Cow<'_, str>> { if input.is_empty() || input[0] != b'"' { return Err(nom::Err::Error(NomError::new(input, ErrorKind::Char))); } @@ -215,9 +216,9 @@ fn array_index(input: &[u8]) -> IResult<&[u8], ArrayIndex> { fn array_indices(input: &[u8]) -> IResult<&[u8], Vec> { delimited( - terminated(char('['), multispace0), - separated_list1(delimited(multispace0, char(','), multispace0), array_index), - preceded(multispace0, char(']')), + char('['), + separated_list1(char(','), delimited(multispace0, array_index, multispace0)), + char(']'), )(input) } diff --git a/src/keypath.rs b/src/keypath.rs new file mode 100644 index 0000000..05e468c --- /dev/null +++ b/src/keypath.rs @@ -0,0 +1,120 @@ +// Copyright 2023 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use nom::{ + branch::alt, + character::complete::{char, i32, multispace0}, + combinator::map, + multi::separated_list1, + sequence::{delimited, preceded, terminated}, + IResult, +}; + +use std::borrow::Cow; +use std::fmt::Display; +use std::fmt::Formatter; + +use crate::jsonpath::raw_string; +use crate::jsonpath::string; +use crate::Error; + +/// Represents a set of key path chains. +/// Compatible with PostgreSQL extracts JSON sub-object paths syntax. +#[derive(Debug, Clone, PartialEq)] +pub struct KeyPaths<'a> { + pub paths: Vec>, +} + +/// Represents a valid key path. +#[derive(Debug, Clone, PartialEq)] +pub enum KeyPath<'a> { + /// represents the index of an Array, allow negative indexing. + Index(i32), + /// represents the quoted field name of an Object. + QuotedName(Cow<'a, str>), + /// represents the field name of an Object. + Name(Cow<'a, str>), +} + +impl<'a> Display for KeyPaths<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{{")?; + for (i, path) in self.paths.iter().enumerate() { + if i > 0 { + write!(f, ",")?; + } + write!(f, "{path}")?; + } + write!(f, "}}")?; + Ok(()) + } +} + +impl<'a> Display for KeyPath<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + KeyPath::Index(idx) => { + write!(f, "{idx}")?; + } + KeyPath::QuotedName(name) => { + write!(f, "\"{name}\"")?; + } + KeyPath::Name(name) => { + write!(f, "{name}")?; + } + } + Ok(()) + } +} + +/// Parsing the input string to key paths. +pub fn parse_key_paths(input: &[u8]) -> Result, Error> { + match key_paths(input) { + Ok((rest, paths)) => { + if !rest.is_empty() { + return Err(Error::InvalidKeyPath); + } + let key_paths = KeyPaths { paths }; + Ok(key_paths) + } + Err(nom::Err::Error(_) | nom::Err::Failure(_)) => Err(Error::InvalidKeyPath), + Err(nom::Err::Incomplete(_)) => unreachable!(), + } +} + +fn key_path(input: &[u8]) -> IResult<&[u8], KeyPath<'_>> { + alt(( + map(i32, KeyPath::Index), + map(string, KeyPath::QuotedName), + map(raw_string, KeyPath::Name), + ))(input) +} + +fn key_paths(input: &[u8]) -> IResult<&[u8], Vec>> { + alt(( + delimited( + preceded(multispace0, char('{')), + separated_list1(char(','), delimited(multispace0, key_path, multispace0)), + terminated(char('}'), multispace0), + ), + map( + delimited( + preceded(multispace0, char('{')), + multispace0, + terminated(char('}'), multispace0), + ), + |_| vec![], + ), + ))(input) +} diff --git a/src/lib.rs b/src/lib.rs index 2c2db4b..79e4638 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -70,6 +70,7 @@ mod from; mod functions; mod jentry; pub mod jsonpath; +pub mod keypath; mod number; mod parser; mod ser; diff --git a/tests/it/functions.rs b/tests/it/functions.rs index b81c85d..bbf4100 100644 --- a/tests/it/functions.rs +++ b/tests/it/functions.rs @@ -19,9 +19,9 @@ use std::collections::BTreeMap; use jsonb::{ array_length, array_values, as_bool, as_null, as_number, as_str, build_array, build_object, compare, convert_to_comparable, from_slice, get_by_index, get_by_keypath, get_by_name, - get_by_path, is_array, is_object, object_each, object_keys, parse_value, path_exists, - strip_nulls, to_bool, to_f64, to_i64, to_pretty_string, to_str, to_string, to_u64, - traverse_check_string, type_of, Number, Object, Value, + get_by_path, is_array, is_object, keypath::parse_key_paths, object_each, object_keys, + parse_value, path_exists, strip_nulls, to_bool, to_f64, to_i64, to_pretty_string, to_str, + to_string, to_u64, traverse_check_string, type_of, Number, Object, Value, }; use jsonb::jsonpath::parse_json_path; @@ -1042,23 +1042,25 @@ fn test_object_each() { #[test] fn test_get_by_keypath() { let sources = vec![ - ("null", vec!["a", "b"], None), - ("true", vec!["a", "b"], None), - (r#""sdasd""#, vec!["1"], None), + ("null", " { } ", Some(Value::Null)), + ("null", " { a , b } ", None), + ("true", "{a,b}", None), + (r#""sdasd""#, "{1}", None), + ("[10,20,30]", "{1}", Some(Value::Number(Number::UInt64(20)))), ( "[10,20,30]", - vec!["1"], - Some(Value::Number(Number::UInt64(20))), + "{-1}", + Some(Value::Number(Number::UInt64(30))), ), ( r#"[10,20,["a","b","c"]]"#, - vec!["2", "0"], + "{2,0}", Some(Value::String(Cow::from("a"))), ), - (r#"[10,20,["a","b","c"]]"#, vec!["2", "a"], None), + (r#"[10,20,["a","b","c"]]"#, "{2,a}", None), ( r#"[10,20,[{"k1":[1,2,3],"k2":{"w":1,"z":2}},"b","c"]]"#, - vec!["2", "0", "k2"], + "{2,0,k2}", Some(init_object(vec![ ("w", Value::Number(Number::UInt64(1))), ("z", Value::Number(Number::UInt64(2))), @@ -1066,10 +1068,10 @@ fn test_get_by_keypath() { ), ]; for (json_str, path_str, expected) in sources { - let path = path_str.into_iter().map(|p| p.as_bytes()); + let key_paths = parse_key_paths(path_str.as_bytes()).unwrap(); { let json = parse_value(json_str.as_bytes()).unwrap().to_vec(); - let result = get_by_keypath(&json, path.clone()); + let result = get_by_keypath(&json, key_paths.paths.iter()); match expected.clone() { Some(e) => assert_eq!(e, from_slice(&result.unwrap()).unwrap()), None => assert_eq!(result, None), @@ -1077,7 +1079,7 @@ fn test_get_by_keypath() { } { let json = json_str.as_bytes(); - let result = get_by_keypath(json, path); + let result = get_by_keypath(json, key_paths.paths.iter()); match expected { Some(e) => assert_eq!(e, from_slice(&result.unwrap()).unwrap()), None => assert_eq!(result, None), diff --git a/tests/it/keypath_parser.rs b/tests/it/keypath_parser.rs new file mode 100644 index 0000000..02a0fd1 --- /dev/null +++ b/tests/it/keypath_parser.rs @@ -0,0 +1,47 @@ +// Copyright 2023 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::io::Write; + +use goldenfile::Mint; +use jsonb::keypath::parse_key_paths; + +#[test] +fn test_json_path() { + let mut mint = Mint::new("tests/it/testdata"); + let mut file = mint.new_goldenfile("key_path.txt").unwrap(); + let cases = &[" { } ", " { 1, a } ", "{1,a,-2}", r#"{a,"b","c"} "#]; + + for case in cases { + let key_paths = parse_key_paths(case.as_bytes()).unwrap(); + + writeln!(file, "---------- Input ----------").unwrap(); + writeln!(file, "{}", case).unwrap(); + writeln!(file, "---------- Output ---------").unwrap(); + writeln!(file, "{}", key_paths).unwrap(); + writeln!(file, "---------- AST ------------").unwrap(); + writeln!(file, "{:#?}", key_paths).unwrap(); + writeln!(file, "\n").unwrap(); + } +} + +#[test] +fn test_json_path_error() { + let cases = &[r#"{"#, r#"ab"#]; + + for case in cases { + let res = parse_key_paths(case.as_bytes()); + assert!(res.is_err()); + } +} diff --git a/tests/it/main.rs b/tests/it/main.rs index a1cab19..3a733c8 100644 --- a/tests/it/main.rs +++ b/tests/it/main.rs @@ -16,4 +16,5 @@ mod decode; mod encode; mod functions; mod jsonpath_parser; +mod keypath_parser; mod parser; diff --git a/tests/it/testdata/key_path.txt b/tests/it/testdata/key_path.txt new file mode 100644 index 0000000..8798151 --- /dev/null +++ b/tests/it/testdata/key_path.txt @@ -0,0 +1,67 @@ +---------- Input ---------- + { } +---------- Output --------- +{} +---------- AST ------------ +KeyPaths { + paths: [], +} + + +---------- Input ---------- + { 1, a } +---------- Output --------- +{1,a} +---------- AST ------------ +KeyPaths { + paths: [ + Index( + 1, + ), + Name( + "a", + ), + ], +} + + +---------- Input ---------- +{1,a,-2} +---------- Output --------- +{1,a,-2} +---------- AST ------------ +KeyPaths { + paths: [ + Index( + 1, + ), + Name( + "a", + ), + Index( + -2, + ), + ], +} + + +---------- Input ---------- +{a,"b","c"} +---------- Output --------- +{a,"b","c"} +---------- AST ------------ +KeyPaths { + paths: [ + Name( + "a", + ), + QuotedName( + "b", + ), + QuotedName( + "c", + ), + ], +} + +