diff --git a/DESCRIPTION b/DESCRIPTION index 569bfea..f9692c8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rjsoncons Title: 'C++' Header-Only 'jsoncons' Library for 'JSON' Queries -Version: 1.2.0.9602 +Version: 1.2.0.9703 Authors@R: c( person( "Martin", "Morgan", role = c("aut", "cre"), diff --git a/NAMESPACE b/NAMESPACE index 2c9a8e6..e7e9515 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,11 @@ S3method(j_patch_op,j_patch_op) S3method(print,j_patch_op) export(as_r) export(j_data_type) +export(j_find_keys) +export(j_find_keys_grep) +export(j_find_values) +export(j_find_values_grep) +export(j_flatten) export(j_patch_apply) export(j_patch_from) export(j_patch_op) diff --git a/NEWS.md b/NEWS.md index 42678d2..f637c22 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # rjsoncons 1.3.0 +- (1.2.0.9703) add key and value search with `j_flatten()`, `j_find_*()` - (1.2.0.9602) compile on Ubuntu 18.04 - (1.2.0.9503) add JSON patch support with `j_patch_apply()`, diff --git a/R/cpp11.R b/R/cpp11.R index a1c38ae..bbd8fb3 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -1,5 +1,13 @@ # Generated by cpp11: do not edit by hand +cpp_j_flatten <- function(data, data_type, object_names, as, path, path_type) { + .Call(`_rjsoncons_cpp_j_flatten`, data, data_type, object_names, as, path, path_type) +} + +cpp_j_flatten_con <- function(con, data_type, object_names, as, path, path_type, n_records, verbose) { + .Call(`_rjsoncons_cpp_j_flatten_con`, con, data_type, object_names, as, path, path_type, n_records, verbose) +} + cpp_j_patch_apply <- function(data, data_type, patch, as) { .Call(`_rjsoncons_cpp_j_patch_apply`, data, data_type, patch, as) } diff --git a/R/flatten.R b/R/flatten.R new file mode 100644 index 0000000..73038b1 --- /dev/null +++ b/R/flatten.R @@ -0,0 +1,315 @@ +## internal implementation of .j_flatten, always returns a list to +## simplify j_find_*() processing of both JSON & NDJSON +.j_flatten <- + function(data, object_names, as, ..., n_records, verbose, data_type) +{ + ## initialize constants to enable code re-use + path <- "" + path_type <- j_path_type(path) + + ## validity + .j_valid(data_type, object_names, path, path_type, n_records, verbose) + + data <- .as_json_string(data, data_type, ...) + result <- do_cpp( + cpp_j_flatten, cpp_j_flatten_con, + data, data_type, object_names, as, path, path_type, + n_records = n_records, verbose = verbose + ) +} + +## internal function calling grepl with argument list +.j_find_grepl <- + function(pattern, x, grep_args) +{ + stopifnot( + is.list(grep_args), + all( + names(grep_args) %in% + setdiff(names(formals(grepl)), c("pattern", "x")) + ) + ) + args <- c(list(pattern = pattern, x = x), grep_args) + do.call(grepl, args) +} + +## internal function to format j_find_*() result +.j_find_format <- + function(flattened, as, data_type) +{ + result <- lapply(flattened, function(json_record, as) { + if (identical(as, "R")) { + json_record + } else { + paths <- names(json_record) + values <- unlist(json_record, use.names = FALSE) + switch( + as, + data.frame = data.frame(path = paths, value = values), + tibble = tibble::tibble(path = paths, value = values) + ) + } + }, as) + + if (data_type[[1]] %in% c("json", "R")) # not NDJSON + result <- result[[1]] + + result +} + +#' @rdname flatten +#' +#' @title Flatten and find keys or values in JSON or NDJSON documents +#' +#' @description `j_flatten()` transforms a JSON document into a list +#' where names are JSONpointer 'paths' and elements are the +#' corresponding 'values' from the JSON document. +#' +#' @inheritParams j_query +#' +#' @param as character(1) describing the return type. For +#' `j_flatten()`, either "string" or "R". For other functions on +#' this page, one of "R", "data.frame", or "tibble". +#' +#' @details Functions documented on this page expand `data` into all +#' path / value pairs. This is not suitable for very large JSON +#' documents. +#' +#' @return +#' +#' `j_flatten(as = "string")` (default) returns a JSON string +#' representation of the flattened document, i.e., an object with keys +#' the JSONpointer paths and values the value at the corresponding +#' path in the original document. +#' +#' `j_flatten(as = "R")` returns a named list, where `names()` are the +#' JSONpointer paths to each element in the JSON document and list +#' elements are the corresponding values. +#' +#' @examples +#' json <- '{ +#' "discards": { +#' "1000": "Record does not exist", +#' "1004": "Queue limit exceeded", +#' "1010": "Discarding timed-out partial msg" +#' }, +#' "warnings": { +#' "0": "Phone number missing country code", +#' "1": "State code missing", +#' "2": "Zip code missing" +#' } +#' }' +#' +#' j_flatten(json) |> +#' str() +#' +#' @export +j_flatten <- + function( + data, object_names = "asis", as = "string", ..., + n_records = Inf, verbose = FALSE, data_type = j_data_type(data) + ) +{ + stopifnot(.is_scalar_character(as), as %in% c("string", "R")) + result <- .j_flatten( + data, object_names, as, ..., + n_records = n_records, verbose = verbose, data_type = data_type + ) + if (data_type[[1]] %in% c("json", "R")) + result <- result[[1]] + + result +} + +#' @rdname flatten +#' +#' @description `j_find_values()` finds paths to exactly matching +#' values. +#' +#' @param values vector of one or more values to be matched exactly to +#' values in the JSON document. +#' +#' @return `j_find_values()` and `j_find_values_grep()` return a list +#' with names as JSONpointer paths and list elements the matching +#' values, or a `data.frame` or `tibble` with columns `path` and +#' `value`. Values are coerced to a common type when `as` is +#' `data.frame` or `tibble`. +#' +#' @examples +#' j_find_values(json, "Zip code missing", as = "tibble") +#' j_find_values( +#' json, +#' c("Queue limit exceeded", "Zip code missing"), +#' as = "tibble" +#' ) +#' +#' @export +j_find_values <- + function( + data, values, object_names = "asis", as = "R", ..., + n_records = Inf, verbose = FALSE, data_type = j_data_type(data) + ) +{ + stopifnot( + .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") + ) + + result <- .j_flatten( + data, object_names, "R", ..., + n_records = n_records, verbose = verbose, data_type = data_type + ) + flattened <- lapply(result, function(json_record) { + Filter(\(x) x %in% values, json_record) + }) + + .j_find_format(flattened, as, data_type) +} + +#' @rdname flatten +#' +#' @description `j_find_values_grep()` finds paths to values matching +#' a regular expression. +#' +#' @param pattern character(1) regular expression to match values or +#' paths. +#' +#' @param grep_args list() additional arguments passed to `grepl()` +#' when searching on values or paths. +#' +#' @examples +#' j_find_values_grep(json, "missing", as = "tibble") +#' +#' @export +j_find_values_grep <- + function( + data, pattern, object_names = "asis", as = "R", ..., + n_records = Inf, verbose = FALSE, data_type = j_data_type(data), + grep_args = list() + ) +{ + stopifnot( + .is_scalar_character(pattern), + .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") + ## FIXME: validate grep_args + ) + + result <- .j_flatten( + data, object_names, "R", ..., + n_records = n_records, verbose = verbose, data_type = data_type + ) + flattened <- lapply(result, function(json_record, grep_args) { + values <- unlist(json_record, use.names = FALSE) + idx <- .j_find_grepl(pattern, values, grep_args) + json_record[idx] + }, grep_args) + + .j_find_format(flattened, as, data_type) +} + +#' @rdname flatten +#' +#' @description `j_find_keys()` finds paths to exactly matching keys. +#' +#' @param keys character() vector of one or more keys to be matched +#' exactly to path elements. +#' +#' @details For `j_find_keys()`, the `key` must exactly match one or +#' more consecutive keys in the JSONpointer path returned by +#' `j_flatten()`. +#' +#' @return `j_find_keys()` and `j_find_keys_grep()` returns a list, +#' data.frame, or tibble similar to `j_find_values()` and +#' `j_find_values_grep()`. +#' +#' @examples +#' j_find_keys(json, "discards", as = "tibble") +#' j_find_keys(json, "1", as = "tibble") +#' j_find_keys(json, c("discards", "warnings"), as = "tibble") +#' +#' @export +j_find_keys <- + function( + data, keys, object_names = "asis", as = "R", ..., + n_records = Inf, verbose = FALSE, data_type = j_data_type(data) + ) +{ + stopifnot( + is.character(keys), !anyNA(keys), + .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") + ) + + result <- .j_flatten( + data, object_names, "R", ..., + n_records = n_records, verbose = verbose, data_type = data_type + ) + flattened <- lapply(result, function(json_record) { + paths <- names(json_record) + keys0 <- strsplit(paths, "/") + idx0 <- unlist(keys0) %in% keys + idx <- unique(rep(seq_along(keys0), lengths(keys0))[idx0]) + json_record[idx] + }) + + .j_find_format(flattened, as, data_type) +} + +#' @rdname flatten +#' +#' @description `j_find_keys_grep()` finds paths to keys matching a +#' regular expression. +#' +#' @details For `j_find_keys_grep()`, the `key` can define a pattern +#' that spans across JSONpointer path elements. +#' +#' @examples +#' j_find_keys_grep(json, "discard", as = "tibble") +#' j_find_keys_grep(json, "1", as = "tibble") +#' j_find_keys_grep(json, "car.*/101", as = "tibble") +#' +#' @export +j_find_keys_grep <- + function( + data, pattern, object_names = "asis", as = "R", ..., + n_records = Inf, verbose = FALSE, data_type = j_data_type(data), + grep_args = list() + ) +{ + stopifnot( + .is_scalar_character(pattern), + .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") + ) + + result <- .j_flatten( + data, object_names, "R", ..., + n_records = n_records, verbose = verbose, data_type = data_type + ) + flattened <- lapply(result, function(json_record, grep_args) { + idx <- .j_find_grepl(pattern, names(json_record), grep_args) + json_record[idx] + }, grep_args) + + .j_find_format(flattened, as, data_type) +} + +#' @rdname flatten +#' +#' @name flatten_NDJSON +#' +#' @description For NDJSON documents, the result is either a character +#' vector (for `as = "string"`) or list of *R* objects, one +#' element for each NDJSON record. +#' +#' @return For NDJSON documents, the result is a vector paralleling +#' the NDJSON document, with `j_flatten()` applied to each element +#' of the NDJSON document. +#' +#' @examples +#' ## NDJSON +#' +#' ndjson_file <- +#' system.file(package = "rjsoncons", "extdata", "example.ndjson") +#' j_flatten(ndjson_file) |> +#' noquote() +#' j_find_values_grep(ndjson_file, "e") |> +#' str() +NULL diff --git a/R/rquerypivot.R b/R/rquerypivot.R index 84aa005..001aea4 100644 --- a/R/rquerypivot.R +++ b/R/rquerypivot.R @@ -71,7 +71,7 @@ j_query <- ) { .j_valid(data_type, object_names, path, path_type, n_records, verbose) - stopifnot(as %in% c("string", "R")) + stopifnot(.is_scalar_character(as), as %in% c("string", "R")) data <- .as_json_string(data, data_type, ...) result <- do_cpp( diff --git a/inst/extdata/flatten_data.json b/inst/extdata/flatten_data.json new file mode 100644 index 0000000..1640161 --- /dev/null +++ b/inst/extdata/flatten_data.json @@ -0,0 +1,12 @@ +{ + "discards": { + "1000": "Record does not exist", + "1004": "Queue limit exceeded", + "1010": "Discarding timed-out partial msg" + }, + "warnings": { + "0": "Phone number missing country code", + "1": "State code missing", + "2": "Zip code missing" + } +} diff --git a/inst/tinytest/test_flatten.R b/inst/tinytest/test_flatten.R new file mode 100644 index 0000000..4784dc4 --- /dev/null +++ b/inst/tinytest/test_flatten.R @@ -0,0 +1,186 @@ +## +## JSON +## + +json_file <- system.file(package = "rjsoncons", "extdata", "flatten_data.json") +json <- paste0(trimws(readLines(json_file, warn = FALSE)), collapse = "") +ojson <- paste0( +'{', + '"warnings":{', + '"0":"Phone number missing country code",', + '"1":"State code missing",', + '"2":"Zip code missing"', + '},', + '"discards":{', + '"1000":"Record does not exist",', + '"1004":"Queue limit exceeded",', + '"1010":"Discarding timed-out partial msg"', + '}', +'}') +flat <- paste0( + '{', + '"/discards/1000":"Record does not exist",', + '"/discards/1004":"Queue limit exceeded",', + '"/discards/1010":"Discarding timed-out partial msg",', + '"/warnings/0":"Phone number missing country code",', + '"/warnings/1":"State code missing",', + '"/warnings/2":"Zip code missing"', + '}' +) +oflat <- paste0( + '{', + '"/warnings/0":"Phone number missing country code",', + '"/warnings/1":"State code missing",', + '"/warnings/2":"Zip code missing",', + '"/discards/1000":"Record does not exist",', + '"/discards/1004":"Queue limit exceeded",', + '"/discards/1010":"Discarding timed-out partial msg"', + '}' +) +flat_r <- list( + `/discards/1000` = "Record does not exist", + `/discards/1004` = "Queue limit exceeded", + `/discards/1010` = "Discarding timed-out partial msg", + `/warnings/0` = "Phone number missing country code", + `/warnings/1` = "State code missing", + `/warnings/2` = "Zip code missing" +) +named_list <- structure(list(), names = character(0)) + +## j_flatten + +expect_identical(j_flatten(json), flat) +expect_identical(j_flatten(json, as = "R"), flat_r) + +expect_identical(j_flatten(json_file, "asis"), flat) +expect_identical(j_flatten(json_file, "asis", as = "R"), flat_r) + +expect_identical(j_flatten(ojson), oflat) +expect_identical(j_flatten(ojson, "sort"), flat) + +## j_find_values + +expect_identical(j_find_values(json, "State code missing"), flat_r[5]) +expect_identical( + j_find_values(json, c("State code missing", "Queue limit exceeded")), + flat_r[c(2, 5)] +) + +expect_identical( + j_find_values( + json, c("State code missing", "Queue limit exceeded"), + as = "data.frame" + ), + data.frame( + path = names(flat_r[c(2, 5)]), + value = unlist(flat_r[c(2, 5)], use.names = FALSE) + ), + info = "as = 'data.frame'" +) +expect_identical( # as = "tibble" + j_find_values( + json, c("State code missing", "Queue limit exceeded"), + as = "tibble" + ), + tibble::tibble( + path = names(flat_r[c(2, 5)]), + value = unlist(flat_r[c(2, 5)], use.names = FALSE) + ), + info = "as = 'tibble'" +) + +expect_identical(j_find_values(json, "foo"), named_list) + +## j_find_values_grep + +expect_identical(j_find_values_grep(json, "missing"), flat_r[4:6]) + +## j_find_keys + +expect_identical(j_find_keys(json, "warnings"), flat_r[4:6]) +expect_identical(j_find_keys(json, c("1000", "1")), flat_r[c(1, 5)]) + +## j_find_keys_grep + +expect_identical(j_find_keys_grep(json, "warn"), flat_r[4:6]) +expect_identical(j_find_keys_grep(json, "ard.*10$"), flat_r[3]) + +## +## NDJSON +## + +ndjson_file <- system.file(package = "rjsoncons", "extdata", "example.ndjson") +flat_ndjson <- c( + '{"/name":"Seattle","/state":"WA"}', '{"/name":"New York","/state":"NY"}', + '{"/name":"Bellevue","/state":"WA"}', '{"/name":"Olympia","/state":"WA"}' +) + +## j_flatten + +expect_identical(j_flatten(ndjson_file), flat_ndjson) +expect_identical(j_flatten(ndjson_file, n_records = 2), flat_ndjson[1:2]) + +## j_find_values*() + +expect_identical( + j_find_values(ndjson_file, "WA"), + list( + list(`/state` = "WA"), named_list, + list(`/state` = "WA"), list(`/state` = "WA") + ) +) +expect_identical( + j_find_values(ndjson_file, "WA", n_records = 2), + list(list(`/state` = "WA"), named_list) +) +expect_identical( + j_find_values_grep(ndjson_file, "e"), + list( + list(`/name` = "Seattle"), list(`/name` = "New York"), + list(`/name` = "Bellevue"), named_list + ) +) + +expect_identical( + j_find_values(ndjson_file, "WA"), + list( + list(`/state` = "WA"), named_list, + list(`/state` = "WA"), list(`/state` = "WA") + ) +) +expect_identical( + j_find_values(ndjson_file, "WA", n_records = 2), + list( + list(`/state` = "WA"), named_list + ) +) + +expect_identical( + j_find_values_grep(ndjson_file, "e", n_records = 2), + list( + list(`/name` = "Seattle"), list(`/name` = "New York") + ) +) + +## j_find_keys*() + +expect_identical( + j_find_keys(ndjson_file, "name"), + list( + list(`/name` = "Seattle"), list(`/name` = "New York"), + list(`/name` = "Bellevue"), list(`/name` = "Olympia") + ) +) +expect_identical( + j_find_keys(ndjson_file, "name", n_records = 2), + list( + list(`/name` = "Seattle"), list(`/name` = "New York") + ) +) + +expect_identical( + j_find_keys_grep(ndjson_file, "ame", n_records = 2), + list( + list(`/name` = "Seattle"), list(`/name` = "New York") + ) +) diff --git a/man/flatten.Rd b/man/flatten.Rd new file mode 100644 index 0000000..be7fed4 --- /dev/null +++ b/man/flatten.Rd @@ -0,0 +1,200 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flatten.R +\name{j_flatten} +\alias{j_flatten} +\alias{j_find_values} +\alias{j_find_values_grep} +\alias{j_find_keys} +\alias{j_find_keys_grep} +\alias{flatten_NDJSON} +\title{Flatten and find keys or values in JSON or NDJSON documents} +\usage{ +j_flatten( + data, + object_names = "asis", + as = "string", + ..., + n_records = Inf, + verbose = FALSE, + data_type = j_data_type(data) +) + +j_find_values( + data, + values, + object_names = "asis", + as = "R", + ..., + n_records = Inf, + verbose = FALSE, + data_type = j_data_type(data) +) + +j_find_values_grep( + data, + pattern, + object_names = "asis", + as = "R", + ..., + n_records = Inf, + verbose = FALSE, + data_type = j_data_type(data), + grep_args = list() +) + +j_find_keys( + data, + keys, + object_names = "asis", + as = "R", + ..., + n_records = Inf, + verbose = FALSE, + data_type = j_data_type(data) +) + +j_find_keys_grep( + data, + pattern, + object_names = "asis", + as = "R", + ..., + n_records = Inf, + verbose = FALSE, + data_type = j_data_type(data), + grep_args = list() +) +} +\arguments{ +\item{data}{a character() JSON string or NDJSON records, or the +name of a file or URL containing JSON or NDJSON, or an \emph{R} +object parsed to a JSON string using \code{jsonlite::toJSON()}.} + +\item{object_names}{character(1) order \code{data} object elements +\code{"asis"} (default) or \code{"sort"} before filtering on \code{path}.} + +\item{as}{character(1) describing the return type. For +\code{j_flatten()}, either "string" or "R". For other functions on +this page, one of "R", "data.frame", or "tibble".} + +\item{...}{passed to \code{jsonlite::toJSON} when \code{data} is an \emph{R} object.} + +\item{n_records}{numeric(1) maximum number of NDJSON records parsed.} + +\item{verbose}{logical(1) report progress when parsing large NDJSON +files.} + +\item{data_type}{character(1) type of \code{data}; one of \code{"json"}, +\code{"ndjson"}. Inferred from \code{data} using \code{j_data_type()}.} + +\item{values}{vector of one or more values to be matched exactly to +values in the JSON document.} + +\item{pattern}{character(1) regular expression to match values or +keys.} + +\item{grep_args}{list() additional arguments passed to \code{grepl()} +when searching on values or paths.} + +\item{keys}{character() vector of one or more keys to be matched +exactly to path elements.} +} +\value{ +\code{j_flatten(as = "string")} (default) returns a JSON string +representation of the flattened document, i.e., an object with keys +the JSONpointer paths and values the values at the corresponding +path in the original document. + +\code{j_flatten(as = "R")} returns a named list, where \code{names()} are the +JSONpointer paths to each element in the JSON document and list +elements are the corresponding values. + +\code{j_find_values()} and \code{j_find_values_grep()} return a list +with names as JSONpointer paths and list elements the matching +values, or a \code{data.frame} or \code{tibble} with columns \code{path} and +\code{value}. Values are coerced to a common type when \code{as} is +\code{data.frame} or \code{tibble}. + +\code{j_find_keys()} and \code{j_find_keys_grep()} returns a list, +data.frame, or tibble similar to \code{j_find_values()} and +\code{j_find_values_grep()}. + +For NDJSON documents, the result is a vector paralleling +the NDJSON document, with \code{j_flatten()} applied to each element +of the NDJSON document. +} +\description{ +\code{j_flatten()} transforms a JSON document into a list +where names are JSONpointer 'keys' and elements are the +corresponding 'values' from the JSON document. + +\code{j_find_values()} finds paths to exactly matching +values. + +\code{j_find_values_grep()} finds paths to values matching +a regular expression. + +\code{j_find_keys()} finds paths to exactly matching keys. + +\code{j_find_keys_grep()} finds paths to keys matching a +regular expression. + +For NDJSON documents, the result is either a character +vector (for \code{as = "string"}) or list of \emph{R} objects, one +element for each NDJSON record. +} +\details{ +Functions documented on this page expand \code{data} into all +key / value pairs. This is not suitable for very large JSON +documents. + +For \code{j_find_keys()}, the \code{key} must exactly match one or +more consecutive keys in the JSONpointer path returned by +\code{j_flatten()}. + +For \code{j_find_keys_grep()}, the \code{key} can define a pattern +that spans across JSONpointer path elements. +} +\examples{ +json <- '{ + "discards": { + "1000": "Record does not exist", + "1004": "Queue limit exceeded", + "1010": "Discarding timed-out partial msg" + }, + "warnings": { + "0": "Phone number missing country code", + "1": "State code missing", + "2": "Zip code missing" + } +}' + +j_flatten(json) |> + str() + +j_find_values(json, "Zip code missing", as = "tibble") +j_find_values( + json, + c("Queue limit exceeded", "Zip code missing"), + as = "tibble" +) + +j_find_values_grep(json, "missing", as = "tibble") + +j_find_keys(json, "discards", as = "tibble") +j_find_keys(json, "1", as = "tibble") +j_find_keys(json, c("discards", "warnings"), as = "tibble") + +j_find_keys_grep(json, "discard", as = "tibble") +j_find_keys_grep(json, "1", as = "tibble") +j_find_keys_grep(json, "car.*/101", as = "tibble") + +## NDJSON + +ndjson_file <- + system.file(package = "rjsoncons", "extdata", "example.ndjson") +j_flatten(ndjson_file) |> + noquote() +j_find_values_grep(ndjson_file, "e") |> + str() +} diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 6a3e736..e660900 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -5,6 +5,20 @@ #include "cpp11/declarations.hpp" #include +// flatten.cpp +sexp cpp_j_flatten(const std::vector& data, const std::string& data_type, const std::string& object_names, const std::string& as, const std::string& path, const std::string& path_type); +extern "C" SEXP _rjsoncons_cpp_j_flatten(SEXP data, SEXP data_type, SEXP object_names, SEXP as, SEXP path, SEXP path_type) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_j_flatten(cpp11::as_cpp&>>(data), cpp11::as_cpp>(data_type), cpp11::as_cpp>(object_names), cpp11::as_cpp>(as), cpp11::as_cpp>(path), cpp11::as_cpp>(path_type))); + END_CPP11 +} +// flatten.cpp +sexp cpp_j_flatten_con(const sexp& con, const std::string& data_type, const std::string& object_names, const std::string& as, const std::string& path, const std::string& path_type, const double n_records, const bool verbose); +extern "C" SEXP _rjsoncons_cpp_j_flatten_con(SEXP con, SEXP data_type, SEXP object_names, SEXP as, SEXP path, SEXP path_type, SEXP n_records, SEXP verbose) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_j_flatten_con(cpp11::as_cpp>(con), cpp11::as_cpp>(data_type), cpp11::as_cpp>(object_names), cpp11::as_cpp>(as), cpp11::as_cpp>(path), cpp11::as_cpp>(path_type), cpp11::as_cpp>(n_records), cpp11::as_cpp>(verbose))); + END_CPP11 +} // patch.cpp sexp cpp_j_patch_apply(const std::string& data, const std::string& data_type, const std::string& patch, const std::string& as); extern "C" SEXP _rjsoncons_cpp_j_patch_apply(SEXP data, SEXP data_type, SEXP patch, SEXP as) { @@ -80,6 +94,8 @@ extern "C" { static const R_CallMethodDef CallEntries[] = { {"_rjsoncons_cpp_as_r", (DL_FUNC) &_rjsoncons_cpp_as_r, 3}, {"_rjsoncons_cpp_as_r_con", (DL_FUNC) &_rjsoncons_cpp_as_r_con, 5}, + {"_rjsoncons_cpp_j_flatten", (DL_FUNC) &_rjsoncons_cpp_j_flatten, 6}, + {"_rjsoncons_cpp_j_flatten_con", (DL_FUNC) &_rjsoncons_cpp_j_flatten_con, 8}, {"_rjsoncons_cpp_j_patch_apply", (DL_FUNC) &_rjsoncons_cpp_j_patch_apply, 4}, {"_rjsoncons_cpp_j_patch_from", (DL_FUNC) &_rjsoncons_cpp_j_patch_from, 5}, {"_rjsoncons_cpp_j_patch_print", (DL_FUNC) &_rjsoncons_cpp_j_patch_print, 3}, diff --git a/src/flatten.cpp b/src/flatten.cpp new file mode 100644 index 0000000..392cf3b --- /dev/null +++ b/src/flatten.cpp @@ -0,0 +1,65 @@ +#include +#include + +#include "enum_index.h" +#include "rquerypivot.h" + +#include +#include // 'stop' + +using namespace jsoncons; + +[[cpp11::register]] +sexp cpp_j_flatten( + const std::vector& data, const std::string& data_type, + const std::string& object_names, const std::string& as, + const std::string& path, const std::string& path_type) +{ + sexp result; + switch(enum_index(object_names_map, object_names)) { + case object_names::asis: { + result = + rquerypivot(path, as, data_type, path_type, false). + flatten(data); + break; + } + case object_names::sort: { + result = + rquerypivot(path, as, data_type, path_type, false). + flatten(data); + break; + } + default: { + cpp11::stop("unknown `object_names = '" + object_names + "'`"); + }} + + return result; +} + +[[cpp11::register]] +sexp cpp_j_flatten_con( + const sexp& con, const std::string& data_type, + const std::string& object_names, const std::string& as, + const std::string& path, const std::string& path_type, + const double n_records, const bool verbose) +{ + sexp result; + switch(enum_index(object_names_map, object_names)) { + case object_names::asis: { + result = + rquerypivot(path, as, data_type, path_type, verbose). + flatten(con, n_records); + break; + } + case object_names::sort: { + result = + rquerypivot(path, as, data_type, path_type, verbose). + flatten(con, n_records); + break; + } + default: { + cpp11::stop("unknown `object_names = '" + object_names + "'`"); + }} + + return result; +} diff --git a/src/rquerypivot.h b/src/rquerypivot.h index 6211fa6..3f80ca1 100644 --- a/src/rquerypivot.h +++ b/src/rquerypivot.h @@ -177,6 +177,11 @@ class rquerypivot pivot_append_result(p); } + void flatten_transform(Json j) + { + result_.push_back(jsonpointer::flatten(j)); + } + // do_strings() / do_connection() sexp do_strings( const std::vector& data, @@ -297,6 +302,19 @@ class rquerypivot return do_connection(con, n_records, &rquerypivot::pivot_transform); } + // flatten + + sexp flatten(const std::vector& data) + { + return do_strings(data, &rquerypivot::flatten_transform); + } + + sexp flatten(const sexp& con, double n_records) + { + return + do_connection(con, n_records, &rquerypivot::flatten_transform); + } + // as sexp as() const diff --git a/vignettes/a_rjsoncons.Rmd b/vignettes/a_rjsoncons.Rmd index 3b9cbb3..6210bb9 100644 --- a/vignettes/a_rjsoncons.Rmd +++ b/vignettes/a_rjsoncons.Rmd @@ -343,6 +343,86 @@ j_patch_from(j_patch_apply(json, patch), json) [JSON Patch]: https://jsonpatch.com/ [RFC6902]: https://datatracker.ietf.org/doc/html/rfc6902/#section-4 +# Flatten and find + +It can sometimes be helpful to explore JSON documents by 'flattening' +the JSON to an object of path / value pairs, where the path is the +[JSONpointer][] path to the corresponding value. It is then +straight-forward to search this flattened object for, e.g., the path +to a known field or value. As an example, consider the object + +```{r} +codes <- '{ + "discards": { + "1000": "Record does not exist", + "1004": "Queue limit exceeded", + "1010": "Discarding timed-out partial msg" + }, + "warnings": { + "0": "Phone number missing country code", + "1": "State code missing", + "2": "Zip code missing" + } +}' +``` + +The 'flat' JSON of this can be represented as named list (using +`str()` to provide a compact visual representation) + +```{r} +j_flatten(codes, as = "R") |> + str() +``` + +The names of the list are JSONpointer paths, so can be used in +`j_query()` and `j_pivot()` as appropriate + +```{r} +j_query(codes, "/discards/1010") +``` + +There are two ways to find known keys and values. The first is to use +exact matching to one or more keys or values, e.g., + +```{r} +j_find_values( + codes, c("Record does not exist", "State code missing"), + as = "tibble" +) +j_find_keys(codes, "warnings", as = "tibble") +``` + +It is also possible to match using a regular expression. + +```{r} +j_find_values_grep(codes, "missing", as = "tibble") +j_find_keys_grep(codes, "card.*/100", as = "tibble") # span key delimiters +``` + +Keys are always character vectors, but values can be of different +type; `j_find_values()` supports searches on these. + +```{r} +j <- '{"x":[1,[2, 3]],"y":{"a":4}}' +j_flatten(j, as = "R") |> str() +j_find_values(j, c(2, 4), as = "tibble") +``` + +A common operation might be to find the path to a know value, and then +to query the original JSON to find the object in which the value is +contained. + +```{r} +j_find_values(j, 3, as = "tibble") +## path to '3' is '/x/1/1', so containing object is at '/x/1' +j_query(j, "/x/1") +j_query(j, "/x/1", as = "R") +``` + +The first argument to `j_find_*()` can be a JSON object, file, or +URL. NDJSON files are flattened into character vector, with each +element the flattened version of the corresponding NDJSON record. + # The JSON parser The package includes a JSON parser, used with the argument `as = "R"`