From 60b51dfe2f5692d8b7ee0207c29c0495ae4c2f6b Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Tue, 19 Mar 2024 01:23:19 -0400 Subject: [PATCH 1/7] add j_flatten(), j_find_*() --- NAMESPACE | 5 + R/cpp11.R | 8 ++ R/flatten.R | 233 +++++++++++++++++++++++++++++++++ R/rquerypivot.R | 2 +- inst/extdata/flatten_data.json | 12 ++ inst/tinytest/test_flatten.R | 103 +++++++++++++++ man/flatten.Rd | 162 +++++++++++++++++++++++ src/cpp11.cpp | 16 +++ src/flatten.cpp | 63 +++++++++ src/rquerypivot.h | 18 +++ 10 files changed, 621 insertions(+), 1 deletion(-) create mode 100644 R/flatten.R create mode 100644 inst/extdata/flatten_data.json create mode 100644 inst/tinytest/test_flatten.R create mode 100644 man/flatten.Rd create mode 100644 src/flatten.cpp diff --git a/NAMESPACE b/NAMESPACE index 2c9a8e6..e7e9515 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,11 @@ S3method(j_patch_op,j_patch_op) S3method(print,j_patch_op) export(as_r) export(j_data_type) +export(j_find_keys) +export(j_find_keys_grep) +export(j_find_values) +export(j_find_values_grep) +export(j_flatten) export(j_patch_apply) export(j_patch_from) export(j_patch_op) diff --git a/R/cpp11.R b/R/cpp11.R index a1c38ae..bbd8fb3 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -1,5 +1,13 @@ # Generated by cpp11: do not edit by hand +cpp_j_flatten <- function(data, data_type, object_names, as, path, path_type) { + .Call(`_rjsoncons_cpp_j_flatten`, data, data_type, object_names, as, path, path_type) +} + +cpp_j_flatten_con <- function(con, data_type, object_names, as, path, path_type, n_records, verbose) { + .Call(`_rjsoncons_cpp_j_flatten_con`, con, data_type, object_names, as, path, path_type, n_records, verbose) +} + cpp_j_patch_apply <- function(data, data_type, patch, as) { .Call(`_rjsoncons_cpp_j_patch_apply`, data, data_type, patch, as) } diff --git a/R/flatten.R b/R/flatten.R new file mode 100644 index 0000000..6bf10d2 --- /dev/null +++ b/R/flatten.R @@ -0,0 +1,233 @@ +#' @rdname flatten +#' +#' @title Flatten and find keys or values +#' +#' @description `j_flatten()` transforms a JSON document into a list +#' where names are JSONpointer 'keys' and elements are the +#' corresponding 'values' from the JSON document. +#' +#' @inheritParams j_query +#' +#' @param as character(1) describing the return type. For +#' `j_flatten()`, either "string" or "R". For other functions on +#' this page, one of "list", "data.frame", or "tibble". +#' +#' @details +#' +#' Functions documented on this page expand `data` into all key / +#' value pairs. This is not suitable for very large JSON documents. +#' +#' @return `j_flatten()` returns a named list, where `names()` are the +#' JSONpointer paths to each element in the JSON document and list +#' elements are the corresponding values. +#' +#' @examples +#' json <- '{ +#' "discards": { +#' "1000": "Record does not exist", +#' "1004": "Queue limit exceeded", +#' "1010": "Discarding timed-out partial msg" +#' }, +#' "warnings": { +#' "0": "Phone number missing country code", +#' "1": "State code missing", +#' "2": "Zip code missing" +#' } +#' }' +#' +#' j_flatten(json) |> +#' str() +#' +#' @export +j_flatten <- + function( + data, object_names = "asis", as = "string", ..., + n_records = Inf, verbose = FALSE, + data_type = j_data_type(data) + ) +{ + ## initialize constants to enable code re-use + path <- "" + path_type <- j_path_type(path) + + ## validity + .j_valid(data_type, object_names, path, path_type, n_records, verbose) + stopifnot(.is_scalar_character(as), as %in% c("string", "R")) + + data <- .as_json_string(data, data_type, ...) + result <- do_cpp( + cpp_j_flatten, cpp_j_flatten_con, + data, data_type, object_names, as, path, path_type, + n_records = n_records, verbose = verbose + ) + + if (data_type[[1]] %in% c("json", "R")) + result <- result[[1]] + + result +} + +j_find_format <- + function(flattened, as) +{ + if (identical(as, "list")) { + flattened + } else { + keys <- names(flattened) + values <- unlist(flattened, use.names = FALSE) + switch( + as, + data.frame = data.frame(key = keys, value = values), + tibble = tibble::tibble(key = keys, value = values) + ) + } +} + +#' @rdname flatten +#' +#' @description `j_find_values()` finds paths to exactly matching +#' values. +#' +#' @param values vector of one or more values, all of the same type +#' (e.g., double, integer, character). +#' +#' @return `j_find_values()` and `j_find_values_grep()` return a list +#' with names as JSONpointer paths and list elements the matching +#' values, or a `data.frame` or `tibble` with columns `path` and +#' `value`. Values are coerced to a common type when `as` is +#' `data.frame` or `tibble`. +#' +#' @examples +#' j_find_values(json, "Zip code missing", as = "tibble") +#' j_find_values( +#' json, +#' c("Queue limit exceeded", "Zip code missing"), +#' as = "tibble" +#' ) +#' +#' @export +j_find_values <- + function( + data, values, object_names = "asis", as = "list", + data_type = j_data_type(data) + ) +{ + types <- unique(vapply(values, typeof, character(1))) + stopifnot( + length(types) == 1L, + .is_scalar_character(as), as %in% c("list", "data.frame", "tibble") + ) + + flattened0 <- j_flatten(data, object_names, "R") + flattened <- Filter(\(x) x %in% values, flattened0) + + j_find_format(flattened, as) +} + +#' @rdname flatten +#' +#' @description `j_find_values_grep()` finds paths to values matching +#' a regular expression. +#' +#' @param pattern character(1) regular expression to match values or +#' keys. +#' +#' @param ... for `j_find_values_grep()` and `j_find_keys_grep()`, +#' additional arguments passed to `grepl()`. +#' +#' @examples +#' j_find_values_grep(json, "missing", as = "tibble") +#' +#' @export +j_find_values_grep <- + function( + data, pattern, ..., object_names = "asis", as = "list", + data_type = j_data_type(data) + ) +{ + stopifnot( + .is_scalar_character(pattern), + .is_scalar_character(as), as %in% c("list", "data.frame", "tibble") + ) + + flattened <- j_flatten(data, object_names, "R") + values <- unlist(flattened, use.names = FALSE) + idx <- grepl(pattern, values, ...) + + j_find_format(flattened[idx], as) +} + +#' @rdname flatten +#' +#' @description `j_find_keys()` finds paths to exactly matching keys. +#' +#' @param keys character() vector of one or more keys to be matched +#' exactly to path elements. +#' +#' @details +#' +#' For `j_find_keys()`, the `key` must exactly match one or more +#' consecutive keys in the JSONpointer path returned by `j_flatten()`. +#' +#' @return `j_find_keys()` and `j_find_keys_grep()` returns a list, +#' data.frame, or tibble similar to `j_find_values()` and +#' `j_find_values_grep()`. +#' +#' @examples +#' j_find_keys(json, "discards", as = "tibble") +#' j_find_keys(json, "1", as = "tibble") +#' j_find_keys(json, c("discards", "warnings"), as = "tibble") +#' +#' @export +j_find_keys <- + function( + data, keys, object_names = "asis", as = "list", + data_type = j_data_type(data) + ) +{ + stopifnot( + is.character(keys), !anyNA(keys), + .is_scalar_character(as), as %in% c("list", "data.frame", "tibble") + ) + + flattened <- j_flatten(data, object_names, "R") + keys0 <- names(flattened) + keys1 <- strsplit(keys0, "/") + idx1 <- unlist(keys1) %in% keys + idx <- unique(rep(seq_along(keys1), lengths(keys1))[idx1]) + + j_find_format(flattened[idx], as) +} + +#' @rdname flatten +#' +#' @description `j_find_keys_grep()` finds paths to keys matching a +#' regular expression. +#' +#' @details +#' +#' For `j_find_keys_grep()`, the `key` can define a pattern that spans +#' across JSONpointer path elements. +#' +#' @examples +#' j_find_keys_grep(json, "discard", as = "tibble") +#' j_find_keys_grep(json, "1", as = "tibble") +#' j_find_keys_grep(json, "car.*/101", as = "tibble") +#' +#' @export +j_find_keys_grep <- + function( + data, pattern, ..., object_names = "asis", as = "list", + data_type = j_data_type(data) + ) +{ + stopifnot( + .is_scalar_character(pattern), + .is_scalar_character(as), as %in% c("list", "data.frame", "tibble") + ) + + flattened <- j_flatten(data, object_names, "R") + idx <- grepl(pattern, names(flattened), ...) + + j_find_format(flattened[idx], as) +} diff --git a/R/rquerypivot.R b/R/rquerypivot.R index 84aa005..001aea4 100644 --- a/R/rquerypivot.R +++ b/R/rquerypivot.R @@ -71,7 +71,7 @@ j_query <- ) { .j_valid(data_type, object_names, path, path_type, n_records, verbose) - stopifnot(as %in% c("string", "R")) + stopifnot(.is_scalar_character(as), as %in% c("string", "R")) data <- .as_json_string(data, data_type, ...) result <- do_cpp( diff --git a/inst/extdata/flatten_data.json b/inst/extdata/flatten_data.json new file mode 100644 index 0000000..1640161 --- /dev/null +++ b/inst/extdata/flatten_data.json @@ -0,0 +1,12 @@ +{ + "discards": { + "1000": "Record does not exist", + "1004": "Queue limit exceeded", + "1010": "Discarding timed-out partial msg" + }, + "warnings": { + "0": "Phone number missing country code", + "1": "State code missing", + "2": "Zip code missing" + } +} diff --git a/inst/tinytest/test_flatten.R b/inst/tinytest/test_flatten.R new file mode 100644 index 0000000..0ca2424 --- /dev/null +++ b/inst/tinytest/test_flatten.R @@ -0,0 +1,103 @@ +json_file <- system.file(package = "rjsoncons", "extdata", "flatten_data.json") +json <- paste0(trimws(readLines(json_file, warn = FALSE)), collapse = "") +ojson <- paste0( +'{', + '"warnings":{', + '"0":"Phone number missing country code",', + '"1":"State code missing",', + '"2":"Zip code missing"', + '},', + '"discards":{', + '"1000":"Record does not exist",', + '"1004":"Queue limit exceeded",', + '"1010":"Discarding timed-out partial msg"', + '}', +'}') +flat <- paste0( + '{', + '"/discards/1000":"Record does not exist",', + '"/discards/1004":"Queue limit exceeded",', + '"/discards/1010":"Discarding timed-out partial msg",', + '"/warnings/0":"Phone number missing country code",', + '"/warnings/1":"State code missing",', + '"/warnings/2":"Zip code missing"', + '}' +) +oflat <- paste0( + '{', + '"/warnings/0":"Phone number missing country code",', + '"/warnings/1":"State code missing",', + '"/warnings/2":"Zip code missing",', + '"/discards/1000":"Record does not exist",', + '"/discards/1004":"Queue limit exceeded",', + '"/discards/1010":"Discarding timed-out partial msg"', + '}' +) +flat_r <- list( + `/discards/1000` = "Record does not exist", + `/discards/1004` = "Queue limit exceeded", + `/discards/1010` = "Discarding timed-out partial msg", + `/warnings/0` = "Phone number missing country code", + `/warnings/1` = "State code missing", + `/warnings/2` = "Zip code missing" +) +named_list <- structure(list(), names = character(0)) + +## j_flatten + +expect_identical(j_flatten(json), flat) +expect_identical(j_flatten(json, as = "R"), flat_r) + +expect_identical(j_flatten(json_file, "asis"), flat) +expect_identical(j_flatten(json_file, "asis", as = "R"), flat_r) + +expect_identical(j_flatten(ojson), oflat) +expect_identical(j_flatten(ojson, "sort"), flat) + + +## j_find_values + +expect_identical(j_find_values(json, "State code missing"), flat_r[5]) +expect_identical( + j_find_values(json, c("State code missing", "Queue limit exceeded")), + flat_r[c(2, 5)] +) + +expect_identical( + j_find_values( + json, c("State code missing", "Queue limit exceeded"), + as = "data.frame" + ), + data.frame( + key = names(flat_r[c(2, 5)]), + value = unlist(flat_r[c(2, 5)], use.names = FALSE) + ), + info = "as = 'data.frame'" +) +expect_identical( # as = "tibble" + j_find_values( + json, c("State code missing", "Queue limit exceeded"), + as = "tibble" + ), + tibble::tibble( + key = names(flat_r[c(2, 5)]), + value = unlist(flat_r[c(2, 5)], use.names = FALSE) + ), + info = "as = 'tibble'" +) + +expect_identical(j_find_values(json, "foo"), named_list) + +## j_find_values_grep + +expect_identical(j_find_values_grep(json, "missing"), flat_r[4:6]) + +## j_find_keys + +expect_identical(j_find_keys(json, "warnings"), flat_r[4:6]) +expect_identical(j_find_keys(json, c("1000", "1")), flat_r[c(1, 5)]) + +## j_find_keys_grep + +expect_identical(j_find_keys_grep(json, "warn"), flat_r[4:6]) +expect_identical(j_find_keys_grep(json, "ard.*10$"), flat_r[3]) diff --git a/man/flatten.Rd b/man/flatten.Rd new file mode 100644 index 0000000..ee5826c --- /dev/null +++ b/man/flatten.Rd @@ -0,0 +1,162 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flatten.R +\name{j_flatten} +\alias{j_flatten} +\alias{j_find_values} +\alias{j_find_values_grep} +\alias{j_find_keys} +\alias{j_find_keys_grep} +\title{Flatten and find keys or values} +\usage{ +j_flatten( + data, + object_names = "asis", + as = "string", + ..., + n_records = Inf, + verbose = FALSE, + data_type = j_data_type(data) +) + +j_find_values( + data, + values, + object_names = "asis", + as = "list", + data_type = j_data_type(data) +) + +j_find_values_grep( + data, + pattern, + ..., + object_names = "asis", + as = "list", + data_type = j_data_type(data) +) + +j_find_keys( + data, + keys, + object_names = "asis", + as = "list", + data_type = j_data_type(data) +) + +j_find_keys_grep( + data, + pattern, + ..., + object_names = "asis", + as = "list", + data_type = j_data_type(data) +) +} +\arguments{ +\item{data}{a character() JSON string or NDJSON records, or the +name of a file or URL containing JSON or NDJSON, or an \emph{R} +object parsed to a JSON string using \code{jsonlite::toJSON()}.} + +\item{object_names}{character(1) order \code{data} object elements +\code{"asis"} (default) or \code{"sort"} before filtering on \code{path}.} + +\item{as}{character(1) describing the return type. For +\code{j_flatten()}, either "string" or "R". For other functions on +this page, one of "list", "data.frame", or "tibble".} + +\item{...}{for \code{j_find_values_grep()} and \code{j_find_keys_grep()}, +additional arguments passed to \code{grepl()}.} + +\item{n_records}{numeric(1) maximum number of NDJSON records parsed.} + +\item{verbose}{logical(1) report progress when parsing large NDJSON +files.} + +\item{data_type}{character(1) type of \code{data}; one of \code{"json"}, +\code{"ndjson"}. Inferred from \code{data} using \code{j_data_type()}.} + +\item{values}{vector of one or more values, all of the same type +(e.g., double, integer, character).} + +\item{pattern}{character(1) regular expression to match values or +keys.} + +\item{keys}{character() vector of one or more keys to be matched +exactly to path elements.} +} +\value{ +\code{j_flatten()} returns a named list, where \code{names()} are the +JSONpointer paths to each element in the JSON document and list +elements are the corresponding values. + +\code{j_find_values()} and \code{j_find_values_grep()} return a list +with names as JSONpointer paths and list elements the matching +values, or a \code{data.frame} or \code{tibble} with columns \code{path} and +\code{value}. Values are coerced to a common type when \code{as} is +\code{data.frame} or \code{tibble}. + +\code{j_find_keys()} and \code{j_find_keys_grep()} returns a list, +data.frame, or tibble similar to \code{j_find_values()} and +\code{j_find_values_grep()}. +} +\description{ +\code{j_flatten()} transforms a JSON document into a list +where names are JSONpointer 'keys' and elements are the +corresponding 'values' from the JSON document. + +\code{j_find_values()} finds paths to exactly matching +values. + +\code{j_find_values_grep()} finds paths to values matching +a regular expression. + +\code{j_find_keys()} finds paths to exactly matching keys. + +\code{j_find_keys_grep()} finds paths to keys matching a +regular expression. +} +\details{ +Functions documented on this page expand \code{data} into all key / +value pairs. This is not suitable for very large JSON documents. + +For \code{j_find_keys()}, the \code{key} must exactly match one or more +consecutive keys in the JSONpointer path returned by \code{j_flatten()}. + +For \code{j_find_keys_grep()}, the \code{key} can define a pattern that spans +across JSONpointer path elements. +} +\examples{ +json <- '{ + "discards": { + "1000": "Record does not exist", + "1004": "Queue limit exceeded", + "1010": "Discarding timed-out partial msg" + }, + "warnings": { + "0": "Phone number missing country code", + "1": "State code missing", + "2": "Zip code missing" + } +}' + +j_flatten(json) |> + str() + +j_find_values(json, "Zip code missing", as = "tibble") +j_find_values( + json, + c("Queue limit exceeded", "Zip code missing"), + as = "tibble" +) + +j_find_values_grep(json, "missing", as = "tibble") + +j_find_keys(json, "discards", as = "tibble") +j_find_keys(json, "1", as = "tibble") +j_find_keys(json, c("discards", "warnings"), as = "tibble") + +j_find_keys_grep(json, "discard", as = "tibble") +j_find_keys_grep(json, "1", as = "tibble") +j_find_keys_grep(json, "car.*/101", as = "tibble") + +} diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 6a3e736..e660900 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -5,6 +5,20 @@ #include "cpp11/declarations.hpp" #include +// flatten.cpp +sexp cpp_j_flatten(const std::vector& data, const std::string& data_type, const std::string& object_names, const std::string& as, const std::string& path, const std::string& path_type); +extern "C" SEXP _rjsoncons_cpp_j_flatten(SEXP data, SEXP data_type, SEXP object_names, SEXP as, SEXP path, SEXP path_type) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_j_flatten(cpp11::as_cpp&>>(data), cpp11::as_cpp>(data_type), cpp11::as_cpp>(object_names), cpp11::as_cpp>(as), cpp11::as_cpp>(path), cpp11::as_cpp>(path_type))); + END_CPP11 +} +// flatten.cpp +sexp cpp_j_flatten_con(const sexp& con, const std::string& data_type, const std::string& object_names, const std::string& as, const std::string& path, const std::string& path_type, const double n_records, const bool verbose); +extern "C" SEXP _rjsoncons_cpp_j_flatten_con(SEXP con, SEXP data_type, SEXP object_names, SEXP as, SEXP path, SEXP path_type, SEXP n_records, SEXP verbose) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_j_flatten_con(cpp11::as_cpp>(con), cpp11::as_cpp>(data_type), cpp11::as_cpp>(object_names), cpp11::as_cpp>(as), cpp11::as_cpp>(path), cpp11::as_cpp>(path_type), cpp11::as_cpp>(n_records), cpp11::as_cpp>(verbose))); + END_CPP11 +} // patch.cpp sexp cpp_j_patch_apply(const std::string& data, const std::string& data_type, const std::string& patch, const std::string& as); extern "C" SEXP _rjsoncons_cpp_j_patch_apply(SEXP data, SEXP data_type, SEXP patch, SEXP as) { @@ -80,6 +94,8 @@ extern "C" { static const R_CallMethodDef CallEntries[] = { {"_rjsoncons_cpp_as_r", (DL_FUNC) &_rjsoncons_cpp_as_r, 3}, {"_rjsoncons_cpp_as_r_con", (DL_FUNC) &_rjsoncons_cpp_as_r_con, 5}, + {"_rjsoncons_cpp_j_flatten", (DL_FUNC) &_rjsoncons_cpp_j_flatten, 6}, + {"_rjsoncons_cpp_j_flatten_con", (DL_FUNC) &_rjsoncons_cpp_j_flatten_con, 8}, {"_rjsoncons_cpp_j_patch_apply", (DL_FUNC) &_rjsoncons_cpp_j_patch_apply, 4}, {"_rjsoncons_cpp_j_patch_from", (DL_FUNC) &_rjsoncons_cpp_j_patch_from, 5}, {"_rjsoncons_cpp_j_patch_print", (DL_FUNC) &_rjsoncons_cpp_j_patch_print, 3}, diff --git a/src/flatten.cpp b/src/flatten.cpp new file mode 100644 index 0000000..4a9e083 --- /dev/null +++ b/src/flatten.cpp @@ -0,0 +1,63 @@ +#include +#include + +#include +#include "enum_index.h" +#include "rquerypivot.h" + +using namespace jsoncons; + +[[cpp11::register]] +sexp cpp_j_flatten( + const std::vector& data, const std::string& data_type, + const std::string& object_names, const std::string& as, + const std::string& path, const std::string& path_type) +{ + sexp result; + switch(enum_index(object_names_map, object_names)) { + case object_names::asis: { + result = + rquerypivot(path, as, data_type, path_type, false). + flatten(data); + break; + } + case object_names::sort: { + result = + rquerypivot(path, as, data_type, path_type, false). + flatten(data); + break; + } + default: { + cpp11::stop("unknown `object_names = '" + object_names + "'`"); + }} + + return result; +} + +[[cpp11::register]] +sexp cpp_j_flatten_con( + const sexp& con, const std::string& data_type, + const std::string& object_names, const std::string& as, + const std::string& path, const std::string& path_type, + const double n_records, const bool verbose) +{ + sexp result; + switch(enum_index(object_names_map, object_names)) { + case object_names::asis: { + result = + rquerypivot(path, as, data_type, path_type, verbose). + flatten(con, n_records); + break; + } + case object_names::sort: { + result = + rquerypivot(path, as, data_type, path_type, verbose). + flatten(con, n_records); + break; + } + default: { + cpp11::stop("unknown `object_names = '" + object_names + "'`"); + }} + + return result; +} diff --git a/src/rquerypivot.h b/src/rquerypivot.h index 6211fa6..3f80ca1 100644 --- a/src/rquerypivot.h +++ b/src/rquerypivot.h @@ -177,6 +177,11 @@ class rquerypivot pivot_append_result(p); } + void flatten_transform(Json j) + { + result_.push_back(jsonpointer::flatten(j)); + } + // do_strings() / do_connection() sexp do_strings( const std::vector& data, @@ -297,6 +302,19 @@ class rquerypivot return do_connection(con, n_records, &rquerypivot::pivot_transform); } + // flatten + + sexp flatten(const std::vector& data) + { + return do_strings(data, &rquerypivot::flatten_transform); + } + + sexp flatten(const sexp& con, double n_records) + { + return + do_connection(con, n_records, &rquerypivot::flatten_transform); + } + // as sexp as() const From 62e8ed05dda5ecbeed39b38de8b92723e1b2c78a Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Thu, 21 Mar 2024 11:03:54 -0400 Subject: [PATCH 2/7] version bump and NEWS entry --- DESCRIPTION | 2 +- NEWS.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 569bfea..ff26239 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rjsoncons Title: 'C++' Header-Only 'jsoncons' Library for 'JSON' Queries -Version: 1.2.0.9602 +Version: 1.2.0.9700 Authors@R: c( person( "Martin", "Morgan", role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index 42678d2..ac4d112 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # rjsoncons 1.3.0 +- (1.2.0.9700) add key and value search with `j_flatten()`, `j_find_*()` - (1.2.0.9602) compile on Ubuntu 18.04 - (1.2.0.9503) add JSON patch support with `j_patch_apply()`, From 929115695f554f19a545f58c2afa0a9086f4e451 Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Tue, 19 Mar 2024 14:10:29 -0400 Subject: [PATCH 3/7] minor updates to `j_flatten()` / `j_find_*()` - use `as = "R"` for default in `j_find_*()`, consistent with other functions - add section to a_introduction.Rmd vignette - internal changes --- R/flatten.R | 32 +++++++------- inst/tinytest/test_flatten.R | 5 +++ vignettes/a_rjsoncons.Rmd | 81 ++++++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 17 deletions(-) diff --git a/R/flatten.R b/R/flatten.R index 6bf10d2..4a27e0e 100644 --- a/R/flatten.R +++ b/R/flatten.R @@ -10,7 +10,7 @@ #' #' @param as character(1) describing the return type. For #' `j_flatten()`, either "string" or "R". For other functions on -#' this page, one of "list", "data.frame", or "tibble". +#' this page, one of "R", "data.frame", or "tibble". #' #' @details #' @@ -67,10 +67,10 @@ j_flatten <- result } -j_find_format <- +.j_find_format <- function(flattened, as) { - if (identical(as, "list")) { + if (identical(as, "R")) { flattened } else { keys <- names(flattened) @@ -108,20 +108,18 @@ j_find_format <- #' @export j_find_values <- function( - data, values, object_names = "asis", as = "list", + data, values, object_names = "asis", as = "R", data_type = j_data_type(data) ) { - types <- unique(vapply(values, typeof, character(1))) stopifnot( - length(types) == 1L, - .is_scalar_character(as), as %in% c("list", "data.frame", "tibble") + .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") ) flattened0 <- j_flatten(data, object_names, "R") flattened <- Filter(\(x) x %in% values, flattened0) - j_find_format(flattened, as) + .j_find_format(flattened, as) } #' @rdname flatten @@ -141,20 +139,20 @@ j_find_values <- #' @export j_find_values_grep <- function( - data, pattern, ..., object_names = "asis", as = "list", + data, pattern, ..., object_names = "asis", as = "R", data_type = j_data_type(data) ) { stopifnot( .is_scalar_character(pattern), - .is_scalar_character(as), as %in% c("list", "data.frame", "tibble") + .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") ) flattened <- j_flatten(data, object_names, "R") values <- unlist(flattened, use.names = FALSE) idx <- grepl(pattern, values, ...) - j_find_format(flattened[idx], as) + .j_find_format(flattened[idx], as) } #' @rdname flatten @@ -181,13 +179,13 @@ j_find_values_grep <- #' @export j_find_keys <- function( - data, keys, object_names = "asis", as = "list", + data, keys, object_names = "asis", as = "R", data_type = j_data_type(data) ) { stopifnot( is.character(keys), !anyNA(keys), - .is_scalar_character(as), as %in% c("list", "data.frame", "tibble") + .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") ) flattened <- j_flatten(data, object_names, "R") @@ -196,7 +194,7 @@ j_find_keys <- idx1 <- unlist(keys1) %in% keys idx <- unique(rep(seq_along(keys1), lengths(keys1))[idx1]) - j_find_format(flattened[idx], as) + .j_find_format(flattened[idx], as) } #' @rdname flatten @@ -217,17 +215,17 @@ j_find_keys <- #' @export j_find_keys_grep <- function( - data, pattern, ..., object_names = "asis", as = "list", + data, pattern, ..., object_names = "asis", as = "R", data_type = j_data_type(data) ) { stopifnot( .is_scalar_character(pattern), - .is_scalar_character(as), as %in% c("list", "data.frame", "tibble") + .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") ) flattened <- j_flatten(data, object_names, "R") idx <- grepl(pattern, names(flattened), ...) - j_find_format(flattened[idx], as) + .j_find_format(flattened[idx], as) } diff --git a/inst/tinytest/test_flatten.R b/inst/tinytest/test_flatten.R index 0ca2424..87d88ed 100644 --- a/inst/tinytest/test_flatten.R +++ b/inst/tinytest/test_flatten.R @@ -1,3 +1,4 @@ +ndjson_file <- system.file(package = "rjsoncons", "extdata", "example.ndjson") json_file <- system.file(package = "rjsoncons", "extdata", "flatten_data.json") json <- paste0(trimws(readLines(json_file, warn = FALSE)), collapse = "") ojson <- paste0( @@ -54,6 +55,8 @@ expect_identical(j_flatten(json_file, "asis", as = "R"), flat_r) expect_identical(j_flatten(ojson), oflat) expect_identical(j_flatten(ojson, "sort"), flat) +expect_identical(length(j_flatten(ndjson_file)), 4L) +expect_identical(length(j_flatten(ndjson_file, n_records = 2)), 2L) ## j_find_values @@ -86,6 +89,8 @@ expect_identical( # as = "tibble" info = "as = 'tibble'" ) +j_find_values(ndjson_file, "WA") |> str() + expect_identical(j_find_values(json, "foo"), named_list) ## j_find_values_grep diff --git a/vignettes/a_rjsoncons.Rmd b/vignettes/a_rjsoncons.Rmd index 3b9cbb3..5a838cf 100644 --- a/vignettes/a_rjsoncons.Rmd +++ b/vignettes/a_rjsoncons.Rmd @@ -343,6 +343,87 @@ j_patch_from(j_patch_apply(json, patch), json) [JSON Patch]: https://jsonpatch.com/ [RFC6902]: https://datatracker.ietf.org/doc/html/rfc6902/#section-4 +# Flatten and find + +It can sometimes be helpful to explore JSON documents by 'flattening' +the JSON to an object of path / value pairs, where the path is the +[JSONpointer][] path to the corresponding value. It is then +straight-forward to search this flattened object for, e.g., the path +to a known field or value. As an example, consider the object + +```{r} +codes <- '{ + "discards": { + "1000": "Record does not exist", + "1004": "Queue limit exceeded", + "1010": "Discarding timed-out partial msg" + }, + "warnings": { + "0": "Phone number missing country code", + "1": "State code missing", + "2": "Zip code missing" + } +}' +``` + +The 'flat' JSON of this can be represented as named list (using +`str()` to provide a compact visual representation) + +```{r} +j_flatten(codes, as = "R") |> + str() +``` + +The names of the list are JSONpointer paths, so can be used in +`j_query()` and `j_pivot()` as appropriate + +```{r} +j_query(codes, "/discards/1010") +``` + +There are two ways to find known keys and values. The first is to use +exact matching to one or more keys or values, e.g., + +```{r} +j_find_values( + codes, c("Record does not exist", "State code missing"), + as = "tibble" +) +j_find_keys(codes, "warnings", as = "tibble") +``` + +It is also possible to match using a regular expression. + +```{r} +j_find_values_grep(codes, "missing", as = "tibble") +j_find_keys_grep(codes, "card.*/100", as = "tibble") # span key delimiters +``` + +Keys are always character vectors, but values can be of different +type; `j_find_values()` supports searches on these, provided the +search values are of the same type. + +```{r} +j <- '{"x":[1,[2, 3]],"y":{"a":4}}' +j_flatten(j, as = "R") |> str() +j_find_values(j, c(2, 4), as = "tibble") +``` + +A common operation might be to find the path to a know value, and then +to query the original JSON to find the object in which the value is +contained. + +```{r} +j_find_values(j, 3, as = "tibble") +## path to '3' is '/x/1/1', so containing object is at 'x/1' +j_query(j, "/x/1") +j_query(j, "/x/1", as = "R") +``` + +The first argument to `j_find_*()` can be a JSON object, file, or +URL. NDJSON files are flattened into character vector, with each +element the flattened version of the corresponding NDJSON record. + # The JSON parser The package includes a JSON parser, used with the argument `as = "R"` From d03e808e0b96143fe4ca6c74628a94df4878e83f Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Wed, 20 Mar 2024 15:51:11 -0400 Subject: [PATCH 4/7] improve `j_flatten()` and `j_find_*()` support for NDJSON --- R/flatten.R | 232 ++++++++++++++++++++++++----------- inst/tinytest/test_flatten.R | 94 ++++++++++++-- man/flatten.Rd | 80 ++++++++---- 3 files changed, 303 insertions(+), 103 deletions(-) diff --git a/R/flatten.R b/R/flatten.R index 4a27e0e..e21336a 100644 --- a/R/flatten.R +++ b/R/flatten.R @@ -1,6 +1,65 @@ +## internal implementation of .j_flatten, always returns a list to +## simplify j_find_*() processing of both JSON & NDJSON +.j_flatten <- + function(data, object_names, as, ..., n_records, verbose, data_type) +{ + ## initialize constants to enable code re-use + path <- "" + path_type <- j_path_type(path) + + ## validity + .j_valid(data_type, object_names, path, path_type, n_records, verbose) + + data <- .as_json_string(data, data_type, ...) + result <- do_cpp( + cpp_j_flatten, cpp_j_flatten_con, + data, data_type, object_names, as, path, path_type, + n_records = n_records, verbose = verbose + ) +} + +## internal function calling grepl with argument list +.j_find_grepl <- + function(pattern, x, grep_args) +{ + stopifnot( + is.list(grep_args), + all( + names(grep_args) %in% + setdiff(names(formals(grepl)), c("pattern", "x")) + ) + ) + args <- c(list(pattern = pattern, x = x), grep_args) + do.call(grepl, args) +} + +## internal function to format j_find_*() result +.j_find_format <- + function(flattened, as, data_type) +{ + result <- lapply(flattened, function(json_record, as) { + if (identical(as, "R")) { + json_record + } else { + keys <- names(json_record) + values <- unlist(json_record, use.names = FALSE) + switch( + as, + data.frame = data.frame(key = keys, value = values), + tibble = tibble::tibble(key = keys, value = values) + ) + } + }, as) + + if (data_type[[1]] %in% c("json", "R")) # not NDJSON + result <- result[[1]] + + result +} + #' @rdname flatten #' -#' @title Flatten and find keys or values +#' @title Flatten and find keys or values in JSON or NDJSON documents #' #' @description `j_flatten()` transforms a JSON document into a list #' where names are JSONpointer 'keys' and elements are the @@ -12,14 +71,20 @@ #' `j_flatten()`, either "string" or "R". For other functions on #' this page, one of "R", "data.frame", or "tibble". #' -#' @details +#' @details Functions documented on this page expand `data` into all +#' key / value pairs. This is not suitable for very large JSON +#' documents. #' -#' Functions documented on this page expand `data` into all key / -#' value pairs. This is not suitable for very large JSON documents. +#' @return #' -#' @return `j_flatten()` returns a named list, where `names()` are the -#' JSONpointer paths to each element in the JSON document and list -#' elements are the corresponding values. +#' `j_flatten(as = "string")` (default) returns a JSON string +#' representation of the flattened document, i.e., an object with keys +#' the JSONpointer paths and values the values at the corresponding +#' path in the original document. +#' +#' `j_flatten(as = "R")` returns a named list, where `names()` are the +#' JSONpointer paths to each element in the JSON document and list +#' elements are the corresponding values. #' #' @examples #' json <- '{ @@ -42,54 +107,27 @@ j_flatten <- function( data, object_names = "asis", as = "string", ..., - n_records = Inf, verbose = FALSE, - data_type = j_data_type(data) + n_records = Inf, verbose = FALSE, data_type = j_data_type(data) ) { - ## initialize constants to enable code re-use - path <- "" - path_type <- j_path_type(path) - - ## validity - .j_valid(data_type, object_names, path, path_type, n_records, verbose) stopifnot(.is_scalar_character(as), as %in% c("string", "R")) - - data <- .as_json_string(data, data_type, ...) - result <- do_cpp( - cpp_j_flatten, cpp_j_flatten_con, - data, data_type, object_names, as, path, path_type, - n_records = n_records, verbose = verbose + result <- .j_flatten( + data, object_names, as, ..., + n_records = n_records, verbose = verbose, data_type = data_type ) - if (data_type[[1]] %in% c("json", "R")) result <- result[[1]] result } -.j_find_format <- - function(flattened, as) -{ - if (identical(as, "R")) { - flattened - } else { - keys <- names(flattened) - values <- unlist(flattened, use.names = FALSE) - switch( - as, - data.frame = data.frame(key = keys, value = values), - tibble = tibble::tibble(key = keys, value = values) - ) - } -} - #' @rdname flatten #' #' @description `j_find_values()` finds paths to exactly matching #' values. #' -#' @param values vector of one or more values, all of the same type -#' (e.g., double, integer, character). +#' @param values vector of one or more values to be matched exactly to +#' values in the JSON document. #' #' @return `j_find_values()` and `j_find_values_grep()` return a list #' with names as JSONpointer paths and list elements the matching @@ -108,18 +146,23 @@ j_flatten <- #' @export j_find_values <- function( - data, values, object_names = "asis", as = "R", - data_type = j_data_type(data) + data, values, object_names = "asis", as = "R", ..., + n_records = Inf, verbose = FALSE, data_type = j_data_type(data) ) { stopifnot( .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") ) - flattened0 <- j_flatten(data, object_names, "R") - flattened <- Filter(\(x) x %in% values, flattened0) + result <- .j_flatten( + data, object_names, "R", ..., + n_records = n_records, verbose = verbose, data_type = data_type + ) + flattened <- lapply(result, function(json_record) { + Filter(\(x) x %in% values, json_record) + }) - .j_find_format(flattened, as) + .j_find_format(flattened, as, data_type) } #' @rdname flatten @@ -130,8 +173,8 @@ j_find_values <- #' @param pattern character(1) regular expression to match values or #' keys. #' -#' @param ... for `j_find_values_grep()` and `j_find_keys_grep()`, -#' additional arguments passed to `grepl()`. +#' @param grep_args list() additional arguments passed to `grepl()` +#' when searching on values or paths. #' #' @examples #' j_find_values_grep(json, "missing", as = "tibble") @@ -139,20 +182,28 @@ j_find_values <- #' @export j_find_values_grep <- function( - data, pattern, ..., object_names = "asis", as = "R", - data_type = j_data_type(data) + data, pattern, object_names = "asis", as = "R", ..., + n_records = Inf, verbose = FALSE, data_type = j_data_type(data), + grep_args = list() ) { stopifnot( .is_scalar_character(pattern), .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") + ## FIXME: validate grep_args ) - flattened <- j_flatten(data, object_names, "R") - values <- unlist(flattened, use.names = FALSE) - idx <- grepl(pattern, values, ...) + result <- .j_flatten( + data, object_names, "R", ..., + n_records = n_records, verbose = verbose, data_type = data_type + ) + flattened <- lapply(result, function(json_record, grep_args) { + values <- unlist(json_record, use.names = FALSE) + idx <- .j_find_grepl(pattern, values, grep_args) + json_record[idx] + }, grep_args) - .j_find_format(flattened[idx], as) + .j_find_format(flattened, as, data_type) } #' @rdname flatten @@ -162,10 +213,9 @@ j_find_values_grep <- #' @param keys character() vector of one or more keys to be matched #' exactly to path elements. #' -#' @details -#' -#' For `j_find_keys()`, the `key` must exactly match one or more -#' consecutive keys in the JSONpointer path returned by `j_flatten()`. +#' @details For `j_find_keys()`, the `key` must exactly match one or +#' more consecutive keys in the JSONpointer path returned by +#' `j_flatten()`. #' #' @return `j_find_keys()` and `j_find_keys_grep()` returns a list, #' data.frame, or tibble similar to `j_find_values()` and @@ -179,8 +229,8 @@ j_find_values_grep <- #' @export j_find_keys <- function( - data, keys, object_names = "asis", as = "R", - data_type = j_data_type(data) + data, keys, object_names = "asis", as = "R", ..., + n_records = Inf, verbose = FALSE, data_type = j_data_type(data) ) { stopifnot( @@ -188,13 +238,19 @@ j_find_keys <- .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") ) - flattened <- j_flatten(data, object_names, "R") - keys0 <- names(flattened) - keys1 <- strsplit(keys0, "/") - idx1 <- unlist(keys1) %in% keys - idx <- unique(rep(seq_along(keys1), lengths(keys1))[idx1]) + result <- .j_flatten( + data, object_names, "R", ..., + n_records = n_records, verbose = verbose, data_type = data_type + ) + flattened <- lapply(result, function(json_record) { + keys0 <- names(json_record) + keys1 <- strsplit(keys0, "/") + idx1 <- unlist(keys1) %in% keys + idx <- unique(rep(seq_along(keys1), lengths(keys1))[idx1]) + json_record[idx] + }) - .j_find_format(flattened[idx], as) + .j_find_format(flattened, as, data_type) } #' @rdname flatten @@ -202,10 +258,8 @@ j_find_keys <- #' @description `j_find_keys_grep()` finds paths to keys matching a #' regular expression. #' -#' @details -#' -#' For `j_find_keys_grep()`, the `key` can define a pattern that spans -#' across JSONpointer path elements. +#' @details For `j_find_keys_grep()`, the `key` can define a pattern +#' that spans across JSONpointer path elements. #' #' @examples #' j_find_keys_grep(json, "discard", as = "tibble") @@ -215,8 +269,9 @@ j_find_keys <- #' @export j_find_keys_grep <- function( - data, pattern, ..., object_names = "asis", as = "R", - data_type = j_data_type(data) + data, pattern, object_names = "asis", as = "R", ..., + n_records = Inf, verbose = FALSE, data_type = j_data_type(data), + grep_args = list() ) { stopifnot( @@ -224,8 +279,37 @@ j_find_keys_grep <- .is_scalar_character(as), as %in% c("R", "data.frame", "tibble") ) - flattened <- j_flatten(data, object_names, "R") - idx <- grepl(pattern, names(flattened), ...) + result <- .j_flatten( + data, object_names, "R", ..., + n_records = n_records, verbose = verbose, data_type = data_type + ) + flattened <- lapply(result, function(json_record, grep_args) { + idx <- .j_find_grepl(pattern, names(json_record), grep_args) + json_record[idx] + }, grep_args) - .j_find_format(flattened[idx], as) + .j_find_format(flattened, as, data_type) } + +#' @rdname flatten +#' +#' @name flatten_NDJSON +#' +#' @description For NDJSON documents, the result is either a character +#' vector (for `as = "string"`) or list of *R* objects, one +#' element for each NDJSON record. +#' +#' @return For NDJSON documents, the result is a vector paralleling +#' the NDJSON document, with `j_flatten()` applied to each element +#' of the NDJSON document. +#' +#' @examples +#' ## NDJSON +#' +#' ndjson_file <- +#' system.file(package = "rjsoncons", "extdata", "example.ndjson") +#' j_flatten(ndjson_file) |> +#' noquote() +#' j_find_values_grep(ndjson_file, "e") |> +#' str() +NULL diff --git a/inst/tinytest/test_flatten.R b/inst/tinytest/test_flatten.R index 87d88ed..f11a25e 100644 --- a/inst/tinytest/test_flatten.R +++ b/inst/tinytest/test_flatten.R @@ -1,4 +1,7 @@ -ndjson_file <- system.file(package = "rjsoncons", "extdata", "example.ndjson") +## +## JSON +## + json_file <- system.file(package = "rjsoncons", "extdata", "flatten_data.json") json <- paste0(trimws(readLines(json_file, warn = FALSE)), collapse = "") ojson <- paste0( @@ -36,9 +39,9 @@ oflat <- paste0( ) flat_r <- list( `/discards/1000` = "Record does not exist", - `/discards/1004` = "Queue limit exceeded", + `/discards/1004` = "Queue limit exceeded", `/discards/1010` = "Discarding timed-out partial msg", - `/warnings/0` = "Phone number missing country code", + `/warnings/0` = "Phone number missing country code", `/warnings/1` = "State code missing", `/warnings/2` = "Zip code missing" ) @@ -55,9 +58,6 @@ expect_identical(j_flatten(json_file, "asis", as = "R"), flat_r) expect_identical(j_flatten(ojson), oflat) expect_identical(j_flatten(ojson, "sort"), flat) -expect_identical(length(j_flatten(ndjson_file)), 4L) -expect_identical(length(j_flatten(ndjson_file, n_records = 2)), 2L) - ## j_find_values expect_identical(j_find_values(json, "State code missing"), flat_r[5]) @@ -89,8 +89,6 @@ expect_identical( # as = "tibble" info = "as = 'tibble'" ) -j_find_values(ndjson_file, "WA") |> str() - expect_identical(j_find_values(json, "foo"), named_list) ## j_find_values_grep @@ -106,3 +104,83 @@ expect_identical(j_find_keys(json, c("1000", "1")), flat_r[c(1, 5)]) expect_identical(j_find_keys_grep(json, "warn"), flat_r[4:6]) expect_identical(j_find_keys_grep(json, "ard.*10$"), flat_r[3]) + +## +## NDJSON +## + +ndjson_file <- system.file(package = "rjsoncons", "extdata", "example.ndjson") +flat_ndjson <- c( + '{"/name":"Seattle","/state":"WA"}', '{"/name":"New York","/state":"NY"}', + '{"/name":"Bellevue","/state":"WA"}', '{"/name":"Olympia","/state":"WA"}' +) + +## j_flatten + +expect_identical(j_flatten(ndjson_file), flat_ndjson) +expect_identical(j_flatten(ndjson_file, n_records = 2), flat_ndjson[1:2]) + +## j_find_values*() + +expect_identical( + j_find_values(ndjson_file, "WA"), + list( + list(`/state` = "WA"), named_list, + list(`/state` = "WA"), list(`/state` = "WA") + ) +) +expect_identical( + j_find_values(ndjson_file, "WA", n_records = 2), + list(list(`/state` = "WA"), named_list) +) +expect_identical( + j_find_values_grep(ndjson_file, "e"), + list( + list(`/name` = "Seattle"), list(`/name` = "New York"), + list(`/name` = "Bellevue"), named_list + ) +) + +expect_identical( + j_find_values(ndjson_file, "WA"), + list( + list(`/state` = "WA"), named_list, + list(`/state` = "WA"), list(`/state` = "WA") + ) +) +expect_identical( + j_find_values(ndjson_file, "WA", n_records = 2), + list( + list(`/state` = "WA"), named_list + ) +) + +expect_identical( + j_find_values_grep(ndjson_file, "e", n_records = 2), + list( + list(`/name` = "Seattle"), list(`/name` = "New York") + ) +) + +## j_find_keys*() + +expect_identical( + j_find_keys(ndjson_file, "name"), + list( + list(`/name` = "Seattle"), list(`/name` = "New York"), + list(`/name` = "Bellevue"), list(`/name` = "Olympia") + ) +) +expect_identical( + j_find_keys(ndjson_file, "name", n_records = 2), + list( + list(`/name` = "Seattle"), list(`/name` = "New York") + ) +) + +expect_identical( + j_find_keys_grep(ndjson_file, "ame", n_records = 2), + list( + list(`/name` = "Seattle"), list(`/name` = "New York") + ) +) diff --git a/man/flatten.Rd b/man/flatten.Rd index ee5826c..be7fed4 100644 --- a/man/flatten.Rd +++ b/man/flatten.Rd @@ -6,7 +6,8 @@ \alias{j_find_values_grep} \alias{j_find_keys} \alias{j_find_keys_grep} -\title{Flatten and find keys or values} +\alias{flatten_NDJSON} +\title{Flatten and find keys or values in JSON or NDJSON documents} \usage{ j_flatten( data, @@ -22,34 +23,46 @@ j_find_values( data, values, object_names = "asis", - as = "list", + as = "R", + ..., + n_records = Inf, + verbose = FALSE, data_type = j_data_type(data) ) j_find_values_grep( data, pattern, - ..., object_names = "asis", - as = "list", - data_type = j_data_type(data) + as = "R", + ..., + n_records = Inf, + verbose = FALSE, + data_type = j_data_type(data), + grep_args = list() ) j_find_keys( data, keys, object_names = "asis", - as = "list", + as = "R", + ..., + n_records = Inf, + verbose = FALSE, data_type = j_data_type(data) ) j_find_keys_grep( data, pattern, - ..., object_names = "asis", - as = "list", - data_type = j_data_type(data) + as = "R", + ..., + n_records = Inf, + verbose = FALSE, + data_type = j_data_type(data), + grep_args = list() ) } \arguments{ @@ -62,10 +75,9 @@ object parsed to a JSON string using \code{jsonlite::toJSON()}.} \item{as}{character(1) describing the return type. For \code{j_flatten()}, either "string" or "R". For other functions on -this page, one of "list", "data.frame", or "tibble".} +this page, one of "R", "data.frame", or "tibble".} -\item{...}{for \code{j_find_values_grep()} and \code{j_find_keys_grep()}, -additional arguments passed to \code{grepl()}.} +\item{...}{passed to \code{jsonlite::toJSON} when \code{data} is an \emph{R} object.} \item{n_records}{numeric(1) maximum number of NDJSON records parsed.} @@ -75,17 +87,25 @@ files.} \item{data_type}{character(1) type of \code{data}; one of \code{"json"}, \code{"ndjson"}. Inferred from \code{data} using \code{j_data_type()}.} -\item{values}{vector of one or more values, all of the same type -(e.g., double, integer, character).} +\item{values}{vector of one or more values to be matched exactly to +values in the JSON document.} \item{pattern}{character(1) regular expression to match values or keys.} +\item{grep_args}{list() additional arguments passed to \code{grepl()} +when searching on values or paths.} + \item{keys}{character() vector of one or more keys to be matched exactly to path elements.} } \value{ -\code{j_flatten()} returns a named list, where \code{names()} are the +\code{j_flatten(as = "string")} (default) returns a JSON string +representation of the flattened document, i.e., an object with keys +the JSONpointer paths and values the values at the corresponding +path in the original document. + +\code{j_flatten(as = "R")} returns a named list, where \code{names()} are the JSONpointer paths to each element in the JSON document and list elements are the corresponding values. @@ -98,6 +118,10 @@ values, or a \code{data.frame} or \code{tibble} with columns \code{path} and \code{j_find_keys()} and \code{j_find_keys_grep()} returns a list, data.frame, or tibble similar to \code{j_find_values()} and \code{j_find_values_grep()}. + +For NDJSON documents, the result is a vector paralleling +the NDJSON document, with \code{j_flatten()} applied to each element +of the NDJSON document. } \description{ \code{j_flatten()} transforms a JSON document into a list @@ -114,16 +138,22 @@ a regular expression. \code{j_find_keys_grep()} finds paths to keys matching a regular expression. + +For NDJSON documents, the result is either a character +vector (for \code{as = "string"}) or list of \emph{R} objects, one +element for each NDJSON record. } \details{ -Functions documented on this page expand \code{data} into all key / -value pairs. This is not suitable for very large JSON documents. +Functions documented on this page expand \code{data} into all +key / value pairs. This is not suitable for very large JSON +documents. -For \code{j_find_keys()}, the \code{key} must exactly match one or more -consecutive keys in the JSONpointer path returned by \code{j_flatten()}. +For \code{j_find_keys()}, the \code{key} must exactly match one or +more consecutive keys in the JSONpointer path returned by +\code{j_flatten()}. -For \code{j_find_keys_grep()}, the \code{key} can define a pattern that spans -across JSONpointer path elements. +For \code{j_find_keys_grep()}, the \code{key} can define a pattern +that spans across JSONpointer path elements. } \examples{ json <- '{ @@ -159,4 +189,12 @@ j_find_keys_grep(json, "discard", as = "tibble") j_find_keys_grep(json, "1", as = "tibble") j_find_keys_grep(json, "car.*/101", as = "tibble") +## NDJSON + +ndjson_file <- + system.file(package = "rjsoncons", "extdata", "example.ndjson") +j_flatten(ndjson_file) |> + noquote() +j_find_values_grep(ndjson_file, "e") |> + str() } From a08d5f1fe0c8074b903a04af526c97223e7e0165 Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Thu, 21 Mar 2024 11:04:23 -0400 Subject: [PATCH 5/7] version bump and NEWS update --- DESCRIPTION | 2 +- NEWS.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ff26239..32c862e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rjsoncons Title: 'C++' Header-Only 'jsoncons' Library for 'JSON' Queries -Version: 1.2.0.9700 +Version: 1.2.0.9701 Authors@R: c( person( "Martin", "Morgan", role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index ac4d112..d008726 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # rjsoncons 1.3.0 -- (1.2.0.9700) add key and value search with `j_flatten()`, `j_find_*()` +- (1.2.0.9701) add key and value search with `j_flatten()`, `j_find_*()` - (1.2.0.9602) compile on Ubuntu 18.04 - (1.2.0.9503) add JSON patch support with `j_patch_apply()`, From cc217fb0a880969ea6d6a91e0319276b2c7d41b7 Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Thu, 21 Mar 2024 11:04:43 -0400 Subject: [PATCH 6/7] rename `j_find_*()` column 'key' as 'path' --- DESCRIPTION | 2 +- NEWS.md | 2 +- R/flatten.R | 22 +++++++++++----------- inst/tinytest/test_flatten.R | 4 ++-- vignettes/a_rjsoncons.Rmd | 5 ++--- 5 files changed, 17 insertions(+), 18 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 32c862e..8e965a6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rjsoncons Title: 'C++' Header-Only 'jsoncons' Library for 'JSON' Queries -Version: 1.2.0.9701 +Version: 1.2.0.9702 Authors@R: c( person( "Martin", "Morgan", role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index d008726..74ed63f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # rjsoncons 1.3.0 -- (1.2.0.9701) add key and value search with `j_flatten()`, `j_find_*()` +- (1.2.0.9702) add key and value search with `j_flatten()`, `j_find_*()` - (1.2.0.9602) compile on Ubuntu 18.04 - (1.2.0.9503) add JSON patch support with `j_patch_apply()`, diff --git a/R/flatten.R b/R/flatten.R index e21336a..73038b1 100644 --- a/R/flatten.R +++ b/R/flatten.R @@ -41,12 +41,12 @@ if (identical(as, "R")) { json_record } else { - keys <- names(json_record) + paths <- names(json_record) values <- unlist(json_record, use.names = FALSE) switch( as, - data.frame = data.frame(key = keys, value = values), - tibble = tibble::tibble(key = keys, value = values) + data.frame = data.frame(path = paths, value = values), + tibble = tibble::tibble(path = paths, value = values) ) } }, as) @@ -62,7 +62,7 @@ #' @title Flatten and find keys or values in JSON or NDJSON documents #' #' @description `j_flatten()` transforms a JSON document into a list -#' where names are JSONpointer 'keys' and elements are the +#' where names are JSONpointer 'paths' and elements are the #' corresponding 'values' from the JSON document. #' #' @inheritParams j_query @@ -72,14 +72,14 @@ #' this page, one of "R", "data.frame", or "tibble". #' #' @details Functions documented on this page expand `data` into all -#' key / value pairs. This is not suitable for very large JSON +#' path / value pairs. This is not suitable for very large JSON #' documents. #' #' @return #' #' `j_flatten(as = "string")` (default) returns a JSON string #' representation of the flattened document, i.e., an object with keys -#' the JSONpointer paths and values the values at the corresponding +#' the JSONpointer paths and values the value at the corresponding #' path in the original document. #' #' `j_flatten(as = "R")` returns a named list, where `names()` are the @@ -171,7 +171,7 @@ j_find_values <- #' a regular expression. #' #' @param pattern character(1) regular expression to match values or -#' keys. +#' paths. #' #' @param grep_args list() additional arguments passed to `grepl()` #' when searching on values or paths. @@ -243,10 +243,10 @@ j_find_keys <- n_records = n_records, verbose = verbose, data_type = data_type ) flattened <- lapply(result, function(json_record) { - keys0 <- names(json_record) - keys1 <- strsplit(keys0, "/") - idx1 <- unlist(keys1) %in% keys - idx <- unique(rep(seq_along(keys1), lengths(keys1))[idx1]) + paths <- names(json_record) + keys0 <- strsplit(paths, "/") + idx0 <- unlist(keys0) %in% keys + idx <- unique(rep(seq_along(keys0), lengths(keys0))[idx0]) json_record[idx] }) diff --git a/inst/tinytest/test_flatten.R b/inst/tinytest/test_flatten.R index f11a25e..4784dc4 100644 --- a/inst/tinytest/test_flatten.R +++ b/inst/tinytest/test_flatten.R @@ -72,7 +72,7 @@ expect_identical( as = "data.frame" ), data.frame( - key = names(flat_r[c(2, 5)]), + path = names(flat_r[c(2, 5)]), value = unlist(flat_r[c(2, 5)], use.names = FALSE) ), info = "as = 'data.frame'" @@ -83,7 +83,7 @@ expect_identical( # as = "tibble" as = "tibble" ), tibble::tibble( - key = names(flat_r[c(2, 5)]), + path = names(flat_r[c(2, 5)]), value = unlist(flat_r[c(2, 5)], use.names = FALSE) ), info = "as = 'tibble'" diff --git a/vignettes/a_rjsoncons.Rmd b/vignettes/a_rjsoncons.Rmd index 5a838cf..6210bb9 100644 --- a/vignettes/a_rjsoncons.Rmd +++ b/vignettes/a_rjsoncons.Rmd @@ -400,8 +400,7 @@ j_find_keys_grep(codes, "card.*/100", as = "tibble") # span key delimiters ``` Keys are always character vectors, but values can be of different -type; `j_find_values()` supports searches on these, provided the -search values are of the same type. +type; `j_find_values()` supports searches on these. ```{r} j <- '{"x":[1,[2, 3]],"y":{"a":4}}' @@ -415,7 +414,7 @@ contained. ```{r} j_find_values(j, 3, as = "tibble") -## path to '3' is '/x/1/1', so containing object is at 'x/1' +## path to '3' is '/x/1/1', so containing object is at '/x/1' j_query(j, "/x/1") j_query(j, "/x/1", as = "R") ``` From 322da040b4b67aced21376e9d76798607db8940d Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Thu, 21 Mar 2024 11:07:04 -0400 Subject: [PATCH 7/7] version bump and NEWS update - update src/flatten to avoid cpp11/declarations.hpp (issue 3) --- DESCRIPTION | 2 +- NEWS.md | 2 +- src/flatten.cpp | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8e965a6..f9692c8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rjsoncons Title: 'C++' Header-Only 'jsoncons' Library for 'JSON' Queries -Version: 1.2.0.9702 +Version: 1.2.0.9703 Authors@R: c( person( "Martin", "Morgan", role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index 74ed63f..f637c22 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # rjsoncons 1.3.0 -- (1.2.0.9702) add key and value search with `j_flatten()`, `j_find_*()` +- (1.2.0.9703) add key and value search with `j_flatten()`, `j_find_*()` - (1.2.0.9602) compile on Ubuntu 18.04 - (1.2.0.9503) add JSON patch support with `j_patch_apply()`, diff --git a/src/flatten.cpp b/src/flatten.cpp index 4a9e083..392cf3b 100644 --- a/src/flatten.cpp +++ b/src/flatten.cpp @@ -1,10 +1,12 @@ #include #include -#include #include "enum_index.h" #include "rquerypivot.h" +#include +#include // 'stop' + using namespace jsoncons; [[cpp11::register]]