From 4e8e63e55f520fed9b7910efa33831b4f763d2b8 Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Mon, 22 Jan 2024 12:07:30 -0500 Subject: [PATCH 1/5] use enum to represent R options, and to map from string to enum value - object_names, as, path_type --- R/cpp11.R | 4 ++-- inst/tinytest/test_utilities.R | 9 ++++++++ src/cpp11.cpp | 6 ++--- src/j_as.h | 7 +++--- src/j_query.h | 8 +++---- src/rjsoncons.cpp | 36 +++++++++++++++-------------- src/utilities.h | 41 +++++++++++++++++++++++++++++----- 7 files changed, 77 insertions(+), 34 deletions(-) diff --git a/R/cpp11.R b/R/cpp11.R index 25a17df..1599a72 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -4,8 +4,8 @@ cpp_version <- function() { .Call(`_rjsoncons_cpp_version`) } -cpp_as_r <- function(data, jtype) { - .Call(`_rjsoncons_cpp_as_r`, data, jtype) +cpp_as_r <- function(data, object_names) { + .Call(`_rjsoncons_cpp_as_r`, data, object_names) } cpp_r_json_init <- function(object_names, path, as, data_type, path_type) { diff --git a/inst/tinytest/test_utilities.R b/inst/tinytest/test_utilities.R index cb09233..af5558b 100644 --- a/inst/tinytest/test_utilities.R +++ b/inst/tinytest/test_utilities.R @@ -6,3 +6,12 @@ expect_false(.is_scalar_character(c("a", "b"))) expect_false(.is_scalar_character(NA_character_)) expect_false(.is_scalar_character("")) expect_true(.is_scalar_character("", z.ok = TRUE)) + +## C++ utilities.h +## 'object_names' should be in c("asis", "sort") +json <- '{"a": 1, "c": 3, "b": 2}' +expect_identical( + rjsoncons:::cpp_as_r(json, "asis"), + list(a = 1L, c = 3L, b = 2L) +) +expect_error(rjsoncons:::cpp_as_r(json, "foo"), "'foo' unknown") diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 7d5a55f..b5ae648 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -13,10 +13,10 @@ extern "C" SEXP _rjsoncons_cpp_version() { END_CPP11 } // rjsoncons.cpp -sexp cpp_as_r(std::string data, std::string jtype); -extern "C" SEXP _rjsoncons_cpp_as_r(SEXP data, SEXP jtype) { +sexp cpp_as_r(std::string data, const std::string object_names); +extern "C" SEXP _rjsoncons_cpp_as_r(SEXP data, SEXP object_names) { BEGIN_CPP11 - return cpp11::as_sexp(cpp_as_r(cpp11::as_cpp>(data), cpp11::as_cpp>(jtype))); + return cpp11::as_sexp(cpp_as_r(cpp11::as_cpp>(data), cpp11::as_cpp>(object_names))); END_CPP11 } // rjsoncons.cpp diff --git a/src/j_as.h b/src/j_as.h index dad4678..136c448 100644 --- a/src/j_as.h +++ b/src/j_as.h @@ -8,6 +8,7 @@ #include "utilities.h" using namespace jsoncons; +using namespace rjsoncons; enum class r_type : uint8_t { @@ -215,9 +216,9 @@ sexp as_r(const Json j) template cpp11::sexp j_as(Json j, std::string as) { - switch(hash(as.c_str())) { - case hash("string"): return as_sexp( j.template as() ); - case hash("R"): return as_r(j); + switch(enum_index(as_map, as)) { + case as::string: return as_sexp( j.template as() ); + case as::R: return as_r(j); default: cpp11::stop("unknown `as = '" + as + "'`"); } } diff --git a/src/j_query.h b/src/j_query.h index 56d7996..3c77a52 100644 --- a/src/j_query.h +++ b/src/j_query.h @@ -10,10 +10,10 @@ template Json j_query(Json j, const std::string path, const std::string path_type) { // evaluate path - switch(hash(path_type.c_str())) { - case hash("JSONpointer"): return jsonpointer::get(j, path); - case hash("JSONpath"): return jsonpath::json_query(j, path); - case hash("JMESpath"): return jmespath::search(j, path); + switch(enum_index(path_type_map, path_type)) { + case path_type::JSONpointer: return jsonpointer::get(j, path); + case path_type::JSONpath: return jsonpath::json_query(j, path); + case path_type::JMESpath: return jmespath::search(j, path); default: cpp11::stop("unknown `path_type` = '" + path_type + "'"); } } diff --git a/src/rjsoncons.cpp b/src/rjsoncons.cpp index 27730be..483641f 100644 --- a/src/rjsoncons.cpp +++ b/src/rjsoncons.cpp @@ -24,12 +24,12 @@ std::string cpp_version() // as_r [[cpp11::register]] -sexp cpp_as_r(std::string data, std::string jtype) +sexp cpp_as_r(std::string data, const std::string object_names) { - switch(hash(jtype.c_str())) { - case hash("asis"): return as_r_impl(data); - case hash("sort"): return as_r_impl(data); - default: cpp11::stop("unknown `object_names = '" + jtype + "'`"); + switch(enum_index(object_names_map, object_names)) { + case object_names::asis: return as_r_impl(data); + case object_names::sort: return as_r_impl(data); + default: cpp11::stop("unknown `object_names = '" + object_names + "'`"); } } @@ -44,9 +44,11 @@ sexp cpp_r_json_init( const std::string path_type ) { - switch(hash(object_names.c_str())) { - case hash("asis"): return r_json_init(path, as, data_type, path_type); - case hash("sort"): return r_json_init(path, as, data_type, path_type); + switch(enum_index(object_names_map, object_names)) { + case object_names::asis: + return r_json_init(path, as, data_type, path_type); + case object_names::sort: + return r_json_init(path, as, data_type, path_type); default: cpp11::stop("unknown `object_names = '" + object_names + "'`"); } } @@ -57,9 +59,9 @@ void cpp_r_json_query( const std::vector data, const std::string object_names) { - switch(hash(object_names.c_str())) { - case hash("asis"): { r_json_query(ext, data); break; } - case hash("sort"): { r_json_query(ext, data); break; } + switch(enum_index(object_names_map, object_names)) { + case object_names::asis: { r_json_query(ext, data); break; } + case object_names::sort: { r_json_query(ext, data); break; } default: cpp11::stop("unknown `object_names = '" + object_names + "'`"); } } @@ -70,9 +72,9 @@ void cpp_r_json_pivot( const std::vector data, const std::string object_names) { - switch(hash(object_names.c_str())) { - case hash("asis"): { r_json_pivot(ext, data); break; } - case hash("sort"): { r_json_pivot(ext, data); break; } + switch(enum_index(object_names_map, object_names)) { + case object_names::asis: { r_json_pivot(ext, data); break; } + case object_names::sort: { r_json_pivot(ext, data); break; } default: cpp11::stop("unknown `object_names = '" + object_names + "'`"); } } @@ -80,9 +82,9 @@ void cpp_r_json_pivot( [[cpp11::register]] cpp11::sexp cpp_r_json_finish(sexp ext, const std::string object_names) { - switch(hash(object_names.c_str())) { - case hash("asis"): return r_json_finish(ext); - case hash("sort"): return r_json_finish(ext); + switch(enum_index(object_names_map, object_names)) { + case object_names::asis: return r_json_finish(ext); + case object_names::sort: return r_json_finish(ext); default: cpp11::stop("unknown `object_names = '" + object_names + "'`"); } } diff --git a/src/utilities.h b/src/utilities.h index c05f09a..3633fd8 100644 --- a/src/utilities.h +++ b/src/utilities.h @@ -1,11 +1,42 @@ #ifndef RJSONCONS_UTILITIES_H #define RJSONCONS_UTILITIES_H -// use this to switch() on string values -// https://stackoverflow.com/a/46711735/547331 -constexpr unsigned int hash(const char *s, int off = 0) -{ - return !s[off] ? 5381 : (hash(s, off+1)*33) ^ s[off]; +#include +#include +#include + +namespace rjsoncons { // enums + + enum object_names { asis, sort }; + enum as { string, R }; + enum path_type { JSONpointer, JSONpath, JMESpath }; + + static std::map object_names_map { + {"asis", asis}, {"sort", sort} + }; + + static std::map as_map { + {"string", string}, {"R", R} + }; + + static std::map path_type_map { + {"JSONpointer", JSONpointer}, {"JSONpath", JSONpath}, + {"JMESpath", JMESpath} + }; + + // look up 'key' in 'enum_map', returning index; used to translate + // R string to enum value. + template + T enum_index( + const std::map& enum_map, const std::string key) + { + auto value = enum_map.find(key); + if (value == std::end(enum_map)) + cpp11::stop("'" + key + "' unknown"); + + return value->second; + } + } #endif From 660253c5b61b79440acca892f035df7753473a4e Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Mon, 22 Jan 2024 16:07:29 -0500 Subject: [PATCH 2/5] use compiled JSONpath, JMESpath expressions - fold j_query, j_pivot into class defintion - use enums in the class --- R/json.R | 4 ++ src/j_as.h | 11 +++- src/j_pivot.h | 79 ------------------------ src/j_query.h | 21 ------- src/r_json.h | 154 ++++++++++++++++++++++++++++++++++++++-------- src/rjsoncons.cpp | 2 - src/utilities.h | 5 ++ 7 files changed, 147 insertions(+), 129 deletions(-) delete mode 100644 src/j_pivot.h delete mode 100644 src/j_query.h diff --git a/R/json.R b/R/json.R index 7208cce..73501e6 100644 --- a/R/json.R +++ b/R/json.R @@ -14,6 +14,8 @@ json_query <- if (any(c("file", "url") %in% data_type)) data <- readLines(data, warn = FALSE) data <- .as_json_string(data, ..., data_type = data_type[[1]]) + if (identical(data_type, "R")) + data_type <- "json" data_type <- head(data_type, 1L) ex <- cpp_r_json_init(object_names, path, as, data_type, path_type) @@ -27,6 +29,8 @@ json_pivot <- if (any(c("file", "url") %in% data_type)) data <- readLines(data, warn = FALSE) data <- .as_json_string(data, ..., data_type = data_type[[1]]) + if (identical(data_type, "R")) + data_type <- "json" as0 <- ifelse(identical(as, "string"), as, "R") ex <- cpp_r_json_init(object_names, path, as0, data_type, path_type) diff --git a/src/j_as.h b/src/j_as.h index 136c448..90d21f2 100644 --- a/src/j_as.h +++ b/src/j_as.h @@ -214,15 +214,20 @@ sexp as_r(const Json j) // json to R template -cpp11::sexp j_as(Json j, std::string as) +cpp11::sexp j_as(Json j, rjsoncons::as as) { - switch(enum_index(as_map, as)) { + switch(as) { case as::string: return as_sexp( j.template as() ); case as::R: return as_r(j); - default: cpp11::stop("unknown `as = '" + as + "'`"); } } +template +cpp11::sexp j_as(Json j, std::string as) +{ + j_as(j, enum_index(as_map, as)); +} + template sexp as_r_impl(const std::string data) { diff --git a/src/j_pivot.h b/src/j_pivot.h deleted file mode 100644 index 74cacd3..0000000 --- a/src/j_pivot.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef RJSONCONS_JSONPIVOT_H -#define RJSONCONS_JSONPIVOT_H - -#include -#include - -template -std::vector object_all_keys(const Json j) -{ - // 'keys' returns keys in the order they are discoverd, 'seen' is - // used as a filter to only insert unseen keys - std::vector keys; - std::unordered_set seen; - - // visit each element in the array... - for (const auto& elt : j.array_range()) { - // if it's an object... - if (elt.type() != json_type::object_value) - continue; - // ...collect member (key) names that have not yet been seen - for (const auto& member : elt.object_range()) - if (seen.insert(member.key()).second) - keys.push_back(member.key()); - } - - return keys; -} - -template -Json pivot_array_as_object(const Json j) -{ - Json object(json_object_arg); - std::vector keys = object_all_keys(j); - - // initialize - for (const auto& key : keys) - object[key] = Json(json_array_arg); - - // pivot - for (const auto& elt : j.array_range()) { - for (const auto& key : keys) { - // non-object values or missing elements are assigned 'null' - Json value = Json::null(); - if (elt.type() == json_type::object_value) - value = elt.at_or_null(key); - object[key].push_back(value); - } - } - - return object; -} - -template -Json j_pivot(const Json j) -{ - Json value; - - switch(j.type()) { - case json_type::null_value: { - value = j; - break; - } - case json_type::object_value: { - // optimistically assuming that this is already an object-of-arrays - value = j; - break; - } - case json_type::array_value: { - value = pivot_array_as_object(j); - break; - } - default: cpp11::stop("`j_pivot()` 'path' must yield an object or array"); - }; - - // a Json object-of-arrays - return value; -} - -#endif diff --git a/src/j_query.h b/src/j_query.h deleted file mode 100644 index 3c77a52..0000000 --- a/src/j_query.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef RJSONCONS_J_QUERY_H -#define RJSONCONS_J_QUERY_H - -#include -#include -#include -#include - -template -Json j_query(Json j, const std::string path, const std::string path_type) -{ - // evaluate path - switch(enum_index(path_type_map, path_type)) { - case path_type::JSONpointer: return jsonpointer::get(j, path); - case path_type::JSONpath: return jsonpath::json_query(j, path); - case path_type::JMESpath: return jmespath::search(j, path); - default: cpp11::stop("unknown `path_type` = '" + path_type + "'"); - } -} - -#endif diff --git a/src/r_json.h b/src/r_json.h index 63d361a..186e4bc 100644 --- a/src/r_json.h +++ b/src/r_json.h @@ -2,42 +2,102 @@ #define RJSONCONS_R_JSON_HPP #include +#include +#include +#include #include #include "utilities.h" #include "j_as.h" -#include "j_query.h" using namespace cpp11; using namespace jsoncons; +using namespace rjsoncons; template class r_json { // FIXME: as_, data_type_, path_type_ should be enums - const std::string path_, - as_, // string, R - data_type_, // json, ndjson - path_type_; // JSONpointer, JSONpath, JMESpath + rjsoncons::data_type data_type_; + rjsoncons::path_type path_type_; + rjsoncons::as as_; std::vector result_; + // only one of the following will be valid per instance + jmespath::jmespath_expression jmespath_; + jsonpath::jsonpath_expression jsonpath_; + const std::string jsonpointer_; -public: - r_json() noexcept = default; + // pivot implementation - r_json(std::string path, std::string as, std::string data_type, - std::string path_type) - : path_(path), as_(as), data_type_(data_type), path_type_(path_type) - {}; + std::vector all_keys(const Json j) + { + // 'keys' returns keys in the order they are discoverd, 'seen' is + // used as a filter to only insert unseen keys + std::vector keys; + std::unordered_set seen; + + // visit each element in the array... + for (const auto& elt : j.array_range()) { + // if it's an object... + if (elt.type() != json_type::object_value) + continue; + // ...collect member (key) names that have not yet been seen + for (const auto& member : elt.object_range()) + if (seen.insert(member.key()).second) + keys.push_back(member.key()); + } - void query(const std::vector data) + return keys; + } + + Json pivot_array_as_object(const Json j) { - result_.reserve(result_.size() + data.size()); - std::transform( - data.begin(), data.end(), std::back_inserter(result_), - [&](const std::string datum) { - Json j = Json::parse(datum); - return j_query(j, path_, path_type_); - }); + Json object(json_object_arg); + std::vector keys = all_keys(j); + + // initialize + for (const auto& key : keys) + object[key] = Json(json_array_arg); + + // pivot + for (const auto& elt : j.array_range()) { + for (const auto& key : keys) { + // non-object values or missing elements are assigned 'null' + Json value = Json::null(); + if (elt.type() == json_type::object_value) + value = elt.at_or_null(key); + object[key].push_back(value); + } + } + + return object; + } + + Json pivot(const Json j) + { + Json value; + + switch(j.type()) { + case json_type::null_value: { + value = j; + break; + } + case json_type::object_value: { + // optimistically assuming that this is already an + // object-of-arrays + value = j; + break; + } + case json_type::array_value: { + value = pivot_array_as_object(j); + break; + } + default: + cpp11::stop("`j_pivot()` 'path' must yield an object or array"); + }; + + // a Json object-of-arrays + return value; } void pivot_append_result(Json j) @@ -78,19 +138,66 @@ class r_json } } +public: + r_json() noexcept = default; + + r_json(std::string path, std::string as, std::string data_type, + std::string path_type) + : data_type_(enum_index(data_type_map, data_type)), + path_type_(enum_index(path_type_map, path_type)), + as_(enum_index(as_map, as)), + // only one 'path' is used; initialize others to a default + jmespath_( + path_type_ == path_type::JMESpath ? + jmespath::make_expression(path) : + jmespath::make_expression("@")), + jsonpath_( + path_type_ == path_type::JSONpath ? + jsonpath::make_expression(path) : + jsonpath::make_expression("$")), + jsonpointer_(path_type_ == path_type::JSONpointer ? path : "/") + {} + + // query + + Json query(Json j) + { + switch(path_type_) { + case path_type::JSONpointer: + return jsonpointer::get(j, jsonpointer_); + case path_type::JSONpath: return jsonpath_.evaluate(j); + case path_type::JMESpath: return jmespath_.evaluate(j); + } + } + + void query(const std::vector data) + { + result_.reserve(result_.size() + data.size()); + std::transform( + data.begin(), data.end(), std::back_inserter(result_), + [&](const std::string datum) { + Json j = Json::parse(datum); + return query(j); + }); + } + + // pivot + void pivot(const std::vector data) { // collect queries across all data for (const auto& datum: data) { Json j = Json::parse(datum); // query and pivot - Json q = j_query(j, path_, path_type_); - Json p = j_pivot(q); + Json q = query(j); + Json p = pivot(q); // append to result pivot_append_result(p); } } + // as + cpp11::sexp as() const { cpp11::writable::list result(result_.size()); @@ -99,9 +206,8 @@ class r_json [&](Json j) { return j_as(j, as_); }); // FIXME: should be able to create cpp11::strings directly - return as_ == "string" ? - package("base")["unlist"](result) : - result; + return + as_ == as::string ? package("base")["unlist"](result) : result; } }; diff --git a/src/rjsoncons.cpp b/src/rjsoncons.cpp index 483641f..98b9f0e 100644 --- a/src/rjsoncons.cpp +++ b/src/rjsoncons.cpp @@ -5,8 +5,6 @@ #include "utilities.h" #include "j_as.h" -#include "j_query.h" -#include "j_pivot.h" #include "r_json.h" using namespace jsoncons; // for convenience diff --git a/src/utilities.h b/src/utilities.h index 3633fd8..82084d4 100644 --- a/src/utilities.h +++ b/src/utilities.h @@ -7,10 +7,15 @@ namespace rjsoncons { // enums + enum data_type { json_data_type, ndjson_data_type }; enum object_names { asis, sort }; enum as { string, R }; enum path_type { JSONpointer, JSONpath, JMESpath }; + static std::map data_type_map { + {"json", json_data_type}, {"ndjson", ndjson_data_type} + }; + static std::map object_names_map { {"asis", asis}, {"sort", sort} }; From c6c82dbb0051bf05c36ee2dd662e6858380d779a Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Tue, 23 Jan 2024 20:46:23 -0500 Subject: [PATCH 3/5] simplify pivot to tibble - more robust that `"data.frame"` --- R/ndjson.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/ndjson.R b/R/ndjson.R index a20e369..3a98411 100644 --- a/R/ndjson.R +++ b/R/ndjson.R @@ -122,8 +122,6 @@ ndjson_pivot <- string = result, R = result, data.frame = as.data.frame(result), - tibble = - as.data.frame(result) |> - tibble::as_tibble() + tibble = tibble::as_tibble(result) ) } From 6eb454fa61c14aa35cb0c511c3be68c5299ed89c Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Thu, 25 Jan 2024 06:18:49 -0500 Subject: [PATCH 4/5] improve performance of connection code - read as binary (avoiding R characters) & parse in raw_buffer() --- R/cpp11.R | 8 +++++++ R/ndjson.R | 34 ++++++++++++++++++------------ src/cpp11.cpp | 28 +++++++++++++++++++------ src/raw_buffer.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++ src/rjsoncons.cpp | 38 ++++++++++++++++++++++++++++++++- 5 files changed, 141 insertions(+), 20 deletions(-) create mode 100644 src/raw_buffer.h diff --git a/R/cpp11.R b/R/cpp11.R index 1599a72..268e6ed 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -20,6 +20,14 @@ cpp_r_json_pivot <- function(ext, data, object_names) { invisible(.Call(`_rjsoncons_cpp_r_json_pivot`, ext, data, object_names)) } +cpp_r_json_query_raw <- function(ext, prefix, bin, n_records, object_names) { + .Call(`_rjsoncons_cpp_r_json_query_raw`, ext, prefix, bin, n_records, object_names) +} + +cpp_r_json_pivot_raw <- function(ext, prefix, bin, n_records, object_names) { + .Call(`_rjsoncons_cpp_r_json_pivot_raw`, ext, prefix, bin, n_records, object_names) +} + cpp_r_json_finish <- function(ext, object_names) { .Call(`_rjsoncons_cpp_r_json_finish`, ext, object_names) } diff --git a/R/ndjson.R b/R/ndjson.R index 3a98411..f68368f 100644 --- a/R/ndjson.R +++ b/R/ndjson.R @@ -27,23 +27,25 @@ ndjson_connection <- on.exit(close(fl)) } data_type <- head(data_type, 1L) - chunk_size <- 1024L * 8L - + chunk_size <- as.integer(2^20) # 1 Mb chunks ex <- cpp_r_json_init(object_names, path, as, data_type, path_type) - i <- lines <- 0L + n_lines <- 0L + prefix <- raw() if (verbose) - cli::cli_progress_message("{lines} ndjson records processed") + cli::cli_progress_message("{n_lines} records processed") repeat { - chunk_size <- min(chunk_size, n_records) - ndjson <- readLines(fl, chunk_size) - if (!length(ndjson)) + if (n_records <= 0L) break - i <- i + 1L - lines <- lines + length(ndjson) - n_records <- max(n_records - chunk_size, 0L) if (verbose) cli::cli_progress_update() - cpp_function(ex, ndjson, object_names) + + bin <- readBin(fl, raw(), chunk_size) + if (!length(bin)) + break + result <- cpp_function(ex, prefix, bin, n_records, object_names) + prefix <- result$prefix + n_lines <- n_lines + result$n_lines + n_records <- n_records - result$n_lines } if (verbose) cli::cli_progress_done() @@ -64,14 +66,17 @@ ndjson_query <- .is_scalar_logical(verbose) ) + n_records <- as.integer(min(n_records, .Machine$integer.max)) if (.is_j_data_type_connection(data_type)) { r_function <- ndjson_connection + cpp_function <- cpp_r_json_query_raw } else { r_function <- ndjson_character + cpp_function <- cpp_r_json_query } r_function( - cpp_r_json_query, + cpp_function, data, path, object_names, as, n_records, verbose, path_type, data_type ) @@ -90,15 +95,18 @@ ndjson_pivot <- .is_scalar_logical(verbose) ) + n_records <- as.integer(min(n_records, .Machine$integer.max)) if (.is_j_data_type_connection(data_type)) { r_function <- ndjson_connection + cpp_function <- cpp_r_json_pivot_raw } else { r_function <- ndjson_character + cpp_function <- cpp_r_json_pivot } as0 <- ifelse(identical(as, "string"), "string", "R") pivot <- r_function( - cpp_r_json_pivot, + cpp_function, data, path, object_names, as0, n_records, verbose, path_type, data_type ) diff --git a/src/cpp11.cpp b/src/cpp11.cpp index b5ae648..9ed868b 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -43,6 +43,20 @@ extern "C" SEXP _rjsoncons_cpp_r_json_pivot(SEXP ext, SEXP data, SEXP object_nam END_CPP11 } // rjsoncons.cpp +cpp11::list cpp_r_json_query_raw(sexp ext, raws prefix, raws bin, int n_records, const std::string object_names); +extern "C" SEXP _rjsoncons_cpp_r_json_query_raw(SEXP ext, SEXP prefix, SEXP bin, SEXP n_records, SEXP object_names) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_r_json_query_raw(cpp11::as_cpp>(ext), cpp11::as_cpp>(prefix), cpp11::as_cpp>(bin), cpp11::as_cpp>(n_records), cpp11::as_cpp>(object_names))); + END_CPP11 +} +// rjsoncons.cpp +cpp11::list cpp_r_json_pivot_raw(sexp ext, const raws prefix, const raws bin, int n_records, const std::string object_names); +extern "C" SEXP _rjsoncons_cpp_r_json_pivot_raw(SEXP ext, SEXP prefix, SEXP bin, SEXP n_records, SEXP object_names) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_r_json_pivot_raw(cpp11::as_cpp>(ext), cpp11::as_cpp>(prefix), cpp11::as_cpp>(bin), cpp11::as_cpp>(n_records), cpp11::as_cpp>(object_names))); + END_CPP11 +} +// rjsoncons.cpp cpp11::sexp cpp_r_json_finish(sexp ext, const std::string object_names); extern "C" SEXP _rjsoncons_cpp_r_json_finish(SEXP ext, SEXP object_names) { BEGIN_CPP11 @@ -52,12 +66,14 @@ extern "C" SEXP _rjsoncons_cpp_r_json_finish(SEXP ext, SEXP object_names) { extern "C" { static const R_CallMethodDef CallEntries[] = { - {"_rjsoncons_cpp_as_r", (DL_FUNC) &_rjsoncons_cpp_as_r, 2}, - {"_rjsoncons_cpp_r_json_finish", (DL_FUNC) &_rjsoncons_cpp_r_json_finish, 2}, - {"_rjsoncons_cpp_r_json_init", (DL_FUNC) &_rjsoncons_cpp_r_json_init, 5}, - {"_rjsoncons_cpp_r_json_pivot", (DL_FUNC) &_rjsoncons_cpp_r_json_pivot, 3}, - {"_rjsoncons_cpp_r_json_query", (DL_FUNC) &_rjsoncons_cpp_r_json_query, 3}, - {"_rjsoncons_cpp_version", (DL_FUNC) &_rjsoncons_cpp_version, 0}, + {"_rjsoncons_cpp_as_r", (DL_FUNC) &_rjsoncons_cpp_as_r, 2}, + {"_rjsoncons_cpp_r_json_finish", (DL_FUNC) &_rjsoncons_cpp_r_json_finish, 2}, + {"_rjsoncons_cpp_r_json_init", (DL_FUNC) &_rjsoncons_cpp_r_json_init, 5}, + {"_rjsoncons_cpp_r_json_pivot", (DL_FUNC) &_rjsoncons_cpp_r_json_pivot, 3}, + {"_rjsoncons_cpp_r_json_pivot_raw", (DL_FUNC) &_rjsoncons_cpp_r_json_pivot_raw, 5}, + {"_rjsoncons_cpp_r_json_query", (DL_FUNC) &_rjsoncons_cpp_r_json_query, 3}, + {"_rjsoncons_cpp_r_json_query_raw", (DL_FUNC) &_rjsoncons_cpp_r_json_query_raw, 5}, + {"_rjsoncons_cpp_version", (DL_FUNC) &_rjsoncons_cpp_version, 0}, {NULL, NULL, 0} }; } diff --git a/src/raw_buffer.h b/src/raw_buffer.h new file mode 100644 index 0000000..f48fee8 --- /dev/null +++ b/src/raw_buffer.h @@ -0,0 +1,53 @@ +#ifndef RJSONCONS_RAW_BUFFER_H +#define RJSONCONS_RAW_BUFFER_H + +#include +#include + +namespace rjsoncons { + + // Parse an R 'raw' vector into a std::vector based + // on newline delimiters '\n'. The raw vector may contain partial + // records, so remember the 'remainder' to prefix to the next + // instance. + class raw_buffer { + int n_records_; + std::vector raw_; + std::vector::iterator buf; + + public: + raw_buffer(const raws prefix, const raws bin, int n_records) + : n_records_(n_records) + { + raw_.reserve(prefix.size() + bin.size()); + // copy prefix + bin to raw_ + std::copy(prefix.begin(), prefix.end(), std::back_inserter(raw_)); + std::copy(bin.begin(), bin.end(), std::back_inserter(raw_)); + } + + std::vector to_strings() + { + std::vector result; + buf = raw_.begin(); + do { + auto eol = std::find(buf, raw_.end(), (uint8_t) '\n'); + if (eol == raw_.end()) // not found + break; + result.push_back(std::string(buf, eol)); + buf = eol + 1; + n_records_ -= 1; + } while ((n_records_ > 0) && (buf != raw_.end())); + + return result; + } + + cpp11::raws remainder() + { + cpp11::writable::raws raw(raw_.end() - buf); + std::copy(buf, raw_.end(), raw.begin()); + return raw; + } + }; +} + +#endif diff --git a/src/rjsoncons.cpp b/src/rjsoncons.cpp index 98b9f0e..dbde93c 100644 --- a/src/rjsoncons.cpp +++ b/src/rjsoncons.cpp @@ -4,10 +4,12 @@ #include #include "utilities.h" +#include "raw_buffer.h" #include "j_as.h" #include "r_json.h" -using namespace jsoncons; // for convenience +using namespace jsoncons; // convenience +using namespace cpp11::literals; // _nm [[cpp11::register]] std::string cpp_version() @@ -77,6 +79,40 @@ void cpp_r_json_pivot( } } +// 'raw' versions of query and pivot + +[[cpp11::register]] +cpp11::list cpp_r_json_query_raw( + sexp ext, + raws prefix, raws bin, int n_records, + const std::string object_names) +{ + rjsoncons::raw_buffer buffer(prefix, bin, n_records); + const std::vector data = buffer.to_strings(); + cpp_r_json_query(ext, data, object_names); + + return cpp11::list({ + "prefix"_nm = buffer.remainder(), + "n_lines"_nm = data.size() + }); +} + +[[cpp11::register]] +cpp11::list cpp_r_json_pivot_raw( + sexp ext, + const raws prefix, const raws bin, int n_records, + const std::string object_names) +{ + rjsoncons::raw_buffer buffer(prefix, bin, n_records); + const std::vector data = buffer.to_strings(); + cpp_r_json_pivot(ext, data, object_names); + + return cpp11::list({ + "prefix"_nm = buffer.remainder(), + "n_lines"_nm = data.size() + }); +} + [[cpp11::register]] cpp11::sexp cpp_r_json_finish(sexp ext, const std::string object_names) { From 9b2c28d0c11a826d69be9bc538d2803b63860a2a Mon Sep 17 00:00:00 2001 From: Martin Morgan Date: Thu, 25 Jan 2024 06:33:00 -0500 Subject: [PATCH 5/5] version bump --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 322f1ee..d36e55c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: rjsoncons Title: 'C++' Header-Only 'jsoncons' Library for 'JSON' Queries -Version: 1.1.0.9400 +Version: 1.1.0.9401 Authors@R: c( person( "Martin", "Morgan", role = c("aut", "cre"),