Skip to content

Commit

Permalink
Use a simpler version to read bson data
Browse files Browse the repository at this point in the history
  • Loading branch information
jeroen committed Dec 2, 2024
1 parent 7b669e4 commit aa2fb37
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 77 deletions.
3 changes: 1 addition & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ importFrom(jsonlite,fromJSON)
importFrom(jsonlite,toJSON)
importFrom(jsonlite,validate)
useDynLib(mongolite,R_bigint_as_char)
useDynLib(mongolite,R_bson_reader_new)
useDynLib(mongolite,R_bson_reader_read)
useDynLib(mongolite,R_bson_reader_file)
useDynLib(mongolite,R_bson_to_json)
useDynLib(mongolite,R_bson_to_list)
useDynLib(mongolite,R_bson_to_raw)
Expand Down
54 changes: 17 additions & 37 deletions R/reader.R
Original file line number Diff line number Diff line change
@@ -1,44 +1,24 @@
#' Standalone BSON reader
#'
#' Utility to parse BSON data into R without using MongoDB. Useful to read data
#' from a `mongoexport` dump without mongodb if it fits in memory.
#' Utility to read BSON data into R without MongoDB. Useful to read a dump from
#' from a `mongoexport` dump if it fits in memory. This utility does not attempt
#' to convert data into a data.frame: the output is a vector with length equal
#' to the number of documents in the collection.
#'
#' To import a bson dump into a local mongodb server, use the [mongo$import][mongo]
#' function instead. This requires less memory and once data is in mongo you can
#' easily query it.
#' Alternatively, to import a bson dump into your local mongodb server, use the
#' [mongo$import][mongo] function instead. This requires little memory and once
#' data is in mongo you can easily query it.
#'
#' @export
#' @useDynLib mongolite R_bson_reader_new R_bson_reader_read
#' @param con either a path to a file, a url, or a a connection object
#' @param as_json return data as json strings instead of R lists
#' @param verbose print some output as we read
#' @return list with either data objects or json strings
#' @useDynLib mongolite R_bson_reader_file
#' @param file path to a bson file on disk
#' @param as_json read data into json strings instead of R lists.
#' @param verbose print some progress output while reading
#' @examples
#' diamonds <- read_bson("http://jeroen.github.io/data/diamonds.bson")
read_bson <- function(con, as_json = FALSE, verbose = interactive()){
if(length(con) && is.character(con)){
con <- if(grepl("^https?://", con)){
url(con)
} else {
file(normalizePath(con, mustWork = TRUE), raw = TRUE)
}
}
stopifnot(inherits(con, 'connection'))
open(con, 'rb')
on.exit(close(con))
reader <- .Call(R_bson_reader_new, con)
output <- new.env(parent = emptyenv())
i <- 0
one <- function(as_json = FALSE){
.Call(R_bson_reader_read, reader, as_json)
}
while(length(obj <- one(as_json))){
i <- i+1
if(isTRUE(verbose))
cat("\rRead", i, file = stderr())
output[[sprintf('%09d', i)]] <- obj
}
if(isTRUE(verbose))
cat("\rDone!\n", file = stderr())
unname(as.list(output, sorted = TRUE))
#' download.file("http://jeroen.github.io/data/diamonds.bson", "diamonds.bson")
#' diamonds <- read_bson("diamonds.bson")
#' unlink("diamonds.bson")
read_bson <- function(file, as_json = FALSE, verbose = interactive()){
file <- normalizePath(file, mustWork = TRUE)
.Call(R_bson_reader_file, file, as_json, verbose)
}
27 changes: 14 additions & 13 deletions man/read_bson.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

64 changes: 39 additions & 25 deletions src/reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,31 +72,45 @@ SEXP R_mongo_restore(SEXP con, SEXP ptr_col, SEXP verb) {
return Rf_ScalarInteger(count);
}

static void fin_bson_reader(SEXP ptr){
if(!R_ExternalPtrAddr(ptr)) return;
bson_reader_destroy(R_ExternalPtrAddr(ptr));
R_SetExternalPtrProtected(ptr, R_NilValue);
R_ClearExternalPtr(ptr);
}

SEXP R_bson_reader_new(SEXP con) {
bson_reader_t *reader = bson_reader_new_from_handle(con, bson_reader_feed, bson_reader_finalize);
SEXP ptr = PROTECT(R_MakeExternalPtr(reader, R_NilValue, con));
R_RegisterCFinalizerEx(ptr, fin_bson_reader, 1);
Rf_setAttrib(ptr, R_ClassSymbol, Rf_mkString("bson_reader"));
UNPROTECT(1);
return ptr;
}

SEXP R_bson_reader_read(SEXP ptr, SEXP as_json){
bson_reader_t *reader = R_ExternalPtrAddr(ptr);
SEXP R_bson_reader_file(SEXP path, SEXP as_json, SEXP verbose){
bson_error_t err = {0};
bson_reader_t *reader = bson_reader_new_from_file(CHAR(STRING_ELT(path, 0)), &err);
if(!reader)
Rf_error("This reader has been destroyed.");
Rf_error("Error opening file: %s", err.message);
bool json_output = Rf_asLogical(as_json);
bool progress = Rf_asLogical(verbose);
bool reached_eof = 0;
const bson_t *doc = bson_reader_read (reader, &reached_eof);
if(reached_eof)
return R_NilValue;
if(doc == NULL)
Rf_error("Failed to read all documents");
return Rf_asLogical(as_json) ? bson_to_str(doc) : bson2list(doc);
size_t len = 0;
while(1){
const bson_t *doc = bson_reader_read (reader, &reached_eof);
if(reached_eof)
break;
if(doc == NULL)
Rf_error("Failed to read all documents");
len++;
}
bson_reader_destroy(reader);
reader = bson_reader_new_from_file(CHAR(STRING_ELT(path, 0)), &err);
reached_eof = 0;
SEXP out = PROTECT(Rf_allocVector(json_output ? STRSXP: VECSXP, len));
for(size_t i = 0; i < len; i++){
const bson_t *doc = bson_reader_read (reader, &reached_eof);
if(reached_eof || doc == NULL)
Rf_error("Failed to read all documents");
if(json_output){
size_t jsonlength = 0;
char *str = bson_as_relaxed_extended_json(doc, &jsonlength);
SET_STRING_ELT(out, i, Rf_mkCharLenCE(str, jsonlength, CE_UTF8));
bson_free(str);
} else {
SET_VECTOR_ELT(out, i, bson2list(doc));
}
if(progress && (i % 50 == 0))
REprintf("\rReading %zd of %zd...", i, len);
}
if(progress)
REprintf("\rDone reading %zd documents\n", len);
bson_reader_destroy(reader);
UNPROTECT(1);
return out;
}

0 comments on commit aa2fb37

Please sign in to comment.