Skip to content

Commit

Permalink
Alternative approach to simplifying load_survey() and fixing out-of…
Browse files Browse the repository at this point in the history
…-memory issues (#71) (#75)

Co-authored-by: Sebastian Funk <[email protected]>
Co-authored-by: Hugo Gruson <[email protected]>
  • Loading branch information
sbfnk and Bisaloo authored Aug 14, 2023
1 parent 29dc178 commit f06b870
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 44 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
* [Cyclomatic complexity](https://en.wikipedia.org/wiki/Cyclomatic_complexity) of `download_survey()` has been reduced by externalising the `find_common_prefix()` function and failing early instead of relying on unnecessary if/else sequences
* More generous filename checks now pass files named e.g. "..._participants_common..." an not only "...participant_common..."
* The package now sets a custom user agent when downloading survey data (#82).
* A problem was fixed where attempted joins of files could lead to blowing up memeory use (#75).

# socialmixr 0.2.0

Expand Down
2 changes: 2 additions & 0 deletions R/globals.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ utils::globalVariables(c(
"identifier.1", # <list_surveys>
"title", # <list_surveys>
"creator", # <list_surveys>
"..main_id", # <load_survey>
"..merge_id", # <load_survey>
"..original.lower.age.limit", # <pop_age>
"..segment", # <pop_age>
"..upper.age.limit", # <pop_age>
Expand Down
80 changes: 36 additions & 44 deletions R/load_survey.r
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#' @return a survey in the correct format
#' @export
load_survey <- function(files, ...) {
exist <- vapply(files, file.exists, TRUE)
exist <- file.exists(files)
missing <- files[!exist]
if (length(missing) > 0) {
stop(
Expand All @@ -23,15 +23,14 @@ load_survey <- function(files, ...) {
)
}
survey_files <- grep("csv$", files, value = TRUE) # select csv files
reference_file <- grep("json$", files, value = TRUE) # select csv files
reference_file <- grep("json$", files, value = TRUE) # select json file
reference <- fromJSON(reference_file)

contact_data <- lapply(survey_files, fread)
names(contact_data) <- survey_files

main_types <- c("participant", "contact")
main_surveys <- list()
main_file <- NULL

## first, get the common files
for (type in main_types) {
Expand All @@ -43,6 +42,7 @@ load_survey <- function(files, ...) {
)
}
main_surveys[[type]] <- rbindlist(contact_data[main_file], fill = TRUE)
main_surveys[[type]] <- main_surveys[[type]][, ..main_id := seq_len(.N)]
survey_files <- setdiff(survey_files, main_file)
}

Expand Down Expand Up @@ -72,60 +72,51 @@ load_survey <- function(files, ...) {
can_merge <- vapply(survey_files, function(x) {
length(intersect(colnames(contact_data[[x]]), colnames(main_surveys[[type]]))) > 0
}, TRUE)
merge_files <- names(can_merge[which(can_merge)])
merge_files <- names(can_merge[can_merge])
while (length(merge_files) > 0) {

merged_files <- NULL
for (file in merge_files) {
contact_data[[file]] <-
contact_data[[file]][, ..merge_id := seq_len(.N)]
common_id <- intersect(colnames(contact_data[[file]]), colnames(main_surveys[[type]]))

## try a merge to see whether it can be done uniquely
test_merge <- merge(
main_surveys[[type]], contact_data[[file]],
by = common_id,
all.x = TRUE, allow.cartesian = TRUE
)

if (nrow(test_merge) == nrow(main_surveys[[type]])) {
## check if all IDs can be merged in
unique_main_survey_ids <-
unique(main_surveys[[type]][, common_id, with = FALSE])
unique_additional_survey_ids <-
unique(contact_data[[file]][, common_id, with = FALSE])

id_overlap <- merge(
unique_main_survey_ids, unique_additional_survey_ids,
by = common_id
merged <- tryCatch({
merge(
main_surveys[[type]], contact_data[[file]], by = common_id,
all.x = TRUE
)

if (nrow(id_overlap) < nrow(unique_main_survey_ids)) {
}, error = function(cond) {
if (!grepl("cartesian", cond$message, fixed = TRUE)) {
stop(cond$message)
}
NULL
})

## first if merge was unique - if not we're ditching the merge
if (!is.null(merged) &&
anyDuplicated(merged[, "..main_id", with = FALSE]) == 0) {
## we're keeping the merge; now check for any warnings to issue
matched_main <- sum(!is.na(merged[["..merge_id"]]))
unmatched_main <- nrow(merged) - matched_main
if (unmatched_main > 0) {
warning(
ifelse(nrow(id_overlap) == 0, "No matching value",
paste0(
"Only ", nrow(id_overlap), " matching value",
ifelse(nrow(id_overlap) > 1, "s", "")
)
), " in ",
"Only ", matched_main, " matching value",
ifelse(matched_main > 1, "s", ""), " in ",
paste0("'", common_id, "'", collapse = ", "),
" column", ifelse(length(common_id) > 1, "s", ""),
" when pulling ", basename(file), " into '", type, "' survey."
)
}

id_overlap_y <- merge(
unique_main_survey_ids, unique_additional_survey_ids,
by = common_id,
all.y = TRUE
)
if (nrow(id_overlap_y) > nrow(unique_main_survey_ids)) {
unmatched_merge <- nrow(contact_data[[file]]) - matched_main
if (unmatched_merge > 0) {
warning(
nrow(id_overlap_y) - nrow(unique_main_survey_ids),
" row(s) could not be matched",
" when pulling ", basename(file), " into '", type, "' survey."
)
unmatched_merge, " row(s) could not be matched when pulling ",
basename(file), " into '", type, "' survey.")
}

main_surveys[[type]] <- test_merge
main_surveys[[type]] <- merged[, !"..merge_id"]
merged_files <- c(merged_files, file)
} else {
anyDuplicated(merged[, "..main_id", with = FALSE])
}
}
survey_files <- setdiff(survey_files, merged_files)
Expand All @@ -135,9 +126,10 @@ load_survey <- function(files, ...) {
if (is.null(merged_files)) {
merge_files <- NULL
} else {
merge_files <- names(can_merge[which(can_merge)])
merge_files <- names(can_merge[can_merge])
}
}
main_surveys[[type]] <- main_surveys[[type]][, ..main_id := NULL]
}

if (length(survey_files) > 0) {
Expand Down

0 comments on commit f06b870

Please sign in to comment.