From 62abebfcccc3a7ddcb073f0fe1affa2614c12efc Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Mon, 4 Dec 2023 11:56:27 -0800 Subject: [PATCH] Handle empty input/output in deduplicate-kp2023 and match-kp2023 Handle empty input in deduplicate-kp2023. If no unmatched records are found by match-kp2023, do not write to the unmatched manifest. --- lib/seattleflu/id3c/cli/command/clinical.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/seattleflu/id3c/cli/command/clinical.py b/lib/seattleflu/id3c/cli/command/clinical.py index 83d35ef0..d9e6a2e5 100644 --- a/lib/seattleflu/id3c/cli/command/clinical.py +++ b/lib/seattleflu/id3c/cli/command/clinical.py @@ -1268,7 +1268,7 @@ def match_kp2023(kp2023_manifest_filename: str, kp2023_manifest_matched_filename records to the matched file. Removes any matches from before writing it to . - does not have to be an existing file, + does not have to be an existing file, but a filename must be provided. If the file does not exist, the newly matched records will be output to stdout without consolidating with previously matched records. @@ -1345,7 +1345,8 @@ def match_kp2023(kp2023_manifest_filename: str, kp2023_manifest_matched_filename matched_clinical_records = pd.concat([matched_clinical_records, newly_matched_clinical_records]).reset_index(drop=True) LOG.info(f"A total of {len(matched_clinical_records)} records are matched to LIMS data with {len(unmatched_clinical_records)} still unmatched.") - unmatched_clinical_records.to_json(kp2023_manifest_unmatched_output_filename, orient='records', lines=True) + if not unmatched_clinical_records.empty: + unmatched_clinical_records.to_json(kp2023_manifest_unmatched_output_filename, orient='records', lines=True) if not matched_clinical_records.empty: dump_ndjson(matched_clinical_records) @@ -1370,6 +1371,10 @@ def deduplicate_kp2023(kp2023_master_manifest_filename: str) -> None: # read in ndjson as pandas df clinical_records = pd.read_json(kp2023_master_manifest_filename, orient='records', dtype={'census_tract': 'string', 'age': 'int64'}, lines=True) + if clinical_records.empty: + LOG.info("No clinical records provided, nothing to deduplicate.") + return + # sort by timestamp clinical_records = clinical_records.sort_values(by='spreadsheet_timestamp', ascending=True)