From 516aa71391086e3677382049901d948701c5b3aa Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 21 Nov 2023 16:28:29 -0800 Subject: [PATCH 1/4] Clinical ETL: Update ethnicity mapper --- lib/seattleflu/id3c/cli/command/etl/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/seattleflu/id3c/cli/command/etl/__init__.py b/lib/seattleflu/id3c/cli/command/etl/__init__.py index 8933f5a5..dab1a78d 100644 --- a/lib/seattleflu/id3c/cli/command/etl/__init__.py +++ b/lib/seattleflu/id3c/cli/command/etl/__init__.py @@ -196,6 +196,8 @@ def ethnicity(ethnicity: Optional[str]) -> Optional[bool]: "null": None, "declined to answer": None, "unable to collect": None, + "prefer not to answer": None, + "don't know": None, } if ethnicity not in mapper: From ff603d1ac6f0892aa0c2ed70d458da88423d3c0c Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 21 Nov 2023 16:29:23 -0800 Subject: [PATCH 2/4] Clinical ETL: Fix codeable concepts for KP2023 functions --- lib/seattleflu/id3c/cli/command/etl/clinical.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lib/seattleflu/id3c/cli/command/etl/clinical.py b/lib/seattleflu/id3c/cli/command/etl/clinical.py index 0b6c0de1..f4d08016 100644 --- a/lib/seattleflu/id3c/cli/command/etl/clinical.py +++ b/lib/seattleflu/id3c/cli/command/etl/clinical.py @@ -492,10 +492,11 @@ def create_symptom_conditions(record: dict, patient_reference: dict, encounter_r for symptom in record['symptom']: mapped_symptom_name = map_symptom(symptom) onset_date = record['date_symptom_onset'] - symptom_code = { - "system": f"{SFS}/symptom", - "code": mapped_symptom_name - } + symptom_code = create_codeable_concept( + system = f"{SFS}/symptom", + code = mapped_symptom_name + ) + condition_resource = create_condition_resource(mapped_symptom_name, patient_reference, @@ -1047,7 +1048,10 @@ def create_icd10_conditions_kp2023(record:dict, patient_reference: dict) -> list condition_resource = create_condition_resource(icd10_code, patient_reference, None, - icd10_codes[icd10_code] + create_codeable_concept( + system = icd10_codes[icd10_code]["system"], + code = icd10_codes[icd10_code]["code"], + display = icd10_codes[icd10_code]["display"]) ) condition_entries.append(create_resource_entry( From 46ed98b7393038a9301f004778378e70ba921bab Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 28 Nov 2023 14:00:54 -0800 Subject: [PATCH 3/4] Cast census tract as string in clinical parse-kp2023 --- lib/seattleflu/id3c/cli/command/clinical.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/seattleflu/id3c/cli/command/clinical.py b/lib/seattleflu/id3c/cli/command/clinical.py index f29df671..83d35ef0 100644 --- a/lib/seattleflu/id3c/cli/command/clinical.py +++ b/lib/seattleflu/id3c/cli/command/clinical.py @@ -1022,6 +1022,10 @@ def parse_kp2023(kp2023_filename: str) -> None: for col in date_cols: clinical_records[col] = pd.to_datetime(clinical_records[col]).dt.strftime('%Y-%m-%d') + # convert census_tract to string + # do this here rather than upon import with dtype, because the latter would require assuming capitalization of column name from KP + clinical_records['census_tract'] = clinical_records['census_tract'].astype('Int64').astype('str') + # ensure there are no unintended columns being kept columns_to_keep = [ '_provenance', From 62abebfcccc3a7ddcb073f0fe1affa2614c12efc Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Mon, 4 Dec 2023 11:56:27 -0800 Subject: [PATCH 4/4] Handle empty input/output in deduplicate-kp2023 and match-kp2023 Handle empty input in deduplicate-kp2023. If no unmatched records are found by match-kp2023, do not write to the unmatched manifest. --- lib/seattleflu/id3c/cli/command/clinical.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/seattleflu/id3c/cli/command/clinical.py b/lib/seattleflu/id3c/cli/command/clinical.py index 83d35ef0..d9e6a2e5 100644 --- a/lib/seattleflu/id3c/cli/command/clinical.py +++ b/lib/seattleflu/id3c/cli/command/clinical.py @@ -1268,7 +1268,7 @@ def match_kp2023(kp2023_manifest_filename: str, kp2023_manifest_matched_filename records to the matched file. Removes any matches from before writing it to . - does not have to be an existing file, + does not have to be an existing file, but a filename must be provided. If the file does not exist, the newly matched records will be output to stdout without consolidating with previously matched records. @@ -1345,7 +1345,8 @@ def match_kp2023(kp2023_manifest_filename: str, kp2023_manifest_matched_filename matched_clinical_records = pd.concat([matched_clinical_records, newly_matched_clinical_records]).reset_index(drop=True) LOG.info(f"A total of {len(matched_clinical_records)} records are matched to LIMS data with {len(unmatched_clinical_records)} still unmatched.") - unmatched_clinical_records.to_json(kp2023_manifest_unmatched_output_filename, orient='records', lines=True) + if not unmatched_clinical_records.empty: + unmatched_clinical_records.to_json(kp2023_manifest_unmatched_output_filename, orient='records', lines=True) if not matched_clinical_records.empty: dump_ndjson(matched_clinical_records) @@ -1370,6 +1371,10 @@ def deduplicate_kp2023(kp2023_master_manifest_filename: str) -> None: # read in ndjson as pandas df clinical_records = pd.read_json(kp2023_master_manifest_filename, orient='records', dtype={'census_tract': 'string', 'age': 'int64'}, lines=True) + if clinical_records.empty: + LOG.info("No clinical records provided, nothing to deduplicate.") + return + # sort by timestamp clinical_records = clinical_records.sort_values(by='spreadsheet_timestamp', ascending=True)