seattleflu · sallybg · Dec 4, 2023 · Nov 22, 2023 · Nov 22, 2023 · Nov 28, 2023
@@ -1022,6 +1022,10 @@ def parse_kp2023(kp2023_filename: str) -> None:
     for col in date_cols:
         clinical_records[col] = pd.to_datetime(clinical_records[col]).dt.strftime('%Y-%m-%d')
 
+    # convert census_tract to string
+    # do this here rather than upon import with dtype, because the latter would require assuming capitalization of column name from KP
+    clinical_records['census_tract'] = clinical_records['census_tract'].astype('Int64').astype('str')
+
     # ensure there are no unintended columns being kept
     columns_to_keep = [
         '_provenance',
@@ -1264,7 +1268,7 @@ def match_kp2023(kp2023_manifest_filename: str, kp2023_manifest_matched_filename
     records to the matched file. Removes any matches from <KP2023 Clinical Manifest name>
     before writing it to <KP2023 Clinical Manifest Unmatched Data output filename>.
 
-    <KP2023 Clinical Manifest Matched Date filename> does not have to be an existing file,
+    <KP2023 Clinical Manifest Matched Data filename> does not have to be an existing file,
     but a filename must be provided. If the file does not exist, the newly matched records
     will be output to stdout without consolidating with previously matched records.
 
@@ -1341,7 +1345,8 @@ def match_kp2023(kp2023_manifest_filename: str, kp2023_manifest_matched_filename
     matched_clinical_records = pd.concat([matched_clinical_records, newly_matched_clinical_records]).reset_index(drop=True)
     LOG.info(f"A total of {len(matched_clinical_records)} records are matched to LIMS data with {len(unmatched_clinical_records)} still unmatched.")
 
-    unmatched_clinical_records.to_json(kp2023_manifest_unmatched_output_filename, orient='records', lines=True)
+    if not unmatched_clinical_records.empty:
+        unmatched_clinical_records.to_json(kp2023_manifest_unmatched_output_filename, orient='records', lines=True)
     if not matched_clinical_records.empty:
         dump_ndjson(matched_clinical_records)
 
@@ -1366,6 +1371,10 @@ def deduplicate_kp2023(kp2023_master_manifest_filename: str) -> None:
     # read in ndjson as pandas df
     clinical_records = pd.read_json(kp2023_master_manifest_filename, orient='records', dtype={'census_tract': 'string', 'age': 'int64'}, lines=True)
 
+    if clinical_records.empty:
+        LOG.info("No clinical records provided, nothing to deduplicate.")
+        return
+
     # sort by timestamp
     clinical_records = clinical_records.sort_values(by='spreadsheet_timestamp', ascending=True)
 

@@ -196,6 +196,8 @@ def ethnicity(ethnicity: Optional[str]) -> Optional[bool]:
         "null":                               None,
         "declined to answer":                 None,
         "unable to collect":                  None,
+        "prefer not to answer":               None,
+        "don't know":                         None,
     }
 
     if ethnicity not in mapper:

@@ -492,10 +492,11 @@ def create_symptom_conditions(record: dict, patient_reference: dict, encounter_r
     for symptom in record['symptom']:
         mapped_symptom_name = map_symptom(symptom)
         onset_date = record['date_symptom_onset']
-        symptom_code = {
-            "system": f"{SFS}/symptom",
-            "code": mapped_symptom_name
-        }
+        symptom_code = create_codeable_concept(
+            system = f"{SFS}/symptom",
+            code = mapped_symptom_name
+        )
+
 
         condition_resource = create_condition_resource(mapped_symptom_name,
                                 patient_reference,
@@ -1047,7 +1048,10 @@ def create_icd10_conditions_kp2023(record:dict, patient_reference: dict) -> list
         condition_resource = create_condition_resource(icd10_code,
                                 patient_reference,
                                 None,
-                                icd10_codes[icd10_code]
+                                create_codeable_concept(
+                                    system = icd10_codes[icd10_code]["system"], 
+                                    code = icd10_codes[icd10_code]["code"], 
+                                    display = icd10_codes[icd10_code]["display"])
                             )
 
         condition_entries.append(create_resource_entry(