Skip to content

Commit

Permalink
Merge pull request #375 from seattleflu/kp2023-etl
Browse files Browse the repository at this point in the history
Clinical ETL: Update ethnicity mapper, fix codeable concepts, cast census tract as string, and handle empty inputs and outputs.
  • Loading branch information
sallybg authored Dec 4, 2023
2 parents d8dd52b + 62abebf commit 0bd566a
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 7 deletions.
13 changes: 11 additions & 2 deletions lib/seattleflu/id3c/cli/command/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,10 @@ def parse_kp2023(kp2023_filename: str) -> None:
for col in date_cols:
clinical_records[col] = pd.to_datetime(clinical_records[col]).dt.strftime('%Y-%m-%d')

# convert census_tract to string
# do this here rather than upon import with dtype, because the latter would require assuming capitalization of column name from KP
clinical_records['census_tract'] = clinical_records['census_tract'].astype('Int64').astype('str')

# ensure there are no unintended columns being kept
columns_to_keep = [
'_provenance',
Expand Down Expand Up @@ -1264,7 +1268,7 @@ def match_kp2023(kp2023_manifest_filename: str, kp2023_manifest_matched_filename
records to the matched file. Removes any matches from <KP2023 Clinical Manifest name>
before writing it to <KP2023 Clinical Manifest Unmatched Data output filename>.
<KP2023 Clinical Manifest Matched Date filename> does not have to be an existing file,
<KP2023 Clinical Manifest Matched Data filename> does not have to be an existing file,
but a filename must be provided. If the file does not exist, the newly matched records
will be output to stdout without consolidating with previously matched records.
Expand Down Expand Up @@ -1341,7 +1345,8 @@ def match_kp2023(kp2023_manifest_filename: str, kp2023_manifest_matched_filename
matched_clinical_records = pd.concat([matched_clinical_records, newly_matched_clinical_records]).reset_index(drop=True)
LOG.info(f"A total of {len(matched_clinical_records)} records are matched to LIMS data with {len(unmatched_clinical_records)} still unmatched.")

unmatched_clinical_records.to_json(kp2023_manifest_unmatched_output_filename, orient='records', lines=True)
if not unmatched_clinical_records.empty:
unmatched_clinical_records.to_json(kp2023_manifest_unmatched_output_filename, orient='records', lines=True)
if not matched_clinical_records.empty:
dump_ndjson(matched_clinical_records)

Expand All @@ -1366,6 +1371,10 @@ def deduplicate_kp2023(kp2023_master_manifest_filename: str) -> None:
# read in ndjson as pandas df
clinical_records = pd.read_json(kp2023_master_manifest_filename, orient='records', dtype={'census_tract': 'string', 'age': 'int64'}, lines=True)

if clinical_records.empty:
LOG.info("No clinical records provided, nothing to deduplicate.")
return

# sort by timestamp
clinical_records = clinical_records.sort_values(by='spreadsheet_timestamp', ascending=True)

Expand Down
2 changes: 2 additions & 0 deletions lib/seattleflu/id3c/cli/command/etl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ def ethnicity(ethnicity: Optional[str]) -> Optional[bool]:
"null": None,
"declined to answer": None,
"unable to collect": None,
"prefer not to answer": None,
"don't know": None,
}

if ethnicity not in mapper:
Expand Down
14 changes: 9 additions & 5 deletions lib/seattleflu/id3c/cli/command/etl/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,10 +492,11 @@ def create_symptom_conditions(record: dict, patient_reference: dict, encounter_r
for symptom in record['symptom']:
mapped_symptom_name = map_symptom(symptom)
onset_date = record['date_symptom_onset']
symptom_code = {
"system": f"{SFS}/symptom",
"code": mapped_symptom_name
}
symptom_code = create_codeable_concept(
system = f"{SFS}/symptom",
code = mapped_symptom_name
)


condition_resource = create_condition_resource(mapped_symptom_name,
patient_reference,
Expand Down Expand Up @@ -1047,7 +1048,10 @@ def create_icd10_conditions_kp2023(record:dict, patient_reference: dict) -> list
condition_resource = create_condition_resource(icd10_code,
patient_reference,
None,
icd10_codes[icd10_code]
create_codeable_concept(
system = icd10_codes[icd10_code]["system"],
code = icd10_codes[icd10_code]["code"],
display = icd10_codes[icd10_code]["display"])
)

condition_entries.append(create_resource_entry(
Expand Down

0 comments on commit 0bd566a

Please sign in to comment.