Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clinical ETL: Update ethnicity mapper and fix codeable concepts #375

Merged
merged 4 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions lib/seattleflu/id3c/cli/command/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,10 @@ def parse_kp2023(kp2023_filename: str) -> None:
for col in date_cols:
clinical_records[col] = pd.to_datetime(clinical_records[col]).dt.strftime('%Y-%m-%d')

# convert census_tract to string
# do this here rather than upon import with dtype, because the latter would require assuming capitalization of column name from KP
clinical_records['census_tract'] = clinical_records['census_tract'].astype('Int64').astype('str')

# ensure there are no unintended columns being kept
columns_to_keep = [
'_provenance',
Expand Down Expand Up @@ -1264,7 +1268,7 @@ def match_kp2023(kp2023_manifest_filename: str, kp2023_manifest_matched_filename
records to the matched file. Removes any matches from <KP2023 Clinical Manifest name>
before writing it to <KP2023 Clinical Manifest Unmatched Data output filename>.

<KP2023 Clinical Manifest Matched Date filename> does not have to be an existing file,
<KP2023 Clinical Manifest Matched Data filename> does not have to be an existing file,
but a filename must be provided. If the file does not exist, the newly matched records
will be output to stdout without consolidating with previously matched records.

Expand Down Expand Up @@ -1341,7 +1345,8 @@ def match_kp2023(kp2023_manifest_filename: str, kp2023_manifest_matched_filename
matched_clinical_records = pd.concat([matched_clinical_records, newly_matched_clinical_records]).reset_index(drop=True)
LOG.info(f"A total of {len(matched_clinical_records)} records are matched to LIMS data with {len(unmatched_clinical_records)} still unmatched.")

unmatched_clinical_records.to_json(kp2023_manifest_unmatched_output_filename, orient='records', lines=True)
if not unmatched_clinical_records.empty:
unmatched_clinical_records.to_json(kp2023_manifest_unmatched_output_filename, orient='records', lines=True)
if not matched_clinical_records.empty:
dump_ndjson(matched_clinical_records)

Expand All @@ -1366,6 +1371,10 @@ def deduplicate_kp2023(kp2023_master_manifest_filename: str) -> None:
# read in ndjson as pandas df
clinical_records = pd.read_json(kp2023_master_manifest_filename, orient='records', dtype={'census_tract': 'string', 'age': 'int64'}, lines=True)

if clinical_records.empty:
LOG.info("No clinical records provided, nothing to deduplicate.")
return

# sort by timestamp
clinical_records = clinical_records.sort_values(by='spreadsheet_timestamp', ascending=True)

Expand Down
2 changes: 2 additions & 0 deletions lib/seattleflu/id3c/cli/command/etl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ def ethnicity(ethnicity: Optional[str]) -> Optional[bool]:
"null": None,
"declined to answer": None,
"unable to collect": None,
"prefer not to answer": None,
"don't know": None,
}

if ethnicity not in mapper:
Expand Down
14 changes: 9 additions & 5 deletions lib/seattleflu/id3c/cli/command/etl/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,10 +492,11 @@ def create_symptom_conditions(record: dict, patient_reference: dict, encounter_r
for symptom in record['symptom']:
mapped_symptom_name = map_symptom(symptom)
onset_date = record['date_symptom_onset']
symptom_code = {
"system": f"{SFS}/symptom",
"code": mapped_symptom_name
}
symptom_code = create_codeable_concept(
system = f"{SFS}/symptom",
code = mapped_symptom_name
)


condition_resource = create_condition_resource(mapped_symptom_name,
patient_reference,
Expand Down Expand Up @@ -1047,7 +1048,10 @@ def create_icd10_conditions_kp2023(record:dict, patient_reference: dict) -> list
condition_resource = create_condition_resource(icd10_code,
patient_reference,
None,
icd10_codes[icd10_code]
create_codeable_concept(
system = icd10_codes[icd10_code]["system"],
code = icd10_codes[icd10_code]["code"],
display = icd10_codes[icd10_code]["display"])
)

condition_entries.append(create_resource_entry(
Expand Down
Loading