From 3205e0b2e5f24900e59548f30a3d63ef5d78af19 Mon Sep 17 00:00:00 2001 From: David Reinhart Date: Mon, 13 Jun 2022 16:40:46 -0700 Subject: [PATCH] Specify timezone when converting date to datetime When parsing clinical data, several date values are converted to datetime data type which by default are interpretted as midnight UTC time. Indicating the timezone when these conversions occur to ensure that date values are converted to midnight local time when a time component is being added. --- lib/seattleflu/id3c/cli/command/clinical.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/seattleflu/id3c/cli/command/clinical.py b/lib/seattleflu/id3c/cli/command/clinical.py index 8436d003..85080b4b 100644 --- a/lib/seattleflu/id3c/cli/command/clinical.py +++ b/lib/seattleflu/id3c/cli/command/clinical.py @@ -275,9 +275,11 @@ def parse_sch(sch_filename, manifest_format, output): clinical_records = clinical_records[columns_to_keep] # Convert dtypes - clinical_records["encountered"] = pd.to_datetime(clinical_records["encountered"]) + # Incoming `encountered` value is typically just date but is cast to datetime with timezone in postgres. Timezone is + # being specified here to ensure values are set to midnight local time instead of UTC. + clinical_records["encountered"] = pd.to_datetime(clinical_records["encountered"]).dt.tz_localize('America/Los_Angeles') - # Reformat vaccination dates + # Reformat vaccination dates. Values are immediately stripped of time component, so don't need timezone specified. if manifest_format in ['year3', 'year4']: clinical_records["FluShotDate"] = pd.to_datetime(clinical_records["FluShotDate"]).dt.strftime('%Y-%m-%d') clinical_records["CovidShot1Date"] = pd.to_datetime(clinical_records["CovidShot1Date"]).dt.strftime('%Y-%m-%d') @@ -387,7 +389,7 @@ def parse_kp(kp_filename, kp_specimen_manifest_filename, manifest_format, output clinical_records = clinical_records[column_map.values()] # Convert dtypes - clinical_records["encountered"] = pd.to_datetime(clinical_records["encountered"]) + clinical_records["encountered"] = pd.to_datetime(clinical_records["encountered"]).dt.tz_localize('America/Los_Angeles') # Insert static value columns clinical_records["site"] = "KP" @@ -491,8 +493,9 @@ def parse_phskc(phskc_filename: str, phskc_specimen_manifest_filename: str, geoc # localize encounter timestamps to pacific time clinical_records['encountered'] = clinical_records['collect_ts'].dt.tz_localize('America/Los_Angeles') - # calculate age based on sample collection date and birth day - clinical_records['birth_date'] = pd.to_datetime(clinical_records['birth_date']) + # calculate age based on sample collection date and birth day. Localize birth date datetime value to ensure accurate + # delta with local collection datetime. + clinical_records['birth_date'] = pd.to_datetime(clinical_records['birth_date']).dt.tz_localize('America/Los_Angeles') clinical_records['age'] = clinical_records.apply( lambda row: age_ceiling( relativedelta(