Skip to content

Commit f45f2c2

Browse files
committed
updated packages
1 parent 222c3df commit f45f2c2

File tree

6 files changed

+187
-12
lines changed

6 files changed

+187
-12
lines changed

.idea/misc.xml

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

omop2survey/__init__.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
from omop2survey.response_set import map_answers_chunk, process_answers, map_items, map_responses, create_dummies, scale, map_answers
1+
from omop2survey.response_set import (map_answers_chunk, process_answers, map_items, map_responses, create_dummies,
2+
create_dummy_variables, create_dummies_R, map_questions, scale, map_answers)
23
from omop2survey.codebooks import create_codebook, generate_codebook, print_codebook, codebook, codebook_html
34
from omop2survey.pivot_data import pivot, pivot_text, pivot_text_local, pivot_local
4-
from omop2survey.recode_missing import recode
5+
from omop2survey.recode_missing import recode, recode_items, recode_missing
56
from omop2survey.subset import show_survey_options, get_survey_map, import_survey_data

omop2survey/recode_missing.py

+56-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import pandas as pd
22

3-
43
def recode_items(input_data):
54
missing_values = [-999, -998, -997, -996, -995, -994, -993, -992, -991, -990,
65
-989, -988, -987, -986, -985, -984, -983, -982, -981, -980]
@@ -41,3 +40,59 @@ def recode(input_data):
4140

4241
return data
4342

43+
def recode_missing(input_data):
44+
missing_values = [-999, -998, -997, -996, -995, -994, -993, -992, -991, -990,
45+
-989, -988, -987, -986, -985, -984, -983, -982, -981, -980]
46+
47+
if isinstance(input_data, str):
48+
if input_data.endswith('.csv') or input_data.endswith('.txt'):
49+
data = pd.read_csv(input_data, na_values=missing_values)
50+
elif input_data.endswith(('.xlsx', '.xls')):
51+
data = pd.read_excel(input_data, na_values=missing_values)
52+
else:
53+
raise ValueError("Unsupported file type. Please provide a .csv, .txt, or .xlsx file.")
54+
elif isinstance(input_data, pd.DataFrame):
55+
data = input_data.copy()
56+
data.replace(missing_values, pd.NA, inplace=True)
57+
else:
58+
raise ValueError("Unsupported data type. Please provide a file path or a pandas DataFrame.")
59+
60+
# Ensure no lists in columns by creating a new column and replacing the original
61+
for col in data.columns:
62+
if data[col].apply(lambda x: isinstance(x, list)).any():
63+
new_col = data[col].apply(lambda x: x[0] if isinstance(x, list) else (pd.NA if pd.isna(x) else x))
64+
data[col] = new_col
65+
else:
66+
new_col = data[col].apply(lambda x: pd.NA if pd.isna(x) else x)
67+
data[col] = new_col
68+
69+
# Special handling for answer_numeric column to ensure it's fully numeric
70+
if 'answer_numeric' in data.columns:
71+
# Convert lists in answer_numeric to their first element, or NA if empty
72+
data['answer_numeric'] = data['answer_numeric'].apply(lambda x: x[0] if isinstance(x, list) else x)
73+
# Convert to numeric, coercing errors to NaN
74+
data['answer_numeric'] = pd.to_numeric(data['answer_numeric'], errors='coerce')
75+
76+
return data
77+
78+
def recode_values(input_data):
79+
missing_values = [-999, -998, -997, -996, -995, -994, -993, -992, -991, -990,
80+
-989, -988, -987, -986, -985, -984, -983, -982, -981, -980]
81+
82+
if isinstance(input_data, str):
83+
if input_data.endswith('.csv') or input_data.endswith('.txt'):
84+
data = pd.read_csv(input_data, na_values=missing_values)
85+
elif input_data.endswith(('.xlsx', '.xls')):
86+
data = pd.read_excel(input_data, na_values=missing_values)
87+
else:
88+
raise ValueError("Unsupported file type. Please provide a .csv, .txt, or .xlsx file.")
89+
elif isinstance(input_data, pd.DataFrame):
90+
data = input_data.copy()
91+
data.replace(missing_values, pd.NA, inplace=True)
92+
else:
93+
raise ValueError("Unsupported data type. Please provide a file path or a pandas DataFrame.")
94+
95+
for col in data.columns:
96+
data[col] = data[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)
97+
98+
return data

omop2survey/response_set.py

+122
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,74 @@ def map_responses(input_data):
7474
print(f"The number of unique person_ids in the dataset: {input_data['person_id'].nunique()}")
7575
return input_data
7676

77+
def map_questions(input_data):
78+
special_cases = {
79+
903087: (-999, "Don't Know"),
80+
903096: (-998, "Skip"),
81+
903072: (-997, "Does Not Apply To Me"),
82+
903079: (-996, "Prefer Not To Answer"),
83+
903070: (-995, "Other"),
84+
903092: (-994, "Not Sure"),
85+
903095: (-993, "None"),
86+
903103: (-992, "Unanswered"),
87+
40192432: (-991, "I am not religious"),
88+
40192487: (-990, "I do not believe in God (or a higher power)"),
89+
40192520: (-989, "Does not apply to my neighborhood"),
90+
903081: (-988, "Free Text"),
91+
596889: (998, "Text"),
92+
596883: (-994, "Not Sure"),
93+
1332844: (-994, "Not Sure"),
94+
903598: (-996, "Prefer Not To Answer"),
95+
903596: (-996, "Prefer Not To Answer"),
96+
903601: (-996, "Prefer Not To Answer"),
97+
903607: (-996, "Prefer Not To Answer"),
98+
903610: (-996, "Prefer Not To Answer"),
99+
903604: (-996, "Prefer Not To Answer"),
100+
43529089: (-997, "No Blood Related Daughters"),
101+
43529086: (-997, "No Blood Related Siblings"),
102+
43529092: (-997, "No Blood Related Sons"),
103+
43529090: (-997, "No Daughters Related")
104+
}
105+
106+
survey_data = load_survey_data()
107+
108+
mapping_numeric = survey_data.groupby('question_concept_id').apply(
109+
lambda g: g.set_index('answer_concept_id')['answer_numeric'].to_dict()
110+
).to_dict()
111+
112+
mapping_text = survey_data.groupby('question_concept_id').apply(
113+
lambda g: g.set_index('answer_concept_id')['answer_text'].str.strip().to_dict()
114+
).to_dict()
115+
116+
input_data['answer_numeric'] = pd.NA
117+
input_data['answer_text'] = pd.NA
118+
119+
for answer_id, (num, text) in special_cases.items():
120+
mask = input_data['answer_concept_id'] == answer_id
121+
input_data.loc[mask, 'answer_numeric'] = num
122+
input_data.loc[mask, 'answer_text'] = text
123+
124+
def apply_mappings(row):
125+
if pd.notna(row['answer_numeric']) and pd.notna(row['answer_text']):
126+
return row
127+
128+
question_id = row['question_concept_id']
129+
answer_id = row['answer_concept_id']
130+
numeric = mapping_numeric.get(question_id, {}).get(answer_id, pd.NA)
131+
text = mapping_text.get(question_id, {}).get(answer_id, pd.NA)
132+
row['answer_numeric'] = numeric
133+
row['answer_text'] = text
134+
return row
135+
136+
input_data = input_data.apply(apply_mappings, axis=1)
137+
138+
numeric_mask = pd.isna(input_data['answer_concept_id']) & input_data['answer'].apply(lambda x: str(x).isdigit())
139+
input_data.loc[numeric_mask, 'answer_numeric'] = input_data.loc[numeric_mask, 'answer'].astype(int)
140+
input_data.loc[numeric_mask, 'answer_text'] = input_data.loc[numeric_mask, 'answer'].astype(str)
141+
142+
print(f"The number of unique person_ids in the dataset: {input_data['person_id'].nunique()}")
143+
return input_data
144+
77145

78146
def create_dummies(user_data):
79147
question_key = load_survey_data()
@@ -96,6 +164,33 @@ def create_dummies(user_data):
96164

97165
return result_data
98166

167+
def create_dummy_variables(user_data):
168+
question_key = load_survey_data()
169+
170+
select_all_questions = question_key[question_key['select_all'] == 1]['question_concept_id'].unique()
171+
172+
id_map = {}
173+
new_id_start = user_data['question_concept_id'].max() + 1
174+
new_rows = []
175+
176+
for question_id in select_all_questions:
177+
select_all_data = user_data[user_data['question_concept_id'] == question_id]
178+
for index, row in select_all_data.iterrows():
179+
combined_key = f"{question_id}_{row['answer_concept_id']}"
180+
if combined_key not in id_map:
181+
id_map[combined_key] = new_id_start
182+
new_id_start += 1
183+
new_row = row.copy()
184+
new_row['question_concept_id'] = id_map[combined_key]
185+
new_rows.append(new_row)
186+
187+
new_rows_df = pd.DataFrame(new_rows)
188+
filtered_data = user_data[~user_data['question_concept_id'].isin(select_all_questions)]
189+
190+
result_data = pd.concat([filtered_data, new_rows_df], ignore_index=True)
191+
192+
return result_data
193+
99194

100195
def scale(data, variables, scale_name, na=False, method='sum'):
101196
df = data[['person_id'] + variables]
@@ -339,4 +434,31 @@ def process_answers(input_data):
339434
result_df = pd.concat(results, ignore_index=True)
340435

341436
return result_df
437+
def create_dummies_R(user_data):
438+
question_key = load_survey_data()
439+
440+
select_all_questions = question_key[question_key['select_all'] == 1]['question_concept_id'].unique()
441+
442+
id_map = {}
443+
new_id_start = user_data['question_concept_id'].max() + 1
444+
new_rows = []
445+
446+
for question_id in select_all_questions:
447+
select_all_data = user_data[user_data['question_concept_id'] == question_id]
448+
for index, row in select_all_data.iterrows():
449+
combined_key = f"{question_id}_{row['answer_concept_id']}"
450+
if combined_key not in id_map:
451+
id_map[combined_key] = new_id_start
452+
new_id_start += 1
453+
new_row = row.copy()
454+
new_row['question_concept_id'] = id_map[combined_key] # Assign new numeric ID
455+
new_rows.append(new_row)
456+
457+
new_rows_df = pd.DataFrame(new_rows)
458+
filtered_data = user_data[~user_data['question_concept_id'].isin(select_all_questions)]
459+
460+
result_data = pd.concat([filtered_data, new_rows_df], ignore_index=True)
461+
462+
return result_data
463+
342464

omop2survey/subset.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ def show_survey_options():
1515
survey_map = get_survey_map()
1616
for key, value in survey_map.items():
1717
print(f"{key}: {value}")
18-
print("\nExample usage: selecting 'Social Determinants of Health' (assuming it is the 3rd option)")
19-
print("# selected_survey_df = omop2survey.import_survey_data(3)")
18+
print("\nExample usage: selecting 'Social Determinants of Health' (assuming it is the 1st option)")
19+
print("# selected_survey_df = omop2survey.import_survey_data(1)")
2020
print("# print(selected_survey_df.head(5))")
2121

2222

vignettes/example.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
omop2.map_answers(sample_df_copy)
99
print(sample_df_copy)
1010

11-
1211
# Create a codebook and save it as an HTML file; the codebook contains only variables in the dataset.
1312
# Note: The codebook function can be used to save the file to the GC workspace bucket,
1413
# whereas codebook_html saves the file locally.
@@ -23,19 +22,16 @@
2322
omop2.recode(sample_df_copy)
2423
print(sample_df_copy.head(5))
2524

26-
2725
# Create dummy coded variables
2826
sample_dummy_df = omop2.create_dummies(sample_df_copy)
2927
print(sample_dummy_df.head(5))
3028

31-
3229
# Convert data from long format to wide format using numeric values.
3330
# The pivot function can be used in the cloud environment.
3431
# Use pivot_local to save files locally.
3532

3633
omop2.pivot_local(sample_df_copy)
3734

38-
3935
# Convert data from long format to wide format using text values.
4036
# The pivot_text function can be used in the cloud environment.
4137
# Use pivot_text_local to save files locally.
@@ -50,7 +46,7 @@
5046
variables = ['q43528662', 'q43528663', 'q43528664']
5147
scale_name = 'afford_healthcare'
5248

53-
pivot_scale = omop2.scale(pivot_df, variables, scale_name) # default na=False, and method='sum'
49+
pivot_scale = omop2.scale(pivot_df, variables, scale_name) # default na=False, and method='sum'
5450
print(pivot_scale['afford_healthcare'])
5551

5652
scale_name = 'mean_afford_healthcare'
@@ -68,5 +64,3 @@
6864
df = pd.DataFrame(data)
6965
processed_df = omop2.recode(df)
7066
print(processed_df)
71-
72-

0 commit comments

Comments
 (0)