@@ -74,6 +74,74 @@ def map_responses(input_data):
74
74
print (f"The number of unique person_ids in the dataset: { input_data ['person_id' ].nunique ()} " )
75
75
return input_data
76
76
77
+ def map_questions (input_data ):
78
+ special_cases = {
79
+ 903087 : (- 999 , "Don't Know" ),
80
+ 903096 : (- 998 , "Skip" ),
81
+ 903072 : (- 997 , "Does Not Apply To Me" ),
82
+ 903079 : (- 996 , "Prefer Not To Answer" ),
83
+ 903070 : (- 995 , "Other" ),
84
+ 903092 : (- 994 , "Not Sure" ),
85
+ 903095 : (- 993 , "None" ),
86
+ 903103 : (- 992 , "Unanswered" ),
87
+ 40192432 : (- 991 , "I am not religious" ),
88
+ 40192487 : (- 990 , "I do not believe in God (or a higher power)" ),
89
+ 40192520 : (- 989 , "Does not apply to my neighborhood" ),
90
+ 903081 : (- 988 , "Free Text" ),
91
+ 596889 : (998 , "Text" ),
92
+ 596883 : (- 994 , "Not Sure" ),
93
+ 1332844 : (- 994 , "Not Sure" ),
94
+ 903598 : (- 996 , "Prefer Not To Answer" ),
95
+ 903596 : (- 996 , "Prefer Not To Answer" ),
96
+ 903601 : (- 996 , "Prefer Not To Answer" ),
97
+ 903607 : (- 996 , "Prefer Not To Answer" ),
98
+ 903610 : (- 996 , "Prefer Not To Answer" ),
99
+ 903604 : (- 996 , "Prefer Not To Answer" ),
100
+ 43529089 : (- 997 , "No Blood Related Daughters" ),
101
+ 43529086 : (- 997 , "No Blood Related Siblings" ),
102
+ 43529092 : (- 997 , "No Blood Related Sons" ),
103
+ 43529090 : (- 997 , "No Daughters Related" )
104
+ }
105
+
106
+ survey_data = load_survey_data ()
107
+
108
+ mapping_numeric = survey_data .groupby ('question_concept_id' ).apply (
109
+ lambda g : g .set_index ('answer_concept_id' )['answer_numeric' ].to_dict ()
110
+ ).to_dict ()
111
+
112
+ mapping_text = survey_data .groupby ('question_concept_id' ).apply (
113
+ lambda g : g .set_index ('answer_concept_id' )['answer_text' ].str .strip ().to_dict ()
114
+ ).to_dict ()
115
+
116
+ input_data ['answer_numeric' ] = pd .NA
117
+ input_data ['answer_text' ] = pd .NA
118
+
119
+ for answer_id , (num , text ) in special_cases .items ():
120
+ mask = input_data ['answer_concept_id' ] == answer_id
121
+ input_data .loc [mask , 'answer_numeric' ] = num
122
+ input_data .loc [mask , 'answer_text' ] = text
123
+
124
+ def apply_mappings (row ):
125
+ if pd .notna (row ['answer_numeric' ]) and pd .notna (row ['answer_text' ]):
126
+ return row
127
+
128
+ question_id = row ['question_concept_id' ]
129
+ answer_id = row ['answer_concept_id' ]
130
+ numeric = mapping_numeric .get (question_id , {}).get (answer_id , pd .NA )
131
+ text = mapping_text .get (question_id , {}).get (answer_id , pd .NA )
132
+ row ['answer_numeric' ] = numeric
133
+ row ['answer_text' ] = text
134
+ return row
135
+
136
+ input_data = input_data .apply (apply_mappings , axis = 1 )
137
+
138
+ numeric_mask = pd .isna (input_data ['answer_concept_id' ]) & input_data ['answer' ].apply (lambda x : str (x ).isdigit ())
139
+ input_data .loc [numeric_mask , 'answer_numeric' ] = input_data .loc [numeric_mask , 'answer' ].astype (int )
140
+ input_data .loc [numeric_mask , 'answer_text' ] = input_data .loc [numeric_mask , 'answer' ].astype (str )
141
+
142
+ print (f"The number of unique person_ids in the dataset: { input_data ['person_id' ].nunique ()} " )
143
+ return input_data
144
+
77
145
78
146
def create_dummies (user_data ):
79
147
question_key = load_survey_data ()
@@ -96,6 +164,33 @@ def create_dummies(user_data):
96
164
97
165
return result_data
98
166
167
+ def create_dummy_variables (user_data ):
168
+ question_key = load_survey_data ()
169
+
170
+ select_all_questions = question_key [question_key ['select_all' ] == 1 ]['question_concept_id' ].unique ()
171
+
172
+ id_map = {}
173
+ new_id_start = user_data ['question_concept_id' ].max () + 1
174
+ new_rows = []
175
+
176
+ for question_id in select_all_questions :
177
+ select_all_data = user_data [user_data ['question_concept_id' ] == question_id ]
178
+ for index , row in select_all_data .iterrows ():
179
+ combined_key = f"{ question_id } _{ row ['answer_concept_id' ]} "
180
+ if combined_key not in id_map :
181
+ id_map [combined_key ] = new_id_start
182
+ new_id_start += 1
183
+ new_row = row .copy ()
184
+ new_row ['question_concept_id' ] = id_map [combined_key ]
185
+ new_rows .append (new_row )
186
+
187
+ new_rows_df = pd .DataFrame (new_rows )
188
+ filtered_data = user_data [~ user_data ['question_concept_id' ].isin (select_all_questions )]
189
+
190
+ result_data = pd .concat ([filtered_data , new_rows_df ], ignore_index = True )
191
+
192
+ return result_data
193
+
99
194
100
195
def scale (data , variables , scale_name , na = False , method = 'sum' ):
101
196
df = data [['person_id' ] + variables ]
@@ -339,4 +434,31 @@ def process_answers(input_data):
339
434
result_df = pd .concat (results , ignore_index = True )
340
435
341
436
return result_df
437
+ def create_dummies_R (user_data ):
438
+ question_key = load_survey_data ()
439
+
440
+ select_all_questions = question_key [question_key ['select_all' ] == 1 ]['question_concept_id' ].unique ()
441
+
442
+ id_map = {}
443
+ new_id_start = user_data ['question_concept_id' ].max () + 1
444
+ new_rows = []
445
+
446
+ for question_id in select_all_questions :
447
+ select_all_data = user_data [user_data ['question_concept_id' ] == question_id ]
448
+ for index , row in select_all_data .iterrows ():
449
+ combined_key = f"{ question_id } _{ row ['answer_concept_id' ]} "
450
+ if combined_key not in id_map :
451
+ id_map [combined_key ] = new_id_start
452
+ new_id_start += 1
453
+ new_row = row .copy ()
454
+ new_row ['question_concept_id' ] = id_map [combined_key ] # Assign new numeric ID
455
+ new_rows .append (new_row )
456
+
457
+ new_rows_df = pd .DataFrame (new_rows )
458
+ filtered_data = user_data [~ user_data ['question_concept_id' ].isin (select_all_questions )]
459
+
460
+ result_data = pd .concat ([filtered_data , new_rows_df ], ignore_index = True )
461
+
462
+ return result_data
463
+
342
464
0 commit comments