updated packages

elifdy · elifdy · commit f45f2c280676 · 2025-01-16T23:52:30.000-05:00
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/omop2survey/__init__.py b/omop2survey/__init__.py
@@ -1,5 +1,6 @@
-from omop2survey.response_set import map_answers_chunk, process_answers, map_items, map_responses, create_dummies, scale, map_answers
+from omop2survey.response_set import (map_answers_chunk, process_answers, map_items, map_responses, create_dummies,
+                                      create_dummy_variables, create_dummies_R, map_questions, scale, map_answers)
 from omop2survey.codebooks import create_codebook, generate_codebook, print_codebook, codebook, codebook_html
 from omop2survey.pivot_data import pivot, pivot_text, pivot_text_local, pivot_local
-from omop2survey.recode_missing import recode
+from omop2survey.recode_missing import recode, recode_items, recode_missing
 from omop2survey.subset import show_survey_options, get_survey_map, import_survey_data
diff --git a/omop2survey/recode_missing.py b/omop2survey/recode_missing.py
@@ -1,6 +1,5 @@
 import pandas as pd
 
-
 def recode_items(input_data):
     missing_values = [-999, -998, -997, -996, -995, -994, -993, -992, -991, -990,
                       -989, -988, -987, -986, -985, -984, -983, -982, -981, -980]
@@ -41,3 +40,59 @@ def recode(input_data):
 
     return data
 
+def recode_missing(input_data):
+    missing_values = [-999, -998, -997, -996, -995, -994, -993, -992, -991, -990,
+                      -989, -988, -987, -986, -985, -984, -983, -982, -981, -980]
+
+    if isinstance(input_data, str):
+        if input_data.endswith('.csv') or input_data.endswith('.txt'):
+            data = pd.read_csv(input_data, na_values=missing_values)
+        elif input_data.endswith(('.xlsx', '.xls')):
+            data = pd.read_excel(input_data, na_values=missing_values)
+        else:
+            raise ValueError("Unsupported file type. Please provide a .csv, .txt, or .xlsx file.")
+    elif isinstance(input_data, pd.DataFrame):
+        data = input_data.copy()
+        data.replace(missing_values, pd.NA, inplace=True)
+    else:
+        raise ValueError("Unsupported data type. Please provide a file path or a pandas DataFrame.")
+
+    # Ensure no lists in columns by creating a new column and replacing the original
+    for col in data.columns:
+        if data[col].apply(lambda x: isinstance(x, list)).any():
+            new_col = data[col].apply(lambda x: x[0] if isinstance(x, list) else (pd.NA if pd.isna(x) else x))
+            data[col] = new_col
+        else:
+            new_col = data[col].apply(lambda x: pd.NA if pd.isna(x) else x)
+            data[col] = new_col
+
+    # Special handling for answer_numeric column to ensure it's fully numeric
+    if 'answer_numeric' in data.columns:
+        # Convert lists in answer_numeric to their first element, or NA if empty
+        data['answer_numeric'] = data['answer_numeric'].apply(lambda x: x[0] if isinstance(x, list) else x)
+        # Convert to numeric, coercing errors to NaN
+        data['answer_numeric'] = pd.to_numeric(data['answer_numeric'], errors='coerce')
+
+    return data
+
+def recode_values(input_data):
+    missing_values = [-999, -998, -997, -996, -995, -994, -993, -992, -991, -990,
+                      -989, -988, -987, -986, -985, -984, -983, -982, -981, -980]
+
+    if isinstance(input_data, str):
+        if input_data.endswith('.csv') or input_data.endswith('.txt'):
+            data = pd.read_csv(input_data, na_values=missing_values)
+        elif input_data.endswith(('.xlsx', '.xls')):
+             data = pd.read_excel(input_data, na_values=missing_values)
+        else:
+            raise ValueError("Unsupported file type. Please provide a .csv, .txt, or .xlsx file.")
+    elif isinstance(input_data, pd.DataFrame):
+        data = input_data.copy()
+        data.replace(missing_values, pd.NA, inplace=True)
+    else:
+        raise ValueError("Unsupported data type. Please provide a file path or a pandas DataFrame.")
+
+    for col in data.columns:
+        data[col] = data[col].apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)
+
+    return data
diff --git a/omop2survey/response_set.py b/omop2survey/response_set.py
@@ -74,6 +74,74 @@ def map_responses(input_data):
     print(f"The number of unique person_ids in the dataset: {input_data['person_id'].nunique()}")
     return input_data
 
+def map_questions(input_data):
+    special_cases = {
+        903087: (-999, "Don't Know"),
+        903096: (-998, "Skip"),
+        903072: (-997, "Does Not Apply To Me"),
+        903079: (-996, "Prefer Not To Answer"),
+        903070: (-995, "Other"),
+        903092: (-994, "Not Sure"),
+        903095: (-993, "None"),
+        903103: (-992, "Unanswered"),
+        40192432: (-991, "I am not religious"),
+        40192487: (-990, "I do not believe in God (or a higher power)"),
+        40192520: (-989, "Does not apply to my neighborhood"),
+        903081: (-988, "Free Text"),
+        596889: (998, "Text"),
+        596883: (-994, "Not Sure"),
+        1332844: (-994, "Not Sure"),
+        903598: (-996, "Prefer Not To Answer"),
+        903596: (-996, "Prefer Not To Answer"),
+        903601: (-996, "Prefer Not To Answer"),
+        903607: (-996, "Prefer Not To Answer"),
+        903610: (-996, "Prefer Not To Answer"),
+        903604: (-996, "Prefer Not To Answer"),
+        43529089: (-997, "No Blood Related Daughters"),
+        43529086: (-997, "No Blood Related Siblings"),
+        43529092: (-997, "No Blood Related Sons"),
+        43529090: (-997, "No Daughters Related")
+    }
+
+    survey_data = load_survey_data()
+
+    mapping_numeric = survey_data.groupby('question_concept_id').apply(
+        lambda g: g.set_index('answer_concept_id')['answer_numeric'].to_dict()
+    ).to_dict()
+
+    mapping_text = survey_data.groupby('question_concept_id').apply(
+        lambda g: g.set_index('answer_concept_id')['answer_text'].str.strip().to_dict()
+    ).to_dict()
+
+    input_data['answer_numeric'] = pd.NA
+    input_data['answer_text'] = pd.NA
+
+    for answer_id, (num, text) in special_cases.items():
+        mask = input_data['answer_concept_id'] == answer_id
+        input_data.loc[mask, 'answer_numeric'] = num
+        input_data.loc[mask, 'answer_text'] = text
+
+    def apply_mappings(row):
+        if pd.notna(row['answer_numeric']) and pd.notna(row['answer_text']):
+            return row
+
+        question_id = row['question_concept_id']
+        answer_id = row['answer_concept_id']
+        numeric = mapping_numeric.get(question_id, {}).get(answer_id, pd.NA)
+        text = mapping_text.get(question_id, {}).get(answer_id, pd.NA)
+        row['answer_numeric'] = numeric
+        row['answer_text'] = text
+        return row
+
+    input_data = input_data.apply(apply_mappings, axis=1)
+
+    numeric_mask = pd.isna(input_data['answer_concept_id']) & input_data['answer'].apply(lambda x: str(x).isdigit())
+    input_data.loc[numeric_mask, 'answer_numeric'] = input_data.loc[numeric_mask, 'answer'].astype(int)
+    input_data.loc[numeric_mask, 'answer_text'] = input_data.loc[numeric_mask, 'answer'].astype(str)
+
+    print(f"The number of unique person_ids in the dataset: {input_data['person_id'].nunique()}")
+    return input_data
+
 
 def create_dummies(user_data):
     question_key = load_survey_data()
@@ -96,6 +164,33 @@ def create_dummies(user_data):
 
     return result_data
 
+def create_dummy_variables(user_data):
+    question_key = load_survey_data()
+
+    select_all_questions = question_key[question_key['select_all'] == 1]['question_concept_id'].unique()
+
+    id_map = {}
+    new_id_start = user_data['question_concept_id'].max() + 1
+    new_rows = []
+
+    for question_id in select_all_questions:
+        select_all_data = user_data[user_data['question_concept_id'] == question_id]
+        for index, row in select_all_data.iterrows():
+            combined_key = f"{question_id}_{row['answer_concept_id']}"
+            if combined_key not in id_map:
+                id_map[combined_key] = new_id_start
+                new_id_start += 1
+            new_row = row.copy()
+            new_row['question_concept_id'] = id_map[combined_key]
+            new_rows.append(new_row)
+
+    new_rows_df = pd.DataFrame(new_rows)
+    filtered_data = user_data[~user_data['question_concept_id'].isin(select_all_questions)]
+
+    result_data = pd.concat([filtered_data, new_rows_df], ignore_index=True)
+
+    return result_data
+
 
 def scale(data, variables, scale_name, na=False, method='sum'):
     df = data[['person_id'] + variables]
@@ -339,4 +434,31 @@ def process_answers(input_data):
     result_df = pd.concat(results, ignore_index=True)
 
     return result_df
+def create_dummies_R(user_data):
+    question_key = load_survey_data()
+
+    select_all_questions = question_key[question_key['select_all'] == 1]['question_concept_id'].unique()
+
+    id_map = {}
+    new_id_start = user_data['question_concept_id'].max() + 1
+    new_rows = []
+
+    for question_id in select_all_questions:
+        select_all_data = user_data[user_data['question_concept_id'] == question_id]
+        for index, row in select_all_data.iterrows():
+            combined_key = f"{question_id}_{row['answer_concept_id']}"
+            if combined_key not in id_map:
+                id_map[combined_key] = new_id_start
+                new_id_start += 1
+            new_row = row.copy()
+            new_row['question_concept_id'] = id_map[combined_key]  # Assign new numeric ID
+            new_rows.append(new_row)
+
+    new_rows_df = pd.DataFrame(new_rows)
+    filtered_data = user_data[~user_data['question_concept_id'].isin(select_all_questions)]
+
+    result_data = pd.concat([filtered_data, new_rows_df], ignore_index=True)
+
+    return result_data
+
 
diff --git a/omop2survey/subset.py b/omop2survey/subset.py
@@ -15,8 +15,8 @@ def show_survey_options():
     survey_map = get_survey_map()
     for key, value in survey_map.items():
         print(f"{key}: {value}")
-    print("\nExample usage: selecting 'Social Determinants of Health' (assuming it is the 3rd option)")
-    print("# selected_survey_df = omop2survey.import_survey_data(3)")
+    print("\nExample usage: selecting 'Social Determinants of Health' (assuming it is the 1st option)")
+    print("# selected_survey_df = omop2survey.import_survey_data(1)")
     print("# print(selected_survey_df.head(5))")
 
 
diff --git a/vignettes/example.py b/vignettes/example.py
@@ -8,7 +8,6 @@
 omop2.map_answers(sample_df_copy)
 print(sample_df_copy)
 
-
 # Create a codebook and save it as an HTML file; the codebook contains only variables in the dataset.
 # Note: The codebook function can be used to save the file to the GC workspace bucket,
 # whereas codebook_html saves the file locally.
@@ -23,19 +22,16 @@
 omop2.recode(sample_df_copy)
 print(sample_df_copy.head(5))
 
-
 # Create dummy coded variables
 sample_dummy_df = omop2.create_dummies(sample_df_copy)
 print(sample_dummy_df.head(5))
 
-
 # Convert data from long format to wide format using numeric values.
 # The pivot function can be used in the cloud environment.
 # Use pivot_local to save files locally.
 
 omop2.pivot_local(sample_df_copy)
 
-
 # Convert data from long format to wide format using text values.
 # The pivot_text function can be used in the cloud environment.
 # Use pivot_text_local to save files locally.
@@ -50,7 +46,7 @@
 variables = ['q43528662', 'q43528663', 'q43528664']
 scale_name = 'afford_healthcare'
 
-pivot_scale = omop2.scale(pivot_df, variables, scale_name) # default na=False, and method='sum'
+pivot_scale = omop2.scale(pivot_df, variables, scale_name)  # default na=False, and method='sum'
 print(pivot_scale['afford_healthcare'])
 
 scale_name = 'mean_afford_healthcare'
@@ -68,5 +64,3 @@
 df = pd.DataFrame(data)
 processed_df = omop2.recode(df)
 print(processed_df)
-
-