Merge pull request #764 from vaccineontology/add-scripts

Add scripts
vaccineontology · Oct 31, 2024 · fb241bb · fb241bb
2 parents dadf4cd + 572aad4
commit fb241bb
Show file tree

Hide file tree

Showing 12 changed files with 319 additions and 127 deletions.
diff --git a/docs/VO-cleanup/output.txt → code for clean-up/VO-cleanup/output.txt b/docs/VO-cleanup/output.txt → code for clean-up/VO-cleanup/output.txt
diff --git a/...VO-cleanup/term_editor/processed_file.csv → ...VO-cleanup/term_editor/processed_file.csv b/...VO-cleanup/term_editor/processed_file.csv → ...VO-cleanup/term_editor/processed_file.csv
diff --git a/code for clean-up/VO-cleanup/term_editor/term_editor_names.xlsx b/code for clean-up/VO-cleanup/term_editor/term_editor_names.xlsx
diff --git a/...term_editor/unique_name_of_term_editor.py → ...term_editor/unique_name_of_term_editor.py b/...term_editor/unique_name_of_term_editor.py → ...term_editor/unique_name_of_term_editor.py
diff --git a/docs/VO-cleanup/clean_up_definition_field.py b/docs/VO-cleanup/clean_up_definition_field.py
diff --git a/docs/VO-cleanup/comments and seeAlso columns.py b/docs/VO-cleanup/comments and seeAlso columns.py
diff --git a/docs/VO-cleanup/term_editor/improved_seperate_columns.py b/docs/VO-cleanup/term_editor/improved_seperate_columns.py
diff --git a/docs/VO-cleanup/term_editor/term_editor_names.xlsx b/docs/VO-cleanup/term_editor/term_editor_names.xlsx
diff --git a/scripts/clean_up_definition_field.py b/scripts/clean_up_definition_field.py
@@ -0,0 +1,88 @@
+import pandas as pd
+import os
+import glob
+
+"""
+This script is used to clean up and capitalize cell values in specific columns of one or more CSV files. 
+You can specify a folder containing CSV files and the columns you want to clean up.
+The cleaning process:
+- Strips leading/trailing whitespace
+- Capitalizes the first letter of the string
+- Removes 'A' or 'An' from the start of the string, if applicable
+
+Contributor: Yuping Zheng 2024-10-30
+"""
+
+
+# Function to clean up and capitalize the content of a cell.
+# Arguments:
+# - cell: The individual cell value to be processed (string).
+# Returns:
+# - The cleaned-up and capitalized cell value, with leading/trailing whitespace removed,
+#   and 'A'/'An' removed from the start if applicable.
+def clean_up(cell):
+    if isinstance(cell, str):  # Ensure the cell is a string before processing
+        cell = cell.strip()  # Remove leading and trailing whitespace
+        cell = cell.capitalize()  # Capitalize the first letter
+        # Remove leading 'A' or 'An' if applicable
+        if cell.startswith('A '):
+            cell = cell.lstrip('A')
+        elif cell.startswith('An '):
+            cell = cell.lstrip('An')
+        # Re-strip and capitalize to ensure cleanliness after removal
+        cell = cell.strip()
+        cell = cell.capitalize()
+    return cell
+
+
+# Function to clean up specified columns in all CSV files within a given folder.
+# Arguments:
+# - columns_to_clean: A list of column names to be cleaned (strings).
+# - folder_path: The path to the folder containing the CSV files to be processed.
+# This function applies the `clean_up` function to each specified column in all matching files.
+def strip_capitalize(columns_to_clean, folder_path):
+    # Find all CSV files in the specified folder
+    file_paths = glob.glob(os.path.join(folder_path, "*.csv"))
+
+    if len(file_paths) == 0:  # Handle the case where no CSV files are found
+        file_paths = [folder_path]  # Assume the path is to a single file if no folder is found
+
+    # Process each CSV file in the folder
+    for file_path in file_paths:
+        df = pd.read_csv(file_path)  # Load the CSV into a pandas DataFrame
+
+        # Apply the clean_up function to each specified column
+        for column in columns_to_clean:
+            if column in df.columns:  # Check if the column exists in the DataFrame
+                df[column] = df[column].apply(clean_up)  # Apply the cleaning function to the column
+
+        # Construct the output file path by appending '_processed' to the original file name
+        dir_name, base_name = os.path.split(file_path)
+        name, ext = os.path.splitext(base_name)
+        output_file = os.path.join(dir_name, f"{name}_processed{ext}")
+
+        # Save the modified DataFrame to a new CSV file
+        df.to_csv(output_file, index=False, encoding='utf-8')
+        print(f"Modified data saved to {output_file}")
+
+
+# Example Usage:
+# To clean up and capitalize columns in all CSV files within a folder:
+#
+# 1. Define the path to the folder containing the CSV files.
+# 2. Specify the columns you want to clean up in the CSV files.
+# 3. Call the `strip_capitalize` function.
+#
+# Example:
+'''
+folder_path = 'path/to/your/csv_folder'
+columns_to_clean = ['definition', 'LABEL']  # Replace with the actual column names you want to clean
+
+strip_capitalize(columns_to_clean, folder_path)
+'''
+
+# This will:
+# - Process all CSV files in the specified folder.
+# - Clean up the specified columns by stripping whitespace, capitalizing the first letter,
+#   and removing 'A'/'An' where applicable.
+# - Save the processed files with '_processed' appended to the original file name.
diff --git a/scripts/prefix_extracter.py b/scripts/prefix_extracter.py
@@ -0,0 +1,48 @@
+import pandas as pd
+import os
+import glob
+"""
+This script is used to extract and process information with specific prefixes (e.g., 'CVX', 'CXX code', 'CVX code')
+from a specified column in a CSV file, and outputs a processed file with the extracted identifiers.
+
+Contributor: Yuping Zheng 2024-10-30
+"""
+# Function to extract identifiers based on specific prefixes (identifiers) from a given string.
+# Arguments:
+# - x: The string to process (usually from a column in the CSV).
+# - *identifiers: The list of prefixes to detect (e.g., 'CVX', 'PMID').
+# Returns:
+# - A tuple containing:
+#   - non_identifier_string: The remaining string after extracting all identifier-related parts.
+#   - combined_string: A string combining all extracted identifier values.
+def extract_identifiers(x, *identifiers):
+    if isinstance(x, str):
+        # Split the string into parts using ';' as a separator (after replacing '|' with ';')
+        parts = x.replace('|', ';').split(';')
+        identifier_values = {identifier: [] for identifier in identifiers}  # Dictionary to store identifier values
+        non_identifier_parts = []  # List to store parts that don't match any identifier
+        # Loop through each part of the string to check for identifiers
+        for part in parts:
+            for identifier in identifiers:
+                if identifier in part:  # Check if the part contains the identifier
+                    # Extract the digits from the part and store in the corresponding identifier list
+                    identifier_values[identifier].append("".join(part))
+                    break
+            else:
+                non_identifier_parts.append(part)  # If no identifier matches, add to non_identifier_parts
+        # Create strings for each identifier, joining the extracted values with ';'
+        identifier_strings = {identifier: ';'.join(values) if values else '' for identifier, values in
+                              identifier_values.items()}
+        # Combine the non-identifier parts back into a string, using '|' as a separator
+        non_identifier_string = '|'.join(non_identifier_parts)
+        # Combine all identifier values into a single string
+        combined_string = '|'.join([value for value in identifier_strings.values() if value])
+        return non_identifier_string, combined_string
+    else:
+        return x, None  # Return original value if the input is not a string
+"""
+Function to process a CSV file by extracting identifiers (based on specified prefixes) from a given column,
+and creating new columns with the extracted information.
+Arguments:
+- file_path: The path to the input CSV file.
+"""
diff --git a/scripts/unique_name_of_term_editor.py b/scripts/unique_name_of_term_editor.py
@@ -0,0 +1,74 @@
+import csv
+import glob
+import os
+
+# Function to process a CSV file and extract unique values from a user-specified column.
+# Arguments:
+# - filename: Path to the CSV file to be processed.
+# - column_name: Name of the column from which to extract unique values.
+# Returns:
+# - A set of unique values from the specified column.
+# - None if there is a Unicode decoding issue or column not found, along with the filename.
+
+#Contributor: Yuping Zheng 2024-10-30
+def process_file(filename, column_name):
+    column_values = []  # To store the values from the specified column
+    try:
+        with open(filename, 'r', encoding='utf-8') as f_in:
+            reader = csv.DictReader(f_in)
+            next(reader)  # Skip header row
+            for row in reader:
+                # Retrieve data from the user-specified column
+                column_data = row.get(column_name, "")
+                column_data = column_data.replace('|', ',').split(',')  # Replace pipe and split by commas
+                column_data = [item.strip() for item in column_data]  # Strip whitespace from values
+                column_values.extend(column_data)  # Collect values from the column
+        return set(column_values), None  # Return unique values as a set
+    except UnicodeDecodeError:
+        return None, filename  # Handle Unicode errors (e.g., non-UTF-8 encoded files)
+    except KeyError:
+        print(f"Column '{column_name}' not found in {filename}")  # Handle missing column error
+        return None, filename
+
+
+# Function to process all CSV files in a folder and extract unique values from a specified column.
+# Arguments:
+# - folder_path: Path to the directory containing CSV files or the file path itself.
+# - column_name: Name of the column from which to extract and replace unique values.
+# This function handles multiple CSV files, checks for encoding issues, and aggregates unique values.
+def replace_unique_name_in_column(folder_path, column_name):
+    # Find all CSV files in the specified directory (or use a single file if directory not found)
+    file_paths = glob.glob(os.path.join(folder_path, "*.csv"))  # Get all CSV files in the folder
+    if len(file_paths) == 0:  # Handle the case when no CSV files are found
+        file_paths = [folder_path]  # Assume the folder_path is actually a file
+
+    # Initialize containers for unique values and non-UTF-8 files
+    all_values = set()  # To store unique values across all files
+    non_utf8_files = []  # To track files that are not encoded in UTF-8
+
+    # Process each file to extract unique values from the specified column
+    for file in file_paths:
+        values, non_utf8_file = process_file(file, column_name)
+        if values is not None:  # If no errors, update the set with unique values
+            all_values.update(values)
+        if non_utf8_file is not None:  # Track files that had issues
+            non_utf8_files.append(non_utf8_file)
+
+    # Output the list of files that were not in UTF-8 encoding
+    print("Files not encoded in UTF-8:", non_utf8_files)
+
+    # Output the unique values found in the specified column across all processed files
+    print(f"Unique values in column '{column_name}' across all files:", all_values)
+    print(f"Total unique values in column '{column_name}':", len(all_values))
+
+
+# Example Usage:
+# Call the `replace_unique_name_in_column` function with the following arguments:
+# - folder_path: Path to the directory containing CSV files or a specific CSV file.
+# - column_name: The column in which to find unique values.
+#
+# Example:
+# replace_unique_name_in_column('path/to/your/csv/folder', 'column_name_to_process')
+
+# The following example processes a single CSV file and extracts unique values from the "term editor" column:
+replace_unique_name_in_column('path/to/your/csv_file.csv', 'term editor')