Skip to content

Commit

Permalink
Merge pull request #764 from vaccineontology/add-scripts
Browse files Browse the repository at this point in the history
Add scripts
  • Loading branch information
zhengj2007 authored Oct 31, 2024
2 parents dadf4cd + 572aad4 commit fb241bb
Show file tree
Hide file tree
Showing 12 changed files with 319 additions and 127 deletions.
File renamed without changes.
Binary file not shown.
33 changes: 0 additions & 33 deletions docs/VO-cleanup/clean_up_definition_field.py

This file was deleted.

61 changes: 0 additions & 61 deletions docs/VO-cleanup/comments and seeAlso columns.py

This file was deleted.

33 changes: 0 additions & 33 deletions docs/VO-cleanup/term_editor/improved_seperate_columns.py

This file was deleted.

Binary file removed docs/VO-cleanup/term_editor/term_editor_names.xlsx
Binary file not shown.
88 changes: 88 additions & 0 deletions scripts/clean_up_definition_field.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import pandas as pd
import os
import glob

"""
This script is used to clean up and capitalize cell values in specific columns of one or more CSV files.
You can specify a folder containing CSV files and the columns you want to clean up.
The cleaning process:
- Strips leading/trailing whitespace
- Capitalizes the first letter of the string
- Removes 'A' or 'An' from the start of the string, if applicable
Contributor: Yuping Zheng 2024-10-30
"""


# Function to clean up and capitalize the content of a cell.
# Arguments:
# - cell: The individual cell value to be processed (string).
# Returns:
# - The cleaned-up and capitalized cell value, with leading/trailing whitespace removed,
# and 'A'/'An' removed from the start if applicable.
def clean_up(cell):
if isinstance(cell, str): # Ensure the cell is a string before processing
cell = cell.strip() # Remove leading and trailing whitespace
cell = cell.capitalize() # Capitalize the first letter
# Remove leading 'A' or 'An' if applicable
if cell.startswith('A '):
cell = cell.lstrip('A')
elif cell.startswith('An '):
cell = cell.lstrip('An')
# Re-strip and capitalize to ensure cleanliness after removal
cell = cell.strip()
cell = cell.capitalize()
return cell


# Function to clean up specified columns in all CSV files within a given folder.
# Arguments:
# - columns_to_clean: A list of column names to be cleaned (strings).
# - folder_path: The path to the folder containing the CSV files to be processed.
# This function applies the `clean_up` function to each specified column in all matching files.
def strip_capitalize(columns_to_clean, folder_path):
# Find all CSV files in the specified folder
file_paths = glob.glob(os.path.join(folder_path, "*.csv"))

if len(file_paths) == 0: # Handle the case where no CSV files are found
file_paths = [folder_path] # Assume the path is to a single file if no folder is found

# Process each CSV file in the folder
for file_path in file_paths:
df = pd.read_csv(file_path) # Load the CSV into a pandas DataFrame

# Apply the clean_up function to each specified column
for column in columns_to_clean:
if column in df.columns: # Check if the column exists in the DataFrame
df[column] = df[column].apply(clean_up) # Apply the cleaning function to the column

# Construct the output file path by appending '_processed' to the original file name
dir_name, base_name = os.path.split(file_path)
name, ext = os.path.splitext(base_name)
output_file = os.path.join(dir_name, f"{name}_processed{ext}")

# Save the modified DataFrame to a new CSV file
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"Modified data saved to {output_file}")


# Example Usage:
# To clean up and capitalize columns in all CSV files within a folder:
#
# 1. Define the path to the folder containing the CSV files.
# 2. Specify the columns you want to clean up in the CSV files.
# 3. Call the `strip_capitalize` function.
#
# Example:
'''
folder_path = 'path/to/your/csv_folder'
columns_to_clean = ['definition', 'LABEL'] # Replace with the actual column names you want to clean
strip_capitalize(columns_to_clean, folder_path)
'''

# This will:
# - Process all CSV files in the specified folder.
# - Clean up the specified columns by stripping whitespace, capitalizing the first letter,
# and removing 'A'/'An' where applicable.
# - Save the processed files with '_processed' appended to the original file name.
48 changes: 48 additions & 0 deletions scripts/prefix_extracter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pandas as pd
import os
import glob
"""
This script is used to extract and process information with specific prefixes (e.g., 'CVX', 'CXX code', 'CVX code')
from a specified column in a CSV file, and outputs a processed file with the extracted identifiers.
Contributor: Yuping Zheng 2024-10-30
"""
# Function to extract identifiers based on specific prefixes (identifiers) from a given string.
# Arguments:
# - x: The string to process (usually from a column in the CSV).
# - *identifiers: The list of prefixes to detect (e.g., 'CVX', 'PMID').
# Returns:
# - A tuple containing:
# - non_identifier_string: The remaining string after extracting all identifier-related parts.
# - combined_string: A string combining all extracted identifier values.
def extract_identifiers(x, *identifiers):
if isinstance(x, str):
# Split the string into parts using ';' as a separator (after replacing '|' with ';')
parts = x.replace('|', ';').split(';')
identifier_values = {identifier: [] for identifier in identifiers} # Dictionary to store identifier values
non_identifier_parts = [] # List to store parts that don't match any identifier
# Loop through each part of the string to check for identifiers
for part in parts:
for identifier in identifiers:
if identifier in part: # Check if the part contains the identifier
# Extract the digits from the part and store in the corresponding identifier list
identifier_values[identifier].append("".join(part))
break
else:
non_identifier_parts.append(part) # If no identifier matches, add to non_identifier_parts
# Create strings for each identifier, joining the extracted values with ';'
identifier_strings = {identifier: ';'.join(values) if values else '' for identifier, values in
identifier_values.items()}
# Combine the non-identifier parts back into a string, using '|' as a separator
non_identifier_string = '|'.join(non_identifier_parts)
# Combine all identifier values into a single string
combined_string = '|'.join([value for value in identifier_strings.values() if value])
return non_identifier_string, combined_string
else:
return x, None # Return original value if the input is not a string
"""
Function to process a CSV file by extracting identifiers (based on specified prefixes) from a given column,
and creating new columns with the extracted information.
Arguments:
- file_path: The path to the input CSV file.
"""
74 changes: 74 additions & 0 deletions scripts/unique_name_of_term_editor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import csv
import glob
import os

# Function to process a CSV file and extract unique values from a user-specified column.
# Arguments:
# - filename: Path to the CSV file to be processed.
# - column_name: Name of the column from which to extract unique values.
# Returns:
# - A set of unique values from the specified column.
# - None if there is a Unicode decoding issue or column not found, along with the filename.

#Contributor: Yuping Zheng 2024-10-30
def process_file(filename, column_name):
column_values = [] # To store the values from the specified column
try:
with open(filename, 'r', encoding='utf-8') as f_in:
reader = csv.DictReader(f_in)
next(reader) # Skip header row
for row in reader:
# Retrieve data from the user-specified column
column_data = row.get(column_name, "")
column_data = column_data.replace('|', ',').split(',') # Replace pipe and split by commas
column_data = [item.strip() for item in column_data] # Strip whitespace from values
column_values.extend(column_data) # Collect values from the column
return set(column_values), None # Return unique values as a set
except UnicodeDecodeError:
return None, filename # Handle Unicode errors (e.g., non-UTF-8 encoded files)
except KeyError:
print(f"Column '{column_name}' not found in {filename}") # Handle missing column error
return None, filename


# Function to process all CSV files in a folder and extract unique values from a specified column.
# Arguments:
# - folder_path: Path to the directory containing CSV files or the file path itself.
# - column_name: Name of the column from which to extract and replace unique values.
# This function handles multiple CSV files, checks for encoding issues, and aggregates unique values.
def replace_unique_name_in_column(folder_path, column_name):
# Find all CSV files in the specified directory (or use a single file if directory not found)
file_paths = glob.glob(os.path.join(folder_path, "*.csv")) # Get all CSV files in the folder
if len(file_paths) == 0: # Handle the case when no CSV files are found
file_paths = [folder_path] # Assume the folder_path is actually a file

# Initialize containers for unique values and non-UTF-8 files
all_values = set() # To store unique values across all files
non_utf8_files = [] # To track files that are not encoded in UTF-8

# Process each file to extract unique values from the specified column
for file in file_paths:
values, non_utf8_file = process_file(file, column_name)
if values is not None: # If no errors, update the set with unique values
all_values.update(values)
if non_utf8_file is not None: # Track files that had issues
non_utf8_files.append(non_utf8_file)

# Output the list of files that were not in UTF-8 encoding
print("Files not encoded in UTF-8:", non_utf8_files)

# Output the unique values found in the specified column across all processed files
print(f"Unique values in column '{column_name}' across all files:", all_values)
print(f"Total unique values in column '{column_name}':", len(all_values))


# Example Usage:
# Call the `replace_unique_name_in_column` function with the following arguments:
# - folder_path: Path to the directory containing CSV files or a specific CSV file.
# - column_name: The column in which to find unique values.
#
# Example:
# replace_unique_name_in_column('path/to/your/csv/folder', 'column_name_to_process')

# The following example processes a single CSV file and extracts unique values from the "term editor" column:
replace_unique_name_in_column('path/to/your/csv_file.csv', 'term editor')
Loading

0 comments on commit fb241bb

Please sign in to comment.