-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #764 from vaccineontology/add-scripts
Add scripts
- Loading branch information
Showing
12 changed files
with
319 additions
and
127 deletions.
There are no files selected for viewing
File renamed without changes.
File renamed without changes.
Binary file not shown.
File renamed without changes.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import pandas as pd | ||
import os | ||
import glob | ||
|
||
""" | ||
This script is used to clean up and capitalize cell values in specific columns of one or more CSV files. | ||
You can specify a folder containing CSV files and the columns you want to clean up. | ||
The cleaning process: | ||
- Strips leading/trailing whitespace | ||
- Capitalizes the first letter of the string | ||
- Removes 'A' or 'An' from the start of the string, if applicable | ||
Contributor: Yuping Zheng 2024-10-30 | ||
""" | ||
|
||
|
||
# Function to clean up and capitalize the content of a cell. | ||
# Arguments: | ||
# - cell: The individual cell value to be processed (string). | ||
# Returns: | ||
# - The cleaned-up and capitalized cell value, with leading/trailing whitespace removed, | ||
# and 'A'/'An' removed from the start if applicable. | ||
def clean_up(cell): | ||
if isinstance(cell, str): # Ensure the cell is a string before processing | ||
cell = cell.strip() # Remove leading and trailing whitespace | ||
cell = cell.capitalize() # Capitalize the first letter | ||
# Remove leading 'A' or 'An' if applicable | ||
if cell.startswith('A '): | ||
cell = cell.lstrip('A') | ||
elif cell.startswith('An '): | ||
cell = cell.lstrip('An') | ||
# Re-strip and capitalize to ensure cleanliness after removal | ||
cell = cell.strip() | ||
cell = cell.capitalize() | ||
return cell | ||
|
||
|
||
# Function to clean up specified columns in all CSV files within a given folder. | ||
# Arguments: | ||
# - columns_to_clean: A list of column names to be cleaned (strings). | ||
# - folder_path: The path to the folder containing the CSV files to be processed. | ||
# This function applies the `clean_up` function to each specified column in all matching files. | ||
def strip_capitalize(columns_to_clean, folder_path): | ||
# Find all CSV files in the specified folder | ||
file_paths = glob.glob(os.path.join(folder_path, "*.csv")) | ||
|
||
if len(file_paths) == 0: # Handle the case where no CSV files are found | ||
file_paths = [folder_path] # Assume the path is to a single file if no folder is found | ||
|
||
# Process each CSV file in the folder | ||
for file_path in file_paths: | ||
df = pd.read_csv(file_path) # Load the CSV into a pandas DataFrame | ||
|
||
# Apply the clean_up function to each specified column | ||
for column in columns_to_clean: | ||
if column in df.columns: # Check if the column exists in the DataFrame | ||
df[column] = df[column].apply(clean_up) # Apply the cleaning function to the column | ||
|
||
# Construct the output file path by appending '_processed' to the original file name | ||
dir_name, base_name = os.path.split(file_path) | ||
name, ext = os.path.splitext(base_name) | ||
output_file = os.path.join(dir_name, f"{name}_processed{ext}") | ||
|
||
# Save the modified DataFrame to a new CSV file | ||
df.to_csv(output_file, index=False, encoding='utf-8') | ||
print(f"Modified data saved to {output_file}") | ||
|
||
|
||
# Example Usage: | ||
# To clean up and capitalize columns in all CSV files within a folder: | ||
# | ||
# 1. Define the path to the folder containing the CSV files. | ||
# 2. Specify the columns you want to clean up in the CSV files. | ||
# 3. Call the `strip_capitalize` function. | ||
# | ||
# Example: | ||
''' | ||
folder_path = 'path/to/your/csv_folder' | ||
columns_to_clean = ['definition', 'LABEL'] # Replace with the actual column names you want to clean | ||
strip_capitalize(columns_to_clean, folder_path) | ||
''' | ||
|
||
# This will: | ||
# - Process all CSV files in the specified folder. | ||
# - Clean up the specified columns by stripping whitespace, capitalizing the first letter, | ||
# and removing 'A'/'An' where applicable. | ||
# - Save the processed files with '_processed' appended to the original file name. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import pandas as pd | ||
import os | ||
import glob | ||
""" | ||
This script is used to extract and process information with specific prefixes (e.g., 'CVX', 'CXX code', 'CVX code') | ||
from a specified column in a CSV file, and outputs a processed file with the extracted identifiers. | ||
Contributor: Yuping Zheng 2024-10-30 | ||
""" | ||
# Function to extract identifiers based on specific prefixes (identifiers) from a given string. | ||
# Arguments: | ||
# - x: The string to process (usually from a column in the CSV). | ||
# - *identifiers: The list of prefixes to detect (e.g., 'CVX', 'PMID'). | ||
# Returns: | ||
# - A tuple containing: | ||
# - non_identifier_string: The remaining string after extracting all identifier-related parts. | ||
# - combined_string: A string combining all extracted identifier values. | ||
def extract_identifiers(x, *identifiers): | ||
if isinstance(x, str): | ||
# Split the string into parts using ';' as a separator (after replacing '|' with ';') | ||
parts = x.replace('|', ';').split(';') | ||
identifier_values = {identifier: [] for identifier in identifiers} # Dictionary to store identifier values | ||
non_identifier_parts = [] # List to store parts that don't match any identifier | ||
# Loop through each part of the string to check for identifiers | ||
for part in parts: | ||
for identifier in identifiers: | ||
if identifier in part: # Check if the part contains the identifier | ||
# Extract the digits from the part and store in the corresponding identifier list | ||
identifier_values[identifier].append("".join(part)) | ||
break | ||
else: | ||
non_identifier_parts.append(part) # If no identifier matches, add to non_identifier_parts | ||
# Create strings for each identifier, joining the extracted values with ';' | ||
identifier_strings = {identifier: ';'.join(values) if values else '' for identifier, values in | ||
identifier_values.items()} | ||
# Combine the non-identifier parts back into a string, using '|' as a separator | ||
non_identifier_string = '|'.join(non_identifier_parts) | ||
# Combine all identifier values into a single string | ||
combined_string = '|'.join([value for value in identifier_strings.values() if value]) | ||
return non_identifier_string, combined_string | ||
else: | ||
return x, None # Return original value if the input is not a string | ||
""" | ||
Function to process a CSV file by extracting identifiers (based on specified prefixes) from a given column, | ||
and creating new columns with the extracted information. | ||
Arguments: | ||
- file_path: The path to the input CSV file. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import csv | ||
import glob | ||
import os | ||
|
||
# Function to process a CSV file and extract unique values from a user-specified column. | ||
# Arguments: | ||
# - filename: Path to the CSV file to be processed. | ||
# - column_name: Name of the column from which to extract unique values. | ||
# Returns: | ||
# - A set of unique values from the specified column. | ||
# - None if there is a Unicode decoding issue or column not found, along with the filename. | ||
|
||
#Contributor: Yuping Zheng 2024-10-30 | ||
def process_file(filename, column_name): | ||
column_values = [] # To store the values from the specified column | ||
try: | ||
with open(filename, 'r', encoding='utf-8') as f_in: | ||
reader = csv.DictReader(f_in) | ||
next(reader) # Skip header row | ||
for row in reader: | ||
# Retrieve data from the user-specified column | ||
column_data = row.get(column_name, "") | ||
column_data = column_data.replace('|', ',').split(',') # Replace pipe and split by commas | ||
column_data = [item.strip() for item in column_data] # Strip whitespace from values | ||
column_values.extend(column_data) # Collect values from the column | ||
return set(column_values), None # Return unique values as a set | ||
except UnicodeDecodeError: | ||
return None, filename # Handle Unicode errors (e.g., non-UTF-8 encoded files) | ||
except KeyError: | ||
print(f"Column '{column_name}' not found in {filename}") # Handle missing column error | ||
return None, filename | ||
|
||
|
||
# Function to process all CSV files in a folder and extract unique values from a specified column. | ||
# Arguments: | ||
# - folder_path: Path to the directory containing CSV files or the file path itself. | ||
# - column_name: Name of the column from which to extract and replace unique values. | ||
# This function handles multiple CSV files, checks for encoding issues, and aggregates unique values. | ||
def replace_unique_name_in_column(folder_path, column_name): | ||
# Find all CSV files in the specified directory (or use a single file if directory not found) | ||
file_paths = glob.glob(os.path.join(folder_path, "*.csv")) # Get all CSV files in the folder | ||
if len(file_paths) == 0: # Handle the case when no CSV files are found | ||
file_paths = [folder_path] # Assume the folder_path is actually a file | ||
|
||
# Initialize containers for unique values and non-UTF-8 files | ||
all_values = set() # To store unique values across all files | ||
non_utf8_files = [] # To track files that are not encoded in UTF-8 | ||
|
||
# Process each file to extract unique values from the specified column | ||
for file in file_paths: | ||
values, non_utf8_file = process_file(file, column_name) | ||
if values is not None: # If no errors, update the set with unique values | ||
all_values.update(values) | ||
if non_utf8_file is not None: # Track files that had issues | ||
non_utf8_files.append(non_utf8_file) | ||
|
||
# Output the list of files that were not in UTF-8 encoding | ||
print("Files not encoded in UTF-8:", non_utf8_files) | ||
|
||
# Output the unique values found in the specified column across all processed files | ||
print(f"Unique values in column '{column_name}' across all files:", all_values) | ||
print(f"Total unique values in column '{column_name}':", len(all_values)) | ||
|
||
|
||
# Example Usage: | ||
# Call the `replace_unique_name_in_column` function with the following arguments: | ||
# - folder_path: Path to the directory containing CSV files or a specific CSV file. | ||
# - column_name: The column in which to find unique values. | ||
# | ||
# Example: | ||
# replace_unique_name_in_column('path/to/your/csv/folder', 'column_name_to_process') | ||
|
||
# The following example processes a single CSV file and extracts unique values from the "term editor" column: | ||
replace_unique_name_in_column('path/to/your/csv_file.csv', 'term editor') |
Oops, something went wrong.