diff --git a/scripts/create_ssf_from_gsa_project.py b/scripts/create_ssf_from_gsa_project.py new file mode 100755 index 0000000..801eae7 --- /dev/null +++ b/scripts/create_ssf_from_gsa_project.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 + +## Script originally made by Stephan Schiffels (@stschiff). Edited by Luca Thale-Bombien (@Kavlahkaff) for specific use in this repository (added empty poseidon_IDs, udg and library_built columns). + +import argparse +import io + +import requests +import re +import pandas as pd +import openpyxl + + +headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} +VALID_PLATFORMS = [ + "NextSeq 1000", + "NextSeq 500", + "NextSeq 550", + "Illumina NextSeq 1000", + "Illumina NextSeq 500", + "Illumina NextSeq 550", + "Illumina NovaSeq 6000", + "Illumina MiniSeq", + "Illumina HiSeq 1000", + "Illumina HiSeq 1500", + "Illumina HiSeq 2000", + "Illumina HiSeq 2500", + "Illumina HiSeq 3000", + "Illumina HiSeq 4000", + "Illumina HiSeq X", + "HiSeq X Five", + "HiSeq X Ten", + "Illumina HiSeq X Five", + "Illumina HiSeq X Ten", + "Illumina Genome Analyzer", + "Illumina Genome Analyzer II", + "Illumina Genome Analyzer IIx", + "Illumina HiScanSQ", + "Illumina MiSeq", + ] + +SSF_COLUMNS = { + "poseidon_IDs": "Individual Name", + "udg": None, + "library_built": None, + "notes": None, + "run_accession": "Run accession", + "study_accession": None, + "sample_accession" : "Individual accession", + "sample_alias": "Individual Name", + "secondary_sample_accession": None, + "first_public": "first_public", + "last_updated": "last_updated", + "instrument_model": "Platform", + "library_layout": "Layout", + "library_source": "Source", + "instrument_platform": "Platform", + "library_name": "Experiment title", + "library_strategy": "Strategy", + "fastq_ftp": "DownLoad1;DownLoad2", + "fastq_aspera": None, + "fastq_bytes": None, + "fastq_md5": "MD5 checksum 1;MD5 checksum 2", + "read_count": None, + "submitted_ftp": "DownLoad1", + "submitted_md5": "MD5 checksum 1", +} + +def extract_release_date(accession_number: str) -> str: + requests_text = requests.get(f"https://ngdc.cncb.ac.cn/gsa-human/browse/{accession_number}", headers=headers).text + release_date_match = re.search(r'Release date:\s*\s*\s*
\s*([\d-]+)', requests_text) + date = release_date_match.group(1) if release_date_match else "n/a" + return date + + +def download_xlsx(accession_number: str) -> pd.ExcelFile: + requests_text = requests.get(f"https://ngdc.cncb.ac.cn/gsa-human/browse/{accession_number}", headers=headers).text + base_url = "https://ngdc.cncb.ac.cn" + action_match = re.search(r"f\.action\s*=\s*[\"']([^\"']+)[\"']", requests_text) + study_id_match = re.search(r"var study_id\s*=\s*'(\d+)'", requests_text) + request_flag_match = re.search(r"var requestFlag\s*=\s*'(\d+)'", requests_text) + file_path_match = re.search(r"downHumanExcel\('(.+?)'\)", requests_text) + + if file_path_match: + file_name = file_path_match.group(1) + else: + print("File path not found.") + + action = action_match.group(1) + "exportExcelFile" if action_match else None + study_id = study_id_match.group(1) if study_id_match else None + request_flag = request_flag_match.group(1) if request_flag_match else None + file_path = f"{action}?fileName={file_name}&study_id={study_id}&requestFlag={request_flag}" + # Construct the full URL + download_url = base_url + file_path + # Send GET request to download the file + print(f"attempting to download file with url: {download_url}") + response = requests.get(download_url, stream=True) + if response.status_code == 200: + print(f"Successfully downloaded file with url: {download_url}") + + # Load the content into pandas from memory (without saving to disk) + excel_data = pd.ExcelFile(io.BytesIO(response.content)) + + return excel_data # Return the ExcelFile object + else: + print(f"Failed to download file. Status code: {response.status_code}") + return None + +def merge_sheets_by_accessions(xls: pd.ExcelFile, accession_col_name: str) -> pd.DataFrame: + """Merges all sheets in the GSA Excel file into one large pandas dataframe, based on the accession number in each sheet. + + Args: + xls (pd.ExcelFile): The Excel file object containing the sheets to be merged. + accession_col_name (str): The name of the column containing the accession numbers in each sheet. + This should be the same for all sheets, and should be the accession of the row in the current sheet (i.e. the Run Accession for the Runs sheet). + + Returns: + pd.DataFrame: A merged pandas dataframe containing all the data from the sheets, merged by their corresponding accession numbers. + """ + ## Rename "Accession" column of each sheet to include sheet name. + dfs = {} + for sheet in xls.sheet_names: + df = xls.parse(sheet, dtype=str) + df.rename(columns={"Accession": f"{sheet} accession"}, inplace=True) # Standardized column name + dfs[sheet] = df + ## One additional rename for the Experiment sheet + dfs["Experiment"].rename(columns={"BioSample accession": "Sample accession"}, inplace=True) + ## One additional rename for the Sample sheet (seems not standardised capitalisation in the GSA file) + dfs["Sample"].rename(columns={"Individual Accession": "Individual accession"}, inplace=True) + + ## Merge dataframes in hierarchical order + merged_df = None + ## dfs.keys = ['Individual', 'Sample', 'Experiment', 'Run'], as is the order in the excel file. + for sheet_name, df in dfs.items(): + if merged_df is None: + merged_df = df # Initialize with the first sheet (Individual) + else: + merged_df = merged_df.merge(df, on=f"{last_sheet_name} accession", how="outer", suffixes=("", "_"+sheet_name)) + # This takes care of empty cells in original xlsx file + merged_df.replace('', 'n/a', inplace=True) + ## Keep track of the last sheet name for the next iteration + last_sheet_name = sheet_name + + ## This takes care of empty cells in original xlsx file + merged_df.fillna('n/a', inplace=True) + return merged_df + +## Helper formatter to combine values in the two provided columns if the file_type value DOES NOT match the type_matches value. +def combine_values(file_type:str, type_matches: str, value1: str, value2: str) -> str: + if file_type == type_matches: + result = 'n/a' + elif value2 == "n/a": + result = f"{value1}" + else: + result = f"{value1};{value2}" + return result + +def df_to_ssf_df(df: pd.DataFrame, cols_to_add: dict, accession_number: str) -> pd.DataFrame: + data = {} + try: + for key, value in cols_to_add.items(): + data[key] = ["n/a"] * len(df) + if key == "study_accession": + data[key] = [accession_number] * len(df) + elif key =="instrument_model": + for model in df[value].tolist(): + if model not in VALID_PLATFORMS: + print(f"Invalid instrument model: {model}, stopping ssf creation") + return + data[key] = df[value].tolist() + elif key == "instrument_platform": + data[key] = ["ILLUMINA"] * len(df) + elif key == "first_public" or key == "last_updated": + data[key] = [extract_release_date(accession_number)] * len(df) + elif key == "fastq_ftp": + data[key] = df.apply(lambda row: combine_values(row['Run data file type'], 'bam', row['DownLoad1'], row['DownLoad2']), axis=1) + elif key == "fastq_md5": + data[key] = df.apply(lambda row: combine_values(row['Run data file type'], 'bam', row['MD5 checksum 1'], row['MD5 checksum 2']), axis=1) + elif value and value in df.columns and not df[value].isnull().all(): + data[key] = df[value].tolist() + except Exception as e: + print(f"Failed to create ssf file due to: {e}") + result_df = pd.DataFrame(data) + print(f"Created ssf file with {len(result_df)} rows and {len(result_df.columns)} columns.") + return result_df + +def save_ssf_to_file(df: pd.DataFrame, output_file: str) -> None: + df.to_csv(output_file, index=False, sep='\t') + print(f"'{output_file}' created successfully.") + +def main(accession_number: str = None, output_file: str = None) -> None: + ## TODO: work out how to see if the data gets updated after initial release. + xls = download_xlsx(accession_number) + merged_df = merge_sheets_by_accessions(xls, "Accession") + ssf = df_to_ssf_df(merged_df, SSF_COLUMNS, accession_number) + + ## Save SSF to file. + save_ssf_to_file(ssf, output_file) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='get_gsa_table', + description='This script downloads a table with ' + 'links to the raw data and metadata provided by ' + 'GSA for a given project accession ID') + + parser.add_argument('accession_number', help="Example: HRA008755") + parser.add_argument('-o', '--output_file', required=True, help="The name of the output file") + + args = parser.parse_args() + main(args.accession_number, args.output_file) \ No newline at end of file diff --git a/scripts/ssf_validator.py b/scripts/ssf_validator.py index 24003cd..0e71ef4 100755 --- a/scripts/ssf_validator.py +++ b/scripts/ssf_validator.py @@ -2,7 +2,7 @@ # MIT License (c) 2023 Thiseas C. Lamnidis -VERSION = "0.3.0dev" +VERSION = "1.0.0" import os import sys @@ -36,6 +36,10 @@ def read_ssf_file(file_path, required_fields=None, error_counter=0): ) ) sys.exit(1) + if "submitted_md5" not in headers: + print( + f"[ssf_validator.py] [File: {file_name}] WARNING: submitted_md5 column not found in SSF file. Please use the latest version of the SSF file creation scripts. This warning can be ignored if you are validating older SSF files." + ) return map(lambda row: dict(zip(headers, row.strip().split("\t"))), l[1:]) @@ -162,6 +166,7 @@ def validate_instrument_model(instrument_model, error_counter, line_num, file_na "Illumina HiSeq 3000", "Illumina HiSeq 4000", "Illumina HiSeq X", + "Illumina HiSeq X Ten", "HiSeq X Five", "HiSeq X Ten", "Illumina Genome Analyzer", @@ -217,6 +222,7 @@ def validate_ssf(file_in): "fastq_md5", "read_count", "submitted_ftp", + "submitted_md5", ] REQUIRED_FIELDS = [ "poseidon_IDs", @@ -226,6 +232,7 @@ def validate_ssf(file_in): "instrument_platform", "library_name", "fastq_ftp", + "submitted_ftp", ] ## Check entries @@ -341,15 +348,9 @@ def validate_ssf(file_in): ## Validate fastq_ftp for reads in [ssf_entry["fastq_ftp"]]: - ## Can be empty string in some cases where input is a BAM, but then data won't be processes (atm) + ## Since v 1.0.0, fastq_ftp can be empty, since then the bam in submitted_ftp will be converted back to FastQ automatically. if isNAstr(reads): - error_counter = print_error( - "[Fastq_ftp is 'n/a'] fastq_ftp cannot be 'n/a'!", - "Line", - line_num, - error_counter, - file_name, - ) + pass elif reads.find(" ") != -1: error_counter = print_error( "[Spaces in FastQ name] File names cannot contain spaces! Please rename.", @@ -373,6 +374,29 @@ def validate_ssf(file_in): error_counter, file_name, ) + + ## Ensure that submitted_ftp and submitted_md5 are not empty (should never be the case, but still.) + if isNAstr(ssf_entry["submitted_ftp"]): + error_counter = print_error( + "[Submitted_ftp missing] submitted_ftp entry has not been specified!", + "Line", + line_num, + error_counter, + file_name, + ) + + try: + if isNAstr(ssf_entry["submitted_md5"]): + error_counter = print_error( + "[Submitted_md5 missing] submitted_md5 entry has not been specified!", + "Line", + line_num, + error_counter, + file_name, + ) + except KeyError: + ## This should only be validated if the column is present in the header, hence a KeyError is expected. + pass ## If formatting errors have occurred print their number and fail. if error_counter > 0: