poseidon-framework · TCLamnidis · May 6, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 11, 2025
diff --git a/scripts/create_ssf_from_gsa_project.py b/scripts/create_ssf_from_gsa_project.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+
+## Script originally made by Stephan Schiffels (@stschiff). Edited by Luca Thale-Bombien (@Kavlahkaff) for specific use in this repository (added empty poseidon_IDs, udg and library_built columns).
+
+import argparse
+import io
+
+import requests
+import re
+import pandas as pd
+import openpyxl
+
+
+headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
+VALID_PLATFORMS = [
+        "NextSeq 1000",
+        "NextSeq 500",
+        "NextSeq 550",
+        "Illumina NextSeq 1000",
+        "Illumina NextSeq 500",
+        "Illumina NextSeq 550",
+        "Illumina NovaSeq 6000",
+        "Illumina MiniSeq",
+        "Illumina HiSeq 1000",
+        "Illumina HiSeq 1500",
+        "Illumina HiSeq 2000",
+        "Illumina HiSeq 2500",
+        "Illumina HiSeq 3000",
+        "Illumina HiSeq 4000",
+        "Illumina HiSeq X",
+        "HiSeq X Five",
+        "HiSeq X Ten",
+        "Illumina HiSeq X Five",
+        "Illumina HiSeq X Ten",
+        "Illumina Genome Analyzer",
+        "Illumina Genome Analyzer II",
+        "Illumina Genome Analyzer IIx",
+        "Illumina HiScanSQ",
+        "Illumina MiSeq",
+    ]
+
+SSF_COLUMNS = {
+    "poseidon_IDs": "Individual Name",
+    "udg": None,
+    "library_built": None,
+    "notes": None,
+    "run_accession": "Run accession",
+    "study_accession": None,
+    "sample_accession" : "Individual accession",
+    "sample_alias": "Individual Name",
+    "secondary_sample_accession": None,
+    "first_public": "first_public",
+    "last_updated": "last_updated",
+    "instrument_model": "Platform",
+    "library_layout": "Layout",
+    "library_source": "Source",
+    "instrument_platform": "Platform",
+    "library_name": "Experiment title",
+    "library_strategy": "Strategy",
+    "fastq_ftp": "DownLoad1;DownLoad2",
+    "fastq_aspera": None,
+    "fastq_bytes": None,
+    "fastq_md5": "MD5 checksum 1;MD5 checksum 2",
+    "read_count": None,
+    "submitted_ftp": "DownLoad1",
+    "submitted_md5": "MD5 checksum 1",
+}
+
+def extract_release_date(accession_number: str) -> str:
+    requests_text = requests.get(f"https://ngdc.cncb.ac.cn/gsa-human/browse/{accession_number}", headers=headers).text
+    release_date_match = re.search(r'<b>Release date:</b>\s*</span>\s*</div>\s*<div class="col-md-9">\s*([\d-]+)', requests_text)
+    date = release_date_match.group(1) if release_date_match else "n/a"
+    return date
+
+
+def download_xlsx(accession_number: str) -> pd.ExcelFile:
+    requests_text = requests.get(f"https://ngdc.cncb.ac.cn/gsa-human/browse/{accession_number}", headers=headers).text
+    base_url = "https://ngdc.cncb.ac.cn"
+    action_match = re.search(r"f\.action\s*=\s*[\"']([^\"']+)[\"']", requests_text)
+    study_id_match = re.search(r"var study_id\s*=\s*'(\d+)'", requests_text)
+    request_flag_match = re.search(r"var requestFlag\s*=\s*'(\d+)'", requests_text)
+    file_path_match = re.search(r"downHumanExcel\('(.+?)'\)", requests_text)
+
+    if file_path_match:
+        file_name = file_path_match.group(1)
+    else:
+        print("File path not found.")
+
+    action = action_match.group(1) + "exportExcelFile" if action_match else None
+    study_id = study_id_match.group(1) if study_id_match else None
+    request_flag = request_flag_match.group(1) if request_flag_match else None
+    file_path = f"{action}?fileName={file_name}&study_id={study_id}&requestFlag={request_flag}"
+    # Construct the full URL
+    download_url = base_url + file_path
+    # Send GET request to download the file
+    print(f"attempting to download file with url: {download_url}")
+    response = requests.get(download_url, stream=True)
+    if response.status_code == 200:
+        print(f"Successfully downloaded file with url: {download_url}")
+
+        # Load the content into pandas from memory (without saving to disk)
+        excel_data = pd.ExcelFile(io.BytesIO(response.content))
+
+        return excel_data  # Return the ExcelFile object
+    else:
+        print(f"Failed to download file. Status code: {response.status_code}")
+        return None
+
+def merge_sheets_by_accessions(xls: pd.ExcelFile, accession_col_name: str) -> pd.DataFrame:
+    """Merges all sheets in the GSA Excel file into one large pandas dataframe, based on the accession number in each sheet.
+
+    Args:
+        xls (pd.ExcelFile): The Excel file object containing the sheets to be merged.
+        accession_col_name (str): The name of the column containing the accession numbers in each sheet.
+            This should be the same for all sheets, and should be the accession of the row in the current sheet (i.e. the Run Accession for the Runs sheet).
+
+    Returns:
+        pd.DataFrame: A merged pandas dataframe containing all the data from the sheets, merged by their corresponding accession numbers.
+    """
+    ## Rename "Accession" column of each sheet to include sheet name.
+    dfs = {}
+    for sheet in xls.sheet_names:
+        df = xls.parse(sheet, dtype=str)
+        df.rename(columns={"Accession": f"{sheet} accession"}, inplace=True)  # Standardized column name
+        dfs[sheet] = df
+    ## One additional rename for the Experiment sheet
+    dfs["Experiment"].rename(columns={"BioSample accession": "Sample accession"}, inplace=True)
+    ## One additional rename for the Sample sheet (seems not standardised capitalisation in the GSA file)
+    dfs["Sample"].rename(columns={"Individual Accession": "Individual accession"}, inplace=True)
+
+    ## Merge dataframes in hierarchical order
+    merged_df = None
+    ## dfs.keys = ['Individual', 'Sample', 'Experiment', 'Run'], as is the order in the excel file.
+    for sheet_name, df in dfs.items():
+        if merged_df is None:
+            merged_df = df  # Initialize with the first sheet (Individual)
+        else:
+            merged_df = merged_df.merge(df, on=f"{last_sheet_name} accession", how="outer", suffixes=("", "_"+sheet_name))
+            # This takes care of empty cells in original xlsx file
+            merged_df.replace('', 'n/a', inplace=True)
+        ## Keep track of the last sheet name for the next iteration
+        last_sheet_name = sheet_name
+
+    ## This takes care of empty cells in original xlsx file
+    merged_df.fillna('n/a', inplace=True)
+    return merged_df
+
+## Helper formatter to combine values in the two provided columns if the file_type value DOES NOT match the type_matches value.
+def combine_values(file_type:str, type_matches: str, value1: str, value2: str) -> str:
+    if file_type == type_matches:
+        result = 'n/a'
+    elif value2 == "n/a":
+        result = f"{value1}"
+    else:
+        result = f"{value1};{value2}"
+    return result
+
+def df_to_ssf_df(df: pd.DataFrame, cols_to_add: dict, accession_number: str) -> pd.DataFrame:
+    data = {}
+    try:
+        for key, value in cols_to_add.items():
+            data[key] = ["n/a"] * len(df)
+            if key == "study_accession":
+                data[key] = [accession_number] * len(df)
+            elif key =="instrument_model":
+                for model in df[value].tolist():
+                    if model not in VALID_PLATFORMS:
+                        print(f"Invalid instrument model: {model}, stopping ssf creation")
+                        return
+                data[key] = df[value].tolist()
+            elif key == "instrument_platform":
+                data[key] = ["ILLUMINA"] * len(df)
+            elif key == "first_public" or key == "last_updated":
+                data[key] = [extract_release_date(accession_number)] * len(df)
+            elif key == "fastq_ftp":
+                data[key] = df.apply(lambda row: combine_values(row['Run data file type'], 'bam', row['DownLoad1'], row['DownLoad2']), axis=1)
+            elif key == "fastq_md5":
+                data[key] = df.apply(lambda row: combine_values(row['Run data file type'], 'bam', row['MD5 checksum 1'], row['MD5 checksum 2']), axis=1)
+            elif value and value in df.columns and not df[value].isnull().all():
+                data[key] = df[value].tolist()
+    except Exception as e:
+        print(f"Failed to create ssf file due to: {e}")
+    result_df = pd.DataFrame(data)
+    print(f"Created ssf file with {len(result_df)} rows and {len(result_df.columns)} columns.")
+    return result_df
+
+def save_ssf_to_file(df: pd.DataFrame, output_file: str) -> None:
+    df.to_csv(output_file, index=False, sep='\t')
+    print(f"'{output_file}' created successfully.")
+
+def main(accession_number: str = None, output_file: str = None) -> None:
+    ## TODO: work out how to see if the data gets updated after initial release.
+    xls          = download_xlsx(accession_number)
+    merged_df    = merge_sheets_by_accessions(xls, "Accession")
+    ssf          = df_to_ssf_df(merged_df, SSF_COLUMNS, accession_number)
+
+    ## Save SSF to file.
+    save_ssf_to_file(ssf, output_file)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+    prog='get_gsa_table',
+    description='This script downloads a table with '
+                'links to the raw data and metadata provided by '
+                'GSA for a given project accession ID')
+
+    parser.add_argument('accession_number', help="Example: HRA008755")
+    parser.add_argument('-o', '--output_file', required=True, help="The name of the output file")
+
+    args = parser.parse_args()
+    main(args.accession_number, args.output_file)
diff --git a/scripts/ssf_validator.py b/scripts/ssf_validator.py
@@ -2,7 +2,7 @@
 
 # MIT License (c) 2023 Thiseas C. Lamnidis
 
-VERSION = "0.3.0dev"
+VERSION = "1.0.0"
 
 import os
 import sys
@@ -36,6 +36,10 @@ def read_ssf_file(file_path, required_fields=None, error_counter=0):
                 )
             )
             sys.exit(1)
+    if "submitted_md5" not in headers:
+        print(
+            f"[ssf_validator.py] [File: {file_name}] WARNING: submitted_md5 column not found in SSF file. Please use the latest version of the SSF file creation scripts. This warning can be ignored if you are validating older SSF files."
+        )
     return map(lambda row: dict(zip(headers, row.strip().split("\t"))), l[1:])
 
 
@@ -162,6 +166,7 @@ def validate_instrument_model(instrument_model, error_counter, line_num, file_na
         "Illumina HiSeq 3000",
         "Illumina HiSeq 4000",
         "Illumina HiSeq X",
+        "Illumina HiSeq X Ten",
         "HiSeq X Five",
         "HiSeq X Ten",
         "Illumina Genome Analyzer",
@@ -217,6 +222,7 @@ def validate_ssf(file_in):
             "fastq_md5",
             "read_count",
             "submitted_ftp",
+            "submitted_md5",
         ]
         REQUIRED_FIELDS = [
             "poseidon_IDs",
@@ -226,6 +232,7 @@ def validate_ssf(file_in):
             "instrument_platform",
             "library_name",
             "fastq_ftp",
+            "submitted_ftp",
         ]
 
         ## Check entries
@@ -341,15 +348,9 @@ def validate_ssf(file_in):
 
             ## Validate fastq_ftp
             for reads in [ssf_entry["fastq_ftp"]]:
-                ## Can be empty string in some cases where input is a BAM, but then data won't be processes (atm)
+                ## Since v 1.0.0, fastq_ftp can be empty, since then the bam in submitted_ftp will be converted back to FastQ automatically.
                 if isNAstr(reads):
-                    error_counter = print_error(
-                        "[Fastq_ftp is 'n/a'] fastq_ftp cannot be 'n/a'!",
-                        "Line",
-                        line_num,
-                        error_counter,
-                        file_name,
-                    )
+                    pass
                 elif reads.find(" ") != -1:
                     error_counter = print_error(
                         "[Spaces in FastQ name] File names cannot contain spaces! Please rename.",
@@ -373,6 +374,29 @@ def validate_ssf(file_in):
                         error_counter,
                         file_name,
                     )
+
+            ## Ensure that submitted_ftp and submitted_md5 are not empty (should never be the case, but still.)
+            if isNAstr(ssf_entry["submitted_ftp"]):
+                error_counter = print_error(
+                    "[Submitted_ftp missing] submitted_ftp entry has not been specified!",
+                    "Line",
+                    line_num,
+                    error_counter,
+                    file_name,
+                )
+
+            try:
+                if isNAstr(ssf_entry["submitted_md5"]):
+                    error_counter = print_error(
+                        "[Submitted_md5 missing] submitted_md5 entry has not been specified!",
+                        "Line",
+                        line_num,
+                        error_counter,
+                        file_name,
+                    )
+            except KeyError:
+                ## This should only be validated if the column is present in the header, hence a KeyError is expected.
+                pass
 
     ## If formatting errors have occurred print their number and fail.
     if error_counter > 0: