Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 211 additions & 0 deletions scripts/create_ssf_from_gsa_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
#!/usr/bin/env python3

## Script originally made by Stephan Schiffels (@stschiff). Edited by Luca Thale-Bombien (@Kavlahkaff) for specific use in this repository (added empty poseidon_IDs, udg and library_built columns).

import argparse
import io

import requests
import re
import pandas as pd
import openpyxl


headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
VALID_PLATFORMS = [
"NextSeq 1000",
"NextSeq 500",
"NextSeq 550",
"Illumina NextSeq 1000",
"Illumina NextSeq 500",
"Illumina NextSeq 550",
"Illumina NovaSeq 6000",
"Illumina MiniSeq",
"Illumina HiSeq 1000",
"Illumina HiSeq 1500",
"Illumina HiSeq 2000",
"Illumina HiSeq 2500",
"Illumina HiSeq 3000",
"Illumina HiSeq 4000",
"Illumina HiSeq X",
"HiSeq X Five",
"HiSeq X Ten",
"Illumina HiSeq X Five",
"Illumina HiSeq X Ten",
"Illumina Genome Analyzer",
"Illumina Genome Analyzer II",
"Illumina Genome Analyzer IIx",
"Illumina HiScanSQ",
"Illumina MiSeq",
]

SSF_COLUMNS = {
"poseidon_IDs": "Individual Name",
"udg": None,
"library_built": None,
"notes": None,
"run_accession": "Run accession",
"study_accession": None,
"sample_accession" : "Individual accession",
"sample_alias": "Individual Name",
"secondary_sample_accession": None,
"first_public": "first_public",
"last_updated": "last_updated",
"instrument_model": "Platform",
"library_layout": "Layout",
"library_source": "Source",
"instrument_platform": "Platform",
"library_name": "Experiment title",
"library_strategy": "Strategy",
"fastq_ftp": "DownLoad1;DownLoad2",
"fastq_aspera": None,
"fastq_bytes": None,
"fastq_md5": "MD5 checksum 1;MD5 checksum 2",
"read_count": None,
"submitted_ftp": "DownLoad1",
"submitted_md5": "MD5 checksum 1",
}

def extract_release_date(accession_number: str) -> str:
requests_text = requests.get(f"https://ngdc.cncb.ac.cn/gsa-human/browse/{accession_number}", headers=headers).text
release_date_match = re.search(r'<b>Release date:</b>\s*</span>\s*</div>\s*<div class="col-md-9">\s*([\d-]+)', requests_text)
date = release_date_match.group(1) if release_date_match else "n/a"
return date


def download_xlsx(accession_number: str) -> pd.ExcelFile:
requests_text = requests.get(f"https://ngdc.cncb.ac.cn/gsa-human/browse/{accession_number}", headers=headers).text
base_url = "https://ngdc.cncb.ac.cn"
action_match = re.search(r"f\.action\s*=\s*[\"']([^\"']+)[\"']", requests_text)
study_id_match = re.search(r"var study_id\s*=\s*'(\d+)'", requests_text)
request_flag_match = re.search(r"var requestFlag\s*=\s*'(\d+)'", requests_text)
file_path_match = re.search(r"downHumanExcel\('(.+?)'\)", requests_text)

if file_path_match:
file_name = file_path_match.group(1)
else:
print("File path not found.")

action = action_match.group(1) + "exportExcelFile" if action_match else None
study_id = study_id_match.group(1) if study_id_match else None
request_flag = request_flag_match.group(1) if request_flag_match else None
file_path = f"{action}?fileName={file_name}&study_id={study_id}&requestFlag={request_flag}"
# Construct the full URL
download_url = base_url + file_path
# Send GET request to download the file
print(f"attempting to download file with url: {download_url}")
response = requests.get(download_url, stream=True)
if response.status_code == 200:
print(f"Successfully downloaded file with url: {download_url}")

# Load the content into pandas from memory (without saving to disk)
excel_data = pd.ExcelFile(io.BytesIO(response.content))

return excel_data # Return the ExcelFile object
else:
print(f"Failed to download file. Status code: {response.status_code}")
return None

def merge_sheets_by_accessions(xls: pd.ExcelFile, accession_col_name: str) -> pd.DataFrame:
"""Merges all sheets in the GSA Excel file into one large pandas dataframe, based on the accession number in each sheet.

Args:
xls (pd.ExcelFile): The Excel file object containing the sheets to be merged.
accession_col_name (str): The name of the column containing the accession numbers in each sheet.
This should be the same for all sheets, and should be the accession of the row in the current sheet (i.e. the Run Accession for the Runs sheet).

Returns:
pd.DataFrame: A merged pandas dataframe containing all the data from the sheets, merged by their corresponding accession numbers.
"""
## Rename "Accession" column of each sheet to include sheet name.
dfs = {}
for sheet in xls.sheet_names:
df = xls.parse(sheet, dtype=str)
df.rename(columns={"Accession": f"{sheet} accession"}, inplace=True) # Standardized column name
dfs[sheet] = df
## One additional rename for the Experiment sheet
dfs["Experiment"].rename(columns={"BioSample accession": "Sample accession"}, inplace=True)
## One additional rename for the Sample sheet (seems not standardised capitalisation in the GSA file)
dfs["Sample"].rename(columns={"Individual Accession": "Individual accession"}, inplace=True)

## Merge dataframes in hierarchical order
merged_df = None
## dfs.keys = ['Individual', 'Sample', 'Experiment', 'Run'], as is the order in the excel file.
for sheet_name, df in dfs.items():
if merged_df is None:
merged_df = df # Initialize with the first sheet (Individual)
else:
merged_df = merged_df.merge(df, on=f"{last_sheet_name} accession", how="outer", suffixes=("", "_"+sheet_name))
# This takes care of empty cells in original xlsx file
merged_df.replace('', 'n/a', inplace=True)
## Keep track of the last sheet name for the next iteration
last_sheet_name = sheet_name

## This takes care of empty cells in original xlsx file
merged_df.fillna('n/a', inplace=True)
return merged_df

## Helper formatter to combine values in the two provided columns if the file_type value DOES NOT match the type_matches value.
def combine_values(file_type:str, type_matches: str, value1: str, value2: str) -> str:
if file_type == type_matches:
result = 'n/a'
elif value2 == "n/a":
result = f"{value1}"
else:
result = f"{value1};{value2}"
return result

def df_to_ssf_df(df: pd.DataFrame, cols_to_add: dict, accession_number: str) -> pd.DataFrame:
data = {}
try:
for key, value in cols_to_add.items():
data[key] = ["n/a"] * len(df)
if key == "study_accession":
data[key] = [accession_number] * len(df)
elif key =="instrument_model":
for model in df[value].tolist():
if model not in VALID_PLATFORMS:
print(f"Invalid instrument model: {model}, stopping ssf creation")
return
data[key] = df[value].tolist()
elif key == "instrument_platform":
data[key] = ["ILLUMINA"] * len(df)
elif key == "first_public" or key == "last_updated":
data[key] = [extract_release_date(accession_number)] * len(df)
elif key == "fastq_ftp":
data[key] = df.apply(lambda row: combine_values(row['Run data file type'], 'bam', row['DownLoad1'], row['DownLoad2']), axis=1)
elif key == "fastq_md5":
data[key] = df.apply(lambda row: combine_values(row['Run data file type'], 'bam', row['MD5 checksum 1'], row['MD5 checksum 2']), axis=1)
elif value and value in df.columns and not df[value].isnull().all():
data[key] = df[value].tolist()
except Exception as e:
print(f"Failed to create ssf file due to: {e}")
result_df = pd.DataFrame(data)
print(f"Created ssf file with {len(result_df)} rows and {len(result_df.columns)} columns.")
return result_df

def save_ssf_to_file(df: pd.DataFrame, output_file: str) -> None:
df.to_csv(output_file, index=False, sep='\t')
print(f"'{output_file}' created successfully.")

def main(accession_number: str = None, output_file: str = None) -> None:
## TODO: work out how to see if the data gets updated after initial release.
xls = download_xlsx(accession_number)
merged_df = merge_sheets_by_accessions(xls, "Accession")
ssf = df_to_ssf_df(merged_df, SSF_COLUMNS, accession_number)

## Save SSF to file.
save_ssf_to_file(ssf, output_file)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog='get_gsa_table',
description='This script downloads a table with '
'links to the raw data and metadata provided by '
'GSA for a given project accession ID')

parser.add_argument('accession_number', help="Example: HRA008755")
parser.add_argument('-o', '--output_file', required=True, help="The name of the output file")

args = parser.parse_args()
main(args.accession_number, args.output_file)
42 changes: 33 additions & 9 deletions scripts/ssf_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# MIT License (c) 2023 Thiseas C. Lamnidis

VERSION = "0.3.0dev"
VERSION = "1.0.0"

import os
import sys
Expand Down Expand Up @@ -36,6 +36,10 @@ def read_ssf_file(file_path, required_fields=None, error_counter=0):
)
)
sys.exit(1)
if "submitted_md5" not in headers:
print(
f"[ssf_validator.py] [File: {file_name}] WARNING: submitted_md5 column not found in SSF file. Please use the latest version of the SSF file creation scripts. This warning can be ignored if you are validating older SSF files."
)
return map(lambda row: dict(zip(headers, row.strip().split("\t"))), l[1:])


Expand Down Expand Up @@ -162,6 +166,7 @@ def validate_instrument_model(instrument_model, error_counter, line_num, file_na
"Illumina HiSeq 3000",
"Illumina HiSeq 4000",
"Illumina HiSeq X",
"Illumina HiSeq X Ten",
"HiSeq X Five",
"HiSeq X Ten",
"Illumina Genome Analyzer",
Expand Down Expand Up @@ -217,6 +222,7 @@ def validate_ssf(file_in):
"fastq_md5",
"read_count",
"submitted_ftp",
"submitted_md5",
]
REQUIRED_FIELDS = [
"poseidon_IDs",
Expand All @@ -226,6 +232,7 @@ def validate_ssf(file_in):
"instrument_platform",
"library_name",
"fastq_ftp",
"submitted_ftp",
]

## Check entries
Expand Down Expand Up @@ -341,15 +348,9 @@ def validate_ssf(file_in):

## Validate fastq_ftp
for reads in [ssf_entry["fastq_ftp"]]:
## Can be empty string in some cases where input is a BAM, but then data won't be processes (atm)
## Since v 1.0.0, fastq_ftp can be empty, since then the bam in submitted_ftp will be converted back to FastQ automatically.
if isNAstr(reads):
error_counter = print_error(
"[Fastq_ftp is 'n/a'] fastq_ftp cannot be 'n/a'!",
"Line",
line_num,
error_counter,
file_name,
)
pass
elif reads.find(" ") != -1:
error_counter = print_error(
"[Spaces in FastQ name] File names cannot contain spaces! Please rename.",
Expand All @@ -373,6 +374,29 @@ def validate_ssf(file_in):
error_counter,
file_name,
)

## Ensure that submitted_ftp and submitted_md5 are not empty (should never be the case, but still.)
if isNAstr(ssf_entry["submitted_ftp"]):
error_counter = print_error(
"[Submitted_ftp missing] submitted_ftp entry has not been specified!",
"Line",
line_num,
error_counter,
file_name,
)

try:
if isNAstr(ssf_entry["submitted_md5"]):
error_counter = print_error(
"[Submitted_md5 missing] submitted_md5 entry has not been specified!",
"Line",
line_num,
error_counter,
file_name,
)
except KeyError:
## This should only be validated if the column is present in the header, hence a KeyError is expected.
pass

## If formatting errors have occurred print their number and fail.
if error_counter > 0:
Expand Down