From 8287974f69165edf73668cfba767afaf00748c14 Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 3 Sep 2024 21:31:37 +0300 Subject: [PATCH 01/20] added url for COPO as metadata source --- assets/genome_metadata_template.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/assets/genome_metadata_template.csv b/assets/genome_metadata_template.csv index 242e512c..5a709c57 100644 --- a/assets/genome_metadata_template.csv +++ b/assets/genome_metadata_template.csv @@ -6,3 +6,4 @@ ENA,Taxonomy,https://www.ebi.ac.uk/ena/browser/api/xml/TAXONOMY_ID,xml NCBI,Assembly,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/ASSEMBLY_ACCESSION/dataset_report?filters.exclude_atypical=false&filters.assembly_version=current&chromosomes=1&chromosomes=2&chromosomes=3&chromosomes=X&chromosomes=Y&chromosomes=M,json NCBI,Taxonomy,https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=TAXONOMY_ID,xml GOAT,Assembly,http://goat.genomehubs.org/api/v2/record?recordId=ASSEMBLY_ACCESSION&result=assembly&taxonomy=ncbi,json +COPO,Biosample,https://copo-project.org/api/sample/biosampleAccession/BIOSAMPLE_ACCESSION?standard=tol&return_type=json,json From bbcd7ab5228e963617a148a69a1ad37e4d4264eb Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 3 Sep 2024 22:21:34 +0300 Subject: [PATCH 02/20] added python script that parses json file to extract metadata from COPO data source --- bin/parse_json_copo_biosample.py | 130 +++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 bin/parse_json_copo_biosample.py diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py new file mode 100644 index 00000000..6808ae59 --- /dev/null +++ b/bin/parse_json_copo_biosample.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 + +import argparse +import os +import json +import sys +import string +import numbers + +fetch = [ + # Basic Information + ("SPECIMEN_ID", ("SPECIMEN_ID",)), + ("biosampleAccession", ("BIOSAMPLE_ACCESSION",)), + ("SCIENTIFIC_NAME", ("GENUS_SPECIES",)), + ("COMMON_NAME", ("COMMON_NAME",)), + + # Collection Details + ("COLLECTED_BY", ("COLLECTORS",)), + ("COLLECTOR_AFFILIATION", ("COLLECTOR_INSTITUTE",)), + ("DATE_OF_COLLECTION", ("COLLECTOR_DATE",)), + ("DESCRIPTION_OF_COLLECTION_METHOD", ("COLLECTION_METHOD",)), + + # Location Details + ("COLLECTION_LOCATION", ("COLLECTION_LOCATION",)), + ("DECIMAL_LATITUDE", ("LATITUDE",)), + ("DECIMAL_LONGITUDE", ("LONGITUDE",)), + ("HABITAT", ("HABITAT",)), + + # Identification + ("IDENTIFIED_BY", ("IDENTIFIER",)), + ("IDENTIFIER_AFFILIATION", ("IDENTIFIER_INSTITUTE",)), + + # Preservation and Symbiotics + ("PRESERVATION_APPROACH", ("PRESERVATION_METHOD",)), + ("SYMBIONT", ("SYMBIONT",)), + + # Additional attributes + ("TAXON_ID", ("NCBI_TAXID",)), + ("ORDER_OR_GROUP", ("ORDER",)), + ("FAMILY", ("FAMILY",)), + ("GENUS", ("GENUS",)), + ("SEX", ("SEX",)), + ("LIFESTAGE", ("LIFESTAGE",)), + ("ORGANISM_PART", ("ORGANISM_PART",)), + ("GAL", ("GAL",)), +] + +def parse_args(args=None): + Description = "Parse contents of a COPO json file report and pul out meta data required by a genome note." + Epilog = "Example usage: python parse_json_copo_biosample.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input JSON Assembly file.") + parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--version", action="version", version="%(prog)s 1.0") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + os.makedirs(path, exist_ok=True) + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check json file -> {}".format(error) + if context != "": + if context_str != "": + error_str = "ERROR: Please check json file -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + else: + error_str = "ERROR: Please check json file -> {}\n{}".format(error, context.strip()) + + print(error_str) + sys.exit(1) + + +def parse_json(file_in, file_out): + with open(file_in, 'r') as json_file: + data = json.load(json_file) + + param_list = [] + record = data["records"][0] # Get the single record + + if len(data["records"]) != 1: + print_error("More than one record found") + + for f in fetch: + param = find_element(record, f[1], index=0) + if param is not None: + if isinstance(param, numbers.Number): + param = str(param) + + if any(p in string.punctuation for p in param): + param = '"' + param + '"' + + param_list.append([f[0], param]) + + if len(param_list) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + for param_pair in param_list: + fout.write(",".join(param_pair) + "\n") + + else: + print_error("No parameters found!", "File: {}".format(file_in)) + + +def find_element(data, fields, index=0): + if index < len(fields): + key = fields[index] + if key in data: + sub_data = data[key] + if type(sub_data) in [list, dict]: + return find_element(sub_data, fields, index + 1) + return sub_data + else: + return None + return None + + +def main(args=None): + args = parse_args(args) + parse_json(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) From 8d6e99959ce68e3696f7cf7aa115380d8e2d17e2 Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 3 Sep 2024 22:22:18 +0300 Subject: [PATCH 03/20] added the copo files --- bin/combine_parsed_data.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/combine_parsed_data.py b/bin/combine_parsed_data.py index 1f2757ef..5a7f4bad 100755 --- a/bin/combine_parsed_data.py +++ b/bin/combine_parsed_data.py @@ -17,6 +17,9 @@ ("NCBI_ASSEMBLY", "ncbi_assembly_file"), ("NCBI_TAXONOMY", "ncbi_taxonomy_file"), ("GOAT_ASSEMBLY", "goat_assembly_file"), + ("COPO_BIOSAMPLE", "copo_biosample_wgs_file"), + ("COPO_BIOSAMPLE_HIC", "copo_biosample_hic_file"), + ("COPO_BIOSAMPLE_RNA", "copo_biosample_rna_file"), ] From 65701448ea6140e82280053c64819c566041b718 Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 3 Sep 2024 14:26:42 -0500 Subject: [PATCH 04/20] edited the file permissions for the python script --- bin/parse_json_copo_biosample.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/parse_json_copo_biosample.py diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py old mode 100644 new mode 100755 From dc2db138566b3520ab5f84803c58aad602557b98 Mon Sep 17 00:00:00 2001 From: Sandra Babirye <99010085+SandraBabirye@users.noreply.github.com> Date: Tue, 3 Sep 2024 22:29:48 +0300 Subject: [PATCH 05/20] Update parse_json_copo_biosample.py --- bin/parse_json_copo_biosample.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index 6808ae59..40139bf0 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -8,33 +8,22 @@ import numbers fetch = [ - # Basic Information ("SPECIMEN_ID", ("SPECIMEN_ID",)), ("biosampleAccession", ("BIOSAMPLE_ACCESSION",)), ("SCIENTIFIC_NAME", ("GENUS_SPECIES",)), ("COMMON_NAME", ("COMMON_NAME",)), - - # Collection Details ("COLLECTED_BY", ("COLLECTORS",)), ("COLLECTOR_AFFILIATION", ("COLLECTOR_INSTITUTE",)), ("DATE_OF_COLLECTION", ("COLLECTOR_DATE",)), ("DESCRIPTION_OF_COLLECTION_METHOD", ("COLLECTION_METHOD",)), - - # Location Details ("COLLECTION_LOCATION", ("COLLECTION_LOCATION",)), ("DECIMAL_LATITUDE", ("LATITUDE",)), ("DECIMAL_LONGITUDE", ("LONGITUDE",)), ("HABITAT", ("HABITAT",)), - - # Identification ("IDENTIFIED_BY", ("IDENTIFIER",)), ("IDENTIFIER_AFFILIATION", ("IDENTIFIER_INSTITUTE",)), - - # Preservation and Symbiotics ("PRESERVATION_APPROACH", ("PRESERVATION_METHOD",)), ("SYMBIONT", ("SYMBIONT",)), - - # Additional attributes ("TAXON_ID", ("NCBI_TAXID",)), ("ORDER_OR_GROUP", ("ORDER",)), ("FAMILY", ("FAMILY",)), From 53e1110a54cc6018e5399f473ed49be0267b9b3a Mon Sep 17 00:00:00 2001 From: Sandra Babirye <99010085+SandraBabirye@users.noreply.github.com> Date: Tue, 3 Sep 2024 22:41:15 +0300 Subject: [PATCH 06/20] Added COPO as biosample --- modules/local/parse_metadata.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/parse_metadata.nf b/modules/local/parse_metadata.nf index 0f895a86..2d0a2edc 100644 --- a/modules/local/parse_metadata.nf +++ b/modules/local/parse_metadata.nf @@ -22,7 +22,7 @@ process PARSE_METADATA { script: // This script is bundled with the pipeline, in nf-core/genomenote/bin/ def prefix = task.ext.prefix ?: meta.id def script_name = "parse_${meta.ext.toLowerCase()}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.py" - def is_biosample = (meta.biosample_type == "WGS" || meta.biosample_type == "HIC" || meta.biosample_type == "RNA") ? "_${meta.biosample_type}" : "" + def is_biosample = (meta.biosample_type == "WGS" || meta.biosample_type == "HIC" || meta.biosample_type == "RNA" || meta.biosample_type == "COPO") ? "_${meta.biosample_type}" : "" def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}${is_biosample}.csv".strip('_') """ $script_name \\ From 3111b32aec69da2e8c24e90f1aa1acd874c5d116 Mon Sep 17 00:00:00 2001 From: Sandra Babirye <99010085+SandraBabirye@users.noreply.github.com> Date: Tue, 3 Sep 2024 22:58:10 +0300 Subject: [PATCH 07/20] Added the prefix of the biosample as COPO --- bin/parse_json_copo_biosample.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index 40139bf0..8638de88 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -65,6 +65,7 @@ def print_error(error, context="Line", context_str=""): def parse_json(file_in, file_out): + biosample_type = "COPO" with open(file_in, 'r') as json_file: data = json.load(json_file) @@ -78,12 +79,14 @@ def parse_json(file_in, file_out): param = find_element(record, f[1], index=0) if param is not None: if isinstance(param, numbers.Number): - param = str(param) - + param = str(param) if any(p in string.punctuation for p in param): - param = '"' + param + '"' - - param_list.append([f[0], param]) + param = '"' + param + '"' + # Prefix parameter name if biosample type is COPO + param_name = f[0] + if biosample_type == "COPO" : + param_name = f"{biosample_type}_{param_name}" + param_list.append([param_name, param]) if len(param_list) > 0: out_dir = os.path.dirname(file_out) From 51a8dca375bf4e31ad72999a5ec798f9a9375a1c Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 3 Sep 2024 15:04:15 -0500 Subject: [PATCH 08/20] Fix Python Black linting issues --- bin/parse_json_copo_biosample.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index 8638de88..0b6e97e5 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -16,9 +16,9 @@ ("COLLECTOR_AFFILIATION", ("COLLECTOR_INSTITUTE",)), ("DATE_OF_COLLECTION", ("COLLECTOR_DATE",)), ("DESCRIPTION_OF_COLLECTION_METHOD", ("COLLECTION_METHOD",)), - ("COLLECTION_LOCATION", ("COLLECTION_LOCATION",)), - ("DECIMAL_LATITUDE", ("LATITUDE",)), - ("DECIMAL_LONGITUDE", ("LONGITUDE",)), + ("COLLECTION_LOCATION", ("COLLECTION_LOCATION",)), + ("DECIMAL_LATITUDE", ("LATITUDE",)), + ("DECIMAL_LONGITUDE", ("LONGITUDE",)), ("HABITAT", ("HABITAT",)), ("IDENTIFIED_BY", ("IDENTIFIER",)), ("IDENTIFIER_AFFILIATION", ("IDENTIFIER_INSTITUTE",)), @@ -34,6 +34,7 @@ ("GAL", ("GAL",)), ] + def parse_args(args=None): Description = "Parse contents of a COPO json file report and pul out meta data required by a genome note." Epilog = "Example usage: python parse_json_copo_biosample.py " @@ -66,7 +67,7 @@ def print_error(error, context="Line", context_str=""): def parse_json(file_in, file_out): biosample_type = "COPO" - with open(file_in, 'r') as json_file: + with open(file_in, "r") as json_file: data = json.load(json_file) param_list = [] @@ -79,12 +80,12 @@ def parse_json(file_in, file_out): param = find_element(record, f[1], index=0) if param is not None: if isinstance(param, numbers.Number): - param = str(param) + param = str(param) if any(p in string.punctuation for p in param): - param = '"' + param + '"' + param = '"' + param + '"' # Prefix parameter name if biosample type is COPO param_name = f[0] - if biosample_type == "COPO" : + if biosample_type == "COPO": param_name = f"{biosample_type}_{param_name}" param_list.append([param_name, param]) From 80dffed85751f9d849ae03e49d77dc054a23f89b Mon Sep 17 00:00:00 2001 From: Sandra Babirye <99010085+SandraBabirye@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:21:55 +0300 Subject: [PATCH 09/20] Edited file ; remove COPO added a s biosample --- modules/local/parse_metadata.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/parse_metadata.nf b/modules/local/parse_metadata.nf index 2d0a2edc..0f895a86 100644 --- a/modules/local/parse_metadata.nf +++ b/modules/local/parse_metadata.nf @@ -22,7 +22,7 @@ process PARSE_METADATA { script: // This script is bundled with the pipeline, in nf-core/genomenote/bin/ def prefix = task.ext.prefix ?: meta.id def script_name = "parse_${meta.ext.toLowerCase()}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.py" - def is_biosample = (meta.biosample_type == "WGS" || meta.biosample_type == "HIC" || meta.biosample_type == "RNA" || meta.biosample_type == "COPO") ? "_${meta.biosample_type}" : "" + def is_biosample = (meta.biosample_type == "WGS" || meta.biosample_type == "HIC" || meta.biosample_type == "RNA") ? "_${meta.biosample_type}" : "" def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}${is_biosample}.csv".strip('_') """ $script_name \\ From 9823831528f9783426c858b3d5417aca3d9e188f Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 10 Sep 2024 19:45:16 +0300 Subject: [PATCH 10/20] edited the parse_json_copo_biosample.py file --- bin/parse_json_copo_biosample.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index 0b6e97e5..ecd55653 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -71,9 +71,9 @@ def parse_json(file_in, file_out): data = json.load(json_file) param_list = [] - record = data["records"][0] # Get the single record + record = data["data"][0] # Get the single record - if len(data["records"]) != 1: + if data["number_found"] != 1: print_error("More than one record found") for f in fetch: From d87b8a3666712994365996f6dc035603c3b26680 Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 10 Sep 2024 20:03:20 +0300 Subject: [PATCH 11/20] edited the parse_json_copo_biosample.py file --- bin/parse_json_copo_biosample.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index ecd55653..2138f961 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -69,9 +69,15 @@ def parse_json(file_in, file_out): biosample_type = "COPO" with open(file_in, "r") as json_file: data = json.load(json_file) + # Check if 'data' key exists and if the list is non-empty + if "data" in data and len(data["data"]) > 0: + record = data["data"][0] # Get the single record + + else: + print_error(f"Error: 'data' key missing or list is empty in the file: {file_in}") + return # Exit early if no valid data param_list = [] - record = data["data"][0] # Get the single record if data["number_found"] != 1: print_error("More than one record found") From a9758ac1b9a3ea11f1547868e9056f51c87b8afa Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 10 Sep 2024 20:40:45 +0300 Subject: [PATCH 12/20] edited the parse_json_copo_biosample.py file --- bin/parse_json_copo_biosample.py | 45 ++++++++++++++------------------ 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index 2138f961..d4118866 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -66,34 +66,29 @@ def print_error(error, context="Line", context_str=""): def parse_json(file_in, file_out): - biosample_type = "COPO" - with open(file_in, "r") as json_file: - data = json.load(json_file) - # Check if 'data' key exists and if the list is non-empty - if "data" in data and len(data["data"]) > 0: - record = data["data"][0] # Get the single record - - else: - print_error(f"Error: 'data' key missing or list is empty in the file: {file_in}") - return # Exit early if no valid data + try: + with open(file_in, "r") as f: + file = json.load(f) + record = file.get('data', []) + except Exception as e: + print_error(f"Failed to read JSON file. Error: {e}") + biosample_type = "COPO" param_list = [] - if data["number_found"] != 1: - print_error("More than one record found") - - for f in fetch: - param = find_element(record, f[1], index=0) - if param is not None: - if isinstance(param, numbers.Number): - param = str(param) - if any(p in string.punctuation for p in param): - param = '"' + param + '"' - # Prefix parameter name if biosample type is COPO - param_name = f[0] - if biosample_type == "COPO": - param_name = f"{biosample_type}_{param_name}" - param_list.append([param_name, param]) + for data in record: + for f in fetch: + param = find_element(data, f[1], index=0) + if param is not None: + if isinstance(param, numbers.Number): + param = str(param) + if any(p in string.punctuation for p in param): + param = '"' + param + '"' + # Prefix parameter name if biosample type is COPO + param_name = f[0] + if biosample_type == "COPO": + param_name = f"{biosample_type}_{param_name}" + param_list.append([param_name, param]) if len(param_list) > 0: out_dir = os.path.dirname(file_out) From a04facf566fb9a84e8453ac4c15469df66a50cac Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 10 Sep 2024 21:41:58 +0300 Subject: [PATCH 13/20] removed LONGITUDE as its missing in the json file --- bin/parse_json_copo_biosample.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index d4118866..35a99335 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -9,23 +9,22 @@ fetch = [ ("SPECIMEN_ID", ("SPECIMEN_ID",)), - ("biosampleAccession", ("BIOSAMPLE_ACCESSION",)), - ("SCIENTIFIC_NAME", ("GENUS_SPECIES",)), + ("BIOSAMPLE_ACCESSION", ("biosampleAccession",)), + ("GENUS_SPECIES", ("SCIENTIFIC_NAME",)), ("COMMON_NAME", ("COMMON_NAME",)), - ("COLLECTED_BY", ("COLLECTORS",)), - ("COLLECTOR_AFFILIATION", ("COLLECTOR_INSTITUTE",)), - ("DATE_OF_COLLECTION", ("COLLECTOR_DATE",)), - ("DESCRIPTION_OF_COLLECTION_METHOD", ("COLLECTION_METHOD",)), + ("COLLECTORS", ("COLLECTED_BY",)), + ("COLLECTOR_INSTITUTE", ("COLLECTOR_AFFILIATION",)), + ("COLLECTOR_DATE", ("DATE_OF_COLLECTION",)), + ("COLLECTION_METHOD", ("DESCRIPTION_OF_COLLECTION_METHOD",)), ("COLLECTION_LOCATION", ("COLLECTION_LOCATION",)), - ("DECIMAL_LATITUDE", ("LATITUDE",)), - ("DECIMAL_LONGITUDE", ("LONGITUDE",)), + ("LATITUDE", ("DECIMAL_LATITUDE",)), ("HABITAT", ("HABITAT",)), - ("IDENTIFIED_BY", ("IDENTIFIER",)), - ("IDENTIFIER_AFFILIATION", ("IDENTIFIER_INSTITUTE",)), - ("PRESERVATION_APPROACH", ("PRESERVATION_METHOD",)), + ("IDENTIFIER", ("IDENTIFIED_BY",)), + ("IDENTIFIER_INSTITUTE", ("IDENTIFIER_AFFILIATION",)), + ("PRESERVATION_METHOD", ("PRESERVATION_APPROACH",)), ("SYMBIONT", ("SYMBIONT",)), - ("TAXON_ID", ("NCBI_TAXID",)), - ("ORDER_OR_GROUP", ("ORDER",)), + ("NCBI_TAXID", ("TAXON_ID",)), + ("ORDER", ("ORDER_OR_GROUP",)), ("FAMILY", ("FAMILY",)), ("GENUS", ("GENUS",)), ("SEX", ("SEX",)), @@ -69,14 +68,14 @@ def parse_json(file_in, file_out): try: with open(file_in, "r") as f: file = json.load(f) - record = file.get('data', []) + records = file.get('data', []) except Exception as e: print_error(f"Failed to read JSON file. Error: {e}") biosample_type = "COPO" param_list = [] - for data in record: + for data in records: for f in fetch: param = find_element(data, f[1], index=0) if param is not None: From 22f41da6724f1222a192e124b34f4b6261860d57 Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 10 Sep 2024 13:44:49 -0500 Subject: [PATCH 14/20] Fixing black linting issues --- bin/parse_json_copo_biosample.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index 35a99335..fd9c702e 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -68,14 +68,14 @@ def parse_json(file_in, file_out): try: with open(file_in, "r") as f: file = json.load(f) - records = file.get('data', []) + records = file.get("data", []) except Exception as e: print_error(f"Failed to read JSON file. Error: {e}") biosample_type = "COPO" param_list = [] - for data in records: + for data in records: for f in fetch: param = find_element(data, f[1], index=0) if param is not None: From c90249576c924d16ac7c37fe9084d44f846c55ba Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 10 Sep 2024 21:49:32 +0300 Subject: [PATCH 15/20] added new arguments in the script --- bin/parse_json_copo_biosample.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index fd9c702e..a5d8759d 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -41,6 +41,9 @@ def parse_args(args=None): parser = argparse.ArgumentParser(description=Description, epilog=Epilog) parser.add_argument("FILE_IN", help="Input JSON Assembly file.") parser.add_argument("FILE_OUT", help="Output file.") + parser.add_argument("--copo_biosample_wgs_file", help="Input parsed COPO genomic biosample file.", required=False) + parser.add_argument("--copo_biosample_hic_file", help="Input parsed COPO HiC biosample file.", required=False) + parser.add_argument("--copo_biosample_rna_file", help="Input parsed COPO RNASeq biosample file.", required=False) parser.add_argument("--version", action="version", version="%(prog)s 1.0") return parser.parse_args(args) From 4f246017c877450451421c0b05795c76d2f33970 Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 10 Sep 2024 22:06:48 +0300 Subject: [PATCH 16/20] added new arguments in the script --- bin/combine_parsed_data.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/combine_parsed_data.py b/bin/combine_parsed_data.py index 5a7f4bad..60cb1cc6 100755 --- a/bin/combine_parsed_data.py +++ b/bin/combine_parsed_data.py @@ -37,6 +37,9 @@ def parse_args(args=None): parser.add_argument("--ncbi_assembly_file", help="Input parsed ENA assembly file.", required=False) parser.add_argument("--ncbi_taxonomy_file", help="Input parsed ENA assembly file.", required=False) parser.add_argument("--goat_assembly_file", help="Input parsed ENA assembly file.", required=False) + parser.add_argument("--copo_biosample_wgs_file", help="Input parsed COPO genomic biosample file.", required=False) + parser.add_argument("--copo_biosample_hic_file", help="Input parsed COPO HiC biosample file.", required=False) + parser.add_argument("--copo_biosample_rna_file", help="Input parsed COPO RNASeq biosample file.", required=False) parser.add_argument("--out_consistent", help="Output file.", required=True) parser.add_argument("--out_inconsistent", help="Output file.", required=True) parser.add_argument("--version", action="version", version="%(prog)s 1.0") From 8deaf14e82d0fa5248e5a26829827f84a7d29590 Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 10 Sep 2024 22:07:29 +0300 Subject: [PATCH 17/20] edited the parse_json_copo_biosample.py file --- bin/parse_json_copo_biosample.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index a5d8759d..fd9c702e 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -41,9 +41,6 @@ def parse_args(args=None): parser = argparse.ArgumentParser(description=Description, epilog=Epilog) parser.add_argument("FILE_IN", help="Input JSON Assembly file.") parser.add_argument("FILE_OUT", help="Output file.") - parser.add_argument("--copo_biosample_wgs_file", help="Input parsed COPO genomic biosample file.", required=False) - parser.add_argument("--copo_biosample_hic_file", help="Input parsed COPO HiC biosample file.", required=False) - parser.add_argument("--copo_biosample_rna_file", help="Input parsed COPO RNASeq biosample file.", required=False) parser.add_argument("--version", action="version", version="%(prog)s 1.0") return parser.parse_args(args) From 940bd5d3d67d2a5ae571f8793cf60f3204c8fc4d Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Tue, 10 Sep 2024 22:11:39 +0300 Subject: [PATCH 18/20] edited the parse_json_copo_biosample.py file --- bin/parse_json_copo_biosample.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index fd9c702e..33fc7fe1 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -18,6 +18,7 @@ ("COLLECTION_METHOD", ("DESCRIPTION_OF_COLLECTION_METHOD",)), ("COLLECTION_LOCATION", ("COLLECTION_LOCATION",)), ("LATITUDE", ("DECIMAL_LATITUDE",)), + ("LONGITUDE", ("DECIMAL_LONGITUDE",)), ("HABITAT", ("HABITAT",)), ("IDENTIFIER", ("IDENTIFIED_BY",)), ("IDENTIFIER_INSTITUTE", ("IDENTIFIER_AFFILIATION",)), From e474905d233e9505638ac080a6a46f21924422f0 Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Wed, 11 Sep 2024 08:06:42 +0300 Subject: [PATCH 19/20] edited the file to Extract biosample type from FILE_OUT --- bin/parse_json_copo_biosample.py | 55 +++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index 33fc7fe1..128b0c88 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -68,27 +68,46 @@ def print_error(error, context="Line", context_str=""): def parse_json(file_in, file_out): try: with open(file_in, "r") as f: - file = json.load(f) - records = file.get("data", []) + data = json.load(f) + except Exception as e: print_error(f"Failed to read JSON file. Error: {e}") - biosample_type = "COPO" - param_list = [] - - for data in records: - for f in fetch: - param = find_element(data, f[1], index=0) - if param is not None: - if isinstance(param, numbers.Number): - param = str(param) - if any(p in string.punctuation for p in param): - param = '"' + param + '"' - # Prefix parameter name if biosample type is COPO - param_name = f[0] - if biosample_type == "COPO": - param_name = f"{biosample_type}_{param_name}" - param_list.append([param_name, param]) + if data["number_found"] == 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["#paramName", "paramValue"]) + "\n") + return + + elif data["number_found"] >> 1: + print_error("More than one record found") + + else: + record = data["data"] + + # Extract biosample type from FILE_OUT + biosample_type = None + if "HIC" in file_out.upper(): + biosample_type = "HIC" + elif "RNA" in file_out.upper(): + biosample_type = "RNA" + + param_list = [] + + for data in record: + for f in fetch: + param = find_element(data, f[1], index=0) + if param is not None: + if isinstance(param, numbers.Number): + param = str(param) + if any(p in string.punctuation for p in param): + param = '"' + param + '"' + # Prefix parameter name if biosample type is COPO + param_name = f[0] + if biosample_type in ["HIC", "RNA"]: + param_name = f"{biosample_type}_{param_name}" + param_list.append([param_name, param]) if len(param_list) > 0: out_dir = os.path.dirname(file_out) From b708ededa3f7dd2de4b640ba280ac54506d39126 Mon Sep 17 00:00:00 2001 From: SandraBabirye Date: Wed, 11 Sep 2024 00:08:15 -0500 Subject: [PATCH 20/20] edited the file to Extract biosample type from FILE_OUT --- bin/parse_json_copo_biosample.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/parse_json_copo_biosample.py b/bin/parse_json_copo_biosample.py index 128b0c88..95f755bb 100755 --- a/bin/parse_json_copo_biosample.py +++ b/bin/parse_json_copo_biosample.py @@ -69,7 +69,7 @@ def parse_json(file_in, file_out): try: with open(file_in, "r") as f: data = json.load(f) - + except Exception as e: print_error(f"Failed to read JSON file. Error: {e}") @@ -80,10 +80,10 @@ def parse_json(file_in, file_out): fout.write(",".join(["#paramName", "paramValue"]) + "\n") return - elif data["number_found"] >> 1: + elif data["number_found"] >> 1: print_error("More than one record found") - else: + else: record = data["data"] # Extract biosample type from FILE_OUT @@ -95,7 +95,7 @@ def parse_json(file_in, file_out): param_list = [] - for data in record: + for data in record: for f in fetch: param = find_element(data, f[1], index=0) if param is not None: