Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added a new feature branch 'copo_metadata' to solve issue #115 #137

Merged
merged 20 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
8287974
added url for COPO as metadata source
SandraBabirye Sep 3, 2024
bbcd7ab
added python script that parses json file to extract metadata from CO…
SandraBabirye Sep 3, 2024
8d6e999
added the copo files
SandraBabirye Sep 3, 2024
6570144
edited the file permissions for the python script
SandraBabirye Sep 3, 2024
dc2db13
Update parse_json_copo_biosample.py
SandraBabirye Sep 3, 2024
53e1110
Added COPO as biosample
SandraBabirye Sep 3, 2024
3111b32
Added the prefix of the biosample as COPO
SandraBabirye Sep 3, 2024
51a8dca
Fix Python Black linting issues
SandraBabirye Sep 3, 2024
80dffed
Edited file ; remove COPO added a s biosample
SandraBabirye Sep 4, 2024
9823831
edited the parse_json_copo_biosample.py file
SandraBabirye Sep 10, 2024
d87b8a3
edited the parse_json_copo_biosample.py file
SandraBabirye Sep 10, 2024
a9758ac
edited the parse_json_copo_biosample.py file
SandraBabirye Sep 10, 2024
a04facf
removed LONGITUDE as its missing in the json file
SandraBabirye Sep 10, 2024
22f41da
Fixing black linting issues
SandraBabirye Sep 10, 2024
c902495
added new arguments in the script
SandraBabirye Sep 10, 2024
4f24601
added new arguments in the script
SandraBabirye Sep 10, 2024
8deaf14
edited the parse_json_copo_biosample.py file
SandraBabirye Sep 10, 2024
940bd5d
edited the parse_json_copo_biosample.py file
SandraBabirye Sep 10, 2024
e474905
edited the file to Extract biosample type from FILE_OUT
SandraBabirye Sep 11, 2024
b708ede
edited the file to Extract biosample type from FILE_OUT
SandraBabirye Sep 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions assets/genome_metadata_template.csv
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ ENA,Taxonomy,https://www.ebi.ac.uk/ena/browser/api/xml/TAXONOMY_ID,xml
NCBI,Assembly,https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/ASSEMBLY_ACCESSION/dataset_report?filters.exclude_atypical=false&filters.assembly_version=current&chromosomes=1&chromosomes=2&chromosomes=3&chromosomes=X&chromosomes=Y&chromosomes=M,json
NCBI,Taxonomy,https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=taxonomy&id=TAXONOMY_ID,xml
GOAT,Assembly,http://goat.genomehubs.org/api/v2/record?recordId=ASSEMBLY_ACCESSION&result=assembly&taxonomy=ncbi,json
COPO,Biosample,https://copo-project.org/api/sample/biosampleAccession/BIOSAMPLE_ACCESSION?standard=tol&return_type=json,json
3 changes: 3 additions & 0 deletions bin/combine_parsed_data.py
SandraBabirye marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
("NCBI_ASSEMBLY", "ncbi_assembly_file"),
("NCBI_TAXONOMY", "ncbi_taxonomy_file"),
("GOAT_ASSEMBLY", "goat_assembly_file"),
("COPO_BIOSAMPLE", "copo_biosample_wgs_file"),
("COPO_BIOSAMPLE_HIC", "copo_biosample_hic_file"),
("COPO_BIOSAMPLE_RNA", "copo_biosample_rna_file"),
]


Expand Down
123 changes: 123 additions & 0 deletions bin/parse_json_copo_biosample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env python3

import argparse
import os
import json
import sys
import string
import numbers

fetch = [
("SPECIMEN_ID", ("SPECIMEN_ID",)),
("biosampleAccession", ("BIOSAMPLE_ACCESSION",)),
("SCIENTIFIC_NAME", ("GENUS_SPECIES",)),
("COMMON_NAME", ("COMMON_NAME",)),
("COLLECTED_BY", ("COLLECTORS",)),
("COLLECTOR_AFFILIATION", ("COLLECTOR_INSTITUTE",)),
("DATE_OF_COLLECTION", ("COLLECTOR_DATE",)),
("DESCRIPTION_OF_COLLECTION_METHOD", ("COLLECTION_METHOD",)),
("COLLECTION_LOCATION", ("COLLECTION_LOCATION",)),
("DECIMAL_LATITUDE", ("LATITUDE",)),
("DECIMAL_LONGITUDE", ("LONGITUDE",)),
("HABITAT", ("HABITAT",)),
("IDENTIFIED_BY", ("IDENTIFIER",)),
("IDENTIFIER_AFFILIATION", ("IDENTIFIER_INSTITUTE",)),
("PRESERVATION_APPROACH", ("PRESERVATION_METHOD",)),
("SYMBIONT", ("SYMBIONT",)),
("TAXON_ID", ("NCBI_TAXID",)),
("ORDER_OR_GROUP", ("ORDER",)),
("FAMILY", ("FAMILY",)),
("GENUS", ("GENUS",)),
("SEX", ("SEX",)),
("LIFESTAGE", ("LIFESTAGE",)),
("ORGANISM_PART", ("ORGANISM_PART",)),
("GAL", ("GAL",)),
]


def parse_args(args=None):
Description = "Parse contents of a COPO json file report and pul out meta data required by a genome note."
Epilog = "Example usage: python parse_json_copo_biosample.py <FILE_IN> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input JSON Assembly file.")
parser.add_argument("FILE_OUT", help="Output file.")
parser.add_argument("--version", action="version", version="%(prog)s 1.0")
return parser.parse_args(args)


def make_dir(path):
if len(path) > 0:
os.makedirs(path, exist_ok=True)


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check json file -> {}".format(error)
if context != "":
if context_str != "":
error_str = "ERROR: Please check json file -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
else:
error_str = "ERROR: Please check json file -> {}\n{}".format(error, context.strip())

print(error_str)
sys.exit(1)


def parse_json(file_in, file_out):
biosample_type = "COPO"
with open(file_in, "r") as json_file:
data = json.load(json_file)

param_list = []
record = data["records"][0] # Get the single record
SandraBabirye marked this conversation as resolved.
Show resolved Hide resolved

if len(data["records"]) != 1:
SandraBabirye marked this conversation as resolved.
Show resolved Hide resolved
print_error("More than one record found")

for f in fetch:
param = find_element(record, f[1], index=0)
if param is not None:
if isinstance(param, numbers.Number):
param = str(param)
if any(p in string.punctuation for p in param):
param = '"' + param + '"'
# Prefix parameter name if biosample type is COPO
param_name = f[0]
if biosample_type == "COPO":
param_name = f"{biosample_type}_{param_name}"
param_list.append([param_name, param])

if len(param_list) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(["#paramName", "paramValue"]) + "\n")
for param_pair in param_list:
fout.write(",".join(param_pair) + "\n")

else:
print_error("No parameters found!", "File: {}".format(file_in))


def find_element(data, fields, index=0):
if index < len(fields):
key = fields[index]
if key in data:
sub_data = data[key]
if type(sub_data) in [list, dict]:
return find_element(sub_data, fields, index + 1)
return sub_data
else:
return None
return None


def main(args=None):
args = parse_args(args)
parse_json(args.FILE_IN, args.FILE_OUT)


if __name__ == "__main__":
sys.exit(main())
Loading