Skip to content

Commit

Permalink
Merge pull request #143 from sanger-tol/merge_dev
Browse files Browse the repository at this point in the history
Merge changes on dev into public_dev ahead of preparation for release 2.0
  • Loading branch information
BethYates authored Oct 2, 2024
2 parents 7e18cbd + eb89d32 commit ef1c8dc
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 65 deletions.
16 changes: 14 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,18 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [[1.2.2](https://github.com/sanger-tol/genomenote/releases/tag/1.2.2)] - Pyrenean Mountain Dog (patch 2) - [2024-09-10]

### Enhancements & fixes

- Bugfix: don't run Busco in scratch mode for large genomes as it takes too much space on /tmp

## [[1.2.1](https://github.com/sanger-tol/genomenote/releases/tag/1.2.1)] - Pyrenean Mountain Dog (patch 1) - [2024-07-12]

### Enhancements & fixes

- Bugfix: Now handles missing fields in `ncbi datasets` genome report

## [[1.2.0](https://github.com/sanger-tol/genomenote/releases/tag/1.2.0)] - Pyrenean Mountain Dog - [2024-05-01]

### Enhancements & fixes
Expand Down Expand Up @@ -34,13 +46,13 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i
| ------------- | ------------- |
| | --cool_order |

## [[1.1.2](https://github.com/sanger-tol/genomenote/releases/tag/1.1.2)] [2024-04-29]
## [[1.1.2](https://github.com/sanger-tol/genomenote/releases/tag/1.1.2)] - Golden Retriever (patch 2) - [2024-04-29]

### Enhancements & fixes

- Bugfix: the BAM still needs to be filtered with `-F0x400`

## [[1.1.1](https://github.com/sanger-tol/genomenote/releases/tag/1.1.1)] [2024-02-26]
## [[1.1.1](https://github.com/sanger-tol/genomenote/releases/tag/1.1.1)] - Golden Retriever (patch 1) - [2024-02-26]

### Enhancements & fixes

Expand Down
18 changes: 11 additions & 7 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
# Visit https://bit.ly/cffinit to generate yours today!

cff-version: 1.2.0
title: sanger-tol/genomenote v1.2.0
title: sanger-tol/genomenote v1.2.2
message: >-
If you use this software, please cite it using the
metadata from this file.
type: software
authors:
- given-names: Cibin
family-names: Sadasivan Baby
- given-names: Tyler
family-names: Chafin
affiliation: Wellcome Sanger Institute
orcid: "https://orcid.org/0000-0002-8538-276X"
orcid: "https://orcid.org/0000-0001-8687-5905"
- given-names: Matthieu
family-names: Muffato
affiliation: Wellcome Sanger Institute
Expand All @@ -20,12 +20,16 @@ authors:
family-names: Qi
affiliation: Wellcome Sanger Institute
orcid: "https://orcid.org/0000-0003-1262-8973"
- given-names: Cibin
family-names: Sadasivan Baby
affiliation: Wellcome Sanger Institute
orcid: "https://orcid.org/0000-0002-8538-276X"
- given-names: Priyanka
family-names: Surana
affiliation: Wellcome Sanger Institute
orcid: "https://orcid.org/0000-0002-7167-0875"
- given-names: Yates
family-names: Bethan
- given-names: Bethan
family-names: Yates
affiliation: Wellcome Sanger Institute
orcid: "https://orcid.org/0000-0003-1658-1762"
identifiers:
Expand All @@ -34,5 +38,5 @@ identifiers:
repository-code: "https://github.com/sanger-tol/genomenote"
license: MIT
commit: TODO
version: 1.2.0
version: 1.2.2
date-released: "2022-10-07"
159 changes: 106 additions & 53 deletions bin/create_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,117 +6,161 @@
import sys
import csv
import re
import math


def parse_args(args=None):
Description = "Create a table by parsing json output to extract N50, BUSCO, QV and COMPLETENESS stats."

parser = argparse.ArgumentParser(description=Description)
parser.add_argument("--genome", help="Input NCBI genome summary JSON file.", required=True)
parser.add_argument("--sequence", help="Input NCBI sequence summary JSON file.", required=True)
parser.add_argument("--genome", required=True, help="Input NCBI genome summary JSON file.")
parser.add_argument("--sequence", required=True, help="Input NCBI sequence summary JSON file.")
parser.add_argument("--busco", help="Input BUSCO short summary JSON file.")
parser.add_argument("--qv", nargs="*", help="Input QV TSV file from MERQURYFK.")
parser.add_argument("--completeness", nargs="*", help="Input COMPLETENESS stats TSV file from MERQURYFK.")
parser.add_argument("--hic", action="append", help="HiC sample ID used for contact maps.")
parser.add_argument("--flagstat", action="append", help="HiC flagstat file created by Samtools.")
parser.add_argument("--outcsv", help="Output CSV file.", required=True)
parser.add_argument("--outcsv", required=True, help="Output CSV file.")
parser.add_argument("--version", action="version", version="%(prog)s 3.1")
return parser.parse_args(args)


def make_dir(path):
"""
Creates a directory if it doesn't exist.
Parameters:
path (str): Path of the directory to be created.
"""
if len(path) > 0:
os.makedirs(path, exist_ok=True)


# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to the sample names
# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to sample names
# We usually don't want it in the final output
def remove_sample_T_suffix(name):
"""
Removes the suffix like "_T1", "_T2", etc., from sample names.
Parameters:
name (str): Sample name to be processed.
Returns:
str: Sample name with the suffix removed.
"""
return re.sub(r"_T\d+", "", name)


def ncbi_stats(genome_in, seq_in, writer):
"""
Extracts and writes assembly information and statistics from genome and
sequence JSON files to a CSV file.
Parameters:
genome_in (str): Path to the NCBI genome summary JSON file.
seq_in (str): Path to the NCBI sequence summary JSON file.
writer (csv.writer): CSV writer object to write the extracted data.
"""
with open(genome_in, "r") as fin1:
data = json.load(fin1)
data = data.get("reports", [{}])[0]

with open(seq_in, "r") as fin2:
seq = json.load(fin2)
seq = json.load(fin2).get("reports", [])

data = data["reports"][0]
info = data["assembly_info"]
attr = info["biosample"]["attributes"]
stats = data["assembly_stats"]
seq = seq["reports"]
info = data.get("assembly_info", {})
attr = info.get("biosample", {}).get("attributes", [])
stats = data.get("assembly_stats", {})
organism = data.get("organism", {})

# Write assembly information
writer.writerow(["##Assembly_Information"])
writer.writerow(["Accession", data["accession"]])
if "common_name" in data["organism"]:
writer.writerow(["Common_Name", data["organism"]["common_name"]])
writer.writerow(["Organism_Name", data["organism"]["organism_name"]])
writer.writerow(
[
"ToL_ID",
"".join(pairs["value"] for pairs in attr if pairs["name"] == "tolid"),
]
)
writer.writerow(["Taxon_ID", data["organism"]["tax_id"]])
writer.writerow(["Assembly_Name", info["assembly_name"]])
writer.writerow(["Assembly_Level", info["assembly_level"]])
writer.writerow(
[
"Life_Stage",
"".join(pairs["value"] for pairs in attr if pairs["name"] == "life_stage"),
]
)
writer.writerow(
[
"Tissue",
"".join(pairs["value"] for pairs in attr if pairs["name"] == "tissue"),
]
)
writer.writerow(["Sex", "".join(pairs["value"] for pairs in attr if pairs["name"] == "sex")])
writer.writerow(["Accession", data.get("accession", math.nan)])
writer.writerow(["Common_Name", organism.get("common_name", math.nan)])
writer.writerow(["Organism_Name", organism.get("organism_name", math.nan)])
tol_id = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tolid")
writer.writerow(["ToL_ID", tol_id if tol_id else math.nan])
writer.writerow(["Taxon_ID", organism.get("tax_id", math.nan)])
writer.writerow(["Assembly_Name", info.get("assembly_name", math.nan)])
writer.writerow(["Assembly_Level", info.get("assembly_level", math.nan)])
life_stage = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "life_stage")
writer.writerow(["Life_Stage", life_stage if life_stage else math.nan])
tissue = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tissue")
writer.writerow(["Tissue", tissue if tissue else math.nan])
sex = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "sex")
writer.writerow(["Sex", sex if sex else math.nan])

# Write assembly statistics
writer.writerow(["##Assembly_Statistics"])
writer.writerow(["Total_Sequence", stats["total_sequence_length"]])
if "total_number_of_chromosomes" in stats:
writer.writerow(["Chromosomes", stats["total_number_of_chromosomes"]])
writer.writerow(["Scaffolds", stats["number_of_scaffolds"]])
writer.writerow(["Scaffold_N50", stats["scaffold_n50"]])
writer.writerow(["Contigs", stats["number_of_contigs"]])
writer.writerow(["Contig_N50", stats["contig_n50"]])
writer.writerow(["GC_Percent", stats["gc_percent"]])
writer.writerow(["Total_Sequence", stats.get("total_sequence_length", math.nan)])
writer.writerow(["Chromosomes", stats.get("total_number_of_chromosomes", math.nan)])
writer.writerow(["Scaffolds", stats.get("number_of_scaffolds", math.nan)])
writer.writerow(["Scaffold_N50", stats.get("scaffold_n50", math.nan)])
writer.writerow(["Contigs", stats.get("number_of_contigs", math.nan)])
writer.writerow(["Contig_N50", stats.get("contig_n50", math.nan)])
writer.writerow(["GC_Percent", stats.get("gc_percent", math.nan)])

chromosome_header = False
for mol in seq:
if "gc_percent" in mol and mol["assembly_unit"] != "non-nuclear":
if mol.get("gc_percent") is not None and mol.get("assembly_unit") != "non-nuclear":
if not chromosome_header:
writer.writerow(["##Chromosome", "Length", "GC_Percent", "Accession"])
chromosome_header = True
writer.writerow(
[mol["chr_name"], round(mol["length"] / 1000000, 2), mol["gc_percent"], mol["genbank_accession"]]
[
mol.get("chr_name", math.nan),
round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan,
mol.get("gc_percent", math.nan),
mol.get("genbank_accession"),
]
)

organelle_header = False
for mol in seq:
if "gc_percent" in mol and mol["assembly_unit"] == "non-nuclear":
if mol.get("gc_percent") is not None and mol.get("assembly_unit") == "non-nuclear":
if not organelle_header:
writer.writerow(["##Organelle", "Length", "GC_Percent", "Accession"])
organelle_header = True
writer.writerow(
[
mol["assigned_molecule_location_type"],
round(mol["length"] / 1000000, 2),
mol["gc_percent"],
mol["genbank_accession"],
mol.get("assigned_molecule_location_type", math.nan),
round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan,
mol.get("gc_percent", math.nan),
mol.get("genbank_accession"),
]
)


def extract_busco(file_in, writer):
"""
Extracts BUSCO information from a JSON file and writes it to a CSV file.
Parameters:
file_in (str): Path to the BUSCO summary JSON file.
writer (csv.writer): CSV writer object to write the extracted data.
"""
with open(file_in, "r") as fin:
data = json.load(fin)

writer.writerow(["##BUSCO", data["lineage_dataset"]["name"]])
writer.writerow(["Summary", data["results"]["one_line_summary"]])
lineage_dataset_name = data.get("lineage_dataset", {}).get("name", math.nan)
results_summary = data.get("results", {}).get("one_line_summary", math.nan)

writer.writerow(["##BUSCO", lineage_dataset_name])
writer.writerow(["Summary", results_summary])


def extract_pacbio(qv, completeness, writer):
"""
Extracts QV and completeness information from TSV files and writes it to a
CSV file.
NOTE: completeness and qv files have to be from matching specimen names
Parameters:
qv (list): List of paths to one or more QV TSV files.
completeness (list): List of paths to completeness stats TSV files.
writer (csv.writer): CSV writer object to write the extracted data.
"""
qval = 0
qv_name = None
for f in qv:
Expand Down Expand Up @@ -150,6 +194,15 @@ def extract_pacbio(qv, completeness, writer):


def extract_mapped(sample, file_in, writer):
"""
Extracts mapping information from a flagstat file and writes it to a CSV
file.
Parameters:
sample (str): Sample ID used for the HiC contact maps.
file_in (str): Path to the HiC flagstat file created by Samtools.
writer (csv.writer): CSV writer object to write the extracted data.
"""
writer.writerow(["##HiC", remove_sample_T_suffix(sample)])
with open(file_in, "r") as fin:
for line in fin:
Expand Down
3 changes: 2 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ process {
}

withName: BUSCO {
scratch = true
// Obey "use_work_dir_as_temp", except for large genomes
scratch = { !params.use_work_dir_as_temp || (meta.genome_size < 2000000000) }
ext.args = { 'test' in workflow.profile.tokenize(',') ?
// Additional configuration to speed processes up during testing.
// Note: BUSCO *must* see the double-quotes around the parameters
Expand Down
4 changes: 2 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ profiles {
arm {
docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64'
}
singularity {
singularity {
singularity.enabled = true
singularity.autoMounts = true
conda.enabled = false
Expand Down Expand Up @@ -245,7 +245,7 @@ manifest {
description = """Creating standarised genome assembly publications"""
mainScript = 'main.nf'
nextflowVersion = '!>=22.10.1'
version = '1.2.0'
version = '1.2.2'
doi = '10.5281/zenodo.7949384'
}

Expand Down

0 comments on commit ef1c8dc

Please sign in to comment.