Merge pull request #143 from sanger-tol/merge_dev

Merge changes on dev into public_dev ahead of preparation for release 2.0
sanger-tol · Oct 2, 2024 · ef1c8dc · ef1c8dc
2 parents 7e18cbd + eb89d32
commit ef1c8dc
Show file tree

Hide file tree

Showing 5 changed files with 135 additions and 65 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,18 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[1.2.2](https://github.com/sanger-tol/genomenote/releases/tag/1.2.2)] - Pyrenean Mountain Dog (patch 2) - [2024-09-10]
+
+### Enhancements & fixes
+
+- Bugfix: don't run Busco in scratch mode for large genomes as it takes too much space on /tmp
+
+## [[1.2.1](https://github.com/sanger-tol/genomenote/releases/tag/1.2.1)] - Pyrenean Mountain Dog (patch 1) - [2024-07-12]
+
+### Enhancements & fixes
+
+- Bugfix: Now handles missing fields in `ncbi datasets` genome report
+
 ## [[1.2.0](https://github.com/sanger-tol/genomenote/releases/tag/1.2.0)] - Pyrenean Mountain Dog - [2024-05-01]
 
 ### Enhancements & fixes
@@ -34,13 +46,13 @@ Note, since the pipeline is using Nextflow DSL2, each process will be run with i
 | ------------- | ------------- |
 |               | --cool_order  |
 
-## [[1.1.2](https://github.com/sanger-tol/genomenote/releases/tag/1.1.2)] [2024-04-29]
+## [[1.1.2](https://github.com/sanger-tol/genomenote/releases/tag/1.1.2)] - Golden Retriever (patch 2) - [2024-04-29]
 
 ### Enhancements & fixes
 
 - Bugfix: the BAM still needs to be filtered with `-F0x400`
 
-## [[1.1.1](https://github.com/sanger-tol/genomenote/releases/tag/1.1.1)] [2024-02-26]
+## [[1.1.1](https://github.com/sanger-tol/genomenote/releases/tag/1.1.1)] - Golden Retriever (patch 1) - [2024-02-26]
 
 ### Enhancements & fixes
 

diff --git a/CITATION.cff b/CITATION.cff
@@ -2,16 +2,16 @@
 # Visit https://bit.ly/cffinit to generate yours today!
 
 cff-version: 1.2.0
-title: sanger-tol/genomenote v1.2.0
+title: sanger-tol/genomenote v1.2.2
 message: >-
     If you use this software, please cite it using the
     metadata from this file.
 type: software
 authors:
-    - given-names: Cibin
-      family-names: Sadasivan Baby
+    - given-names: Tyler
+      family-names: Chafin
       affiliation: Wellcome Sanger Institute
-      orcid: "https://orcid.org/0000-0002-8538-276X"
+      orcid: "https://orcid.org/0000-0001-8687-5905"
     - given-names: Matthieu
       family-names: Muffato
       affiliation: Wellcome Sanger Institute
@@ -20,12 +20,16 @@ authors:
       family-names: Qi
       affiliation: Wellcome Sanger Institute
       orcid: "https://orcid.org/0000-0003-1262-8973"
+    - given-names: Cibin
+      family-names: Sadasivan Baby
+      affiliation: Wellcome Sanger Institute
+      orcid: "https://orcid.org/0000-0002-8538-276X"
     - given-names: Priyanka
       family-names: Surana
       affiliation: Wellcome Sanger Institute
       orcid: "https://orcid.org/0000-0002-7167-0875"
-    - given-names: Yates
-      family-names: Bethan
+    - given-names: Bethan
+      family-names: Yates
       affiliation: Wellcome Sanger Institute
       orcid: "https://orcid.org/0000-0003-1658-1762"
 identifiers:
@@ -34,5 +38,5 @@ identifiers:
 repository-code: "https://github.com/sanger-tol/genomenote"
 license: MIT
 commit: TODO
-version: 1.2.0
+version: 1.2.2
 date-released: "2022-10-07"
diff --git a/bin/create_table.py b/bin/create_table.py
@@ -6,117 +6,161 @@
 import sys
 import csv
 import re
+import math
 
 
 def parse_args(args=None):
     Description = "Create a table by parsing json output to extract N50, BUSCO, QV and COMPLETENESS stats."
 
     parser = argparse.ArgumentParser(description=Description)
-    parser.add_argument("--genome", help="Input NCBI genome summary JSON file.", required=True)
-    parser.add_argument("--sequence", help="Input NCBI sequence summary JSON file.", required=True)
+    parser.add_argument("--genome", required=True, help="Input NCBI genome summary JSON file.")
+    parser.add_argument("--sequence", required=True, help="Input NCBI sequence summary JSON file.")
     parser.add_argument("--busco", help="Input BUSCO short summary JSON file.")
     parser.add_argument("--qv", nargs="*", help="Input QV TSV file from MERQURYFK.")
     parser.add_argument("--completeness", nargs="*", help="Input COMPLETENESS stats TSV file from MERQURYFK.")
     parser.add_argument("--hic", action="append", help="HiC sample ID used for contact maps.")
     parser.add_argument("--flagstat", action="append", help="HiC flagstat file created by Samtools.")
-    parser.add_argument("--outcsv", help="Output CSV file.", required=True)
+    parser.add_argument("--outcsv", required=True, help="Output CSV file.")
     parser.add_argument("--version", action="version", version="%(prog)s 3.1")
     return parser.parse_args(args)
 
 
 def make_dir(path):
+    """
+    Creates a directory if it doesn't exist.
+
+    Parameters:
+    path (str): Path of the directory to be created.
+    """
     if len(path) > 0:
         os.makedirs(path, exist_ok=True)
 
 
-# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to the sample names
+# check_samplesheet.py adds a suffix like "_T1", "_T2", etc, to sample names
 # We usually don't want it in the final output
 def remove_sample_T_suffix(name):
+    """
+    Removes the suffix like "_T1", "_T2", etc., from sample names.
+
+    Parameters:
+    name (str): Sample name to be processed.
+
+    Returns:
+    str: Sample name with the suffix removed.
+    """
     return re.sub(r"_T\d+", "", name)
 
 
 def ncbi_stats(genome_in, seq_in, writer):
+    """
+    Extracts and writes assembly information and statistics from genome and
+    sequence JSON files to a CSV file.
+
+    Parameters:
+    genome_in (str): Path to the NCBI genome summary JSON file.
+    seq_in (str): Path to the NCBI sequence summary JSON file.
+    writer (csv.writer): CSV writer object to write the extracted data.
+    """
     with open(genome_in, "r") as fin1:
         data = json.load(fin1)
+    data = data.get("reports", [{}])[0]
+
     with open(seq_in, "r") as fin2:
-        seq = json.load(fin2)
+        seq = json.load(fin2).get("reports", [])
 
-    data = data["reports"][0]
-    info = data["assembly_info"]
-    attr = info["biosample"]["attributes"]
-    stats = data["assembly_stats"]
-    seq = seq["reports"]
+    info = data.get("assembly_info", {})
+    attr = info.get("biosample", {}).get("attributes", [])
+    stats = data.get("assembly_stats", {})
+    organism = data.get("organism", {})
 
+    # Write assembly information
     writer.writerow(["##Assembly_Information"])
-    writer.writerow(["Accession", data["accession"]])
-    if "common_name" in data["organism"]:
-        writer.writerow(["Common_Name", data["organism"]["common_name"]])
-    writer.writerow(["Organism_Name", data["organism"]["organism_name"]])
-    writer.writerow(
-        [
-            "ToL_ID",
-            "".join(pairs["value"] for pairs in attr if pairs["name"] == "tolid"),
-        ]
-    )
-    writer.writerow(["Taxon_ID", data["organism"]["tax_id"]])
-    writer.writerow(["Assembly_Name", info["assembly_name"]])
-    writer.writerow(["Assembly_Level", info["assembly_level"]])
-    writer.writerow(
-        [
-            "Life_Stage",
-            "".join(pairs["value"] for pairs in attr if pairs["name"] == "life_stage"),
-        ]
-    )
-    writer.writerow(
-        [
-            "Tissue",
-            "".join(pairs["value"] for pairs in attr if pairs["name"] == "tissue"),
-        ]
-    )
-    writer.writerow(["Sex", "".join(pairs["value"] for pairs in attr if pairs["name"] == "sex")])
+    writer.writerow(["Accession", data.get("accession", math.nan)])
+    writer.writerow(["Common_Name", organism.get("common_name", math.nan)])
+    writer.writerow(["Organism_Name", organism.get("organism_name", math.nan)])
+    tol_id = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tolid")
+    writer.writerow(["ToL_ID", tol_id if tol_id else math.nan])
+    writer.writerow(["Taxon_ID", organism.get("tax_id", math.nan)])
+    writer.writerow(["Assembly_Name", info.get("assembly_name", math.nan)])
+    writer.writerow(["Assembly_Level", info.get("assembly_level", math.nan)])
+    life_stage = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "life_stage")
+    writer.writerow(["Life_Stage", life_stage if life_stage else math.nan])
+    tissue = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "tissue")
+    writer.writerow(["Tissue", tissue if tissue else math.nan])
+    sex = "".join(pairs.get("value", "") for pairs in attr if pairs.get("name") == "sex")
+    writer.writerow(["Sex", sex if sex else math.nan])
+
+    # Write assembly statistics
     writer.writerow(["##Assembly_Statistics"])
-    writer.writerow(["Total_Sequence", stats["total_sequence_length"]])
-    if "total_number_of_chromosomes" in stats:
-        writer.writerow(["Chromosomes", stats["total_number_of_chromosomes"]])
-    writer.writerow(["Scaffolds", stats["number_of_scaffolds"]])
-    writer.writerow(["Scaffold_N50", stats["scaffold_n50"]])
-    writer.writerow(["Contigs", stats["number_of_contigs"]])
-    writer.writerow(["Contig_N50", stats["contig_n50"]])
-    writer.writerow(["GC_Percent", stats["gc_percent"]])
+    writer.writerow(["Total_Sequence", stats.get("total_sequence_length", math.nan)])
+    writer.writerow(["Chromosomes", stats.get("total_number_of_chromosomes", math.nan)])
+    writer.writerow(["Scaffolds", stats.get("number_of_scaffolds", math.nan)])
+    writer.writerow(["Scaffold_N50", stats.get("scaffold_n50", math.nan)])
+    writer.writerow(["Contigs", stats.get("number_of_contigs", math.nan)])
+    writer.writerow(["Contig_N50", stats.get("contig_n50", math.nan)])
+    writer.writerow(["GC_Percent", stats.get("gc_percent", math.nan)])
+
     chromosome_header = False
     for mol in seq:
-        if "gc_percent" in mol and mol["assembly_unit"] != "non-nuclear":
+        if mol.get("gc_percent") is not None and mol.get("assembly_unit") != "non-nuclear":
             if not chromosome_header:
                 writer.writerow(["##Chromosome", "Length", "GC_Percent", "Accession"])
                 chromosome_header = True
             writer.writerow(
-                [mol["chr_name"], round(mol["length"] / 1000000, 2), mol["gc_percent"], mol["genbank_accession"]]
+                [
+                    mol.get("chr_name", math.nan),
+                    round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan,
+                    mol.get("gc_percent", math.nan),
+                    mol.get("genbank_accession"),
+                ]
             )
+
     organelle_header = False
     for mol in seq:
-        if "gc_percent" in mol and mol["assembly_unit"] == "non-nuclear":
+        if mol.get("gc_percent") is not None and mol.get("assembly_unit") == "non-nuclear":
             if not organelle_header:
                 writer.writerow(["##Organelle", "Length", "GC_Percent", "Accession"])
                 organelle_header = True
             writer.writerow(
                 [
-                    mol["assigned_molecule_location_type"],
-                    round(mol["length"] / 1000000, 2),
-                    mol["gc_percent"],
-                    mol["genbank_accession"],
+                    mol.get("assigned_molecule_location_type", math.nan),
+                    round(mol.get("length", 0) / 1000000, 2) if mol.get("length") is not None else math.nan,
+                    mol.get("gc_percent", math.nan),
+                    mol.get("genbank_accession"),
                 ]
             )
 
 
 def extract_busco(file_in, writer):
+    """
+    Extracts BUSCO information from a JSON file and writes it to a CSV file.
+
+    Parameters:
+    file_in (str): Path to the BUSCO summary JSON file.
+    writer (csv.writer): CSV writer object to write the extracted data.
+    """
     with open(file_in, "r") as fin:
         data = json.load(fin)
 
-    writer.writerow(["##BUSCO", data["lineage_dataset"]["name"]])
-    writer.writerow(["Summary", data["results"]["one_line_summary"]])
+    lineage_dataset_name = data.get("lineage_dataset", {}).get("name", math.nan)
+    results_summary = data.get("results", {}).get("one_line_summary", math.nan)
+
+    writer.writerow(["##BUSCO", lineage_dataset_name])
+    writer.writerow(["Summary", results_summary])
 
 
 def extract_pacbio(qv, completeness, writer):
+    """
+    Extracts QV and completeness information from TSV files and writes it to a
+    CSV file.
+
+    NOTE: completeness and qv files have to be from matching specimen names
+
+    Parameters:
+    qv (list): List of paths to one or more QV TSV files.
+    completeness (list): List of paths to completeness stats TSV files.
+    writer (csv.writer): CSV writer object to write the extracted data.
+    """
     qval = 0
     qv_name = None
     for f in qv:
@@ -150,6 +194,15 @@ def extract_pacbio(qv, completeness, writer):
 
 
 def extract_mapped(sample, file_in, writer):
+    """
+    Extracts mapping information from a flagstat file and writes it to a CSV
+    file.
+
+    Parameters:
+    sample (str): Sample ID used for the HiC contact maps.
+    file_in (str): Path to the HiC flagstat file created by Samtools.
+    writer (csv.writer): CSV writer object to write the extracted data.
+    """
     writer.writerow(["##HiC", remove_sample_T_suffix(sample)])
     with open(file_in, "r") as fin:
         for line in fin:

diff --git a/conf/modules.config b/conf/modules.config
@@ -50,7 +50,8 @@ process {
     }
 
     withName: BUSCO {
-        scratch = true
+        // Obey "use_work_dir_as_temp", except for large genomes
+        scratch = { !params.use_work_dir_as_temp || (meta.genome_size < 2000000000) }
         ext.args = { 'test' in workflow.profile.tokenize(',') ?
                         // Additional configuration to speed processes up during testing.
                         // Note: BUSCO *must* see the double-quotes around the parameters

diff --git a/nextflow.config b/nextflow.config
@@ -148,7 +148,7 @@ profiles {
     arm {
         docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64'
     }
-    singularity {        
+    singularity {
         singularity.enabled    = true
         singularity.autoMounts = true
         conda.enabled          = false
@@ -245,7 +245,7 @@ manifest {
     description     = """Creating standarised genome assembly publications"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=22.10.1'
-    version         = '1.2.0'
+    version         = '1.2.2'
     doi             = '10.5281/zenodo.7949384'
 }