sanger-tol · BethYates · Jul 30, 2024 · Jul 14, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/bin/combine_parsed_data.py b/bin/combine_parsed_data.py
@@ -10,7 +10,9 @@
 files = [
     ("ENA_ASSEMBLY", "ena_assembly_file"),
     ("ENA_BIOPROJECT", "ena_bioproject_file"),
-    ("ENA_BIOSAMPLE", "ena_biosample_file"),
+    ("ENA_BIOSAMPLE", "ena_biosample_wgs_file"),
+    ("ENA_BIOSAMPLE_HIC", "ena_biosample_hic_file"),
+    ("ENA_BIOSAMPLE_RNA", "ena_biosample_rna_file"),
     ("ENA_TAXONOMY", "ena_taxonomy_file"),
     ("NCBI_ASSEMBLY", "ncbi_assembly_file"),
     ("NCBI_TAXONOMY", "ncbi_taxonomy_file"),
@@ -25,7 +27,9 @@ def parse_args(args=None):
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
     parser.add_argument("--ena_assembly_file", help="Input parsed ENA assembly file.", required=False)
     parser.add_argument("--ena_bioproject_file", help="Input parsed ENA assembly file.", required=False)
-    parser.add_argument("--ena_biosample_file", help="Input parsed ENA assembly file.", required=False)
+    parser.add_argument("--ena_biosample_wgs_file", help="Input parsed ENA genomic biosample file.", required=False)
+    parser.add_argument("--ena_biosample_hic_file", help="Input parsed ENA HiC biosample file.", required=False)
+    parser.add_argument("--ena_biosample_rna_file", help="Input parsed ENA RNASeq biosample file.", required=False)
     parser.add_argument("--ena_taxonomy_file", help="Input parsed ENA assembly file.", required=False)
     parser.add_argument("--ncbi_assembly_file", help="Input parsed ENA assembly file.", required=False)
     parser.add_argument("--ncbi_taxonomy_file", help="Input parsed ENA assembly file.", required=False)

diff --git a/bin/parse_xml_ena_biosample.py b/bin/parse_xml_ena_biosample.py
@@ -43,7 +43,7 @@
 
 
 def parse_args(args=None):
-    Description = "Parse contents of an ENA SAMPLE report and pul out meta data required by a genome note."
+    Description = "Parse contents of an ENA SAMPLE report and pull out meta data required by a genome note."
     Epilog = "Example usage: python parse_xml_ena_sample.py <FILE_IN> <FILE_OUT>"
 
     parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
@@ -77,6 +77,13 @@ def parse_xml(file_in, file_out):
     root = tree.getroot()
     param_list = []
 
+    # Extract biosample type from FILE_OUT
+    biosample_type = None
+    if "HIC" in file_out.upper():
+        biosample_type = "HIC"
+    elif "RNA" in file_out.upper():
+        biosample_type = "RNA"
+
     for f in fetch:
         param = None
         r = root
@@ -101,8 +108,7 @@ def parse_xml(file_in, file_out):
                             param = r.attrib.get(f[2][1])
                         except ValueError:
                             param = None
-
-                    ## Count child elements with specfic tag
+                    ## Count child elements with specific tag
                     if f[2][0] == "count":
                         if r is not None:
                             param = str(len(r.findall(f[2][1]))) if len(r.findall(f[2][1])) != 0 else None
@@ -129,8 +135,11 @@ def parse_xml(file_in, file_out):
 
                     if any(p in string.punctuation for p in param):
                         param = '"' + param + '"'
-
-                    param_list.append([f[0], param])
+                    # Prefix parameter name if biosample type is HiC or RNA
+                    param_name = f[0]
+                    if biosample_type in ["HIC", "RNA"]:
+                        param_name = f"{biosample_type}_{param_name}"
+                    param_list.append([param_name, param])
 
     if len(param_list) > 0:
         out_dir = os.path.dirname(file_out)

diff --git a/conf/test.config b/conf/test.config
@@ -33,7 +33,9 @@ params {
     species  = 'Cloeon_dipterum'
     taxon_id = '197152'
     bioproject = 'PRJEB45177'
-    biosample =  'SAMEA7520803'
+    biosample_wgs = 'SAMEA7520803'
+    biosample_hic = 'SAMEA7520846'
+    biosample_rna = 'SAMEA7521081'
 
     // Genome Notes Portal
     write_to_portal  = false

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -33,7 +33,9 @@ params {
     species = 'Ypsolopha_sequella'
     taxon_id = '1870436'
     bioproject = 'PRJEB50740'
-    biosample =  'SAMEA7519929'
+    biosample_wgs =  'SAMEA7519929'
+    biosample_hic = 'SAMEA7519968'
+    biosample_rna = null
 
     // Genome Notes Portal
     write_to_portal  = false

diff --git a/docs/usage.md b/docs/usage.md
@@ -25,7 +25,9 @@ You will need to supply the assembly accession for the genome you would like to
    --species '[species name]'
    --taxon_id '[taxon id]'
    --bioproject '[bioproject accession]'
-   --biosample '[biosample accession]'
+   --biosample_wgs '[biosample accession of the biosample used to produce the genomic sequence]'
+   --biosample_hic '[biosample accession of the biosample used to produce the HiC data]'
+   --biosample_rna '[biosample accession of the biosample used to produce the RNASeq data]
 ```
 
 If you wish to run the optional step that writes genome metatdata key value-pairs to a genome notes databases you will need to set the parameter "write_to_portal" to true and provide the base url for the REST API that writes to the database.
@@ -101,7 +103,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p
 The typical command for running the pipeline is as follows:
 
 ```bash
-nextflow run sanger-tol/genomenote --input samplesheet.csv --outdir <OUTDIR> --fasta genome.fasta --assembly GCA_922984935.2 --species Epithemia_sp._CRS-2021b --taxon_id 2809013 --bioproject PRJEB49353 --biosample SAMEA7524400 -profile docker
+nextflow run sanger-tol/genomenote --input samplesheet.csv --outdir <OUTDIR> --fasta genome.fasta --assembly GCA_922984935.2 --species Epithemia_sp._CRS-2021b --taxon_id 2809013 --bioproject PRJEB49353 --biosample_wgs SAMEA7524400 -profile docker
 ```
 
 This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
@@ -137,7 +139,7 @@ assembly: 'GCA_922984935.2'
 species: 'Epithemia_sp._CRS-2021b'
 taxon_id: '2809013'
 bioproject: 'PRJEB49353'
-biosample: 'SAMEA7524400'
+biosample_wgs: 'SAMEA7524400'
 <...>
 ```
 

diff --git a/modules/local/combine_metadata.nf b/modules/local/combine_metadata.nf
@@ -24,7 +24,7 @@ process COMBINE_METADATA {
     for (item in  file_list){
         def file = item
         def file_ext = item.getExtension()
-        def file_name = "--" + item.getName().minus("${prefix}_").minus(".${file_ext}") + "_file"
+        def file_name = "--" + item.getName().minus("${prefix}_").minus(".${file_ext}").toLowerCase() + "_file"
         args.add(file_name)
         args.add(file)
     }

diff --git a/modules/local/parse_metadata.nf b/modules/local/parse_metadata.nf
@@ -22,7 +22,8 @@ process PARSE_METADATA {
     script: // This script is bundled with the pipeline, in nf-core/genomenote/bin/
     def prefix = task.ext.prefix ?: meta.id
     def script_name = "parse_${meta.ext.toLowerCase()}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.py"
-    def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.csv"
+    def is_biosample = (meta.biosample_type == "WGS" || meta.biosample_type == "HIC" || meta.biosample_type == "RNA") ? "_${meta.biosample_type}" : ""
+    def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}${is_biosample}.csv".strip('_')
     """
     $script_name \\
         $json \\

diff --git a/modules/local/run_wget.nf b/modules/local/run_wget.nf
@@ -14,16 +14,18 @@ process RUN_WGET {
 
 
     output:
-    tuple val(meta), path("${meta.id}_${meta.source}_${meta.type}.${meta.ext}") , emit:  file_path
+    tuple val(meta), path("${meta.id}_${meta.source}_${meta.type}*.${meta.ext}") , emit:  file_path
     path "versions.yml"                         , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     def  no_certificate = (meta.source == 'GOAT') ? '--no-check-certificate' : ''
+    def is_biosample = (meta.biosample_type == "WGS" || meta.biosample_type == "HIC" || meta.biosample_type == "RNA") ? "_${meta.biosample_type}" : ""
+    def output = "${meta.id}_${meta.source}_${meta.type}${is_biosample}.${meta.ext}".strip('_')
     """
-        wget ${no_certificate} -c -O ${meta.id}_${meta.source}_${meta.type}.${meta.ext} '${url}'
+        wget ${no_certificate} -c -O ${output} '${url}'
 
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":

diff --git a/nextflow.config b/nextflow.config
@@ -19,7 +19,9 @@ params {
     species                         = null
     taxon_id                        = null
     bioproject                      = null
-    biosample                       = null
+    biosample_wgs                   = null
+    biosample_hic                   = null
+    biosample_rna                   = null
 
     // Genome Notes 
     write_to_portal                 = false     

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -48,9 +48,17 @@
                     "type": "string",
                     "description": "The bioproject accesion linked to the genome assembly, for example: PRJEB49353."
                 },
-                "biosample": {
+                "biosample_wgs": {
                     "type": "string",
-                    "description": "The biosample accesion(s) linked to the samples in the experiment, for example: SAMEA7524400."
+                    "description": "The biosample accesion(s) linked to the WGS samples in the experiment, for example: SAMEA7520803."
+                },
+                "biosample_rna": {
+                    "type": "string",
+                    "description": "The biosample accesion(s) linked to the RNA samples in the experiment, for example: SAMEA7521081."
+                },
+                "biosample_hic": {
+                    "type": "string",
+                    "description": "The biosample accesion(s) linked to the Hi-C samples in the experiment, for example: SAMEA7520846."
                 },
                 "outdir": {
                     "type": "string",

diff --git a/subworkflows/local/genome_metadata.nf b/subworkflows/local/genome_metadata.nf
@@ -15,37 +15,68 @@ workflow GENOME_METADATA {
 
     main:
     ch_versions = Channel.empty()
- 
+
     // Define channel for RUN_WGET
     ch_file_list
     | splitCsv(header: ['source', 'type', 'url', 'ext'], skip: 1)
-    | map { row -> 
-        [   
-            // meta
-            [   id: params.assembly,
-                taxon_id: params.taxon_id,
-                source: row.source, 
-                type: row.type, 
-                ext: row.ext, 
-            ],
-            // url 
-            row.url
-                .replaceAll(/ASSEMBLY_ACCESSION/, params.assembly)
-                .replaceAll(/TAXONOMY_ID/, params.taxon_id)
-                .replaceAll(/BIOPROJECT_ACCESSION/, params.bioproject)
-                .replaceAll(/BIOSAMPLE_ACCESSION/, params.biosample)
+    | flatMap { row ->
+        // Create a list to hold the final entries
+        def entries = []
+
+        // Common metadata
+        def metadata = [
+            id: params.assembly,
+            taxon_id: params.taxon_id,
+            source: row.source,
+            type: row.type,
+            ext: row.ext
+        ]
+
+        // Define biosamples with their types
+        def biosamples = [
+            ["WGS", params.biosample_wgs],
+            ["HIC", params.biosample_hic],
+            ["RNA", params.biosample_rna]
         ]
+
+        // Process each biosample
+        biosamples.each { biosampleType, biosampleID ->
+            if ( biosampleID != null ) {
+                // Skip if biosampleID is null}
+                def url = row.url
+                    .replaceAll(/ASSEMBLY_ACCESSION/, params.assembly)
+                    .replaceAll(/TAXONOMY_ID/, params.taxon_id)
+                    .replaceAll(/BIOPROJECT_ACCESSION/, params.bioproject)
+                    .replaceAll(/BIOSAMPLE_ACCESSION/, biosampleID)
+
+                if (row.type == 'Biosample') {
+                    // Add entry with biosample type in metadata for Biosample type
+                    entries << [
+                        metadata + [biosample_type: biosampleType],
+                        url
+                    ]
+                } else {
+                    // Add entry without biosample type in metadata for other types
+                    entries << [
+                        metadata + [biosample_type: ''],
+                        url
+                    ]
+                }
+            }
+        }
+        return entries
     }
+    | unique()
     | set { file_list }
 
     // Fetch files
     RUN_WGET ( file_list )
-    ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() ) 
+    ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() )
 
     PARSE_METADATA(RUN_WGET.out.file_path)
     ch_versions = ch_versions.mix( PARSE_METADATA.out.versions.first() )
 
-    PARSE_METADATA.out.file_path 
+    PARSE_METADATA.out.file_path
     | map { it -> tuple( it[1] )}
     | collect  
     | map { it ->

diff --git a/workflows/genomenote.nf b/workflows/genomenote.nf
@@ -14,8 +14,8 @@ def checkPathParamList = [ params.input, params.multiqc_config, params.lineage_d
 for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
 
 // Check mandatory parameters
-if (params.assembly && params.taxon_id && params.bioproject && params.biosample) { metadata_inputs = [ params.assembly, params.taxon_id, params.bioproject, params.biosample ] }
-else { exit 1, 'Metadata input not specified. Please include an assembly accession, a taxon id, a bioproject accession and a biosample_accession' }
+if (params.assembly && params.taxon_id && params.bioproject && params.biosample_wgs) { metadata_inputs = [ params.assembly, params.taxon_id, params.bioproject, params.biosample_wgs ] }
+else { exit 1, 'Metadata input not specified. Please include an assembly accession, a taxon id, a bioproject accession and a biosample accession' }
 if (params.input)     { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' }
 if (params.fasta)     { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Genome fasta not specified!' }
 if (params.binsize)   { ch_bin   = Channel.of(params.binsize)     } else { exit 1, 'Bin size for cooler/cload not specified!' }