Allow adding multiple biosample

reichan1998 · Jul 14, 2024 · a2446eb · a2446eb · BethYates · Jul 25, 2024
1 parent 12bff64
commit a2446eb
Show file tree

Hide file tree

Showing 6 changed files with 80 additions and 26 deletions.
diff --git a/bin/combine_parsed_data.py b/bin/combine_parsed_data.py
@@ -55,7 +55,10 @@ def process_file(file_in, params):
             if any(p in string.punctuation for p in value):
                 value = '"' + value + '"'
 
-            source_dict[key] = value
+            if key in source_dict:
+                source_dict[key].append(value)
+            else:
+                source_dict[key] = [value]
 
             if key in params:
                 params[key].append(value)
@@ -72,8 +75,18 @@ def main(args=None):
     params_inconsistent = {}
 
     for file in files:
-        (params, paramDict) = process_file(getattr(args, file[1]), params)
-        param_sets[file[0]] = paramDict
+        file_list = getattr(args, file[1])
+        if file_list:
+            for single_file in file_list.split(','):
+                (params, paramDict) = process_file(single_file, params)
+                if file[0] not in param_sets:
+                    param_sets[file[0]] = paramDict
+                else:
+                    for key, values in paramDict.items():
+                        if key in param_sets[file[0]]:
+                            param_sets[file[0]][key].extend(values)
+                        else:
+                            param_sets[file[0]][key] = values
 
     for key in params.keys():
         value_set = {v for v in params[key]}
@@ -82,7 +95,8 @@ def main(args=None):
 
             for source in param_sets:
                 if key in param_sets[source]:
-                    params_inconsistent[key].append((source, param_sets[source][key]))
+                    for val in param_sets[source][key]:
+                        params_inconsistent[key].append((source, val))
 
     # Strip inconsistent data from parameter list
     for i in params_inconsistent.keys():

diff --git a/bin/combine_statistics_data.py b/bin/combine_statistics_data.py
@@ -82,7 +82,11 @@ def process_inconsistent_file(file, params, inconsistent, consistent):
             else:
                 key = row.pop(0)
 
-                if consistent.get(key) is None:
+                if key in inconsistent:
+                    # Append to existing inconsistent values
+                    inconsistent[key].extend(row)
+                else:
+                    # Add new inconsistent values
                     inconsistent[key] = row
 
     return inconsistent

diff --git a/modules/local/combine_metadata.nf b/modules/local/combine_metadata.nf
@@ -21,12 +21,24 @@ process COMBINE_METADATA {
     script:
     def args = []
     def prefix = task.ext.prefix ?: meta.id
+    def biosample_files = []
+
     for (item in  file_list){
         def file = item
         def file_ext = item.getExtension()
         def file_name = "--" + item.getName().minus("${prefix}_").minus(".${file_ext}") + "_file"
-        args.add(file_name)
-        args.add(file)
+
+        if (file_name.contains("biosample")) {
+                biosample_files.add(file)
+        } else {
+            args.add(file_name)
+            args.add(file)
+        }
+    }
+
+    if (!biosample_files.isEmpty()) {
+        args.add("--ena_biosample_file")
+        args.add(biosample_files.join(","))
     }
 
     """

diff --git a/modules/local/parse_metadata.nf b/modules/local/parse_metadata.nf
@@ -22,7 +22,7 @@ process PARSE_METADATA {
     script: // This script is bundled with the pipeline, in nf-core/genomenote/bin/
     def prefix = task.ext.prefix ?: meta.id
     def script_name = "parse_${meta.ext.toLowerCase()}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.py"
-    def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.csv"
+    def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}${meta.biosample}.csv"
     """
     $script_name \\
         $json \\

diff --git a/modules/local/run_wget.nf b/modules/local/run_wget.nf
@@ -14,16 +14,17 @@ process RUN_WGET {
 
 
     output:
-    tuple val(meta), path("${meta.id}_${meta.source}_${meta.type}.${meta.ext}") , emit:  file_path
+    tuple val(meta), path("${meta.id}_${meta.source}_${meta.type}*.${meta.ext}") , emit:  file_path
     path "versions.yml"                         , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     def  no_certificate = (meta.source == 'GOAT') ? '--no-check-certificate' : ''
+    def output = "${meta.id}_${meta.source}_${meta.type}${meta.biosample}.${meta.ext}".strip('_')
     """
-        wget ${no_certificate} -c -O ${meta.id}_${meta.source}_${meta.type}.${meta.ext} '${url}'
+        wget ${no_certificate} -c -O ${output} '${url}'
 
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":

diff --git a/subworkflows/local/genome_metadata.nf b/subworkflows/local/genome_metadata.nf
@@ -15,37 +15,60 @@ workflow GENOME_METADATA {
 
     main:
     ch_versions = Channel.empty()
- 
+
     // Define channel for RUN_WGET
     ch_file_list
     | splitCsv(header: ['source', 'type', 'url', 'ext'], skip: 1)
-    | map { row -> 
-        [   
-            // meta
-            [   id: params.assembly,
-                taxon_id: params.taxon_id,
-                source: row.source, 
-                type: row.type, 
-                ext: row.ext, 
-            ],
-            // url 
-            row.url
+    | flatMap { row ->
+        // Create a list to hold the final entries
+        def entries = []
+
+        // Common metadata
+        def metadata = [
+            id: params.assembly,
+            taxon_id: params.taxon_id,
+            source: row.source,
+            type: row.type,
+            ext: row.ext
+        ]
+
+        // Process each biosample
+        params.biosample.split(',').each { biosample ->
+            def url = row.url
                 .replaceAll(/ASSEMBLY_ACCESSION/, params.assembly)
                 .replaceAll(/TAXONOMY_ID/, params.taxon_id)
                 .replaceAll(/BIOPROJECT_ACCESSION/, params.bioproject)
-                .replaceAll(/BIOSAMPLE_ACCESSION/, params.biosample)
-        ]
+                .replaceAll(/BIOSAMPLE_ACCESSION/, biosample.trim())
+
+            if (row.type == 'Biosample') {
+                // Add entry with biosample in metadata for Biosample type
+                entries << [
+                    metadata + [biosample: biosample.trim()],
+                    url
+                ]
+            } else {
+                // Add entry without biosample in metadata for other types
+                entries << [
+                    metadata + [biosample: ''],
+                    url
+                ]
+            }
+        }
+
+        return entries
     }
+    | unique()
     | set { file_list }
+    file_list.view()
 
     // Fetch files
     RUN_WGET ( file_list )
-    ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() ) 
+    ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() )
 
     PARSE_METADATA(RUN_WGET.out.file_path)
     ch_versions = ch_versions.mix( PARSE_METADATA.out.versions.first() )
 
-    PARSE_METADATA.out.file_path 
+    PARSE_METADATA.out.file_path
     | map { it -> tuple( it[1] )}
     | collect  
     | map { it ->