Skip to content

Commit

Permalink
Allow adding multiple biosample
Browse files Browse the repository at this point in the history
  • Loading branch information
reichan1998 committed Jul 14, 2024
1 parent 12bff64 commit a2446eb
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 26 deletions.
22 changes: 18 additions & 4 deletions bin/combine_parsed_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ def process_file(file_in, params):
if any(p in string.punctuation for p in value):
value = '"' + value + '"'

source_dict[key] = value
if key in source_dict:
source_dict[key].append(value)
else:
source_dict[key] = [value]

if key in params:
params[key].append(value)
Expand All @@ -72,8 +75,18 @@ def main(args=None):
params_inconsistent = {}

for file in files:
(params, paramDict) = process_file(getattr(args, file[1]), params)
param_sets[file[0]] = paramDict
file_list = getattr(args, file[1])
if file_list:
for single_file in file_list.split(','):
(params, paramDict) = process_file(single_file, params)
if file[0] not in param_sets:
param_sets[file[0]] = paramDict
else:
for key, values in paramDict.items():
if key in param_sets[file[0]]:
param_sets[file[0]][key].extend(values)
else:
param_sets[file[0]][key] = values

for key in params.keys():
value_set = {v for v in params[key]}
Expand All @@ -82,7 +95,8 @@ def main(args=None):

for source in param_sets:
if key in param_sets[source]:
params_inconsistent[key].append((source, param_sets[source][key]))
for val in param_sets[source][key]:
params_inconsistent[key].append((source, val))

# Strip inconsistent data from parameter list
for i in params_inconsistent.keys():
Expand Down
6 changes: 5 additions & 1 deletion bin/combine_statistics_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ def process_inconsistent_file(file, params, inconsistent, consistent):
else:
key = row.pop(0)

if consistent.get(key) is None:
if key in inconsistent:
# Append to existing inconsistent values
inconsistent[key].extend(row)
else:
# Add new inconsistent values
inconsistent[key] = row

return inconsistent
Expand Down
16 changes: 14 additions & 2 deletions modules/local/combine_metadata.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,24 @@ process COMBINE_METADATA {
script:
def args = []
def prefix = task.ext.prefix ?: meta.id
def biosample_files = []

for (item in file_list){
def file = item
def file_ext = item.getExtension()
def file_name = "--" + item.getName().minus("${prefix}_").minus(".${file_ext}") + "_file"
args.add(file_name)
args.add(file)

if (file_name.contains("biosample")) {
biosample_files.add(file)
} else {
args.add(file_name)
args.add(file)
}
}

if (!biosample_files.isEmpty()) {
args.add("--ena_biosample_file")
args.add(biosample_files.join(","))
}

"""
Expand Down
2 changes: 1 addition & 1 deletion modules/local/parse_metadata.nf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ process PARSE_METADATA {
script: // This script is bundled with the pipeline, in nf-core/genomenote/bin/
def prefix = task.ext.prefix ?: meta.id
def script_name = "parse_${meta.ext.toLowerCase()}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.py"
def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.csv"
def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}${meta.biosample}.csv"
"""
$script_name \\
$json \\
Expand Down
5 changes: 3 additions & 2 deletions modules/local/run_wget.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,17 @@ process RUN_WGET {


output:
tuple val(meta), path("${meta.id}_${meta.source}_${meta.type}.${meta.ext}") , emit: file_path
tuple val(meta), path("${meta.id}_${meta.source}_${meta.type}*.${meta.ext}") , emit: file_path
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def no_certificate = (meta.source == 'GOAT') ? '--no-check-certificate' : ''
def output = "${meta.id}_${meta.source}_${meta.type}${meta.biosample}.${meta.ext}".strip('_')
"""
wget ${no_certificate} -c -O ${meta.id}_${meta.source}_${meta.type}.${meta.ext} '${url}'
wget ${no_certificate} -c -O ${output} '${url}'
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
55 changes: 39 additions & 16 deletions subworkflows/local/genome_metadata.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,60 @@ workflow GENOME_METADATA {

main:
ch_versions = Channel.empty()

// Define channel for RUN_WGET
ch_file_list
| splitCsv(header: ['source', 'type', 'url', 'ext'], skip: 1)
| map { row ->
[
// meta
[ id: params.assembly,
taxon_id: params.taxon_id,
source: row.source,
type: row.type,
ext: row.ext,
],
// url
row.url
| flatMap { row ->
// Create a list to hold the final entries
def entries = []

// Common metadata
def metadata = [
id: params.assembly,
taxon_id: params.taxon_id,
source: row.source,
type: row.type,
ext: row.ext
]

// Process each biosample
params.biosample.split(',').each { biosample ->
def url = row.url
.replaceAll(/ASSEMBLY_ACCESSION/, params.assembly)
.replaceAll(/TAXONOMY_ID/, params.taxon_id)
.replaceAll(/BIOPROJECT_ACCESSION/, params.bioproject)
.replaceAll(/BIOSAMPLE_ACCESSION/, params.biosample)
]
.replaceAll(/BIOSAMPLE_ACCESSION/, biosample.trim())

if (row.type == 'Biosample') {
// Add entry with biosample in metadata for Biosample type
entries << [
metadata + [biosample: biosample.trim()],
url
]
} else {
// Add entry without biosample in metadata for other types
entries << [
metadata + [biosample: ''],
url
]
}
}

return entries
}
| unique()
| set { file_list }
file_list.view()

// Fetch files
RUN_WGET ( file_list )
ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() )
ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() )

PARSE_METADATA(RUN_WGET.out.file_path)
ch_versions = ch_versions.mix( PARSE_METADATA.out.versions.first() )

PARSE_METADATA.out.file_path
PARSE_METADATA.out.file_path
| map { it -> tuple( it[1] )}
| collect
| map { it ->
Expand Down

4 comments on commit a2446eb

@BethYates
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really close to doing what we need, there are a few changes that I think could improve things. I've updated the original issue to try and explain things a bit better but I'll add some specific comments here too.

Rather than passing through a single parameter with a list of biosample IDs I would prefer to pass them as individual parameters this is because the data type for each of the biosamples is important and will be needed later when generating the genome note document. There are three types of biosample data that we are concerned with:

  • the biosample used to produce the genomic sequence
  • the biosample used to produce the HiC data
  • the biosample used to produce the RNASeq data

I would like to see a parameter for each of these added to the config you could then replace your line on genome_metdata.nf

params.biosample.split(',').each { biosample ->
with something like
var biosamples = [["WGS", params.biosample_wgs ], ["HIC", params.biosample_hic], ["RNA", params_biosample_rna]] biosamples.each { biosampleType, biosampleID ->

In your metadata array you could then include the biosampleType as a new item "biosample_type" and remove the "biosample" entry. biosampleID would then be used to replace the "BIOSAMPLE_ACCESSION" in the replaceAll()

@BethYates
Copy link

@BethYates BethYates commented on a2446eb Jul 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In run_wget.nf it is more informative for us to know what type of biosample data we are dealing with than to know the biosample ID in the output file so I would suggesting modiying the name of the output file, you could do something like this:
def is_biosample = (meta.biosample_type == "WGS" | meta.biosample_type == "HIC" | meta.biosample_type == "RNA" ) ? "_${meta.biosample_type}" : ""
def output = "${meta.id}_${meta.source}_${meta.type}${is_biosample}.${meta.ext}"

I would make the same change to the output file in parse_metadata.nf too

@BethYates
Copy link

@BethYates BethYates commented on a2446eb Jul 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would then modify the script that processes the biosample related files, parse_xml_ena_biosample.py. As the output file name in the FILE_OUT argument passed to the script now contains information in the biosample data type you can extract this from the file name and use it to prefix the parameter names in the output file, e.g COLLECTOR would become either "COLLECTOR" or "HIC_COLLECTOR" or "RNA_COLLECTOR". That way when we merge all the individual metadata files we will know which values correspond to which type of biosample and we will be able to use these values to fill in the genome note template file.

I would only prefix the parameters that correspond to the HiC and RNASeq biosample accessions as this is what the template file we are using to produce the genome note expects.

@BethYates
Copy link

@BethYates BethYates commented on a2446eb Jul 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you make the changes suggested above I think that the changes you have made to combine_statistics_data.py and combine_metadata.nf should not be needed.

In combine_metadata.nf you will need to add change

def file_name = "--" + item.getName().minus("${prefix}_").minus(".${file_ext}") + "_file"

to

def file_name = "--" + item.getName().minus("${prefix}_").minus(".${file_ext}").toLowerCase() + "_file"

though.

And in combine_parsed_data,nf you can simply things if you remove your changes and instead add the follwing to the files list

    ("ENA_BIOSAMPLE", "ena_biosample_wgs_file"),
    ("ENA_BIOSAMPLE_HIC", "ena_biosample_hic_file"),
    ("ENA_BIOSAMPLE_RNA", "ena_biosample_rna_file"), 

and add

    parser.add_argument("--ena_biosample_wgs_file", help="Input parsed ENA genomic biosample file.", required=False)
    parser.add_argument("--ena_biosample_hic_file", help="Input parsed ENA HiC biosample file.", required=False)
    parser.add_argument("--ena_biosample_rna_file", help="Input parsed ENA RNASeq biosample file.", required=False)

to the parse_args function

Please sign in to comment.