Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow running of metatdata subworkflow on different biosample types #132

Merged
merged 3 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions bin/combine_parsed_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
files = [
("ENA_ASSEMBLY", "ena_assembly_file"),
("ENA_BIOPROJECT", "ena_bioproject_file"),
("ENA_BIOSAMPLE", "ena_biosample_file"),
("ENA_BIOSAMPLE", "ena_biosample_wgs_file"),
("ENA_BIOSAMPLE_HIC", "ena_biosample_hic_file"),
("ENA_BIOSAMPLE_RNA", "ena_biosample_rna_file"),
("ENA_TAXONOMY", "ena_taxonomy_file"),
("NCBI_ASSEMBLY", "ncbi_assembly_file"),
("NCBI_TAXONOMY", "ncbi_taxonomy_file"),
Expand All @@ -25,7 +27,9 @@ def parse_args(args=None):
parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("--ena_assembly_file", help="Input parsed ENA assembly file.", required=False)
parser.add_argument("--ena_bioproject_file", help="Input parsed ENA assembly file.", required=False)
parser.add_argument("--ena_biosample_file", help="Input parsed ENA assembly file.", required=False)
parser.add_argument("--ena_biosample_wgs_file", help="Input parsed ENA genomic biosample file.", required=False)
parser.add_argument("--ena_biosample_hic_file", help="Input parsed ENA HiC biosample file.", required=False)
parser.add_argument("--ena_biosample_rna_file", help="Input parsed ENA RNASeq biosample file.", required=False)
parser.add_argument("--ena_taxonomy_file", help="Input parsed ENA assembly file.", required=False)
parser.add_argument("--ncbi_assembly_file", help="Input parsed ENA assembly file.", required=False)
parser.add_argument("--ncbi_taxonomy_file", help="Input parsed ENA assembly file.", required=False)
Expand Down
19 changes: 14 additions & 5 deletions bin/parse_xml_ena_biosample.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@


def parse_args(args=None):
Description = "Parse contents of an ENA SAMPLE report and pul out meta data required by a genome note."
Description = "Parse contents of an ENA SAMPLE report and pull out meta data required by a genome note."
Epilog = "Example usage: python parse_xml_ena_sample.py <FILE_IN> <FILE_OUT>"

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
Expand Down Expand Up @@ -77,6 +77,13 @@ def parse_xml(file_in, file_out):
root = tree.getroot()
param_list = []

# Extract biosample type from FILE_OUT
biosample_type = None
if "HIC" in file_out.upper():
biosample_type = "HIC"
elif "RNA" in file_out.upper():
biosample_type = "RNA"

for f in fetch:
param = None
r = root
Expand All @@ -101,8 +108,7 @@ def parse_xml(file_in, file_out):
param = r.attrib.get(f[2][1])
except ValueError:
param = None

## Count child elements with specfic tag
## Count child elements with specific tag
if f[2][0] == "count":
if r is not None:
param = str(len(r.findall(f[2][1]))) if len(r.findall(f[2][1])) != 0 else None
Expand All @@ -129,8 +135,11 @@ def parse_xml(file_in, file_out):

if any(p in string.punctuation for p in param):
param = '"' + param + '"'

param_list.append([f[0], param])
# Prefix parameter name if biosample type is HiC or RNA
param_name = f[0]
if biosample_type in ["HIC", "RNA"]:
param_name = f"{biosample_type}_{param_name}"
param_list.append([param_name, param])

if len(param_list) > 0:
out_dir = os.path.dirname(file_out)
Expand Down
4 changes: 3 additions & 1 deletion conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ params {
species = 'Cloeon_dipterum'
taxon_id = '197152'
bioproject = 'PRJEB45177'
biosample = 'SAMEA7520803'
biosample_wgs = 'SAMEA7520803'
biosample_hic = 'SAMEA7520846'
biosample_rna = 'SAMEA7521081'

// Genome Notes Portal
write_to_portal = false
Expand Down
4 changes: 3 additions & 1 deletion conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ params {
species = 'Ypsolopha_sequella'
taxon_id = '1870436'
bioproject = 'PRJEB50740'
biosample = 'SAMEA7519929'
biosample_wgs = 'SAMEA7519929'
biosample_hic = 'SAMEA7519968'
biosample_rna = null

// Genome Notes Portal
write_to_portal = false
Expand Down
8 changes: 5 additions & 3 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ You will need to supply the assembly accession for the genome you would like to
--species '[species name]'
--taxon_id '[taxon id]'
--bioproject '[bioproject accession]'
--biosample '[biosample accession]'
--biosample_wgs '[biosample accession of the biosample used to produce the genomic sequence]'
--biosample_hic '[biosample accession of the biosample used to produce the HiC data]'
--biosample_rna '[biosample accession of the biosample used to produce the RNASeq data]
```

If you wish to run the optional step that writes genome metatdata key value-pairs to a genome notes databases you will need to set the parameter "write_to_portal" to true and provide the base url for the REST API that writes to the database.
Expand Down Expand Up @@ -101,7 +103,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p
The typical command for running the pipeline is as follows:

```bash
nextflow run sanger-tol/genomenote --input samplesheet.csv --outdir <OUTDIR> --fasta genome.fasta --assembly GCA_922984935.2 --species Epithemia_sp._CRS-2021b --taxon_id 2809013 --bioproject PRJEB49353 --biosample SAMEA7524400 -profile docker
nextflow run sanger-tol/genomenote --input samplesheet.csv --outdir <OUTDIR> --fasta genome.fasta --assembly GCA_922984935.2 --species Epithemia_sp._CRS-2021b --taxon_id 2809013 --bioproject PRJEB49353 --biosample_wgs SAMEA7524400 -profile docker
```

This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
Expand Down Expand Up @@ -137,7 +139,7 @@ assembly: 'GCA_922984935.2'
species: 'Epithemia_sp._CRS-2021b'
taxon_id: '2809013'
bioproject: 'PRJEB49353'
biosample: 'SAMEA7524400'
biosample_wgs: 'SAMEA7524400'
<...>
```

Expand Down
2 changes: 1 addition & 1 deletion modules/local/combine_metadata.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ process COMBINE_METADATA {
for (item in file_list){
def file = item
def file_ext = item.getExtension()
def file_name = "--" + item.getName().minus("${prefix}_").minus(".${file_ext}") + "_file"
def file_name = "--" + item.getName().minus("${prefix}_").minus(".${file_ext}").toLowerCase() + "_file"
args.add(file_name)
args.add(file)
}
Expand Down
3 changes: 2 additions & 1 deletion modules/local/parse_metadata.nf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ process PARSE_METADATA {
script: // This script is bundled with the pipeline, in nf-core/genomenote/bin/
def prefix = task.ext.prefix ?: meta.id
def script_name = "parse_${meta.ext.toLowerCase()}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.py"
def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}.csv"
def is_biosample = (meta.biosample_type == "WGS" || meta.biosample_type == "HIC" || meta.biosample_type == "RNA") ? "_${meta.biosample_type}" : ""
def output_file = "${prefix}_${meta.source.toLowerCase()}_${meta.type.toLowerCase()}${is_biosample}.csv".strip('_')
"""
$script_name \\
$json \\
Expand Down
6 changes: 4 additions & 2 deletions modules/local/run_wget.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,18 @@ process RUN_WGET {


output:
tuple val(meta), path("${meta.id}_${meta.source}_${meta.type}.${meta.ext}") , emit: file_path
tuple val(meta), path("${meta.id}_${meta.source}_${meta.type}*.${meta.ext}") , emit: file_path
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def no_certificate = (meta.source == 'GOAT') ? '--no-check-certificate' : ''
def is_biosample = (meta.biosample_type == "WGS" || meta.biosample_type == "HIC" || meta.biosample_type == "RNA") ? "_${meta.biosample_type}" : ""
def output = "${meta.id}_${meta.source}_${meta.type}${is_biosample}.${meta.ext}".strip('_')
"""
wget ${no_certificate} -c -O ${meta.id}_${meta.source}_${meta.type}.${meta.ext} '${url}'
wget ${no_certificate} -c -O ${output} '${url}'

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
4 changes: 3 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ params {
species = null
taxon_id = null
bioproject = null
biosample = null
biosample_wgs = null
biosample_hic = null
biosample_rna = null

// Genome Notes
write_to_portal = false
Expand Down
12 changes: 10 additions & 2 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,17 @@
"type": "string",
"description": "The bioproject accesion linked to the genome assembly, for example: PRJEB49353."
},
"biosample": {
"biosample_wgs": {
"type": "string",
"description": "The biosample accesion(s) linked to the samples in the experiment, for example: SAMEA7524400."
"description": "The biosample accesion(s) linked to the WGS samples in the experiment, for example: SAMEA7520803."
},
"biosample_rna": {
"type": "string",
"description": "The biosample accesion(s) linked to the RNA samples in the experiment, for example: SAMEA7521081."
},
"biosample_hic": {
"type": "string",
"description": "The biosample accesion(s) linked to the Hi-C samples in the experiment, for example: SAMEA7520846."
},
"outdir": {
"type": "string",
Expand Down
67 changes: 49 additions & 18 deletions subworkflows/local/genome_metadata.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,68 @@ workflow GENOME_METADATA {

main:
ch_versions = Channel.empty()

// Define channel for RUN_WGET
ch_file_list
| splitCsv(header: ['source', 'type', 'url', 'ext'], skip: 1)
| map { row ->
[
// meta
[ id: params.assembly,
taxon_id: params.taxon_id,
source: row.source,
type: row.type,
ext: row.ext,
],
// url
row.url
.replaceAll(/ASSEMBLY_ACCESSION/, params.assembly)
.replaceAll(/TAXONOMY_ID/, params.taxon_id)
.replaceAll(/BIOPROJECT_ACCESSION/, params.bioproject)
.replaceAll(/BIOSAMPLE_ACCESSION/, params.biosample)
| flatMap { row ->
// Create a list to hold the final entries
def entries = []

// Common metadata
def metadata = [
id: params.assembly,
taxon_id: params.taxon_id,
source: row.source,
type: row.type,
ext: row.ext
]

// Define biosamples with their types
def biosamples = [
["WGS", params.biosample_wgs],
["HIC", params.biosample_hic],
["RNA", params.biosample_rna]
]

// Process each biosample
biosamples.each { biosampleType, biosampleID ->
if ( biosampleID != null ) {
// Skip if biosampleID is null}
def url = row.url
.replaceAll(/ASSEMBLY_ACCESSION/, params.assembly)
.replaceAll(/TAXONOMY_ID/, params.taxon_id)
.replaceAll(/BIOPROJECT_ACCESSION/, params.bioproject)
.replaceAll(/BIOSAMPLE_ACCESSION/, biosampleID)

if (row.type == 'Biosample') {
// Add entry with biosample type in metadata for Biosample type
entries << [
metadata + [biosample_type: biosampleType],
url
]
} else {
// Add entry without biosample type in metadata for other types
entries << [
metadata + [biosample_type: ''],
url
]
}
}
}
return entries
}
| unique()
| set { file_list }

// Fetch files
RUN_WGET ( file_list )
ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() )
ch_versions = ch_versions.mix( RUN_WGET.out.versions.first() )

PARSE_METADATA(RUN_WGET.out.file_path)
ch_versions = ch_versions.mix( PARSE_METADATA.out.versions.first() )

PARSE_METADATA.out.file_path
PARSE_METADATA.out.file_path
| map { it -> tuple( it[1] )}
| collect
| map { it ->
Expand Down
4 changes: 2 additions & 2 deletions workflows/genomenote.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def checkPathParamList = [ params.input, params.multiqc_config, params.lineage_d
for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }

// Check mandatory parameters
if (params.assembly && params.taxon_id && params.bioproject && params.biosample) { metadata_inputs = [ params.assembly, params.taxon_id, params.bioproject, params.biosample ] }
else { exit 1, 'Metadata input not specified. Please include an assembly accession, a taxon id, a bioproject accession and a biosample_accession' }
if (params.assembly && params.taxon_id && params.bioproject && params.biosample_wgs) { metadata_inputs = [ params.assembly, params.taxon_id, params.bioproject, params.biosample_wgs ] }
else { exit 1, 'Metadata input not specified. Please include an assembly accession, a taxon id, a bioproject accession and a biosample accession' }
if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' }
if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Genome fasta not specified!' }
if (params.binsize) { ch_bin = Channel.of(params.binsize) } else { exit 1, 'Bin size for cooler/cload not specified!' }
Expand Down
Loading