diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f8a290..e508e7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ Initial release of nf-core/variantprioritization, created with the [nf-core](htt ### `Added` +- [#47](https://github.com/nf-core/variantprioritization/pull/47) - Use bcftools/isec nf-core module for computing VCF intersection between multiple callers + ### `Fixed` - [#32](https://github.com/nf-core/variantprioritization/pull/32) - Rename to nf-core/variantprioritization diff --git a/bin/isec_vcfs.py b/bin/isec_vcfs.py index c87d8c6..e28cb05 100755 --- a/bin/isec_vcfs.py +++ b/bin/isec_vcfs.py @@ -7,7 +7,14 @@ import argparse -def intersect_variants(sample): +def create_tool_order_dict(tool_order_str): + tool_order_list = tool_order_str.split(",") + tool_order_dict = {} + for idx, tool in enumerate(tool_order_list): + tool_order_dict[idx] = tool + return tool_order_dict + +def intersect_variants(sample, tool_order): sample_id = sample suffixes = (".cns", ".tbi") @@ -15,76 +22,61 @@ def intersect_variants(sample): sample_files = os.listdir("./") sample_files = list(filter(r.match, sample_files)) sample_files = [file for file in sample_files if not file.endswith(suffixes)] - print(sample_files) - - tool_names = {} - for idx, file in enumerate(sample_files): - tool = file.split(".")[2] # change this if you change prefix - tool_names[idx] = tool - - if len(sample_files) > 1: - - for idx, x in enumerate(sample_files): - idx = idx + 1 # cant use 0 for -n - os.system(f'bcftools isec -c none -n={idx} -p {idx} {" ".join(str(x) for x in sample_files)}') - - pattern = "./**/sites.txt" - fn_size = {} - file_list = glob.glob(pattern, recursive=True) - for file in file_list: - file_size = os.stat(file).st_size - fn_size[file] = file_size - # remove sites.txt files that are empty - fn_size = {key: val for key, val in fn_size.items() if val != 0} - file_list = list(fn_size.keys()) - - li = [] - for filename in file_list: - df = pd.read_table(filename, sep="\t", header=None, converters={4: str}) # preserve leading zeros - li.append(df) - - frame = pd.concat(li, axis=0, ignore_index=True) - - # loop over the 4th column containing bytes '0101' etc - convert_column = [] - for byte_str in frame[4]: - # print(byte_str) - # convert 0101 to itemized list - code = [x for x in byte_str] - # init list to match 1's in bytestring to corresponding tool name - grab_index = [] - for idx, val in enumerate(code): - if val != "0": - grab_index.append(int(idx)) - bytes_2_tal = {k: tool_names[k] for k in grab_index if k in tool_names} - bytes_2_tal = ",".join(bytes_2_tal.values()) - convert_column.append(bytes_2_tal) - - assert len(convert_column) == len(frame), f"bytes to TAL section failed - length of list != length DF" - frame[4] = convert_column - # I noticed duplicate rows in the output file during testing. Worrying as I'm not sure how they got there... - # chr1 3866080 C T freebayes - # chr1 3866080 C T freebayes - frame = frame.drop_duplicates() - frame.to_csv(f"{sample}_keys.txt", sep="\t", index=None, header=None) - - else: - - os.system( - f'bcftools view {sample_files[0]} -G -H | awk -v OFS="\t" \'{{print $1, $2, $4, $5, "{tool_names[0]}"}}\' > {sample}_keys.txt' - ) - - -def main(): + + tool_names = create_tool_order_dict(tool_order) + pattern = "./**/sites.txt" + fn_size = {} + file_list = glob.glob(pattern, recursive=True) + for file in file_list: + file_size = os.stat(file).st_size + fn_size[file] = file_size + # remove sites.txt files that are empty + fn_size = {key: val for key, val in fn_size.items() if val != 0} + file_list = list(fn_size.keys()) + + li = [] + for filename in file_list: + df = pd.read_table(filename, sep="\t", header=None, converters={4: str}) # preserve leading zeros + li.append(df) + + frame = pd.concat(li, axis=0, ignore_index=True) + + # loop over the 4th column containing bytes '0101' etc + convert_column = [] + for byte_str in frame[4]: + # print(byte_str) + # convert 0101 to itemized list + code = [x for x in byte_str] + # init list to match 1's in bytestring to corresponding tool name + grab_index = [] + for idx, val in enumerate(code): + if val != "0": + grab_index.append(int(idx)) + bytes_2_tal = {k: tool_names[k] for k in grab_index if k in tool_names} + bytes_2_tal = ",".join(bytes_2_tal.values()) + convert_column.append(bytes_2_tal) + + assert len(convert_column) == len(frame), f"bytes to TAL section failed - length of list != length DF" + frame[4] = convert_column + # I noticed duplicate rows in the output file during testing. Worrying as I'm not sure how they got there... + # Don't worry, be happy - Not a core member - Hackathon 2025 + # chr1 3866080 C T freebayes + # chr1 3866080 C T freebayes + frame = frame.drop_duplicates() + frame.to_csv(f"{sample}_keys.txt", sep="\t", index=None, header=None) + + +def main():# # Argument parsing using argparse parser = argparse.ArgumentParser(description="Reformat somatic CNA files for PCGR input.") parser.add_argument("-s", "--sample", required=True, help="Sample name (meta.id) for the output.") + parser.add_argument("--tool_order", required=False, help="Comma-separated list of tool names in order of VCFs in intersection.") args = parser.parse_args() # Call reformat_cna function with arguments - intersect_variants(args.sample) + intersect_variants(args.sample, args.tool_order) if __name__ == "__main__": diff --git a/conf/modules.config b/conf/modules.config index 6f2f013..3ccfd2f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -47,6 +47,26 @@ process { ] } + withName: 'BCFTOOLS_ISEC' { + ext.args = {[ + "-c none", + "-n=${meta.vcf_size}", + ].join(" ").trim() + } + ext.prefix = { "${meta.vcf_size}"} + + publishDir = [ + enabled: false + ] + } + + withName: 'BCFTOOLS_VIEW_TO_KEYS' { + ext.prefix = { "${meta.patient}.${meta.sample}" } + publishDir = [ + enabled: false + ] + } + withName: 'BCFTOOLS_FILTER' { label = 'process_low' ext.args = { diff --git a/modules.json b/modules.json index 4e5ab8f..e5f0a0b 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", "installed_by": ["modules"] }, + "bcftools/isec": { + "branch": "master", + "git_sha": "f17049e03697726ace7499d2fe342f892594f6f3", + "installed_by": ["modules"] + }, "bcftools/norm": { "branch": "master", "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", diff --git a/modules/local/reformat_input/bcftools_view_to_keys.nf b/modules/local/reformat_input/bcftools_view_to_keys.nf new file mode 100644 index 0000000..075bfba --- /dev/null +++ b/modules/local/reformat_input/bcftools_view_to_keys.nf @@ -0,0 +1,31 @@ +process BCFTOOLS_VIEW_TO_KEYS { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bcftools=1.21" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5a/5acacb55c52bec97c61fd34ffa8721fce82ce823005793592e2a80bf71632cd0/data': + 'community.wave.seqera.io/library/bcftools:1.21--4335bec1d7b44d11' }" + + input: + tuple val(meta), path(vcf), path(tbi) + + output: + tuple val(meta), path("${prefix}_keys.txt"), emit: variant_tool_map + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + bcftools view ${vcf} -G -H | awk -v OFS="\t" \'{{print \$1, \$2, \$4, \$5, "${meta.tools[0]}"}}\' > ${prefix}_keys.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/reformat_input/isec_vcf.nf b/modules/local/reformat_input/isec_vcf.nf index b45e13a..785fa51 100644 --- a/modules/local/reformat_input/isec_vcf.nf +++ b/modules/local/reformat_input/isec_vcf.nf @@ -7,7 +7,7 @@ process INTERSECT_SOMATIC_VARIANTS { 'docker.io/barryd237/pysam-xcmds:latest' }" input: - tuple val(meta), path(vcf), path(tbi) + tuple val(meta), path(isec_results) output: tuple val(meta), path("${prefix}_keys.txt"), emit: variant_tool_map @@ -21,6 +21,7 @@ process INTERSECT_SOMATIC_VARIANTS { prefix = task.ext.prefix ?: "${meta.id}" // meta.sample, toggle using modules.config """ isec_vcfs.py \ + --tool_order ${meta.tools.join(",")} \ --sample ${prefix} cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/bcftools/isec/environment.yml b/modules/nf-core/bcftools/isec/environment.yml new file mode 100644 index 0000000..ba863b3 --- /dev/null +++ b/modules/nf-core/bcftools/isec/environment.yml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + # renovate: datasource=conda depName=bioconda/htslib + - bioconda::htslib=1.22.1 + # renovate: datasource=conda depName=bioconda/bcftools + - bioconda::bcftools=1.22 diff --git a/modules/nf-core/bcftools/isec/main.nf b/modules/nf-core/bcftools/isec/main.nf new file mode 100644 index 0000000..c9bdf4c --- /dev/null +++ b/modules/nf-core/bcftools/isec/main.nf @@ -0,0 +1,51 @@ +process BCFTOOLS_ISEC { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/47/474a5ea8dc03366b04df884d89aeacc4f8e6d1ad92266888e7a8e7958d07cde8/data': + 'community.wave.seqera.io/library/bcftools_htslib:0a3fa2654b52006f' }" + + input: + tuple val(meta), path(vcfs), path(tbis) + + output: + tuple val(meta), path("${prefix}", type: "dir"), emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + bcftools isec \\ + $args \\ + -p $prefix \\ + ${vcfs} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir ${prefix} + touch ${prefix}/README.txt + touch ${prefix}/sites.txt + echo "" | gzip > ${prefix}/0000.vcf.gz + touch ${prefix}/0000.vcf.gz.tbi + echo "" | gzip > ${prefix}/0001.vcf.gz + touch ${prefix}/0001.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/isec/meta.yml b/modules/nf-core/bcftools/isec/meta.yml new file mode 100644 index 0000000..55cb49f --- /dev/null +++ b/modules/nf-core/bcftools/isec/meta.yml @@ -0,0 +1,58 @@ +name: bcftools_isec +description: Apply set operations to VCF files +keywords: + - variant calling + - intersect + - union + - complement + - VCF + - BCF +tools: + - isec: + description: | + Computes intersections, unions and complements of VCF files. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + identifier: biotools:bcftools +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcfs: + type: list + description: | + List containing 2 or more vcf/bcf files. These must be compressed and have an associated index. + e.g. [ 'file1.vcf.gz', 'file2.vcf' ] + - tbis: + type: list + description: | + List containing the tbi index files corresponding to the vcf/bcf input files + e.g. [ 'file1.vcf.tbi', 'file2.vcf.tbi' ] +output: + results: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}: + type: directory + description: Folder containing the set operations results perform on the vcf files + pattern: "${prefix}" + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/bcftools/isec/tests/main.nf.test b/modules/nf-core/bcftools/isec/tests/main.nf.test new file mode 100644 index 0000000..ca8fc6e --- /dev/null +++ b/modules/nf-core/bcftools/isec/tests/main.nf.test @@ -0,0 +1,82 @@ +nextflow_process { + + name "Test Process BCFTOOLS_ISEC" + script "../main.nf" + process "BCFTOOLS_ISEC" + + tag "modules" + tag "modules_nfcore" + tag "bcftools" + tag "bcftools/isec" + + config "./nextflow.config" + + test("sarscov2 - [[vcf1.gz, vcf2.gz], [tbi1, tbi2]]") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test2.vcf.gz', checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test2.vcf.gz.tbi', checkIfExists: true) + ] + ] + """ + } + } + + then { + def results_dir = new File(process.out.results[0][1]) + def results_list = [] + results_dir.eachFileRecurse { file -> results_list << file.getName() } + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + results_list.sort(), + path("${process.out.results[0][1]}").list().findAll { + it.getFileName().toString() != "0000.vcf.gz.tbi" && it.getFileName().toString() != "0001.vcf.gz.tbi" + } + ).match() + } + ) + } + + } + + test("sarscov2 - [[vcf1, vcf2], [tbi1, tbi2]] - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test2.vcf.gz', checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf.gz.tbi', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test2.vcf.gz.tbi', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/isec/tests/main.nf.test.snap b/modules/nf-core/bcftools/isec/tests/main.nf.test.snap new file mode 100644 index 0000000..b8407c3 --- /dev/null +++ b/modules/nf-core/bcftools/isec/tests/main.nf.test.snap @@ -0,0 +1,75 @@ +{ + "sarscov2 - [[vcf1.gz, vcf2.gz], [tbi1, tbi2]]": { + "content": [ + [ + "versions.yml:md5,60b924654d2911d4f90d8b357c9d2d56" + ], + [ + "0000.vcf.gz", + "0000.vcf.gz.tbi", + "0001.vcf.gz", + "0001.vcf.gz.tbi", + "README.txt", + "sites.txt" + ], + [ + "0000.vcf.gz:md5,8e722884ffb75155212a3fc053918766", + "0001.vcf.gz:md5,b39a72f91458b94b346dd73690207649", + "README.txt:md5,10fc33b66522645600d44afbd41fb792", + "sites.txt:md5,1cea3fbde7f6d3c97f3d39036f9690df" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-11T14:19:09.011186" + }, + "sarscov2 - [[vcf1, vcf2], [tbi1, tbi2]] - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "0000.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "0000.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e", + "0001.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "0001.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e", + "README.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "sites.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + "versions.yml:md5,60b924654d2911d4f90d8b357c9d2d56" + ], + "results": [ + [ + { + "id": "test" + }, + [ + "0000.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "0000.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e", + "0001.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "0001.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e", + "README.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "sites.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,60b924654d2911d4f90d8b357c9d2d56" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-11T14:19:13.592383" + } +} \ No newline at end of file diff --git a/modules/nf-core/bcftools/isec/tests/nextflow.config b/modules/nf-core/bcftools/isec/tests/nextflow.config new file mode 100644 index 0000000..ac887d6 --- /dev/null +++ b/modules/nf-core/bcftools/isec/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = '--nfiles +2 --output-type z --no-version' +} diff --git a/subworkflows/local/format_files.nf b/subworkflows/local/format_files.nf index 55664b9..c356a85 100644 --- a/subworkflows/local/format_files.nf +++ b/subworkflows/local/format_files.nf @@ -8,6 +8,8 @@ include { REFORMAT_VCF } from '../../modules/local/reform include { REFORMAT_CNA } from '../../modules/local/reformat_input/reformat_cna' include { INTERSECT_SOMATIC_VARIANTS } from '../../modules/local/reformat_input/isec_vcf' include { PCGR_VCF } from '../../modules/local/reformat_input/pcgr_vcf' +include { BCFTOOLS_ISEC } from '../../modules/nf-core/bcftools/isec/main' +include { BCFTOOLS_VIEW_TO_KEYS } from '../../modules/local/reformat_input/bcftools_view_to_keys' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -56,10 +58,74 @@ workflow FORMAT_FILES { skip: 1 )*/ - INTERSECT_SOMATIC_VARIANTS( per_sample_somatic_vcfs ) + // This could be refactored with the .collect() groovy list method. + per_sample_somatic_vcfs.transpose().map { + meta, vcf, tbis -> + def tool_name = vcf.toString().tokenize('.')[2] + [ meta , tool_name, vcf, tbis ] + }.groupTuple().map { + meta, tool_names, vcfs, tbis -> + [ meta + ['tools': tool_names] , vcfs, tbis ] + }.set { per_sample_somatic_vcfs } + + per_sample_somatic_vcfs.map{ + meta, vcfs, tbis -> + [meta, vcfs, tbis, (1..vcfs.size()).toList()] + }.branch { + meta, vcfs, tbis, vcf_size -> + single: vcfs.size() < 2 + multiple: vcfs.size() > 1 + }.set { per_sample_somatic_vcfs } + + per_sample_somatic_vcfs.multiple.transpose(by: 3).map{ + meta, vcfs, tbis, vcf_size -> + [ meta + ['vcf_size': vcf_size], vcfs, tbis ] + }.set { per_sample_somatic_vcfs_multiple } + + per_sample_somatic_vcfs.single.map { + meta, vcfs, tbis, isec_size -> + [ meta , vcfs, tbis ] + }.set { per_sample_somatic_vcfs_single } + + + BCFTOOLS_ISEC ( per_sample_somatic_vcfs_multiple ) + + ch_isec_somatic_postprocess = BCFTOOLS_ISEC.out.results.map { + meta, results -> + [ meta.subMap([ 'patient', 'status', 'sample', 'tools']), results ] + }.groupTuple() + + BCFTOOLS_VIEW_TO_KEYS ( per_sample_somatic_vcfs_single ) + + + INTERSECT_SOMATIC_VARIANTS( ch_isec_somatic_postprocess ) + + INTERSECT_SOMATIC_VARIANTS.out.variant_tool_map.mix( + BCFTOOLS_VIEW_TO_KEYS.out.variant_tool_map + ).set{ variant_tool_map_ch } + // merge mapping key back with sample VCFs, produce PCGR ready VCFs. - sample_vcfs_keys = INTERSECT_SOMATIC_VARIANTS.out.variant_tool_map.join(per_sample_somatic_vcfs) + + per_sample_somatic_vcfs_single.map { + meta, vcf, tbi -> + [ meta.subMap([ 'patient', 'status', 'sample']), vcf, tbi ] + }.mix( + per_sample_somatic_vcfs.multiple.map { + meta, vcf, tbi, isec_iter -> + [ meta.subMap([ 'patient', 'status', 'sample']), vcf, tbi ] + } + ).set { per_sample_somatic_vcfs_all } + + variant_tool_map_ch.map{ + meta, variant_tool_map -> + [ meta.subMap([ 'patient', 'status', 'sample']), variant_tool_map ] + }.join( + per_sample_somatic_vcfs_all + ). set { sample_vcfs_keys } + + + /*sample_vcfs_keys.map{ it -> "My values are:\n$it\n"