nf-core · maxibor · Oct 28, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,8 @@ Initial release of nf-core/variantprioritization, created with the [nf-core](htt
 
 ### `Added`
 
+- [#47](https://github.com/nf-core/variantprioritization/pull/47) - Use bcftools/isec nf-core module for computing VCF intersection between multiple callers
+
 ### `Fixed`
 
 - [#32](https://github.com/nf-core/variantprioritization/pull/32) - Rename to nf-core/variantprioritization

diff --git a/bin/isec_vcfs.py b/bin/isec_vcfs.py
@@ -7,84 +7,76 @@
 import argparse
 
 
-def intersect_variants(sample):
+def create_tool_order_dict(tool_order_str):
+    tool_order_list = tool_order_str.split(",")
+    tool_order_dict = {}
+    for idx, tool in enumerate(tool_order_list):
+        tool_order_dict[idx] = tool
+    return tool_order_dict
+
+def intersect_variants(sample, tool_order):
 
     sample_id = sample
     suffixes = (".cns", ".tbi")
     r = re.compile(f"{sample_id}*")
     sample_files = os.listdir("./")
     sample_files = list(filter(r.match, sample_files))
     sample_files = [file for file in sample_files if not file.endswith(suffixes)]
-    print(sample_files)
-
-    tool_names = {}
-    for idx, file in enumerate(sample_files):
-        tool = file.split(".")[2]  # change this if you change prefix
-        tool_names[idx] = tool
-
-    if len(sample_files) > 1:
-
-        for idx, x in enumerate(sample_files):
-            idx = idx + 1  # cant use 0 for -n
-            os.system(f'bcftools isec -c none -n={idx} -p {idx} {" ".join(str(x) for x in sample_files)}')
-
-        pattern = "./**/sites.txt"
-        fn_size = {}
-        file_list = glob.glob(pattern, recursive=True)
-        for file in file_list:
-            file_size = os.stat(file).st_size
-            fn_size[file] = file_size
-        # remove sites.txt files that are empty
-        fn_size = {key: val for key, val in fn_size.items() if val != 0}
-        file_list = list(fn_size.keys())
-
-        li = []
-        for filename in file_list:
-            df = pd.read_table(filename, sep="\t", header=None, converters={4: str})  # preserve leading zeros
-            li.append(df)
-
-        frame = pd.concat(li, axis=0, ignore_index=True)
-
-        # loop over the 4th column containing bytes '0101' etc
-        convert_column = []
-        for byte_str in frame[4]:
-            # print(byte_str)
-            # convert 0101 to itemized list
-            code = [x for x in byte_str]
-            # init list to match 1's in bytestring to corresponding tool name
-            grab_index = []
-            for idx, val in enumerate(code):
-                if val != "0":
-                    grab_index.append(int(idx))
-            bytes_2_tal = {k: tool_names[k] for k in grab_index if k in tool_names}
-            bytes_2_tal = ",".join(bytes_2_tal.values())
-            convert_column.append(bytes_2_tal)
-
-        assert len(convert_column) == len(frame), f"bytes to TAL section failed - length of list != length DF"
-        frame[4] = convert_column
-        # I noticed duplicate rows in the output file during testing. Worrying as I'm not sure how they got there...
-        # chr1    3866080 C       T       freebayes
-        # chr1    3866080 C       T       freebayes
-        frame = frame.drop_duplicates()
-        frame.to_csv(f"{sample}_keys.txt", sep="\t", index=None, header=None)
-
-    else:
-
-        os.system(
-            f'bcftools view {sample_files[0]} -G -H | awk -v OFS="\t" \'{{print $1, $2, $4, $5, "{tool_names[0]}"}}\' > {sample}_keys.txt'
-        )
-
-
-def main():
+
+    tool_names = create_tool_order_dict(tool_order)
+    pattern = "./**/sites.txt"
+    fn_size = {}
+    file_list = glob.glob(pattern, recursive=True)
+    for file in file_list:
+        file_size = os.stat(file).st_size
+        fn_size[file] = file_size
+    # remove sites.txt files that are empty
+    fn_size = {key: val for key, val in fn_size.items() if val != 0}
+    file_list = list(fn_size.keys())
+
+    li = []
+    for filename in file_list:
+        df = pd.read_table(filename, sep="\t", header=None, converters={4: str})  # preserve leading zeros
+        li.append(df)
+
+    frame = pd.concat(li, axis=0, ignore_index=True)
+
+    # loop over the 4th column containing bytes '0101' etc
+    convert_column = []
+    for byte_str in frame[4]:
+        # print(byte_str)
+        # convert 0101 to itemized list
+        code = [x for x in byte_str]
+        # init list to match 1's in bytestring to corresponding tool name
+        grab_index = []
+        for idx, val in enumerate(code):
+            if val != "0":
+                grab_index.append(int(idx))
+        bytes_2_tal = {k: tool_names[k] for k in grab_index if k in tool_names}
+        bytes_2_tal = ",".join(bytes_2_tal.values())
+        convert_column.append(bytes_2_tal)
+
+    assert len(convert_column) == len(frame), f"bytes to TAL section failed - length of list != length DF"
+    frame[4] = convert_column
+    # I noticed duplicate rows in the output file during testing. Worrying as I'm not sure how they got there...
+    # Don't worry, be happy - Not a core member - Hackathon 2025
+    # chr1    3866080 C       T       freebayes
+    # chr1    3866080 C       T       freebayes
+    frame = frame.drop_duplicates()
+    frame.to_csv(f"{sample}_keys.txt", sep="\t", index=None, header=None)
+
+
+def main():#
     # Argument parsing using argparse
     parser = argparse.ArgumentParser(description="Reformat somatic CNA files for PCGR input.")
     parser.add_argument("-s", "--sample", required=True, help="Sample name (meta.id) for the output.")
+    parser.add_argument("--tool_order", required=False, help="Comma-separated list of tool names in order of VCFs in intersection.")
 
     args = parser.parse_args()
 
 
     # Call reformat_cna function with arguments
-    intersect_variants(args.sample)
+    intersect_variants(args.sample, args.tool_order)
 
 
 if __name__ == "__main__":

diff --git a/conf/modules.config b/conf/modules.config
@@ -47,6 +47,26 @@ process {
         ]
     }
 
+    withName: 'BCFTOOLS_ISEC' {
+        ext.args = {[
+            "-c none",
+            "-n=${meta.vcf_size}",
+            ].join(" ").trim()
+        }
+        ext.prefix = { "${meta.vcf_size}"}
+
+        publishDir = [
+            enabled: false
+        ]
+    }
+
+    withName: 'BCFTOOLS_VIEW_TO_KEYS' {
+        ext.prefix = { "${meta.patient}.${meta.sample}" }
+        publishDir = [
+            enabled: false
+        ]
+    }
+
     withName: 'BCFTOOLS_FILTER' {
         label = 'process_low'
         ext.args = {

diff --git a/modules.json b/modules.json
@@ -10,6 +10,11 @@
                         "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46",
                         "installed_by": ["modules"]
                     },
+                    "bcftools/isec": {
+                        "branch": "master",
+                        "git_sha": "f17049e03697726ace7499d2fe342f892594f6f3",
+                        "installed_by": ["modules"]
+                    },
                     "bcftools/norm": {
                         "branch": "master",
                         "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46",

diff --git a/modules/local/reformat_input/bcftools_view_to_keys.nf b/modules/local/reformat_input/bcftools_view_to_keys.nf
@@ -0,0 +1,31 @@
+process BCFTOOLS_VIEW_TO_KEYS {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::bcftools=1.21"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5a/5acacb55c52bec97c61fd34ffa8721fce82ce823005793592e2a80bf71632cd0/data':
+        'community.wave.seqera.io/library/bcftools:1.21--4335bec1d7b44d11' }"
+
+    input:
+    tuple val(meta), path(vcf), path(tbi)
+
+    output:
+    tuple val(meta), path("${prefix}_keys.txt"), emit: variant_tool_map
+    path "versions.yml"                        , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    bcftools view ${vcf} -G -H | awk -v OFS="\t" \'{{print \$1, \$2, \$4, \$5, "${meta.tools[0]}"}}\' > ${prefix}_keys.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/reformat_input/isec_vcf.nf b/modules/local/reformat_input/isec_vcf.nf
@@ -7,7 +7,7 @@ process INTERSECT_SOMATIC_VARIANTS {
         'docker.io/barryd237/pysam-xcmds:latest' }"
 
     input:
-    tuple val(meta), path(vcf), path(tbi)
+    tuple val(meta), path(isec_results)
 
     output:
     tuple val(meta), path("${prefix}_keys.txt"), emit: variant_tool_map
@@ -21,6 +21,7 @@ process INTERSECT_SOMATIC_VARIANTS {
     prefix = task.ext.prefix ?: "${meta.id}" // meta.sample, toggle using modules.config
     """
     isec_vcfs.py \
+        --tool_order ${meta.tools.join(",")} \
         --sample ${prefix}
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/modules/nf-core/bcftools/isec/environment.yml b/modules/nf-core/bcftools/isec/environment.yml
diff --git a/modules/nf-core/bcftools/isec/main.nf b/modules/nf-core/bcftools/isec/main.nf
diff --git a/modules/nf-core/bcftools/isec/meta.yml b/modules/nf-core/bcftools/isec/meta.yml