Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ Initial release of nf-core/variantprioritization, created with the [nf-core](htt

### `Added`

- [#47](https://github.com/nf-core/variantprioritization/pull/47) - Use bcftools/isec nf-core module for computing VCF intersection between multiple callers

### `Fixed`

- [#32](https://github.com/nf-core/variantprioritization/pull/32) - Rename to nf-core/variantprioritization
Expand Down
118 changes: 55 additions & 63 deletions bin/isec_vcfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,84 +7,76 @@
import argparse


def intersect_variants(sample):
def create_tool_order_dict(tool_order_str):
tool_order_list = tool_order_str.split(",")
tool_order_dict = {}
for idx, tool in enumerate(tool_order_list):
tool_order_dict[idx] = tool
return tool_order_dict

def intersect_variants(sample, tool_order):

sample_id = sample
suffixes = (".cns", ".tbi")
r = re.compile(f"{sample_id}*")
sample_files = os.listdir("./")
sample_files = list(filter(r.match, sample_files))
sample_files = [file for file in sample_files if not file.endswith(suffixes)]
print(sample_files)

tool_names = {}
for idx, file in enumerate(sample_files):
tool = file.split(".")[2] # change this if you change prefix
tool_names[idx] = tool

if len(sample_files) > 1:

for idx, x in enumerate(sample_files):
idx = idx + 1 # cant use 0 for -n
os.system(f'bcftools isec -c none -n={idx} -p {idx} {" ".join(str(x) for x in sample_files)}')

pattern = "./**/sites.txt"
fn_size = {}
file_list = glob.glob(pattern, recursive=True)
for file in file_list:
file_size = os.stat(file).st_size
fn_size[file] = file_size
# remove sites.txt files that are empty
fn_size = {key: val for key, val in fn_size.items() if val != 0}
file_list = list(fn_size.keys())

li = []
for filename in file_list:
df = pd.read_table(filename, sep="\t", header=None, converters={4: str}) # preserve leading zeros
li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

# loop over the 4th column containing bytes '0101' etc
convert_column = []
for byte_str in frame[4]:
# print(byte_str)
# convert 0101 to itemized list
code = [x for x in byte_str]
# init list to match 1's in bytestring to corresponding tool name
grab_index = []
for idx, val in enumerate(code):
if val != "0":
grab_index.append(int(idx))
bytes_2_tal = {k: tool_names[k] for k in grab_index if k in tool_names}
bytes_2_tal = ",".join(bytes_2_tal.values())
convert_column.append(bytes_2_tal)

assert len(convert_column) == len(frame), f"bytes to TAL section failed - length of list != length DF"
frame[4] = convert_column
# I noticed duplicate rows in the output file during testing. Worrying as I'm not sure how they got there...
# chr1 3866080 C T freebayes
# chr1 3866080 C T freebayes
frame = frame.drop_duplicates()
frame.to_csv(f"{sample}_keys.txt", sep="\t", index=None, header=None)

else:

os.system(
f'bcftools view {sample_files[0]} -G -H | awk -v OFS="\t" \'{{print $1, $2, $4, $5, "{tool_names[0]}"}}\' > {sample}_keys.txt'
)


def main():

tool_names = create_tool_order_dict(tool_order)
pattern = "./**/sites.txt"
fn_size = {}
file_list = glob.glob(pattern, recursive=True)
for file in file_list:
file_size = os.stat(file).st_size
fn_size[file] = file_size
# remove sites.txt files that are empty
fn_size = {key: val for key, val in fn_size.items() if val != 0}
file_list = list(fn_size.keys())

li = []
for filename in file_list:
df = pd.read_table(filename, sep="\t", header=None, converters={4: str}) # preserve leading zeros
li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

# loop over the 4th column containing bytes '0101' etc
convert_column = []
for byte_str in frame[4]:
# print(byte_str)
# convert 0101 to itemized list
code = [x for x in byte_str]
# init list to match 1's in bytestring to corresponding tool name
grab_index = []
for idx, val in enumerate(code):
if val != "0":
grab_index.append(int(idx))
bytes_2_tal = {k: tool_names[k] for k in grab_index if k in tool_names}
bytes_2_tal = ",".join(bytes_2_tal.values())
convert_column.append(bytes_2_tal)

assert len(convert_column) == len(frame), f"bytes to TAL section failed - length of list != length DF"
frame[4] = convert_column
# I noticed duplicate rows in the output file during testing. Worrying as I'm not sure how they got there...
# Don't worry, be happy - Not a core member - Hackathon 2025
# chr1 3866080 C T freebayes
# chr1 3866080 C T freebayes
frame = frame.drop_duplicates()
frame.to_csv(f"{sample}_keys.txt", sep="\t", index=None, header=None)


def main():#
# Argument parsing using argparse
parser = argparse.ArgumentParser(description="Reformat somatic CNA files for PCGR input.")
parser.add_argument("-s", "--sample", required=True, help="Sample name (meta.id) for the output.")
parser.add_argument("--tool_order", required=False, help="Comma-separated list of tool names in order of VCFs in intersection.")

args = parser.parse_args()


# Call reformat_cna function with arguments
intersect_variants(args.sample)
intersect_variants(args.sample, args.tool_order)


if __name__ == "__main__":
Expand Down
20 changes: 20 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,26 @@ process {
]
}

withName: 'BCFTOOLS_ISEC' {
ext.args = {[
"-c none",
"-n=${meta.vcf_size}",
].join(" ").trim()
}
ext.prefix = { "${meta.vcf_size}"}

publishDir = [
enabled: false
]
}

withName: 'BCFTOOLS_VIEW_TO_KEYS' {
ext.prefix = { "${meta.patient}.${meta.sample}" }
publishDir = [
enabled: false
]
}

withName: 'BCFTOOLS_FILTER' {
label = 'process_low'
ext.args = {
Expand Down
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@
"git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46",
"installed_by": ["modules"]
},
"bcftools/isec": {
"branch": "master",
"git_sha": "f17049e03697726ace7499d2fe342f892594f6f3",
"installed_by": ["modules"]
},
"bcftools/norm": {
"branch": "master",
"git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46",
Expand Down
31 changes: 31 additions & 0 deletions modules/local/reformat_input/bcftools_view_to_keys.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
process BCFTOOLS_VIEW_TO_KEYS {
tag "$meta.id"
label 'process_medium'

conda "bioconda::bcftools=1.21"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5a/5acacb55c52bec97c61fd34ffa8721fce82ce823005793592e2a80bf71632cd0/data':
'community.wave.seqera.io/library/bcftools:1.21--4335bec1d7b44d11' }"

input:
tuple val(meta), path(vcf), path(tbi)

output:
tuple val(meta), path("${prefix}_keys.txt"), emit: variant_tool_map
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
prefix = task.ext.prefix ?: "${meta.id}"
"""
bcftools view ${vcf} -G -H | awk -v OFS="\t" \'{{print \$1, \$2, \$4, \$5, "${meta.tools[0]}"}}\' > ${prefix}_keys.txt

cat <<-END_VERSIONS > versions.yml
"${task.process}":
bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//')
END_VERSIONS
"""
}
3 changes: 2 additions & 1 deletion modules/local/reformat_input/isec_vcf.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ process INTERSECT_SOMATIC_VARIANTS {
'docker.io/barryd237/pysam-xcmds:latest' }"

input:
tuple val(meta), path(vcf), path(tbi)
tuple val(meta), path(isec_results)

output:
tuple val(meta), path("${prefix}_keys.txt"), emit: variant_tool_map
Expand All @@ -21,6 +21,7 @@ process INTERSECT_SOMATIC_VARIANTS {
prefix = task.ext.prefix ?: "${meta.id}" // meta.sample, toggle using modules.config
"""
isec_vcfs.py \
--tool_order ${meta.tools.join(",")} \
--sample ${prefix}

cat <<-END_VERSIONS > versions.yml
Expand Down
10 changes: 10 additions & 0 deletions modules/nf-core/bcftools/isec/environment.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

51 changes: 51 additions & 0 deletions modules/nf-core/bcftools/isec/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 58 additions & 0 deletions modules/nf-core/bcftools/isec/meta.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading