diff --git a/workflow/rules/cram_variantCalling-Discovery.smk b/workflow/rules/cram_variantCalling-Discovery.smk new file mode 100644 index 0000000..0d96844 --- /dev/null +++ b/workflow/rules/cram_variantCalling-Discovery.smk @@ -0,0 +1,104 @@ +CHROMS=list(range(1, 23)) +CHROMS.extend(['X']) + +# conditional statement in sequenceChrStringCheck.smk to either proceed with 1 or add "chr" for "chr1" +rule check_for_chr_string: + input: + SMP = "results/cram/{sample}.cram", + output: + ChrStrCheck="results/vcf/{sample}/chrStrCheck/alignmentFileHeader.out", + params: + sample=lambda wc: wc.get("sample"), + resources: + mem_mb=1500, + runtime="00:10:00", + benchmark: + repeat("benchmarks/check_for_chr_string.{sample}.RawSeq", 1) + script: + "../scripts/chr-string-check-cram.py" + +rule cram_variant_calling: + input: + cram="results/cram/{sample}.cram", + cramIndex="results/cram/{sample}.cram.crai", + ref = which_ref, + chrStrCheck="results/vcf/{sample}/chrStrCheck/alignmentFileHeader.out", + regions=CHROMS, + output: + splitChrVCF="results/vcf/{sample}/chr{chroms}.vcf.gz", + params: + sample=lambda wc: wc.get("sample"), + chroms_from_wc=lambda wc: wc.get("chroms"), + extra_mpileup=get_variant_calling_mpileup_params, + extra_call=get_variant_calling_call_params, + extra_NormTrueFalse=get_NormTrueFalse, + extra_normalize=get_NormLeftAlign, + resources: + mem_mb=3500, + runtime="24:00:00", + benchmark: + repeat("benchmarks/cram_variant_calling.{sample}.chr{chroms}.StoredSequence", 1) + script: + "../scripts/cram-variant-calling-discovery.py" + + +rule variant_calling_index: + input: + ChrVCF="results/vcf/{sample}/chr{chroms}.vcf.gz", + output: + ChrTBI="results/vcf/{sample}/chr{chroms}.vcf.gz.tbi", + resources: + mem_mb=1500, + runtime="00:30:00", + benchmark: + repeat("benchmarks/variant_calling_index.{sample}.chr{chroms}.StoredSequence", 1) + shell: + """ + bcftools index --tbi {input.ChrVCF} -o {output.ChrTBI} + """ + + +rule merge_sample_chrom_list: + input: + concatChrVCF=expand("results/vcf/{sample}/chr{{chroms}}.vcf.gz", sample=cramSamples["cramSample"]), + concatChrTBI=expand("results/vcf/{sample}/chr{{chroms}}.vcf.gz.tbi", sample=cramSamples["cramSample"]), + output: + mergeList="results/vcf/chr{chroms}-merge-sampleList.txt", + resources: + mem_mb=200, + runtime="00:10:00", + benchmark: + repeat("benchmarks/merge_sample_chrom_list.chr{chroms}.StoredSequence", 1) + script: + "../scripts/merge-sample-chrom-list-discovery.py" + +rule merge_samples_per_chrom: + input: + mergeList="results/vcf/chr{chroms}-merge-sampleList.txt", + output: + mergeChrVCF="results/vcf/Merged-chr{chroms}-allSamples.vcf.gz", + params: + chroms_from_wc=lambda wc: wc.get("chroms"), + resources: + mem_mb=10000, + runtime="00:10:00", + benchmark: + repeat("benchmarks/merge_samples_per_chrom.chr{chroms}.StoredSequence", 1) + script: + "../scripts/merge-sample-chroms.py" + + +rule merge_samples_per_chrom_index: + input: + mergeChrVCF="results/vcf/Merged-chr{chroms}-allSamples.vcf.gz", + output: + mergeChrTBI="results/vcf/Merged-chr{chroms}-allSamples.vcf.gz.tbi", + resources: + mem_mb=1500, + runtime="00:30:00", + benchmark: + repeat("benchmarks/merge_samples_per_chrom_index.chr{chroms}.StoredSequence", 1) + shell: + """ + bcftools index --tbi {input.mergeChrVCF} -o {output.mergeChrTBI} + """ diff --git a/workflow/rules/variantCalling-Discovery.smk b/workflow/rules/variantCalling-Discovery.smk new file mode 100644 index 0000000..cf25ae2 --- /dev/null +++ b/workflow/rules/variantCalling-Discovery.smk @@ -0,0 +1,102 @@ +CHROMS=list(range(1, 23)) +CHROMS.extend(['X']) + +# conditional statement in sequenceChrStringCheck.smk to either proceed with 1 or add "chr" for "chr1" +rule check_for_chr_string: + input: + sortedBam="results/sortedBam/{sample}.sorted.bam", + bamIndex="results/sortedBam/{sample}.sorted.bam.bai", + output: + ChrStrCheck="results/vcf/{sample}/chrStrCheck/alignmentFileHeader.out", + params: + sample=lambda wc: wc.get("sample"), + resources: + mem_mb=1500, + runtime="00:10:00", + benchmark: + repeat("benchmarks/check_for_chr_string.{sample}.RawSeq", 1) + script: + "../scripts/chr-string-check-bam.py" + + +rule variant_calling: + input: + sortedbam="results/sortedBam/{sample}.sorted.bam", + bamIndex="results/sortedBam/{sample}.sorted.bam.bai", + chrStrCheck="results/vcf/{sample}/chrStrCheck/alignmentFileHeader.out", + ref=which_ref, + regions=CHROMS, + output: + ChrVCF="results/vcf/{sample}/chr{chroms}.vcf.gz", + resources: + mem_mb=2500, + runtime="24:00:00", + params: + sample=lambda wc: wc.get("sample"), + extra_mpileup=get_variant_calling_mpileup_params, + extra_call=get_variant_calling_call_params, + extra_NormTrueFalse=get_NormTrueFalse, + extra_normalize=get_NormLeftAlign, + benchmark: + repeat("benchmarks/variant_calling.{sample}.chr{chroms}.RawSeq", 1) + script: + "../scripts/variant-calling-discovery.py" + +rule variant_calling_index: + input: + ChrVCF="results/vcf/{sample}/chr{chroms}.vcf.gz", + output: + ChrTBI="results/vcf/{sample}/chr{chroms}.vcf.gz.tbi", + resources: + mem_mb=1500, + runtime="06:00:00", + benchmark: + repeat("benchmarks/variant_calling_index.{sample}.chr{chroms}.RawSeq", 1) + shell: + """ + bcftools index --tbi {input.ChrVCF} -o {output.ChrTBI} + """ + +rule merge_sample_chrom_list: + input: + concatChrVCF=expand("results/vcf/{sample}/chr{{chroms}}.vcf.gz", sample=samples["sample"]), + concatChrTBI=expand("results/vcf/{sample}/chr{{chroms}}.vcf.gz.tbi", sample=samples["sample"]), + output: + mergeList="results/vcf/chr{chroms}-merge-sampleList.txt", + resources: + mem_mb=200, + runtime="00:10:00", + benchmark: + repeat("benchmarks/merge_sample_chrom_list.chr{chroms}.RawSeq", 1) + script: + "../scripts/merge-sample-chrom-list.py" + +rule merge_samples_per_chrom: + input: + mergeList="results/vcf/chr{chroms}-merge-sampleList.txt", + output: + mergeChrVCF="results/vcf/Merged-chr{chroms}-allSamples.vcf.gz", + params: + chroms_from_wc=lambda wc: wc.get("chroms"), + resources: + mem_mb=10000, + runtime="00:10:00", + benchmark: + repeat("benchmarks/merge_samples_per_chrom.chr{chroms}.RawSeq", 1) + script: + "../scripts/merge-sample-chroms.py" + +rule merge_samples_per_chrom_index: + input: + mergeChrVCF="results/vcf/Merged-chr{chroms}-allSamples.vcf.gz", + output: + mergeChrTBI="results/vcf/Merged-chr{chroms}-allSamples.vcf.gz.tbi", + resources: + mem_mb=1500, + runtime="00:30:00", + benchmark: + repeat("benchmarks/merge_samples_per_chrom_index.chr{chroms}.RawSeq", 1) + shell: + """ + bcftools index --tbi {input.mergeChrVCF} -o {output.mergeChrTBI} + """ diff --git a/workflow/scripts/cram-variant-calling-discovery.py b/workflow/scripts/cram-variant-calling-discovery.py new file mode 100644 index 0000000..536a51f --- /dev/null +++ b/workflow/scripts/cram-variant-calling-discovery.py @@ -0,0 +1,127 @@ +from os import path +from snakemake.shell import shell + +# Extract arguments. +extra = snakemake.params.get("extra", "") +extra_mpileup = snakemake.params.get("extra_mpileup", "") +extra_call = snakemake.params.get("extra_call", "") +extra_NormTrueFalse = snakemake.params.get("extra_NormTrueFalse", "") +extra_normalize = snakemake.params.get("extra_normalize", "") + +sample = snakemake.params.get("sample", "") +chromsWC = snakemake.params.get("chroms_from_wc", "") + +reference = snakemake.input.ref +# comment block below if you have to add specific path to referene in cram_variantCalling.smk file +# if isinstance(reference, str): +# reference = path.splitext(snakemake.input.ref)[0] +# else: +# reference = path.splitext(snakemake.input.ref[0])[0] + +log = snakemake.log_fmt_shell(stdout=False, stderr=True) + +shell( + "mkdir -p results/vcf/{sample}/TempFiles/chr{chromsWC}" +) + +def findSequenceName(Filename,Value): + with open(Filename) as f: + if Value in f.read(): # read if value is in file + print('chr1 is present, thus, chr string exists') + print('Adding chr to NYGC regions file') + return True + else: + return False + f.close() + +Norm = extra_NormTrueFalse + +filename = snakemake.input.chrStrCheck +value = 'SN:chr1' + +chrStrYes = findSequenceName(filename,value) + +if Norm == True: + if chrStrYes == True: + # keeps the chr string for "chr1" + shell( + "bcftools mpileup" + " -f {reference}" + " {extra_mpileup}" + " -r {snakemake.input.regions}" + " -O u" + " results/cram/{sample}.cram |" + " bcftools call" + " {extra_call}" + " -O u |" + " bcftools sort" + " -m 2G" + " -T results/vcf/{sample}/TempFiles/chr{chromsWC}" + " -O u |" + " bcftools norm -f {reference}" + " {extra_normalize}" + " -O z" + " -o {snakemake.output.ChrVCF}" + ) + + elif chrStrYes == False: + # removes the chr string for "1" + shell( + "bcftools mpileup" + " -f {reference}" + " {extra_mpileup}" + " -r {snakemake.input.regions}" + " -O u" + " results/cram/{sample}.cram |" + " bcftools call" + " {extra_call}" + " -O u |" + " bcftools sort" + " -m 2G" + " -T results/vcf/{sample}/TempFiles/chr{chromsWC}" + " -O u |" + " bcftools norm -f {reference}" + " {extra_normalize}" + " -O z" + " -o {snakemake.output.ChrVCF}" + ) + +elif Norm == False: + if chrStrYes == True: + # keeps the chr string for "chr1" + shell( + "bcftools mpileup" + " -f {reference}" + " {extra_mpileup}" + " -r {snakemake.input.regions}" + " -O u" + " results/cram/{sample}.cram |" + " bcftools call" + " {extra_call}" + " -O u |" + " bcftools sort" + " -m 2G" + " -T results/vcf/{sample}/TempFiles/chr{chromsWC}" + " -O z" + " -o {snakemake.output.ChrVCF}" + ) + + elif chrStrYes == False: + # removes the chr string for "1" + shell( + "bcftools mpileup" + " -f {reference}" + " {extra_mpileup}" + " -r {snakemake.input.regions}" + " -O u" + " results/cram/{sample}.cram |" + " bcftools call" + " {extra_call}" + " -O u |" + " bcftools sort" + " -m 2G" + " -T results/vcf/{sample}/TempFiles/chr{chromsWC}" + " -O z" + " -o {snakemake.output.ChrVCF}" + ) + diff --git a/workflow/scripts/merge-sample-chrom-list-discovery.py b/workflow/scripts/merge-sample-chrom-list-discovery.py new file mode 100644 index 0000000..175ddc0 --- /dev/null +++ b/workflow/scripts/merge-sample-chrom-list-discovery.py @@ -0,0 +1,16 @@ +from os import path +import re +import tempfile +from snakemake.shell import shell + +# Extract arguments. +# extra = snakemake.params.get("extra", "") + +log = snakemake.log_fmt_shell(stdout=False, stderr=True) +sed_arg = [ "sed 's/ /\\n/g'" ] + +shell( + "echo {snakemake.input.ChrVCF} |" + " {sed_arg} - > {snakemake.output.mergeList}" +) + diff --git a/workflow/scripts/variant-calling-discovery.py b/workflow/scripts/variant-calling-discovery.py new file mode 100644 index 0000000..536a51f --- /dev/null +++ b/workflow/scripts/variant-calling-discovery.py @@ -0,0 +1,127 @@ +from os import path +from snakemake.shell import shell + +# Extract arguments. +extra = snakemake.params.get("extra", "") +extra_mpileup = snakemake.params.get("extra_mpileup", "") +extra_call = snakemake.params.get("extra_call", "") +extra_NormTrueFalse = snakemake.params.get("extra_NormTrueFalse", "") +extra_normalize = snakemake.params.get("extra_normalize", "") + +sample = snakemake.params.get("sample", "") +chromsWC = snakemake.params.get("chroms_from_wc", "") + +reference = snakemake.input.ref +# comment block below if you have to add specific path to referene in cram_variantCalling.smk file +# if isinstance(reference, str): +# reference = path.splitext(snakemake.input.ref)[0] +# else: +# reference = path.splitext(snakemake.input.ref[0])[0] + +log = snakemake.log_fmt_shell(stdout=False, stderr=True) + +shell( + "mkdir -p results/vcf/{sample}/TempFiles/chr{chromsWC}" +) + +def findSequenceName(Filename,Value): + with open(Filename) as f: + if Value in f.read(): # read if value is in file + print('chr1 is present, thus, chr string exists') + print('Adding chr to NYGC regions file') + return True + else: + return False + f.close() + +Norm = extra_NormTrueFalse + +filename = snakemake.input.chrStrCheck +value = 'SN:chr1' + +chrStrYes = findSequenceName(filename,value) + +if Norm == True: + if chrStrYes == True: + # keeps the chr string for "chr1" + shell( + "bcftools mpileup" + " -f {reference}" + " {extra_mpileup}" + " -r {snakemake.input.regions}" + " -O u" + " results/cram/{sample}.cram |" + " bcftools call" + " {extra_call}" + " -O u |" + " bcftools sort" + " -m 2G" + " -T results/vcf/{sample}/TempFiles/chr{chromsWC}" + " -O u |" + " bcftools norm -f {reference}" + " {extra_normalize}" + " -O z" + " -o {snakemake.output.ChrVCF}" + ) + + elif chrStrYes == False: + # removes the chr string for "1" + shell( + "bcftools mpileup" + " -f {reference}" + " {extra_mpileup}" + " -r {snakemake.input.regions}" + " -O u" + " results/cram/{sample}.cram |" + " bcftools call" + " {extra_call}" + " -O u |" + " bcftools sort" + " -m 2G" + " -T results/vcf/{sample}/TempFiles/chr{chromsWC}" + " -O u |" + " bcftools norm -f {reference}" + " {extra_normalize}" + " -O z" + " -o {snakemake.output.ChrVCF}" + ) + +elif Norm == False: + if chrStrYes == True: + # keeps the chr string for "chr1" + shell( + "bcftools mpileup" + " -f {reference}" + " {extra_mpileup}" + " -r {snakemake.input.regions}" + " -O u" + " results/cram/{sample}.cram |" + " bcftools call" + " {extra_call}" + " -O u |" + " bcftools sort" + " -m 2G" + " -T results/vcf/{sample}/TempFiles/chr{chromsWC}" + " -O z" + " -o {snakemake.output.ChrVCF}" + ) + + elif chrStrYes == False: + # removes the chr string for "1" + shell( + "bcftools mpileup" + " -f {reference}" + " {extra_mpileup}" + " -r {snakemake.input.regions}" + " -O u" + " results/cram/{sample}.cram |" + " bcftools call" + " {extra_call}" + " -O u |" + " bcftools sort" + " -m 2G" + " -T results/vcf/{sample}/TempFiles/chr{chromsWC}" + " -O z" + " -o {snakemake.output.ChrVCF}" + ) +