Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Germline CNV WDLs for WGS #6607

Merged
merged 10 commits into from
Aug 11, 2020
73 changes: 72 additions & 1 deletion scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ task CollectCounts {
File ref_fasta
File ref_fasta_fai
File ref_fasta_dict
Array[String]? disabled_read_filters
Boolean? enable_indexing
String? format
File? gatk4_jar_override
Expand All @@ -201,11 +202,22 @@ task CollectCounts {
Int? preemptible_attempts
}

parameter_meta {
bam: {
localization_optional: true
}
bam_idx: {
localization_optional: true
}
}

Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
Int command_mem_mb = machine_mem_mb - 1000

Boolean enable_indexing_ = select_first([enable_indexing, false])

Array[String] disabled_read_filters_arr = if defined(disabled_read_filters) then prefix("--disable-read-filter ", select_first([disabled_read_filters])) else []

# Sample name is derived from the bam filename
String base_filename = basename(bam, ".bam")
String format_ = select_first([format, "HDF5"])
Expand Down Expand Up @@ -257,7 +269,8 @@ task CollectCounts {
--reference ~{ref_fasta} \
--format ~{default="HDF5" hdf5_or_tsv_or_null_format} \
--interval-merging-rule OVERLAPPING_ONLY \
--output ~{counts_filename_for_collect_read_counts}
--output ~{counts_filename_for_collect_read_counts} \
~{sep=' ' disabled_read_filters_arr}

if [ ~{do_block_compression} = "true" ]; then
bgzip ~{counts_filename_for_collect_read_counts}
Expand Down Expand Up @@ -303,6 +316,15 @@ task CollectAllelicCounts {
Int? preemptible_attempts
}

parameter_meta {
bam: {
localization_optional: true
}
bam_idx: {
localization_optional: true
}
}

Int machine_mem_mb = select_first([mem_gb, 13]) * 1000
Int command_mem_mb = machine_mem_mb - 1000

Expand Down Expand Up @@ -605,3 +627,52 @@ task CollectModelQualityMetrics {
String qc_status_string = read_string("qcStatus.txt")
}
}

task ScatterPloidyCallsBySample {
input {
File contig_ploidy_calls_tar
Array[String] samples

# Runtime parameters
String docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
}

Int num_samples = length(samples)
String out_dir = "calls_renamed"

command <<<
set -eu

# Extract ploidy calls
mkdir calls
tar xzf ~{contig_ploidy_calls_tar} -C calls/

# Archive call files by sample, renaming so they will be glob'd in order
sample_ids=(~{sep=" " samples})
num_samples=~{num_samples}
num_digits=${#num_samples}
for (( i=0; i<~{num_samples}; i++ ))
do
sample_id=${sample_ids[$i]}
padded_sample_index=$(printf "%0${num_digits}d" $i)
tar -czf sample_${padded_sample_index}.${sample_id}.contig_ploidy_calls.tar.gz -C calls/SAMPLE_${i} .
done
>>>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Space here for consistency

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


runtime {
docker: docker
memory: select_first([mem_gb, 2]) + " GiB"
disks: "local-disk " + select_first([disk_space_gb, 10]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}

output {
Array[File] sample_contig_ploidy_calls_tar = glob("sample_*.contig_ploidy_calls.tar.gz")
}
}
18 changes: 10 additions & 8 deletions scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ workflow CNVGermlineCaseScatteredWorkflow {
##############################################
#### optional arguments for CollectCounts ####
##############################################
Array[String]? disabled_read_filters_for_collect_counts
String? collect_counts_format
Boolean? collect_counts_enable_indexing
Int? mem_gb_for_collect_counts
Expand Down Expand Up @@ -149,6 +150,7 @@ workflow CNVGermlineCaseScatteredWorkflow {
preemptible_attempts = preemptible_attempts,
padding = padding,
bin_length = bin_length,
disabled_read_filters_for_collect_counts = disabled_read_filters_for_collect_counts,
collect_counts_format = collect_counts_format,
collect_counts_enable_indexing = collect_counts_enable_indexing,
mem_gb_for_collect_counts = mem_gb_for_collect_counts,
Expand Down Expand Up @@ -196,16 +198,16 @@ workflow CNVGermlineCaseScatteredWorkflow {

output {
Array[File] preprocessed_intervals = CNVGermlineCaseWorkflow.preprocessed_intervals
Array[Array[File]] read_counts_entity_id = CNVGermlineCaseWorkflow.read_counts_entity_id
Array[Array[File]] read_counts = CNVGermlineCaseWorkflow.read_counts
Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar
Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id)
Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts)
Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars)
Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars
Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf
Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf
Array[Array[File]] qc_status_files = CNVGermlineCaseWorkflow.qc_status_files
Array[Array[String]] qc_status_strings = CNVGermlineCaseWorkflow.qc_status_strings
Array[Array[File]] denoised_copy_ratios = CNVGermlineCaseWorkflow.denoised_copy_ratios
Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf)
Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf)
Array[File] denoised_copy_ratios = flatten(CNVGermlineCaseWorkflow.denoised_copy_ratios)
Array[File] qc_status_files = flatten(CNVGermlineCaseWorkflow.qc_status_files)
Array[String] qc_status_strings = flatten(CNVGermlineCaseWorkflow.qc_status_strings)
}
}

Expand Down
18 changes: 15 additions & 3 deletions scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ workflow CNVGermlineCaseWorkflow {
##############################################
#### optional arguments for CollectCounts ####
##############################################
Array[String]? disabled_read_filters_for_collect_counts
String? collect_counts_format
Boolean? collect_counts_enable_indexing
Int? mem_gb_for_collect_counts
Expand Down Expand Up @@ -116,6 +117,8 @@ workflow CNVGermlineCaseWorkflow {
###################################################
Int ref_copy_number_autosomal_contigs
Array[String]? allosomal_contigs
Int? disk_space_gb_for_postprocess_germline_cnv_calls
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The task name in the comment section header should match the name of the WDL task.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Int? mem_gb_for_postprocess_germline_cnv_calls

##########################
#### arguments for QC ####
Expand Down Expand Up @@ -150,6 +153,7 @@ workflow CNVGermlineCaseWorkflow {
ref_fasta_dict = ref_fasta_dict,
format = collect_counts_format,
enable_indexing = collect_counts_enable_indexing,
disabled_read_filters = disabled_read_filters_for_collect_counts,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
mem_gb = mem_gb_for_collect_counts,
Expand Down Expand Up @@ -253,18 +257,26 @@ workflow CNVGermlineCaseWorkflow {
}
}

call CNVTasks.ScatterPloidyCallsBySample {
input :
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar,
samples = CollectCounts.entity_id,
docker = gatk_docker,
preemptible_attempts = preemptible_attempts
}

output {
File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals
Array[File] read_counts_entity_id = CollectCounts.entity_id
Array[File] read_counts = CollectCounts.counts
File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar
Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar
Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars
Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar
Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf
Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios
Array[File] qc_status_files = CollectSampleQualityMetrics.qc_status_file
Array[String] qc_status_strings = CollectSampleQualityMetrics.qc_status_string
Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios
}
}

Expand Down Expand Up @@ -314,7 +326,7 @@ task DetermineGermlineContigPloidyCaseMode {
--mapping-error-rate ~{default="0.01" mapping_error_rate} \
--sample-psi-scale ~{default="0.0001" sample_psi_scale}

tar czf case-contig-ploidy-calls.tar.gz -C ~{output_dir_}/case-calls .
tar c -C ~{output_dir_}/case-calls . | gzip -1 > case-contig-ploidy-calls.tar.gz
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain this change? Also, why is it not in cohort mode as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Faster compression with gzip -1 I believe. This is okay in case mode since the calls tars aren't usually kept in storage except as intermediates, so the tradeoff with larger file size doesn't outweigh the cost of compressing/decompressing on VMs.


rm -rf contig-ploidy-model
>>>
Expand Down
16 changes: 14 additions & 2 deletions scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ workflow CNVGermlineCohortWorkflow {
##############################################
#### optional arguments for CollectCounts ####
##############################################
Array[String]? disabled_read_filters_for_collect_counts
String? collect_counts_format
Boolean? collect_counts_enable_indexing
Int? mem_gb_for_collect_counts
Expand Down Expand Up @@ -152,6 +153,8 @@ workflow CNVGermlineCohortWorkflow {
#### arguments for PostprocessGermlineCNVCalls ####
###################################################
Int ref_copy_number_autosomal_contigs
Int? mem_gb_for_postprocess_germline_cnv_calls
Int? disk_space_gb_for_postprocess_germline_cnv_calls
Array[String]? allosomal_contigs

##########################
Expand Down Expand Up @@ -206,6 +209,7 @@ workflow CNVGermlineCohortWorkflow {
ref_fasta_dict = ref_fasta_dict,
format = collect_counts_format,
enable_indexing = collect_counts_enable_indexing,
disabled_read_filters = disabled_read_filters_for_collect_counts,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
mem_gb = mem_gb_for_collect_counts,
Expand Down Expand Up @@ -353,24 +357,32 @@ workflow CNVGermlineCohortWorkflow {
preemptible_attempts = preemptible_attempts
}

call CNVTasks.ScatterPloidyCallsBySample {
input :
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar,
samples = CollectCounts.entity_id,
docker = gatk_docker,
preemptible_attempts = preemptible_attempts
}

output {
File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals
Array[File] read_counts_entity_ids = CollectCounts.entity_id
Array[File] read_counts = CollectCounts.counts
File? annotated_intervals = AnnotateIntervals.annotated_intervals
File filtered_intervals = FilterIntervals.filtered_intervals
File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar
File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar
Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar
Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar
Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars
Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar
Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf
Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios
Array[File] sample_qc_status_files = CollectSampleQualityMetrics.qc_status_file
Array[String] sample_qc_status_strings = CollectSampleQualityMetrics.qc_status_string
File model_qc_status_file = CollectModelQualityMetrics.qc_status_file
String model_qc_string = CollectModelQualityMetrics.qc_status_string
Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios
}
}

Expand Down