From f146f664851417ad76d584d40e8995676fcd8447 Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Mon, 13 Jan 2020 18:17:28 -0500 Subject: [PATCH] gCNV WDLs for WGS --- scripts/cnv_wdl/cnv_common_tasks.wdl | 246 +++++++++++++----- .../cnv_germline_case_scattered_workflow.wdl | 22 +- .../germline/cnv_germline_case_workflow.wdl | 69 +++-- .../germline/cnv_germline_cohort_workflow.wdl | 77 +++--- 4 files changed, 292 insertions(+), 122 deletions(-) diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index a4fe6b08237..85835f01061 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -188,6 +188,7 @@ task CollectCounts { File ref_fasta File ref_fasta_fai File ref_fasta_dict + Array[String]? disabled_read_filters Boolean? enable_indexing String? format File? gatk4_jar_override @@ -201,10 +202,27 @@ task CollectCounts { Int? preemptible_attempts } + parameter_meta { + bam: { + localization_optional: true + } + bam_idx: { + localization_optional: true + } + } + Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 Boolean enable_indexing_ = select_first([enable_indexing, false]) + Array[String] disabled_read_filters_arr = if(defined(disabled_read_filters)) + then + prefix( + "--disable-read-filter ", + select_first([disabled_read_filters]) + ) + else + [] # Sample name is derived from the bam filename String base_filename = basename(bam, ".bam") @@ -257,7 +275,8 @@ task CollectCounts { --reference ~{ref_fasta} \ --format ~{default="HDF5" hdf5_or_tsv_or_null_format} \ --interval-merging-rule OVERLAPPING_ONLY \ - --output ~{counts_filename_for_collect_read_counts} + --output ~{counts_filename_for_collect_read_counts} \ + ~{sep=' ' disabled_read_filters_arr} if [ ~{do_block_compression} = "true" ]; then bgzip ~{counts_filename_for_collect_read_counts} @@ -303,6 +322,15 @@ task CollectAllelicCounts { Int? preemptible_attempts } + parameter_meta { + bam: { + localization_optional: true + } + bam_idx: { + localization_optional: true + } + } + Int machine_mem_mb = select_first([mem_gb, 13]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 @@ -413,33 +441,33 @@ task ScatterIntervals { } } -task PostprocessGermlineCNVCalls { +task BundledPostprocessGermlineCNVCalls { input { - String entity_id - Array[File] gcnv_calls_tars - Array[File] gcnv_model_tars - Array[File] calling_configs - Array[File] denoising_configs - Array[File] gcnvkernel_version - Array[File] sharded_interval_lists - File contig_ploidy_calls_tar - Array[String]? allosomal_contigs - Int ref_copy_number_autosomal_contigs - Int sample_index - File? gatk4_jar_override - - # Runtime parameters - String gatk_docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? cpu - Int? preemptible_attempts + File invariants_tar + String entity_id + File contig_ploidy_calls_tar + Array[String]? allosomal_contigs + Int ref_copy_number_autosomal_contigs + Int sample_index + File? gatk4_jar_override + + # Runtime parameters + String gatk_docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts } Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 + Float invariants_size = size(invariants_tar, "GiB") + Float disk_overhead = 20.0 + Float tar_disk_factor= 5.0 + Int vm_disk_size = ceil(tar_disk_factor * invariants_size + disk_overhead) + String genotyped_intervals_vcf_filename = "genotyped-intervals-~{entity_id}.vcf.gz" String genotyped_segments_vcf_filename = "genotyped-segments-~{entity_id}.vcf.gz" String denoised_copy_ratios_filename = "denoised_copy_ratios-~{entity_id}.tsv" @@ -447,49 +475,31 @@ task PostprocessGermlineCNVCalls { Array[String] allosomal_contigs_args = if defined(allosomal_contigs) then prefix("--allosomal-contig ", select_first([allosomal_contigs])) else [] command <<< - set -eu - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} + set -euo pipefail - sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override} # untar calls to CALLS_0, CALLS_1, etc directories and build the command line # also copy over shard config and interval files - gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars}) - calling_configs_array=(~{sep=" " calling_configs}) - denoising_configs_array=(~{sep=" " denoising_configs}) - gcnvkernel_version_array=(~{sep=" " gcnvkernel_version}) - sharded_interval_lists_array=(~{sep=" " sharded_interval_lists}) - calls_args="" - for index in ${!gcnv_calls_tar_array[@]}; do - gcnv_calls_tar=${gcnv_calls_tar_array[$index]} - mkdir -p CALLS_$index/SAMPLE_~{sample_index} - tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_~{sample_index} - cp ${calling_configs_array[$index]} CALLS_$index/ - cp ${denoising_configs_array[$index]} CALLS_$index/ - cp ${gcnvkernel_version_array[$index]} CALLS_$index/ - cp ${sharded_interval_lists_array[$index]} CALLS_$index/ - calls_args="$calls_args --calls-shard-path CALLS_$index" - done - - # untar models to MODEL_0, MODEL_1, etc directories and build the command line - gcnv_model_tar_array=(~{sep=" " gcnv_model_tars}) - model_args="" - for index in ${!gcnv_model_tar_array[@]}; do - gcnv_model_tar=${gcnv_model_tar_array[$index]} - mkdir MODEL_$index - tar xzf $gcnv_model_tar -C MODEL_$index - model_args="$model_args --model-shard-path MODEL_$index" + tar xzf ~{invariants_tar} + rm ~{invariants_tar} + number_of_shards=`find . -name 'CALLS_*' | wc -l` + + touch calls_and_model_args.txt + for i in $(seq 0 `expr $number_of_shards - 1`); do + echo "--calls-shard-path CALLS_$i" >> calls_and_model_args.txt + echo "--model-shard-path MODEL_$i" >> calls_and_model_args.txt done - mkdir contig-ploidy-calls - tar xzf ~{contig_ploidy_calls_tar} -C contig-ploidy-calls + mkdir -p extracted-contig-ploidy-calls + tar xzf ~{contig_ploidy_calls_tar} -C extracted-contig-ploidy-calls + rm ~{contig_ploidy_calls_tar} - gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \ - $calls_args \ - $model_args \ + time gatk --java-options "-Xmx~{command_mem_mb}m" PostprocessGermlineCNVCalls \ + --arguments_file calls_and_model_args.txt \ ~{sep=" " allosomal_contigs_args} \ --autosomal-ref-copy-number ~{ref_copy_number_autosomal_contigs} \ - --contig-ploidy-calls contig-ploidy-calls \ + --contig-ploidy-calls extracted-contig-ploidy-calls \ --sample-index ~{sample_index} \ --output-genotyped-intervals ~{genotyped_intervals_vcf_filename} \ --output-genotyped-segments ~{genotyped_segments_vcf_filename} \ @@ -497,15 +507,16 @@ task PostprocessGermlineCNVCalls { rm -rf CALLS_* rm -rf MODEL_* - rm -rf contig-ploidy-calls + rm -rf extracted-contig-ploidy-calls >>> runtime { docker: gatk_docker memory: machine_mem_mb + " MB" - disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD" + disks: "local-disk " + select_first([disk_space_gb, vm_disk_size]) + if use_ssd then " SSD" else " HDD" cpu: select_first([cpu, 1]) preemptible: select_first([preemptible_attempts, 5]) + maxRetries: 1 } output { @@ -605,3 +616,122 @@ task CollectModelQualityMetrics { String qc_status_string = read_string("qcStatus.txt") } } + +task BundlePostprocessingInvariants { + input { + Array[File] calls_tars + Array[File] model_tars + Array[File] calling_configs + Array[File] denoising_configs + Array[File] gcnvkernel_version + Array[File] sharded_interval_lists + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + command <<< + set -euo pipefail + mkdir -p out + + calls_files_tar_list=~{write_lines(calls_tars)} + model_files_tar_list=~{write_lines(model_tars)} + + calling_configs_list=~{write_lines(calling_configs)} + denoising_configs_list=~{write_lines(denoising_configs)} + gcnvkernel_version_list=~{write_lines(gcnvkernel_version)} + sharded_interval_lists_list=~{write_lines(sharded_interval_lists)} + + cat $calls_files_tar_list | sort -V > calls_files_tar_list.sorted + cat $model_files_tar_list | sort -V > model_files_tar_list.sorted + + cat $calling_configs_list | sort -V > calling_configs_list.sorted + cat $denoising_configs_list | sort -V > denoising_configs_list.sorted + cat $gcnvkernel_version_list | sort -V > gcnvkernel_version_list.sorted + cat $sharded_interval_lists_list | sort -V > sharded_interval_lists_list.sorted + + paste calls_files_tar_list.sorted model_files_tar_list.sorted calling_configs_list.sorted denoising_configs_list.sorted gcnvkernel_version_list.sorted sharded_interval_lists_list.sorted |\ + awk '{print (NR-1)"\t"$0}' > file_sets.sorted + OIFS=$IFS + IFS=$'\t' + while read index calls_tar model_tar call_config denoise version intervals; do + mkdir -p out/CALLS_$index + mkdir -p out/MODEL_$index + tar xzf $calls_tar -C out/CALLS_$index + tar xzf $model_tar -C out/MODEL_$index + cp $call_config out/CALLS_$index + cp $denoise out/CALLS_$index + cp $version out/CALLS_$index + cp $intervals out/CALLS_$index + rm $calls_tar $model_tar $call_config $denoise $version $intervals + + done < file_sets.sorted + IFS=$OIFS + + tar c -C out . | gzip -1 > case-gcnv-postprocessing-invariants.tar.gz + rm -Rf out + >>> + + runtime { + docker: docker + memory: select_first([mem_gb, 2]) + " GiB" + disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + File bundle_tar = "case-gcnv-postprocessing-invariants.tar.gz" + } +} + +task ScatterPloidyCallsBySample { + input { + File contig_ploidy_calls_tar + Array[String] samples + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int num_samples = length(samples) + String out_dir = "calls_renamed" + + command <<< + set -eu + + # Extract ploidy calls + mkdir calls + tar xzf ~{contig_ploidy_calls_tar} -C calls/ + + # Archive call files by sample, renaming so they will be glob'd in order + sample_ids=(~{sep=" " samples}) + for (( i=0; i<~{num_samples}; i++ )) + do + sample_id=${sample_ids[$i]} + sample_no=`printf %04d $i` + tar -czf sample_${sample_no}.${sample_id}.contig_ploidy_calls.tar.gz -C calls/SAMPLE_${i} . + done + >>> + runtime { + docker: docker + memory: select_first([mem_gb, 2]) + " GiB" + disks: "local-disk " + select_first([disk_space_gb, 10]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + Array[File] sample_contig_ploidy_calls_tar = glob("sample_*.contig_ploidy_calls.tar.gz") + } +} \ No newline at end of file diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl index 24634c4c369..ec1c8c6636b 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -49,6 +49,7 @@ workflow CNVGermlineCaseScatteredWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## + Array[String]? disabled_read_filters_for_collect_counts String? collect_counts_format Boolean? collect_counts_enable_indexing Int? mem_gb_for_collect_counts @@ -149,6 +150,7 @@ workflow CNVGermlineCaseScatteredWorkflow { preemptible_attempts = preemptible_attempts, padding = padding, bin_length = bin_length, + disabled_read_filters_for_collect_counts = disabled_read_filters_for_collect_counts, collect_counts_format = collect_counts_format, collect_counts_enable_indexing = collect_counts_enable_indexing, mem_gb_for_collect_counts = mem_gb_for_collect_counts, @@ -196,16 +198,16 @@ workflow CNVGermlineCaseScatteredWorkflow { output { Array[File] preprocessed_intervals = CNVGermlineCaseWorkflow.preprocessed_intervals - Array[Array[File]] read_counts_entity_id = CNVGermlineCaseWorkflow.read_counts_entity_id - Array[Array[File]] read_counts = CNVGermlineCaseWorkflow.read_counts - Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar - Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars - Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars - Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf - Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf - Array[Array[File]] qc_status_files = CNVGermlineCaseWorkflow.qc_status_files - Array[Array[String]] qc_status_strings = CNVGermlineCaseWorkflow.qc_status_strings - Array[Array[File]] denoised_copy_ratios = CNVGermlineCaseWorkflow.denoised_copy_ratios + Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id) + Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts) + Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars) + Array[File] gcnv_calls_tars = flatten(CNVGermlineCaseWorkflow.gcnv_calls_tars) + Array[File] gcnv_tracking_tars = flatten(CNVGermlineCaseWorkflow.gcnv_tracking_tars) + Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf) + Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf) + Array[File] denoised_copy_ratios = flatten(CNVGermlineCaseWorkflow.denoised_copy_ratios) + Array[File] qc_status_files = flatten(CNVGermlineCaseWorkflow.qc_status_files) + Array[String] qc_status_strings = flatten(CNVGermlineCaseWorkflow.qc_status_strings) } } diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index 81654ba7cc3..d13cee99c10 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -59,6 +59,7 @@ workflow CNVGermlineCaseWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## + Array[String]? disabled_read_filters_for_collect_counts String? collect_counts_format Boolean? collect_counts_enable_indexing Int? mem_gb_for_collect_counts @@ -111,11 +112,19 @@ workflow CNVGermlineCaseWorkflow { Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing + ###################################################### + #### arguments for BundlePostprocessingInvariants #### + ###################################################### + Int? mem_gb_for_bundle_postprocessing_invariants + Int? disk_space_gb_for_bundle_postprocessing_invariants + ################################################### #### arguments for PostprocessGermlineCNVCalls #### ################################################### Int ref_copy_number_autosomal_contigs Array[String]? allosomal_contigs + Int? disk_space_gb_for_postprocess_germline_cnv_calls + Int? mem_gb_for_postprocess_germline_cnv_calls ########################## #### arguments for QC #### @@ -150,6 +159,7 @@ workflow CNVGermlineCaseWorkflow { ref_fasta_dict = ref_fasta_dict, format = collect_counts_format, enable_indexing = collect_counts_enable_indexing, + disabled_read_filters = disabled_read_filters_for_collect_counts, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, @@ -222,30 +232,39 @@ workflow CNVGermlineCaseWorkflow { } } - Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCaseMode.gcnv_call_tars) + call CNVTasks.BundlePostprocessingInvariants { + input: + calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar, + model_tars = gcnv_model_tars, + calling_configs = GermlineCNVCallerCaseMode.calling_config_json, + denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, + docker = gatk_docker, + mem_gb = mem_gb_for_bundle_postprocessing_invariants, + disk_space_gb = disk_space_gb_for_bundle_postprocessing_invariants, + preemptible_attempts = preemptible_attempts + } scatter (sample_index in range(length(normal_bams))) { - call CNVTasks.PostprocessGermlineCNVCalls { + call CNVTasks.BundledPostprocessGermlineCNVCalls { input: + invariants_tar = BundlePostprocessingInvariants.bundle_tar, entity_id = CollectCounts.entity_id[sample_index], - gcnv_calls_tars = call_tars_sample_by_shard[sample_index], - gcnv_model_tars = gcnv_model_tars, - calling_configs = GermlineCNVCallerCaseMode.calling_config_json, - denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, - gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, - sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar, sample_index = sample_index, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + mem_gb = mem_gb_for_postprocess_germline_cnv_calls, + disk_space_gb = disk_space_gb_for_postprocess_germline_cnv_calls } call CNVTasks.CollectSampleQualityMetrics { input: - genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf, + genotyped_segments_vcf = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf, entity_id = CollectCounts.entity_id[sample_index], maximum_number_events = maximum_number_events_per_sample, gatk_docker = gatk_docker, @@ -253,18 +272,26 @@ workflow CNVGermlineCaseWorkflow { } } + call CNVTasks.ScatterPloidyCallsBySample { + input : + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar, + samples = CollectCounts.entity_id, + docker = gatk_docker, + preemptible_attempts = preemptible_attempts + } + output { File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals Array[File] read_counts_entity_id = CollectCounts.entity_id Array[File] read_counts = CollectCounts.counts - File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar - Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars + Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar + Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar - Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf - Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] genotyped_intervals_vcf = BundledPostprocessGermlineCNVCalls.genotyped_intervals_vcf + Array[File] genotyped_segments_vcf = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] denoised_copy_ratios = BundledPostprocessGermlineCNVCalls.denoised_copy_ratios Array[File] qc_status_files = CollectSampleQualityMetrics.qc_status_file Array[String] qc_status_strings = CollectSampleQualityMetrics.qc_status_string - Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios } } @@ -441,15 +468,7 @@ task GermlineCNVCallerCaseMode { --disable-annealing ~{default="false" disable_annealing} tar czf case-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-tracking . - - CURRENT_SAMPLE=0 - NUM_SAMPLES=~{num_samples} - NUM_DIGITS=${#NUM_SAMPLES} - while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do - CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) - tar czf case-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE . - let CURRENT_SAMPLE=CURRENT_SAMPLE+1 - done + tar czf case-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-calls . rm -rf contig-ploidy-calls rm -rf gcnv-model @@ -464,7 +483,7 @@ task GermlineCNVCallerCaseMode { } output { - Array[File] gcnv_call_tars = glob("case-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz") + File gcnv_calls_tar = "case-gcnv-calls-shard-~{scatter_index}.tar.gz" File gcnv_tracking_tar = "case-gcnv-tracking-shard-~{scatter_index}.tar.gz" File calling_config_json = "~{output_dir_}/case-calls/calling_config.json" File denoising_config_json = "~{output_dir_}/case-calls/denoising_config.json" diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index 66e54f2f39c..dc0a9af4123 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -87,6 +87,7 @@ workflow CNVGermlineCohortWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## + Array[String]? disabled_read_filters_for_collect_counts String? collect_counts_format Boolean? collect_counts_enable_indexing Int? mem_gb_for_collect_counts @@ -148,10 +149,18 @@ workflow CNVGermlineCohortWorkflow { Float? gcnv_caller_external_admixing_rate Boolean? gcnv_disable_annealing + ###################################################### + #### arguments for BundlePostprocessingInvariants #### + ###################################################### + Int? mem_gb_for_bundle_postprocessing_invariants + Int? disk_space_gb_for_bundle_postprocessing_invariants + ################################################### #### arguments for PostprocessGermlineCNVCalls #### ################################################### Int ref_copy_number_autosomal_contigs + Int? mem_gb_for_postprocess_germline_cnv_calls + Int? disk_space_gb_for_postprocess_germline_cnv_calls Array[String]? allosomal_contigs ########################## @@ -206,6 +215,7 @@ workflow CNVGermlineCohortWorkflow { ref_fasta_dict = ref_fasta_dict, format = collect_counts_format, enable_indexing = collect_counts_enable_indexing, + disabled_read_filters = disabled_read_filters_for_collect_counts, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, @@ -315,30 +325,39 @@ workflow CNVGermlineCohortWorkflow { } } - Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars) + call CNVTasks.BundlePostprocessingInvariants { + input: + calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar, + model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, + calling_configs = GermlineCNVCallerCohortMode.calling_config_json, + denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, + docker = gatk_docker, + mem_gb = mem_gb_for_bundle_postprocessing_invariants, + disk_space_gb = disk_space_gb_for_bundle_postprocessing_invariants, + preemptible_attempts = preemptible_attempts + } - scatter (sample_index in range(length(CollectCounts.entity_id))) { - call CNVTasks.PostprocessGermlineCNVCalls { + scatter (sample_index in range(length(normal_bams))) { + call CNVTasks.BundledPostprocessGermlineCNVCalls { input: + invariants_tar = BundlePostprocessingInvariants.bundle_tar, entity_id = CollectCounts.entity_id[sample_index], - gcnv_calls_tars = call_tars_sample_by_shard[sample_index], - gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, - calling_configs = GermlineCNVCallerCohortMode.calling_config_json, - denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, - gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, - sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, - contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, sample_index = sample_index, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts + preemptible_attempts = preemptible_attempts, + mem_gb = mem_gb_for_postprocess_germline_cnv_calls, + disk_space_gb = disk_space_gb_for_postprocess_germline_cnv_calls } call CNVTasks.CollectSampleQualityMetrics { input: - genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf, + genotyped_segments_vcf = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf, entity_id = CollectCounts.entity_id[sample_index], maximum_number_events = maximum_number_events_per_sample, gatk_docker = gatk_docker, @@ -353,6 +372,14 @@ workflow CNVGermlineCohortWorkflow { preemptible_attempts = preemptible_attempts } + call CNVTasks.ScatterPloidyCallsBySample { + input : + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, + samples = CollectCounts.entity_id, + docker = gatk_docker, + preemptible_attempts = preemptible_attempts + } + output { File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals Array[File] read_counts_entity_ids = CollectCounts.entity_id @@ -360,17 +387,17 @@ workflow CNVGermlineCohortWorkflow { File? annotated_intervals = AnnotateIntervals.annotated_intervals File filtered_intervals = FilterIntervals.filtered_intervals File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar - File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar + Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar - Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars + Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar - Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf - Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] genotyped_intervals_vcfs = BundledPostprocessGermlineCNVCalls.genotyped_intervals_vcf + Array[File] genotyped_segments_vcfs = BundledPostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] denoised_copy_ratios = BundledPostprocessGermlineCNVCalls.denoised_copy_ratios Array[File] sample_qc_status_files = CollectSampleQualityMetrics.qc_status_file Array[String] sample_qc_status_strings = CollectSampleQualityMetrics.qc_status_string File model_qc_status_file = CollectModelQualityMetrics.qc_status_file String model_qc_string = CollectModelQualityMetrics.qc_status_string - Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios } } @@ -572,17 +599,9 @@ task GermlineCNVCallerCohortMode { --caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \ --disable-annealing ~{default="false" disable_annealing} - tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model . - tar czf ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-tracking . - - CURRENT_SAMPLE=0 - NUM_SAMPLES=~{num_samples} - NUM_DIGITS=${#NUM_SAMPLES} - while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do - CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE) - tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE . - let CURRENT_SAMPLE=CURRENT_SAMPLE+1 - done + tar c -C ~{output_dir_}/~{cohort_entity_id}-tracking . | gzip -1 > ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz + tar c -C ~{output_dir_}/~{cohort_entity_id}-calls . | gzip -1 > ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz + tar c -C ~{output_dir_}/~{cohort_entity_id}-model . | gzip -1 > ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz rm -rf contig-ploidy-calls >>> @@ -597,7 +616,7 @@ task GermlineCNVCallerCohortMode { output { File gcnv_model_tar = "~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz" - Array[File] gcnv_call_tars = glob("~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz") + File gcnv_calls_tar = "~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz" File gcnv_tracking_tar = "~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz" File calling_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/calling_config.json" File denoising_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/denoising_config.json"