diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index a4fe6b08237..def1e031479 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -188,6 +188,7 @@ task CollectCounts { File ref_fasta File ref_fasta_fai File ref_fasta_dict + Array[String]? disabled_read_filters Boolean? enable_indexing String? format File? gatk4_jar_override @@ -201,11 +202,22 @@ task CollectCounts { Int? preemptible_attempts } + parameter_meta { + bam: { + localization_optional: true + } + bam_idx: { + localization_optional: true + } + } + Int machine_mem_mb = select_first([mem_gb, 7]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 Boolean enable_indexing_ = select_first([enable_indexing, false]) + Array[String] disabled_read_filters_arr = if defined(disabled_read_filters) then prefix("--disable-read-filter ", select_first([disabled_read_filters])) else [] + # Sample name is derived from the bam filename String base_filename = basename(bam, ".bam") String format_ = select_first([format, "HDF5"]) @@ -257,7 +269,8 @@ task CollectCounts { --reference ~{ref_fasta} \ --format ~{default="HDF5" hdf5_or_tsv_or_null_format} \ --interval-merging-rule OVERLAPPING_ONLY \ - --output ~{counts_filename_for_collect_read_counts} + --output ~{counts_filename_for_collect_read_counts} \ + ~{sep=' ' disabled_read_filters_arr} if [ ~{do_block_compression} = "true" ]; then bgzip ~{counts_filename_for_collect_read_counts} @@ -303,6 +316,15 @@ task CollectAllelicCounts { Int? preemptible_attempts } + parameter_meta { + bam: { + localization_optional: true + } + bam_idx: { + localization_optional: true + } + } + Int machine_mem_mb = select_first([mem_gb, 13]) * 1000 Int command_mem_mb = machine_mem_mb - 1000 @@ -605,3 +627,52 @@ task CollectModelQualityMetrics { String qc_status_string = read_string("qcStatus.txt") } } + +task ScatterPloidyCallsBySample { + input { + File contig_ploidy_calls_tar + Array[String] samples + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? cpu + Int? preemptible_attempts + } + + Int num_samples = length(samples) + String out_dir = "calls_renamed" + + command <<< + set -eu + + # Extract ploidy calls + mkdir calls + tar xzf ~{contig_ploidy_calls_tar} -C calls/ + + # Archive call files by sample, renaming so they will be glob'd in order + sample_ids=(~{sep=" " samples}) + num_samples=~{num_samples} + num_digits=${#num_samples} + for (( i=0; i<~{num_samples}; i++ )) + do + sample_id=${sample_ids[$i]} + padded_sample_index=$(printf "%0${num_digits}d" $i) + tar -czf sample_${padded_sample_index}.${sample_id}.contig_ploidy_calls.tar.gz -C calls/SAMPLE_${i} . + done + >>> + + runtime { + docker: docker + memory: select_first([mem_gb, 2]) + " GiB" + disks: "local-disk " + select_first([disk_space_gb, 10]) + if use_ssd then " SSD" else " HDD" + cpu: select_first([cpu, 1]) + preemptible: select_first([preemptible_attempts, 5]) + } + + output { + Array[File] sample_contig_ploidy_calls_tar = glob("sample_*.contig_ploidy_calls.tar.gz") + } +} \ No newline at end of file diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl index 24634c4c369..3991fea48f5 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -49,6 +49,7 @@ workflow CNVGermlineCaseScatteredWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## + Array[String]? disabled_read_filters_for_collect_counts String? collect_counts_format Boolean? collect_counts_enable_indexing Int? mem_gb_for_collect_counts @@ -149,6 +150,7 @@ workflow CNVGermlineCaseScatteredWorkflow { preemptible_attempts = preemptible_attempts, padding = padding, bin_length = bin_length, + disabled_read_filters_for_collect_counts = disabled_read_filters_for_collect_counts, collect_counts_format = collect_counts_format, collect_counts_enable_indexing = collect_counts_enable_indexing, mem_gb_for_collect_counts = mem_gb_for_collect_counts, @@ -196,16 +198,16 @@ workflow CNVGermlineCaseScatteredWorkflow { output { Array[File] preprocessed_intervals = CNVGermlineCaseWorkflow.preprocessed_intervals - Array[Array[File]] read_counts_entity_id = CNVGermlineCaseWorkflow.read_counts_entity_id - Array[Array[File]] read_counts = CNVGermlineCaseWorkflow.read_counts - Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar + Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id) + Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts) + Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars) Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars - Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf - Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf - Array[Array[File]] qc_status_files = CNVGermlineCaseWorkflow.qc_status_files - Array[Array[String]] qc_status_strings = CNVGermlineCaseWorkflow.qc_status_strings - Array[Array[File]] denoised_copy_ratios = CNVGermlineCaseWorkflow.denoised_copy_ratios + Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf) + Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf) + Array[File] denoised_copy_ratios = flatten(CNVGermlineCaseWorkflow.denoised_copy_ratios) + Array[File] qc_status_files = flatten(CNVGermlineCaseWorkflow.qc_status_files) + Array[String] qc_status_strings = flatten(CNVGermlineCaseWorkflow.qc_status_strings) } } diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index 81654ba7cc3..67f1c9b5932 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -59,6 +59,7 @@ workflow CNVGermlineCaseWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## + Array[String]? disabled_read_filters_for_collect_counts String? collect_counts_format Boolean? collect_counts_enable_indexing Int? mem_gb_for_collect_counts @@ -116,6 +117,8 @@ workflow CNVGermlineCaseWorkflow { ################################################### Int ref_copy_number_autosomal_contigs Array[String]? allosomal_contigs + Int? disk_space_gb_for_postprocess_germline_cnv_calls + Int? mem_gb_for_postprocess_germline_cnv_calls ########################## #### arguments for QC #### @@ -150,6 +153,7 @@ workflow CNVGermlineCaseWorkflow { ref_fasta_dict = ref_fasta_dict, format = collect_counts_format, enable_indexing = collect_counts_enable_indexing, + disabled_read_filters = disabled_read_filters_for_collect_counts, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, @@ -253,18 +257,26 @@ workflow CNVGermlineCaseWorkflow { } } + call CNVTasks.ScatterPloidyCallsBySample { + input : + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar, + samples = CollectCounts.entity_id, + docker = gatk_docker, + preemptible_attempts = preemptible_attempts + } + output { File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals Array[File] read_counts_entity_id = CollectCounts.entity_id Array[File] read_counts = CollectCounts.counts - File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar + Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios Array[File] qc_status_files = CollectSampleQualityMetrics.qc_status_file Array[String] qc_status_strings = CollectSampleQualityMetrics.qc_status_string - Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios } } @@ -314,7 +326,7 @@ task DetermineGermlineContigPloidyCaseMode { --mapping-error-rate ~{default="0.01" mapping_error_rate} \ --sample-psi-scale ~{default="0.0001" sample_psi_scale} - tar czf case-contig-ploidy-calls.tar.gz -C ~{output_dir_}/case-calls . + tar c -C ~{output_dir_}/case-calls . | gzip -1 > case-contig-ploidy-calls.tar.gz rm -rf contig-ploidy-model >>> diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index 66e54f2f39c..5f92933b834 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -87,6 +87,7 @@ workflow CNVGermlineCohortWorkflow { ############################################## #### optional arguments for CollectCounts #### ############################################## + Array[String]? disabled_read_filters_for_collect_counts String? collect_counts_format Boolean? collect_counts_enable_indexing Int? mem_gb_for_collect_counts @@ -152,6 +153,8 @@ workflow CNVGermlineCohortWorkflow { #### arguments for PostprocessGermlineCNVCalls #### ################################################### Int ref_copy_number_autosomal_contigs + Int? mem_gb_for_postprocess_germline_cnv_calls + Int? disk_space_gb_for_postprocess_germline_cnv_calls Array[String]? allosomal_contigs ########################## @@ -206,6 +209,7 @@ workflow CNVGermlineCohortWorkflow { ref_fasta_dict = ref_fasta_dict, format = collect_counts_format, enable_indexing = collect_counts_enable_indexing, + disabled_read_filters = disabled_read_filters_for_collect_counts, gatk4_jar_override = gatk4_jar_override, gatk_docker = gatk_docker, mem_gb = mem_gb_for_collect_counts, @@ -353,6 +357,14 @@ workflow CNVGermlineCohortWorkflow { preemptible_attempts = preemptible_attempts } + call CNVTasks.ScatterPloidyCallsBySample { + input : + contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, + samples = CollectCounts.entity_id, + docker = gatk_docker, + preemptible_attempts = preemptible_attempts + } + output { File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals Array[File] read_counts_entity_ids = CollectCounts.entity_id @@ -360,17 +372,17 @@ workflow CNVGermlineCohortWorkflow { File? annotated_intervals = AnnotateIntervals.annotated_intervals File filtered_intervals = FilterIntervals.filtered_intervals File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar - File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar + Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf + Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios Array[File] sample_qc_status_files = CollectSampleQualityMetrics.qc_status_file Array[String] sample_qc_status_strings = CollectSampleQualityMetrics.qc_status_string File model_qc_status_file = CollectModelQualityMetrics.qc_status_file String model_qc_string = CollectModelQualityMetrics.qc_status_string - Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios } }