broadinstitute · mwalker174 · Aug 11, 2020 · Jan 13, 2020 · May 22, 2020 · May 28, 2020
diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl
@@ -188,6 +188,7 @@ task CollectCounts {
       File ref_fasta
       File ref_fasta_fai
       File ref_fasta_dict
+      Array[String]? disabled_read_filters
       Boolean? enable_indexing
       String? format
       File? gatk4_jar_override
@@ -201,11 +202,22 @@ task CollectCounts {
       Int? preemptible_attempts
     }
 
+    parameter_meta {
+      bam: {
+        localization_optional: true
+      }
+      bam_idx: {
+        localization_optional: true
+      }
+    }
+
     Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
     Int command_mem_mb = machine_mem_mb - 1000
 
     Boolean enable_indexing_ = select_first([enable_indexing, false])
 
+    Array[String] disabled_read_filters_arr = if defined(disabled_read_filters) then prefix("--disable-read-filter ", select_first([disabled_read_filters])) else []
+
     # Sample name is derived from the bam filename
     String base_filename = basename(bam, ".bam")
     String format_ = select_first([format, "HDF5"])
@@ -257,7 +269,8 @@ task CollectCounts {
             --reference ~{ref_fasta} \
             --format ~{default="HDF5" hdf5_or_tsv_or_null_format} \
             --interval-merging-rule OVERLAPPING_ONLY \
-            --output ~{counts_filename_for_collect_read_counts}
+            --output ~{counts_filename_for_collect_read_counts} \
+            ~{sep=' ' disabled_read_filters_arr}
 
         if [ ~{do_block_compression} = "true" ]; then
             bgzip ~{counts_filename_for_collect_read_counts}
@@ -303,6 +316,15 @@ task CollectAllelicCounts {
       Int? preemptible_attempts
     }
 
+    parameter_meta {
+      bam: {
+        localization_optional: true
+      }
+      bam_idx: {
+        localization_optional: true
+      }
+    }
+
     Int machine_mem_mb = select_first([mem_gb, 13]) * 1000
     Int command_mem_mb = machine_mem_mb - 1000
 
@@ -605,3 +627,52 @@ task CollectModelQualityMetrics {
         String qc_status_string = read_string("qcStatus.txt")
     }
 }
+
+task ScatterPloidyCallsBySample {
+    input {
+      File contig_ploidy_calls_tar
+      Array[String] samples
+
+      # Runtime parameters
+      String docker
+      Int? mem_gb
+      Int? disk_space_gb
+      Boolean use_ssd = false
+      Int? cpu
+      Int? preemptible_attempts
+    }
+
+    Int num_samples = length(samples)
+    String out_dir = "calls_renamed"
+
+    command <<<
+      set -eu
+
+      # Extract ploidy calls
+      mkdir calls
+      tar xzf ~{contig_ploidy_calls_tar} -C calls/
+
+      # Archive call files by sample, renaming so they will be glob'd in order
+      sample_ids=(~{sep=" " samples})
+      num_samples=~{num_samples}
+      num_digits=${#num_samples}
+      for (( i=0; i<~{num_samples}; i++ ))
+      do
+        sample_id=${sample_ids[$i]}
+        padded_sample_index=$(printf "%0${num_digits}d" $i)
+        tar -czf sample_${padded_sample_index}.${sample_id}.contig_ploidy_calls.tar.gz -C calls/SAMPLE_${i} .
+      done
+    >>>
+
+    runtime {
+        docker: docker
+        memory: select_first([mem_gb, 2]) + " GiB"
+        disks: "local-disk " + select_first([disk_space_gb, 10]) + if use_ssd then " SSD" else " HDD"
+        cpu: select_first([cpu, 1])
+        preemptible: select_first([preemptible_attempts, 5])
+    }
+
+    output {
+        Array[File] sample_contig_ploidy_calls_tar = glob("sample_*.contig_ploidy_calls.tar.gz")
+    }
+}
diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl
@@ -49,6 +49,7 @@ workflow CNVGermlineCaseScatteredWorkflow {
       ##############################################
       #### optional arguments for CollectCounts ####
       ##############################################
+      Array[String]? disabled_read_filters_for_collect_counts
       String? collect_counts_format
       Boolean? collect_counts_enable_indexing
       Int? mem_gb_for_collect_counts
@@ -149,6 +150,7 @@ workflow CNVGermlineCaseScatteredWorkflow {
                 preemptible_attempts = preemptible_attempts,
                 padding = padding,
                 bin_length = bin_length,
+                disabled_read_filters_for_collect_counts = disabled_read_filters_for_collect_counts,
                 collect_counts_format = collect_counts_format,
                 collect_counts_enable_indexing = collect_counts_enable_indexing,
                 mem_gb_for_collect_counts = mem_gb_for_collect_counts,
@@ -196,16 +198,16 @@ workflow CNVGermlineCaseScatteredWorkflow {
 
     output {
         Array[File] preprocessed_intervals = CNVGermlineCaseWorkflow.preprocessed_intervals
-        Array[Array[File]] read_counts_entity_id = CNVGermlineCaseWorkflow.read_counts_entity_id
-        Array[Array[File]] read_counts = CNVGermlineCaseWorkflow.read_counts
-        Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar
+        Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id)
+        Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts)
+        Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars)
         Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
         Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars
-        Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf
-        Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf
-        Array[Array[File]] qc_status_files = CNVGermlineCaseWorkflow.qc_status_files
-        Array[Array[String]] qc_status_strings = CNVGermlineCaseWorkflow.qc_status_strings
-        Array[Array[File]] denoised_copy_ratios = CNVGermlineCaseWorkflow.denoised_copy_ratios
+        Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf)
+        Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf)
+        Array[File] denoised_copy_ratios = flatten(CNVGermlineCaseWorkflow.denoised_copy_ratios)
+        Array[File] qc_status_files = flatten(CNVGermlineCaseWorkflow.qc_status_files)
+        Array[String] qc_status_strings = flatten(CNVGermlineCaseWorkflow.qc_status_strings)
     }
 }
 

diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl
@@ -59,6 +59,7 @@ workflow CNVGermlineCaseWorkflow {
       ##############################################
       #### optional arguments for CollectCounts ####
       ##############################################
+      Array[String]? disabled_read_filters_for_collect_counts
       String? collect_counts_format
       Boolean? collect_counts_enable_indexing
       Int? mem_gb_for_collect_counts
@@ -116,6 +117,8 @@ workflow CNVGermlineCaseWorkflow {
       ###################################################
       Int ref_copy_number_autosomal_contigs
       Array[String]? allosomal_contigs
+      Int? disk_space_gb_for_postprocess_germline_cnv_calls
+      Int? mem_gb_for_postprocess_germline_cnv_calls
 
       ##########################
       #### arguments for QC ####
@@ -150,6 +153,7 @@ workflow CNVGermlineCaseWorkflow {
                 ref_fasta_dict = ref_fasta_dict,
                 format = collect_counts_format,
                 enable_indexing = collect_counts_enable_indexing,
+                disabled_read_filters = disabled_read_filters_for_collect_counts,
                 gatk4_jar_override = gatk4_jar_override,
                 gatk_docker = gatk_docker,
                 mem_gb = mem_gb_for_collect_counts,
@@ -253,18 +257,26 @@ workflow CNVGermlineCaseWorkflow {
         }
     }
 
+    call CNVTasks.ScatterPloidyCallsBySample {
+        input :
+            contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar,
+            samples = CollectCounts.entity_id,
+            docker = gatk_docker,
+            preemptible_attempts = preemptible_attempts
+    }
+
     output {
         File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals
         Array[File] read_counts_entity_id = CollectCounts.entity_id
         Array[File] read_counts = CollectCounts.counts
-        File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar
+        Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar
         Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars
         Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar
         Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
         Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf
+        Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios
         Array[File] qc_status_files = CollectSampleQualityMetrics.qc_status_file
         Array[String] qc_status_strings = CollectSampleQualityMetrics.qc_status_string
-        Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios
     }
 }
 
@@ -314,7 +326,7 @@ task DetermineGermlineContigPloidyCaseMode {
             --mapping-error-rate ~{default="0.01" mapping_error_rate} \
             --sample-psi-scale ~{default="0.0001" sample_psi_scale}
 
-        tar czf case-contig-ploidy-calls.tar.gz -C ~{output_dir_}/case-calls .
+        tar c -C ~{output_dir_}/case-calls . | gzip -1 > case-contig-ploidy-calls.tar.gz
 
         rm -rf contig-ploidy-model
     >>>

diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
@@ -87,6 +87,7 @@ workflow CNVGermlineCohortWorkflow {
       ##############################################
       #### optional arguments for CollectCounts ####
       ##############################################
+      Array[String]? disabled_read_filters_for_collect_counts
       String? collect_counts_format
       Boolean? collect_counts_enable_indexing
       Int? mem_gb_for_collect_counts
@@ -152,6 +153,8 @@ workflow CNVGermlineCohortWorkflow {
       #### arguments for PostprocessGermlineCNVCalls ####
       ###################################################
       Int ref_copy_number_autosomal_contigs
+      Int? mem_gb_for_postprocess_germline_cnv_calls
+      Int? disk_space_gb_for_postprocess_germline_cnv_calls
       Array[String]? allosomal_contigs
 
       ##########################
@@ -206,6 +209,7 @@ workflow CNVGermlineCohortWorkflow {
                 ref_fasta_dict = ref_fasta_dict,
                 format = collect_counts_format,
                 enable_indexing = collect_counts_enable_indexing,
+                disabled_read_filters = disabled_read_filters_for_collect_counts,
                 gatk4_jar_override = gatk4_jar_override,
                 gatk_docker = gatk_docker,
                 mem_gb = mem_gb_for_collect_counts,
@@ -353,24 +357,32 @@ workflow CNVGermlineCohortWorkflow {
             preemptible_attempts = preemptible_attempts
     }
 
+    call CNVTasks.ScatterPloidyCallsBySample {
+        input :
+            contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar,
+            samples = CollectCounts.entity_id,
+            docker = gatk_docker,
+            preemptible_attempts = preemptible_attempts
+    }
+
     output {
         File preprocessed_intervals = PreprocessIntervals.preprocessed_intervals
         Array[File] read_counts_entity_ids = CollectCounts.entity_id
         Array[File] read_counts = CollectCounts.counts
         File? annotated_intervals = AnnotateIntervals.annotated_intervals
         File filtered_intervals = FilterIntervals.filtered_intervals
         File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar
-        File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar
+        Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar
         Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar
         Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars
         Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar
         Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
         Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf
+        Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios
         Array[File] sample_qc_status_files = CollectSampleQualityMetrics.qc_status_file
         Array[String] sample_qc_status_strings = CollectSampleQualityMetrics.qc_status_string
         File model_qc_status_file = CollectModelQualityMetrics.qc_status_file
         String model_qc_string = CollectModelQualityMetrics.qc_status_string
-        Array[File] denoised_copy_ratios = PostprocessGermlineCNVCalls.denoised_copy_ratios
     }
 }