Skip to content

Commit

Permalink
Revert back to 2d file array
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 committed Jul 15, 2020
1 parent e00692b commit 7084775
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 111 deletions.
110 changes: 38 additions & 72 deletions scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -437,23 +437,26 @@ task ScatterIntervals {

task PostprocessGermlineCNVCalls {
input {
File gcnv_calls_sample_tar
Array[File] gcnv_model_tars
File gcnv_shard_configs_tar
String entity_id
File contig_ploidy_calls_tar
Array[String]? allosomal_contigs
Int ref_copy_number_autosomal_contigs
Int sample_index
File? gatk4_jar_override

# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
String entity_id
Array[File] gcnv_calls_tars
Array[File] gcnv_model_tars
Array[File] calling_configs
Array[File] denoising_configs
Array[File] gcnvkernel_version
Array[File] sharded_interval_lists
File contig_ploidy_calls_tar
Array[String]? allosomal_contigs
Int ref_copy_number_autosomal_contigs
Int sample_index
File? gatk4_jar_override

# Runtime parameters
String gatk_docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
}

Int machine_mem_mb = select_first([mem_gb, 7]) * 1000
Expand All @@ -466,15 +469,28 @@ task PostprocessGermlineCNVCalls {
Array[String] allosomal_contigs_args = if defined(allosomal_contigs) then prefix("--allosomal-contig ", select_first([allosomal_contigs])) else []

command <<<
set -euo pipefail
set -eu
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}

sharded_interval_lists_array=(~{sep=" " sharded_interval_lists})

# untar calls to CALLS_0, CALLS_1, etc directories and build the command line
tar xzf ~{gcnv_calls_sample_tar}
tar xzf ~{gcnv_shard_configs_tar}
# also copy over shard config and interval files
gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars})
calling_configs_array=(~{sep=" " calling_configs})
denoising_configs_array=(~{sep=" " denoising_configs})
gcnvkernel_version_array=(~{sep=" " gcnvkernel_version})
sharded_interval_lists_array=(~{sep=" " sharded_interval_lists})
calls_args=""
for calls_dir in CALLS_*; do
calls_args="$calls_args --calls-shard-path $calls_dir"
for index in ${!gcnv_calls_tar_array[@]}; do
gcnv_calls_tar=${gcnv_calls_tar_array[$index]}
mkdir -p CALLS_$index/SAMPLE_~{sample_index}
tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_~{sample_index}
cp ${calling_configs_array[$index]} CALLS_$index/
cp ${denoising_configs_array[$index]} CALLS_$index/
cp ${gcnvkernel_version_array[$index]} CALLS_$index/
cp ${sharded_interval_lists_array[$index]} CALLS_$index/
calls_args="$calls_args --calls-shard-path CALLS_$index"
done

# untar models to MODEL_0, MODEL_1, etc directories and build the command line
Expand Down Expand Up @@ -512,7 +528,6 @@ task PostprocessGermlineCNVCalls {
disks: "local-disk " + select_first([disk_space_gb, 40]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
maxRetries: 1
}

output {
Expand Down Expand Up @@ -613,55 +628,6 @@ task CollectModelQualityMetrics {
}
}

task TransposeCallerOutputs {
input {
Array[File] gcnv_calls_tars

# Runtime parameters
String docker
Int? mem_gb
Int? disk_space_gb
Boolean use_ssd = false
Int? cpu
Int? preemptible_attempts
}

command <<<
set -euo pipefail

gcnv_calls_tar_array=(~{sep=" " gcnv_calls_tars})
for index in ${!gcnv_calls_tar_array[@]}; do
mkdir CALLS_$index
tar xzf ${gcnv_calls_tar_array[$index]} -C CALLS_$index
done

CURRENT_SAMPLE=0
NUM_SAMPLES=$(ls -d CALLS_0/SAMPLE_* | wc -l)
NUM_DIGITS=${#NUM_SAMPLES}
while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE)
tar c CALLS_*/SAMPLE_$CURRENT_SAMPLE | gzip -1 > gcnv-calls-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
done

rm -r CALLS_*/SAMPLE_*
tar c CALLS_* | gzip -1 > gcnv-shard-configs.tar.gz
>>>

runtime {
docker: docker
memory: select_first([mem_gb, 2]) + " GiB"
disks: "local-disk " + select_first([disk_space_gb, 150]) + if use_ssd then " SSD" else " HDD"
cpu: select_first([cpu, 1])
preemptible: select_first([preemptible_attempts, 5])
}

output {
Array[File] gcnv_calls_sample_tars = glob("gcnv-calls-sample-*.tar.gz")
File gcnv_shard_configs_tar = "gcnv-shard-configs.tar.gz"
}
}

task ScatterPloidyCallsBySample {
input {
File contig_ploidy_calls_tar
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ workflow CNVGermlineCaseScatteredWorkflow {
Array[File] read_counts_entity_id = flatten(CNVGermlineCaseWorkflow.read_counts_entity_id)
Array[File] read_counts = flatten(CNVGermlineCaseWorkflow.read_counts)
Array[File] sample_contig_ploidy_calls_tars = flatten(CNVGermlineCaseWorkflow.sample_contig_ploidy_calls_tars)
Array[Array[File]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars
Array[File] genotyped_intervals_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_intervals_vcf)
Array[File] genotyped_segments_vcf = flatten(CNVGermlineCaseWorkflow.genotyped_segments_vcf)
Expand Down
41 changes: 23 additions & 18 deletions scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -232,32 +232,25 @@ workflow CNVGermlineCaseWorkflow {
}
}

call CNVTasks.TransposeCallerOutputs {
input:
gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar,
docker = gatk_docker,
mem_gb = mem_gb_for_transpose_caller_outputs,
disk_space_gb = disk_space_gb_for_transpose_caller_outputs,
preemptible_attempts = preemptible_attempts
}
Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCaseMode.gcnv_call_tars)

scatter (sample_index in range(length(normal_bams))) {

call CNVTasks.PostprocessGermlineCNVCalls {
input:
gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index],
gcnv_model_tars = gcnv_model_tars,
gcnv_shard_configs_tar = TransposeCallerOutputs.gcnv_shard_configs_tar,
entity_id = CollectCounts.entity_id[sample_index],
gcnv_calls_tars = call_tars_sample_by_shard[sample_index],
gcnv_model_tars = gcnv_model_tars,
calling_configs = GermlineCNVCallerCaseMode.calling_config_json,
denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json,
gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json,
sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list,
allosomal_contigs = allosomal_contigs,
ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar,
sample_index = sample_index,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
mem_gb = mem_gb_for_postprocess_germline_cnv_calls,
disk_space_gb = disk_space_gb_for_postprocess_germline_cnv_calls
preemptible_attempts = preemptible_attempts
}

call CNVTasks.CollectSampleQualityMetrics {
Expand All @@ -283,7 +276,7 @@ workflow CNVGermlineCaseWorkflow {
Array[File] read_counts_entity_id = CollectCounts.entity_id
Array[File] read_counts = CollectCounts.counts
Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar
Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar
Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars
Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar
Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf
Expand Down Expand Up @@ -465,9 +458,17 @@ task GermlineCNVCallerCaseMode {
--caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \
--disable-annealing ~{default="false" disable_annealing}

tar czf case-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-calls .
tar czf case-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/case-tracking .

CURRENT_SAMPLE=0
NUM_SAMPLES=~{num_samples}
NUM_DIGITS=${#NUM_SAMPLES}
while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE)
tar czf case-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE .
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
done

rm -rf contig-ploidy-calls
rm -rf gcnv-model
>>>
Expand All @@ -481,7 +482,11 @@ task GermlineCNVCallerCaseMode {
}

output {
File gcnv_calls_tar = "case-gcnv-calls-shard-~{scatter_index}.tar.gz"
Array[File] gcnv_call_tars = glob("case-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz")
File gcnv_tracking_tar = "case-gcnv-tracking-shard-~{scatter_index}.tar.gz"
File calling_config_json = "~{output_dir_}/case-calls/calling_config.json"
File denoising_config_json = "~{output_dir_}/case-calls/denoising_config.json"
File gcnvkernel_version_json = "~{output_dir_}/case-calls/gcnvkernel_version.json"
File sharded_interval_list = "~{output_dir_}/case-calls/interval_list.tsv"
}
}
45 changes: 25 additions & 20 deletions scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -325,31 +325,26 @@ workflow CNVGermlineCohortWorkflow {
}
}

call CNVTasks.TransposeCallerOutputs {
input:
gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar,
docker = gatk_docker,
mem_gb = mem_gb_for_transpose_caller_outputs,
disk_space_gb = disk_space_gb_for_transpose_caller_outputs,
preemptible_attempts = preemptible_attempts
}

scatter (sample_index in range(length(normal_bams))) {
Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars)

scatter (sample_index in range(length(CollectCounts.entity_id))) {
call CNVTasks.PostprocessGermlineCNVCalls {
input:
gcnv_calls_sample_tar = TransposeCallerOutputs.gcnv_calls_sample_tars[sample_index],
gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar,
gcnv_shard_configs_tar = TransposeCallerOutputs.gcnv_shard_configs_tar,
entity_id = CollectCounts.entity_id[sample_index],
gcnv_calls_tars = call_tars_sample_by_shard[sample_index],
gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar,
calling_configs = GermlineCNVCallerCohortMode.calling_config_json,
denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json,
gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json,
sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list,
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar,
allosomal_contigs = allosomal_contigs,
ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar,
sample_index = sample_index,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
preemptible_attempts = preemptible_attempts,
mem_gb = mem_gb_for_postprocess_germline_cnv_calls,
disk_space_gb = disk_space_gb_for_postprocess_germline_cnv_calls
preemptible_attempts = preemptible_attempts
}

call CNVTasks.CollectSampleQualityMetrics {
Expand Down Expand Up @@ -386,7 +381,7 @@ workflow CNVGermlineCohortWorkflow {
File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar
Array[File] sample_contig_ploidy_calls_tars = ScatterPloidyCallsBySample.sample_contig_ploidy_calls_tar
Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar
Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar
Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars
Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar
Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf
Expand Down Expand Up @@ -537,6 +532,8 @@ task GermlineCNVCallerCohortMode {
String output_dir_ = select_first([output_dir, "out"])
Int num_samples = length(read_count_files)

String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819
command <<<
set -eu
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk4_jar_override}
Expand Down Expand Up @@ -594,9 +591,17 @@ task GermlineCNVCallerCohortMode {
--caller-external-admixing-rate ~{default="1.00" caller_external_admixing_rate} \
--disable-annealing ~{default="false" disable_annealing}

tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model .
tar czf ~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-tracking .
tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls .
tar czf ~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-model .

CURRENT_SAMPLE=0
NUM_SAMPLES=~{num_samples}
NUM_DIGITS=${#NUM_SAMPLES}
while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${NUM_DIGITS}d" $CURRENT_SAMPLE)
tar czf ~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ~{output_dir_}/~{cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE .
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
done

rm -rf contig-ploidy-calls
>>>
Expand All @@ -611,7 +616,7 @@ task GermlineCNVCallerCohortMode {

output {
File gcnv_model_tar = "~{cohort_entity_id}-gcnv-model-shard-~{scatter_index}.tar.gz"
File gcnv_calls_tar = "~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}.tar.gz"
Array[File] gcnv_call_tars = glob("~{cohort_entity_id}-gcnv-calls-shard-~{scatter_index}-sample-*.tar.gz")
File gcnv_tracking_tar = "~{cohort_entity_id}-gcnv-tracking-shard-~{scatter_index}.tar.gz"
File calling_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/calling_config.json"
File denoising_config_json = "~{output_dir_}/~{cohort_entity_id}-calls/denoising_config.json"
Expand Down

0 comments on commit 7084775

Please sign in to comment.