Skip to content

Commit

Permalink
Restore array output in gCNV WDLs for efficient postprocessing. (#5490)
Browse files Browse the repository at this point in the history
* Added canonicalization of all file paths passed to gCNV python scripts.

* Restore array output in gCNV WDLs for efficient postprocessing.
  • Loading branch information
samuelklee authored Jan 8, 2019
1 parent 0570670 commit c89c189
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 30 deletions.
19 changes: 17 additions & 2 deletions scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,10 @@ task PostprocessGermlineCNVCalls {
String entity_id
Array[File] gcnv_calls_tars
Array[File] gcnv_model_tars
Array[File] calling_configs
Array[File] denoising_configs
Array[File] gcnvkernel_version
Array[File] sharded_interval_lists
File contig_ploidy_calls_tar
Array[String]? allosomal_contigs
Int ref_copy_number_autosomal_contigs
Expand Down Expand Up @@ -370,13 +374,24 @@ task PostprocessGermlineCNVCalls {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

sharded_interval_lists_array=(${sep=" " sharded_interval_lists})

# untar calls to CALLS_0, CALLS_1, etc directories and build the command line
# also copy over shard config and interval files
gcnv_calls_tar_array=(${sep=" " gcnv_calls_tars})
calling_configs_array=(${sep=" " calling_configs})
denoising_configs_array=(${sep=" " denoising_configs})
gcnvkernel_version_array=(${sep=" " gcnvkernel_version})
sharded_interval_lists_array=(${sep=" " sharded_interval_lists})
calls_args=""
for index in ${dollar}{!gcnv_calls_tar_array[@]}; do
gcnv_calls_tar=${dollar}{gcnv_calls_tar_array[$index]}
mkdir CALLS_$index
tar xzf $gcnv_calls_tar -C CALLS_$index
mkdir -p CALLS_$index/SAMPLE_${sample_index}
tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_${sample_index}
cp ${dollar}{calling_configs_array[$index]} CALLS_$index/
cp ${dollar}{denoising_configs_array[$index]} CALLS_$index/
cp ${dollar}{gcnvkernel_version_array[$index]} CALLS_$index/
cp ${dollar}{sharded_interval_lists_array[$index]} CALLS_$index/
calls_args="$calls_args --calls-shard-path CALLS_$index"
done

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ workflow CNVGermlineCaseScatteredWorkflow {
Array[Array[File]] read_counts_entity_id = CNVGermlineCaseWorkflow.read_counts_entity_id
Array[Array[File]] read_counts = CNVGermlineCaseWorkflow.read_counts
Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar
Array[Array[File]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars
Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf
Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf
Expand Down
29 changes: 23 additions & 6 deletions scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,18 @@ workflow CNVGermlineCaseWorkflow {
}
}

Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCaseMode.gcnv_call_tars)

scatter (sample_index in range(length(normal_bams))) {
call CNVTasks.PostprocessGermlineCNVCalls {
input:
entity_id = CollectCounts.entity_id[sample_index],
gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar,
gcnv_calls_tars = call_tars_sample_by_shard[sample_index],
gcnv_model_tars = gcnv_model_tars,
calling_configs = GermlineCNVCallerCaseMode.calling_config_json,
denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json,
gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json,
sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list,
allosomal_contigs = allosomal_contigs,
ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar,
Expand All @@ -232,7 +238,7 @@ workflow CNVGermlineCaseWorkflow {
Array[File] read_counts_entity_id = CollectCounts.entity_id
Array[File] read_counts = CollectCounts.counts
File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar
Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar
Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars
Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar
Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf
Expand Down Expand Up @@ -355,6 +361,9 @@ task GermlineCNVCallerCaseMode {

# If optional output_dir not specified, use "out"
String output_dir_ = select_first([output_dir, "out"])
Int num_samples = length(read_count_files)

String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819

command <<<
set -e
Expand Down Expand Up @@ -406,8 +415,16 @@ task GermlineCNVCallerCaseMode {
--caller-external-admixing-rate ${default="1.00" caller_external_admixing_rate} \
--disable-annealing ${default="false" disable_annealing}

tar czf case-gcnv-calls-${scatter_index}.tar.gz -C ${output_dir_}/case-calls .
tar czf case-gcnv-tracking-${scatter_index}.tar.gz -C ${output_dir_}/case-tracking .
tar czf case-gcnv-tracking-shard-${scatter_index}.tar.gz -C ${output_dir_}/case-tracking .

CURRENT_SAMPLE=0
NUM_SAMPLES=${num_samples}
NUM_DIGITS=${dollar}{#NUM_SAMPLES}
while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${dollar}{NUM_DIGITS}d" $CURRENT_SAMPLE)
tar czf case-gcnv-calls-shard-${scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ${output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE .
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
done
>>>

runtime {
Expand All @@ -419,8 +436,8 @@ task GermlineCNVCallerCaseMode {
}

output {
File gcnv_calls_tar = "case-gcnv-calls-${scatter_index}.tar.gz"
File gcnv_tracking_tar = "case-gcnv-tracking-${scatter_index}.tar.gz"
Array[File] gcnv_call_tars = glob("case-gcnv-calls-shard-${scatter_index}-sample-*.tar.gz")
File gcnv_tracking_tar = "case-gcnv-tracking-shard-${scatter_index}.tar.gz"
File calling_config_json = "${output_dir_}/case-calls/calling_config.json"
File denoising_config_json = "${output_dir_}/case-calls/denoising_config.json"
File gcnvkernel_version_json = "${output_dir_}/case-calls/gcnvkernel_version.json"
Expand Down
33 changes: 25 additions & 8 deletions scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -304,12 +304,18 @@ workflow CNVGermlineCohortWorkflow {
}
}

Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars)

scatter (sample_index in range(length(CollectCounts.entity_id))) {
call CNVTasks.PostprocessGermlineCNVCalls {
input:
entity_id = CollectCounts.entity_id[sample_index],
gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar,
gcnv_calls_tars = call_tars_sample_by_shard[sample_index],
gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar,
calling_configs = GermlineCNVCallerCohortMode.calling_config_json,
denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json,
gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json,
sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list,
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar,
allosomal_contigs = allosomal_contigs,
ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
Expand All @@ -329,7 +335,7 @@ workflow CNVGermlineCohortWorkflow {
File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar
File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar
Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar
Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar
Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars
Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar
Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf
Expand Down Expand Up @@ -470,6 +476,9 @@ task GermlineCNVCallerCohortMode {

# If optional output_dir not specified, use "out"
String output_dir_ = select_first([output_dir, "out"])
Int num_samples = length(read_count_files)

String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819
command <<<
set -e
Expand Down Expand Up @@ -529,9 +538,17 @@ task GermlineCNVCallerCohortMode {
--caller-external-admixing-rate ${default="1.00" caller_external_admixing_rate} \
--disable-annealing ${default="false" disable_annealing}

tar czf ${cohort_entity_id}-gcnv-model-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-model .
tar czf ${cohort_entity_id}-gcnv-calls-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls .
tar czf ${cohort_entity_id}-gcnv-tracking-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-tracking .
tar czf ${cohort_entity_id}-gcnv-model-shard-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-model .
tar czf ${cohort_entity_id}-gcnv-tracking-shard-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-tracking .

CURRENT_SAMPLE=0
NUM_SAMPLES=${num_samples}
NUM_DIGITS=${dollar}{#NUM_SAMPLES}
while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${dollar}{NUM_DIGITS}d" $CURRENT_SAMPLE)
tar czf ${cohort_entity_id}-gcnv-calls-shard-${scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE .
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
done
>>>

runtime {
Expand All @@ -543,9 +560,9 @@ task GermlineCNVCallerCohortMode {
}

output {
File gcnv_model_tar = "${cohort_entity_id}-gcnv-model-${scatter_index}.tar.gz"
File gcnv_calls_tar = "${cohort_entity_id}-gcnv-calls-${scatter_index}.tar.gz"
File gcnv_tracking_tar = "${cohort_entity_id}-gcnv-tracking-${scatter_index}.tar.gz"
File gcnv_model_tar = "${cohort_entity_id}-gcnv-model-shard-${scatter_index}.tar.gz"
Array[File] gcnv_call_tars = glob("${cohort_entity_id}-gcnv-calls-shard-${scatter_index}-sample-*.tar.gz")
File gcnv_tracking_tar = "${cohort_entity_id}-gcnv-tracking-shard-${scatter_index}.tar.gz"
File calling_config_json = "${output_dir_}/${cohort_entity_id}-calls/calling_config.json"
File denoising_config_json = "${output_dir_}/${cohort_entity_id}-calls/denoising_config.json"
File gcnvkernel_version_json = "${output_dir_}/${cohort_entity_id}-calls/gcnvkernel_version.json"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.CoveragePerContig;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.tools.copynumber.utils.CopyNumberUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
Expand Down Expand Up @@ -385,20 +386,20 @@ private boolean executeDeterminePloidyAndDepthPythonScript(final File samplesByC
: outputDir + File.separator; //add trailing slash if necessary
//note that the samples x coverage-by-contig table is referred to as "metadata" by gcnvkernel
final List<String> arguments = new ArrayList<>(Arrays.asList(
"--sample_coverage_metadata=" + samplesByCoveragePerContigFile.getAbsolutePath(),
"--output_calls_path=" + outputDirArg + outputPrefix + CALLS_PATH_SUFFIX));
"--sample_coverage_metadata=" + CopyNumberUtils.getCanonicalPath(samplesByCoveragePerContigFile),
"--output_calls_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + CALLS_PATH_SUFFIX)));
arguments.addAll(germlineContigPloidyModelArgumentCollection.generatePythonArguments(runMode));
arguments.addAll(germlineContigPloidyHybridADVIArgumentCollection.generatePythonArguments());

final String script;
if (runMode == RunMode.COHORT) {
script = COHORT_DETERMINE_PLOIDY_AND_DEPTH_PYTHON_SCRIPT;
arguments.add("--interval_list=" + intervalsFile.getAbsolutePath());
arguments.add("--contig_ploidy_prior_table=" + inputContigPloidyPriorsFile.getAbsolutePath());
arguments.add("--output_model_path=" + outputDirArg + outputPrefix + MODEL_PATH_SUFFIX);
arguments.add("--interval_list=" + CopyNumberUtils.getCanonicalPath(intervalsFile));
arguments.add("--contig_ploidy_prior_table=" + CopyNumberUtils.getCanonicalPath(inputContigPloidyPriorsFile));
arguments.add("--output_model_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + MODEL_PATH_SUFFIX));
} else {
script = CASE_DETERMINE_PLOIDY_AND_DEPTH_PYTHON_SCRIPT;
arguments.add("--input_model_path=" + inputModelDir);
arguments.add("--input_model_path=" + CopyNumberUtils.getCanonicalPath(inputModelDir));
}
return executor.executeScript(
new Resource(script, GermlineCNVCaller.class),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.tools.copynumber.utils.CopyNumberUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
Expand Down Expand Up @@ -406,21 +407,21 @@ private boolean executeGermlineCNVCallerPythonScript(final List<File> intervalSu

//add required arguments
final List<String> arguments = new ArrayList<>(Arrays.asList(
"--ploidy_calls_path=" + inputContigPloidyCallsDir,
"--output_calls_path=" + outputDirArg + outputPrefix + CALLS_PATH_SUFFIX,
"--output_tracking_path=" + outputDirArg + outputPrefix + TRACKING_PATH_SUFFIX));
"--ploidy_calls_path=" + CopyNumberUtils.getCanonicalPath(inputContigPloidyCallsDir),
"--output_calls_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + CALLS_PATH_SUFFIX),
"--output_tracking_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + TRACKING_PATH_SUFFIX)));

//if a model path is given, add it to the argument (both COHORT and CASE modes)
if (inputModelDir != null) {
arguments.add("--input_model_path=" + inputModelDir);
arguments.add("--input_model_path=" + CopyNumberUtils.getCanonicalPath(inputModelDir));
}

final String script;
if (runMode == RunMode.COHORT) {
script = COHORT_DENOISING_CALLING_PYTHON_SCRIPT;
//these are the annotated intervals, if provided
arguments.add("--modeling_interval_list=" + specifiedIntervalsFile.getAbsolutePath());
arguments.add("--output_model_path=" + outputDirArg + outputPrefix + MODEL_PATH_SUFFIX);
arguments.add("--modeling_interval_list=" + CopyNumberUtils.getCanonicalPath(specifiedIntervalsFile));
arguments.add("--output_model_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + MODEL_PATH_SUFFIX));
if (inputAnnotatedIntervalsFile != null) {
arguments.add("--enable_explicit_gc_bias_modeling=True");
} else {
Expand All @@ -432,7 +433,7 @@ private boolean executeGermlineCNVCallerPythonScript(final List<File> intervalSu
}

arguments.add("--read_count_tsv_files");
arguments.addAll(intervalSubsetReadCountFiles.stream().map(File::getAbsolutePath).collect(Collectors.toList()));
arguments.addAll(intervalSubsetReadCountFiles.stream().map(CopyNumberUtils::getCanonicalPath).collect(Collectors.toList()));

arguments.addAll(germlineDenoisingModelArgumentCollection.generatePythonArguments(runMode));
arguments.addAll(germlineCallingArgumentCollection.generatePythonArguments(runMode));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package org.broadinstitute.hellbender.tools.copynumber.utils;

import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;

import java.io.File;
import java.io.IOException;

public final class CopyNumberUtils {
private CopyNumberUtils() {}

/**
* File paths that are passed to {@link PythonScriptExecutor} must be canonical (rather than absolute).
* See https://github.com/broadinstitute/gatk/issues/4724.
*/
public static String getCanonicalPath(final File file) {
Utils.nonNull(file);
try {
return file.getCanonicalPath();
} catch (final IOException e) {
throw new UserException.BadInput(String.format("Could not resolve a canonical file path: %s", file));
}
}

/**
* File paths that are passed to {@link PythonScriptExecutor} must be canonical (rather than absolute).
* See https://github.com/broadinstitute/gatk/issues/4724.
*/
public static String getCanonicalPath(final String filename) {
Utils.nonEmpty(filename);
return getCanonicalPath(new File(filename));
}
}

0 comments on commit c89c189

Please sign in to comment.