Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restore array output in gCNV WDLs for efficient postprocessing. #5490

Merged
merged 2 commits into from
Jan 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,10 @@ task PostprocessGermlineCNVCalls {
String entity_id
Array[File] gcnv_calls_tars
Array[File] gcnv_model_tars
Array[File] calling_configs
Array[File] denoising_configs
Array[File] gcnvkernel_version
Array[File] sharded_interval_lists
File contig_ploidy_calls_tar
Array[String]? allosomal_contigs
Int ref_copy_number_autosomal_contigs
Expand Down Expand Up @@ -370,13 +374,24 @@ task PostprocessGermlineCNVCalls {
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override}

sharded_interval_lists_array=(${sep=" " sharded_interval_lists})

# untar calls to CALLS_0, CALLS_1, etc directories and build the command line
# also copy over shard config and interval files
gcnv_calls_tar_array=(${sep=" " gcnv_calls_tars})
calling_configs_array=(${sep=" " calling_configs})
denoising_configs_array=(${sep=" " denoising_configs})
gcnvkernel_version_array=(${sep=" " gcnvkernel_version})
sharded_interval_lists_array=(${sep=" " sharded_interval_lists})
calls_args=""
for index in ${dollar}{!gcnv_calls_tar_array[@]}; do
gcnv_calls_tar=${dollar}{gcnv_calls_tar_array[$index]}
mkdir CALLS_$index
tar xzf $gcnv_calls_tar -C CALLS_$index
mkdir -p CALLS_$index/SAMPLE_${sample_index}
tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_${sample_index}
cp ${dollar}{calling_configs_array[$index]} CALLS_$index/
cp ${dollar}{denoising_configs_array[$index]} CALLS_$index/
cp ${dollar}{gcnvkernel_version_array[$index]} CALLS_$index/
cp ${dollar}{sharded_interval_lists_array[$index]} CALLS_$index/
calls_args="$calls_args --calls-shard-path CALLS_$index"
done

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ workflow CNVGermlineCaseScatteredWorkflow {
Array[Array[File]] read_counts_entity_id = CNVGermlineCaseWorkflow.read_counts_entity_id
Array[Array[File]] read_counts = CNVGermlineCaseWorkflow.read_counts
Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar
Array[Array[File]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars
Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars
Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf
Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf
Expand Down
29 changes: 23 additions & 6 deletions scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,18 @@ workflow CNVGermlineCaseWorkflow {
}
}

Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCaseMode.gcnv_call_tars)

scatter (sample_index in range(length(normal_bams))) {
call CNVTasks.PostprocessGermlineCNVCalls {
input:
entity_id = CollectCounts.entity_id[sample_index],
gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar,
gcnv_calls_tars = call_tars_sample_by_shard[sample_index],
gcnv_model_tars = gcnv_model_tars,
calling_configs = GermlineCNVCallerCaseMode.calling_config_json,
denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json,
gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json,
sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list,
allosomal_contigs = allosomal_contigs,
ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar,
Expand All @@ -232,7 +238,7 @@ workflow CNVGermlineCaseWorkflow {
Array[File] read_counts_entity_id = CollectCounts.entity_id
Array[File] read_counts = CollectCounts.counts
File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar
Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar
Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars
Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar
Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf
Expand Down Expand Up @@ -355,6 +361,9 @@ task GermlineCNVCallerCaseMode {

# If optional output_dir not specified, use "out"
String output_dir_ = select_first([output_dir, "out"])
Int num_samples = length(read_count_files)

String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819

command <<<
set -e
Expand Down Expand Up @@ -406,8 +415,16 @@ task GermlineCNVCallerCaseMode {
--caller-external-admixing-rate ${default="1.00" caller_external_admixing_rate} \
--disable-annealing ${default="false" disable_annealing}

tar czf case-gcnv-calls-${scatter_index}.tar.gz -C ${output_dir_}/case-calls .
tar czf case-gcnv-tracking-${scatter_index}.tar.gz -C ${output_dir_}/case-tracking .
tar czf case-gcnv-tracking-shard-${scatter_index}.tar.gz -C ${output_dir_}/case-tracking .

CURRENT_SAMPLE=0
NUM_SAMPLES=${num_samples}
NUM_DIGITS=${dollar}{#NUM_SAMPLES}
while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${dollar}{NUM_DIGITS}d" $CURRENT_SAMPLE)
tar czf case-gcnv-calls-shard-${scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ${output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE .
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
done
>>>

runtime {
Expand All @@ -419,8 +436,8 @@ task GermlineCNVCallerCaseMode {
}

output {
File gcnv_calls_tar = "case-gcnv-calls-${scatter_index}.tar.gz"
File gcnv_tracking_tar = "case-gcnv-tracking-${scatter_index}.tar.gz"
Array[File] gcnv_call_tars = glob("case-gcnv-calls-shard-${scatter_index}-sample-*.tar.gz")
File gcnv_tracking_tar = "case-gcnv-tracking-shard-${scatter_index}.tar.gz"
File calling_config_json = "${output_dir_}/case-calls/calling_config.json"
File denoising_config_json = "${output_dir_}/case-calls/denoising_config.json"
File gcnvkernel_version_json = "${output_dir_}/case-calls/gcnvkernel_version.json"
Expand Down
33 changes: 25 additions & 8 deletions scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -304,12 +304,18 @@ workflow CNVGermlineCohortWorkflow {
}
}

Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars)

scatter (sample_index in range(length(CollectCounts.entity_id))) {
call CNVTasks.PostprocessGermlineCNVCalls {
input:
entity_id = CollectCounts.entity_id[sample_index],
gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar,
gcnv_calls_tars = call_tars_sample_by_shard[sample_index],
gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar,
calling_configs = GermlineCNVCallerCohortMode.calling_config_json,
denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json,
gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json,
sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list,
contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar,
allosomal_contigs = allosomal_contigs,
ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs,
Expand All @@ -329,7 +335,7 @@ workflow CNVGermlineCohortWorkflow {
File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar
File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar
Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar
Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar
Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars
Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar
Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf
Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf
Expand Down Expand Up @@ -470,6 +476,9 @@ task GermlineCNVCallerCohortMode {

# If optional output_dir not specified, use "out"
String output_dir_ = select_first([output_dir, "out"])
Int num_samples = length(read_count_files)

String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819

command <<<
set -e
Expand Down Expand Up @@ -529,9 +538,17 @@ task GermlineCNVCallerCohortMode {
--caller-external-admixing-rate ${default="1.00" caller_external_admixing_rate} \
--disable-annealing ${default="false" disable_annealing}

tar czf ${cohort_entity_id}-gcnv-model-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-model .
tar czf ${cohort_entity_id}-gcnv-calls-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls .
tar czf ${cohort_entity_id}-gcnv-tracking-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-tracking .
tar czf ${cohort_entity_id}-gcnv-model-shard-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-model .
tar czf ${cohort_entity_id}-gcnv-tracking-shard-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-tracking .

CURRENT_SAMPLE=0
NUM_SAMPLES=${num_samples}
NUM_DIGITS=${dollar}{#NUM_SAMPLES}
while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do
CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${dollar}{NUM_DIGITS}d" $CURRENT_SAMPLE)
tar czf ${cohort_entity_id}-gcnv-calls-shard-${scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE .
let CURRENT_SAMPLE=CURRENT_SAMPLE+1
done
>>>

runtime {
Expand All @@ -543,9 +560,9 @@ task GermlineCNVCallerCohortMode {
}

output {
File gcnv_model_tar = "${cohort_entity_id}-gcnv-model-${scatter_index}.tar.gz"
File gcnv_calls_tar = "${cohort_entity_id}-gcnv-calls-${scatter_index}.tar.gz"
File gcnv_tracking_tar = "${cohort_entity_id}-gcnv-tracking-${scatter_index}.tar.gz"
File gcnv_model_tar = "${cohort_entity_id}-gcnv-model-shard-${scatter_index}.tar.gz"
Array[File] gcnv_call_tars = glob("${cohort_entity_id}-gcnv-calls-shard-${scatter_index}-sample-*.tar.gz")
File gcnv_tracking_tar = "${cohort_entity_id}-gcnv-tracking-shard-${scatter_index}.tar.gz"
File calling_config_json = "${output_dir_}/${cohort_entity_id}-calls/calling_config.json"
File denoising_config_json = "${output_dir_}/${cohort_entity_id}-calls/denoising_config.json"
File gcnvkernel_version_json = "${output_dir_}/${cohort_entity_id}-calls/gcnvkernel_version.json"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.CoveragePerContig;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.tools.copynumber.utils.CopyNumberUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
Expand Down Expand Up @@ -385,20 +386,20 @@ private boolean executeDeterminePloidyAndDepthPythonScript(final File samplesByC
: outputDir + File.separator; //add trailing slash if necessary
//note that the samples x coverage-by-contig table is referred to as "metadata" by gcnvkernel
final List<String> arguments = new ArrayList<>(Arrays.asList(
"--sample_coverage_metadata=" + samplesByCoveragePerContigFile.getAbsolutePath(),
"--output_calls_path=" + outputDirArg + outputPrefix + CALLS_PATH_SUFFIX));
"--sample_coverage_metadata=" + CopyNumberUtils.getCanonicalPath(samplesByCoveragePerContigFile),
"--output_calls_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + CALLS_PATH_SUFFIX)));
arguments.addAll(germlineContigPloidyModelArgumentCollection.generatePythonArguments(runMode));
arguments.addAll(germlineContigPloidyHybridADVIArgumentCollection.generatePythonArguments());

final String script;
if (runMode == RunMode.COHORT) {
script = COHORT_DETERMINE_PLOIDY_AND_DEPTH_PYTHON_SCRIPT;
arguments.add("--interval_list=" + intervalsFile.getAbsolutePath());
arguments.add("--contig_ploidy_prior_table=" + inputContigPloidyPriorsFile.getAbsolutePath());
arguments.add("--output_model_path=" + outputDirArg + outputPrefix + MODEL_PATH_SUFFIX);
arguments.add("--interval_list=" + CopyNumberUtils.getCanonicalPath(intervalsFile));
samuelklee marked this conversation as resolved.
Show resolved Hide resolved
arguments.add("--contig_ploidy_prior_table=" + CopyNumberUtils.getCanonicalPath(inputContigPloidyPriorsFile));
arguments.add("--output_model_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + MODEL_PATH_SUFFIX));
} else {
script = CASE_DETERMINE_PLOIDY_AND_DEPTH_PYTHON_SCRIPT;
arguments.add("--input_model_path=" + inputModelDir);
arguments.add("--input_model_path=" + CopyNumberUtils.getCanonicalPath(inputModelDir));
}
return executor.executeScript(
new Resource(script, GermlineCNVCaller.class),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.tools.copynumber.utils.CopyNumberUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
Expand Down Expand Up @@ -406,21 +407,21 @@ private boolean executeGermlineCNVCallerPythonScript(final List<File> intervalSu

//add required arguments
final List<String> arguments = new ArrayList<>(Arrays.asList(
"--ploidy_calls_path=" + inputContigPloidyCallsDir,
"--output_calls_path=" + outputDirArg + outputPrefix + CALLS_PATH_SUFFIX,
"--output_tracking_path=" + outputDirArg + outputPrefix + TRACKING_PATH_SUFFIX));
"--ploidy_calls_path=" + CopyNumberUtils.getCanonicalPath(inputContigPloidyCallsDir),
"--output_calls_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + CALLS_PATH_SUFFIX),
"--output_tracking_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + TRACKING_PATH_SUFFIX)));

//if a model path is given, add it to the argument (both COHORT and CASE modes)
if (inputModelDir != null) {
arguments.add("--input_model_path=" + inputModelDir);
arguments.add("--input_model_path=" + CopyNumberUtils.getCanonicalPath(inputModelDir));
}

final String script;
if (runMode == RunMode.COHORT) {
script = COHORT_DENOISING_CALLING_PYTHON_SCRIPT;
//these are the annotated intervals, if provided
arguments.add("--modeling_interval_list=" + specifiedIntervalsFile.getAbsolutePath());
arguments.add("--output_model_path=" + outputDirArg + outputPrefix + MODEL_PATH_SUFFIX);
arguments.add("--modeling_interval_list=" + CopyNumberUtils.getCanonicalPath(specifiedIntervalsFile));
arguments.add("--output_model_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + MODEL_PATH_SUFFIX));
if (inputAnnotatedIntervalsFile != null) {
arguments.add("--enable_explicit_gc_bias_modeling=True");
} else {
Expand All @@ -432,7 +433,7 @@ private boolean executeGermlineCNVCallerPythonScript(final List<File> intervalSu
}

arguments.add("--read_count_tsv_files");
arguments.addAll(intervalSubsetReadCountFiles.stream().map(File::getAbsolutePath).collect(Collectors.toList()));
arguments.addAll(intervalSubsetReadCountFiles.stream().map(CopyNumberUtils::getCanonicalPath).collect(Collectors.toList()));

arguments.addAll(germlineDenoisingModelArgumentCollection.generatePythonArguments(runMode));
arguments.addAll(germlineCallingArgumentCollection.generatePythonArguments(runMode));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package org.broadinstitute.hellbender.tools.copynumber.utils;

import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor;

import java.io.File;
import java.io.IOException;

public final class CopyNumberUtils {
private CopyNumberUtils() {}

/**
* File paths that are passed to {@link PythonScriptExecutor} must be canonical (rather than absolute).
* See https://github.com/broadinstitute/gatk/issues/4724.
*/
public static String getCanonicalPath(final File file) {
Utils.nonNull(file);
try {
return file.getCanonicalPath();
} catch (final IOException e) {
throw new UserException.BadInput(String.format("Could not resolve a canonical file path: %s", file));
}
}

/**
* File paths that are passed to {@link PythonScriptExecutor} must be canonical (rather than absolute).
* See https://github.com/broadinstitute/gatk/issues/4724.
*/
public static String getCanonicalPath(final String filename) {
Utils.nonEmpty(filename);
return getCanonicalPath(new File(filename));
}
}