From cc4f379a9a3851f045e12d1ad5af6453f44036e3 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Mon, 10 Sep 2018 15:15:05 -0400 Subject: [PATCH 1/2] Added canonicalization of all file paths passed to gCNV python scripts. --- .../DetermineGermlineContigPloidy.java | 13 +++---- .../tools/copynumber/GermlineCNVCaller.java | 15 ++++---- .../copynumber/utils/CopyNumberUtils.java | 34 +++++++++++++++++++ 3 files changed, 49 insertions(+), 13 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/CopyNumberUtils.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DetermineGermlineContigPloidy.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DetermineGermlineContigPloidy.java index dde103aac67..36225a57989 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DetermineGermlineContigPloidy.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DetermineGermlineContigPloidy.java @@ -18,6 +18,7 @@ import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata; import org.broadinstitute.hellbender.tools.copynumber.formats.records.CoveragePerContig; import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; +import org.broadinstitute.hellbender.tools.copynumber.utils.CopyNumberUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.io.IOUtils; @@ -385,20 +386,20 @@ private boolean executeDeterminePloidyAndDepthPythonScript(final File samplesByC : outputDir + File.separator; //add trailing slash if necessary //note that the samples x coverage-by-contig table is referred to as "metadata" by gcnvkernel final List arguments = new ArrayList<>(Arrays.asList( - "--sample_coverage_metadata=" + samplesByCoveragePerContigFile.getAbsolutePath(), - "--output_calls_path=" + outputDirArg + outputPrefix + CALLS_PATH_SUFFIX)); + "--sample_coverage_metadata=" + CopyNumberUtils.getCanonicalPath(samplesByCoveragePerContigFile), + "--output_calls_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + CALLS_PATH_SUFFIX))); arguments.addAll(germlineContigPloidyModelArgumentCollection.generatePythonArguments(runMode)); arguments.addAll(germlineContigPloidyHybridADVIArgumentCollection.generatePythonArguments()); final String script; if (runMode == RunMode.COHORT) { script = COHORT_DETERMINE_PLOIDY_AND_DEPTH_PYTHON_SCRIPT; - arguments.add("--interval_list=" + intervalsFile.getAbsolutePath()); - arguments.add("--contig_ploidy_prior_table=" + inputContigPloidyPriorsFile.getAbsolutePath()); - arguments.add("--output_model_path=" + outputDirArg + outputPrefix + MODEL_PATH_SUFFIX); + arguments.add("--interval_list=" + CopyNumberUtils.getCanonicalPath(intervalsFile)); + arguments.add("--contig_ploidy_prior_table=" + CopyNumberUtils.getCanonicalPath(inputContigPloidyPriorsFile)); + arguments.add("--output_model_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + MODEL_PATH_SUFFIX)); } else { script = CASE_DETERMINE_PLOIDY_AND_DEPTH_PYTHON_SCRIPT; - arguments.add("--input_model_path=" + inputModelDir); + arguments.add("--input_model_path=" + CopyNumberUtils.getCanonicalPath(inputModelDir)); } return executor.executeScript( new Resource(script, GermlineCNVCaller.class), diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/GermlineCNVCaller.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/GermlineCNVCaller.java index bc13db70c93..3498deaccc5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/GermlineCNVCaller.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/GermlineCNVCaller.java @@ -13,6 +13,7 @@ import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleIntervalCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; +import org.broadinstitute.hellbender.tools.copynumber.utils.CopyNumberUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.io.IOUtils; @@ -406,21 +407,21 @@ private boolean executeGermlineCNVCallerPythonScript(final List intervalSu //add required arguments final List arguments = new ArrayList<>(Arrays.asList( - "--ploidy_calls_path=" + inputContigPloidyCallsDir, - "--output_calls_path=" + outputDirArg + outputPrefix + CALLS_PATH_SUFFIX, - "--output_tracking_path=" + outputDirArg + outputPrefix + TRACKING_PATH_SUFFIX)); + "--ploidy_calls_path=" + CopyNumberUtils.getCanonicalPath(inputContigPloidyCallsDir), + "--output_calls_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + CALLS_PATH_SUFFIX), + "--output_tracking_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + TRACKING_PATH_SUFFIX))); //if a model path is given, add it to the argument (both COHORT and CASE modes) if (inputModelDir != null) { - arguments.add("--input_model_path=" + inputModelDir); + arguments.add("--input_model_path=" + CopyNumberUtils.getCanonicalPath(inputModelDir)); } final String script; if (runMode == RunMode.COHORT) { script = COHORT_DENOISING_CALLING_PYTHON_SCRIPT; //these are the annotated intervals, if provided - arguments.add("--modeling_interval_list=" + specifiedIntervalsFile.getAbsolutePath()); - arguments.add("--output_model_path=" + outputDirArg + outputPrefix + MODEL_PATH_SUFFIX); + arguments.add("--modeling_interval_list=" + CopyNumberUtils.getCanonicalPath(specifiedIntervalsFile)); + arguments.add("--output_model_path=" + CopyNumberUtils.getCanonicalPath(outputDirArg + outputPrefix + MODEL_PATH_SUFFIX)); if (inputAnnotatedIntervalsFile != null) { arguments.add("--enable_explicit_gc_bias_modeling=True"); } else { @@ -432,7 +433,7 @@ private boolean executeGermlineCNVCallerPythonScript(final List intervalSu } arguments.add("--read_count_tsv_files"); - arguments.addAll(intervalSubsetReadCountFiles.stream().map(File::getAbsolutePath).collect(Collectors.toList())); + arguments.addAll(intervalSubsetReadCountFiles.stream().map(CopyNumberUtils::getCanonicalPath).collect(Collectors.toList())); arguments.addAll(germlineDenoisingModelArgumentCollection.generatePythonArguments(runMode)); arguments.addAll(germlineCallingArgumentCollection.generatePythonArguments(runMode)); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/CopyNumberUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/CopyNumberUtils.java new file mode 100644 index 00000000000..e423b464119 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/CopyNumberUtils.java @@ -0,0 +1,34 @@ +package org.broadinstitute.hellbender.tools.copynumber.utils; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; + +import java.io.File; +import java.io.IOException; + +public final class CopyNumberUtils { + private CopyNumberUtils() {} + + /** + * File paths that are passed to {@link PythonScriptExecutor} must be canonical (rather than absolute). + * See https://github.com/broadinstitute/gatk/issues/4724. + */ + public static String getCanonicalPath(final File file) { + Utils.nonNull(file); + try { + return file.getCanonicalPath(); + } catch (final IOException e) { + throw new UserException.BadInput(String.format("Could not resolve a canonical file path: %s", file)); + } + } + + /** + * File paths that are passed to {@link PythonScriptExecutor} must be canonical (rather than absolute). + * See https://github.com/broadinstitute/gatk/issues/4724. + */ + public static String getCanonicalPath(final String filename) { + Utils.nonEmpty(filename); + return getCanonicalPath(new File(filename)); + } +} From 08ffd754367508fa36052001c3e49031ce2f2241 Mon Sep 17 00:00:00 2001 From: Samuel Lee Date: Mon, 19 Nov 2018 16:25:33 -0500 Subject: [PATCH 2/2] Restore array output in gCNV WDLs for efficient postprocessing. --- scripts/cnv_wdl/cnv_common_tasks.wdl | 19 +++++++++-- .../cnv_germline_case_scattered_workflow.wdl | 2 +- .../germline/cnv_germline_case_workflow.wdl | 29 ++++++++++++---- .../germline/cnv_germline_cohort_workflow.wdl | 33 ++++++++++++++----- 4 files changed, 66 insertions(+), 17 deletions(-) diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl index c686e7b35a3..a46a0391981 100644 --- a/scripts/cnv_wdl/cnv_common_tasks.wdl +++ b/scripts/cnv_wdl/cnv_common_tasks.wdl @@ -342,6 +342,10 @@ task PostprocessGermlineCNVCalls { String entity_id Array[File] gcnv_calls_tars Array[File] gcnv_model_tars + Array[File] calling_configs + Array[File] denoising_configs + Array[File] gcnvkernel_version + Array[File] sharded_interval_lists File contig_ploidy_calls_tar Array[String]? allosomal_contigs Int ref_copy_number_autosomal_contigs @@ -370,13 +374,24 @@ task PostprocessGermlineCNVCalls { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk4_jar_override} + sharded_interval_lists_array=(${sep=" " sharded_interval_lists}) + # untar calls to CALLS_0, CALLS_1, etc directories and build the command line + # also copy over shard config and interval files gcnv_calls_tar_array=(${sep=" " gcnv_calls_tars}) + calling_configs_array=(${sep=" " calling_configs}) + denoising_configs_array=(${sep=" " denoising_configs}) + gcnvkernel_version_array=(${sep=" " gcnvkernel_version}) + sharded_interval_lists_array=(${sep=" " sharded_interval_lists}) calls_args="" for index in ${dollar}{!gcnv_calls_tar_array[@]}; do gcnv_calls_tar=${dollar}{gcnv_calls_tar_array[$index]} - mkdir CALLS_$index - tar xzf $gcnv_calls_tar -C CALLS_$index + mkdir -p CALLS_$index/SAMPLE_${sample_index} + tar xzf $gcnv_calls_tar -C CALLS_$index/SAMPLE_${sample_index} + cp ${dollar}{calling_configs_array[$index]} CALLS_$index/ + cp ${dollar}{denoising_configs_array[$index]} CALLS_$index/ + cp ${dollar}{gcnvkernel_version_array[$index]} CALLS_$index/ + cp ${dollar}{sharded_interval_lists_array[$index]} CALLS_$index/ calls_args="$calls_args --calls-shard-path CALLS_$index" done diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl index e2ef5e0c4ae..7638385a3c3 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_scattered_workflow.wdl @@ -187,7 +187,7 @@ workflow CNVGermlineCaseScatteredWorkflow { Array[Array[File]] read_counts_entity_id = CNVGermlineCaseWorkflow.read_counts_entity_id Array[Array[File]] read_counts = CNVGermlineCaseWorkflow.read_counts Array[File] contig_ploidy_calls_tars = CNVGermlineCaseWorkflow.contig_ploidy_calls_tar - Array[Array[File]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars + Array[Array[Array[File]]] gcnv_calls_tars = CNVGermlineCaseWorkflow.gcnv_calls_tars Array[Array[File]] gcnv_tracking_tars = CNVGermlineCaseWorkflow.gcnv_tracking_tars Array[Array[File]] genotyped_intervals_vcf = CNVGermlineCaseWorkflow.genotyped_intervals_vcf Array[Array[File]] genotyped_segments_vcf = CNVGermlineCaseWorkflow.genotyped_segments_vcf diff --git a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl index d9dde999fdb..135edc176a1 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_case_workflow.wdl @@ -211,12 +211,18 @@ workflow CNVGermlineCaseWorkflow { } } + Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCaseMode.gcnv_call_tars) + scatter (sample_index in range(length(normal_bams))) { call CNVTasks.PostprocessGermlineCNVCalls { input: entity_id = CollectCounts.entity_id[sample_index], - gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar, + gcnv_calls_tars = call_tars_sample_by_shard[sample_index], gcnv_model_tars = gcnv_model_tars, + calling_configs = GermlineCNVCallerCaseMode.calling_config_json, + denoising_configs = GermlineCNVCallerCaseMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCaseMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCaseMode.sharded_interval_list, allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar, @@ -232,7 +238,7 @@ workflow CNVGermlineCaseWorkflow { Array[File] read_counts_entity_id = CollectCounts.entity_id Array[File] read_counts = CollectCounts.counts File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCaseMode.contig_ploidy_calls_tar - Array[File] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_calls_tar + Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCaseMode.gcnv_call_tars Array[File] gcnv_tracking_tars = GermlineCNVCallerCaseMode.gcnv_tracking_tar Array[File] genotyped_intervals_vcf = PostprocessGermlineCNVCalls.genotyped_intervals_vcf Array[File] genotyped_segments_vcf = PostprocessGermlineCNVCalls.genotyped_segments_vcf @@ -355,6 +361,9 @@ task GermlineCNVCallerCaseMode { # If optional output_dir not specified, use "out" String output_dir_ = select_first([output_dir, "out"]) + Int num_samples = length(read_count_files) + + String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819 command <<< set -e @@ -406,8 +415,16 @@ task GermlineCNVCallerCaseMode { --caller-external-admixing-rate ${default="1.00" caller_external_admixing_rate} \ --disable-annealing ${default="false" disable_annealing} - tar czf case-gcnv-calls-${scatter_index}.tar.gz -C ${output_dir_}/case-calls . - tar czf case-gcnv-tracking-${scatter_index}.tar.gz -C ${output_dir_}/case-tracking . + tar czf case-gcnv-tracking-shard-${scatter_index}.tar.gz -C ${output_dir_}/case-tracking . + + CURRENT_SAMPLE=0 + NUM_SAMPLES=${num_samples} + NUM_DIGITS=${dollar}{#NUM_SAMPLES} + while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do + CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${dollar}{NUM_DIGITS}d" $CURRENT_SAMPLE) + tar czf case-gcnv-calls-shard-${scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ${output_dir_}/case-calls/SAMPLE_$CURRENT_SAMPLE . + let CURRENT_SAMPLE=CURRENT_SAMPLE+1 + done >>> runtime { @@ -419,8 +436,8 @@ task GermlineCNVCallerCaseMode { } output { - File gcnv_calls_tar = "case-gcnv-calls-${scatter_index}.tar.gz" - File gcnv_tracking_tar = "case-gcnv-tracking-${scatter_index}.tar.gz" + Array[File] gcnv_call_tars = glob("case-gcnv-calls-shard-${scatter_index}-sample-*.tar.gz") + File gcnv_tracking_tar = "case-gcnv-tracking-shard-${scatter_index}.tar.gz" File calling_config_json = "${output_dir_}/case-calls/calling_config.json" File denoising_config_json = "${output_dir_}/case-calls/denoising_config.json" File gcnvkernel_version_json = "${output_dir_}/case-calls/gcnvkernel_version.json" diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl index ad583e7fd14..48d165ae551 100644 --- a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl +++ b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl @@ -304,12 +304,18 @@ workflow CNVGermlineCohortWorkflow { } } + Array[Array[File]] call_tars_sample_by_shard = transpose(GermlineCNVCallerCohortMode.gcnv_call_tars) + scatter (sample_index in range(length(CollectCounts.entity_id))) { call CNVTasks.PostprocessGermlineCNVCalls { input: entity_id = CollectCounts.entity_id[sample_index], - gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar, + gcnv_calls_tars = call_tars_sample_by_shard[sample_index], gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar, + calling_configs = GermlineCNVCallerCohortMode.calling_config_json, + denoising_configs = GermlineCNVCallerCohortMode.denoising_config_json, + gcnvkernel_version = GermlineCNVCallerCohortMode.gcnvkernel_version_json, + sharded_interval_lists = GermlineCNVCallerCohortMode.sharded_interval_list, contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar, allosomal_contigs = allosomal_contigs, ref_copy_number_autosomal_contigs = ref_copy_number_autosomal_contigs, @@ -329,7 +335,7 @@ workflow CNVGermlineCohortWorkflow { File contig_ploidy_model_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_model_tar File contig_ploidy_calls_tar = DetermineGermlineContigPloidyCohortMode.contig_ploidy_calls_tar Array[File] gcnv_model_tars = GermlineCNVCallerCohortMode.gcnv_model_tar - Array[File] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_calls_tar + Array[Array[File]] gcnv_calls_tars = GermlineCNVCallerCohortMode.gcnv_call_tars Array[File] gcnv_tracking_tars = GermlineCNVCallerCohortMode.gcnv_tracking_tar Array[File] genotyped_intervals_vcfs = PostprocessGermlineCNVCalls.genotyped_intervals_vcf Array[File] genotyped_segments_vcfs = PostprocessGermlineCNVCalls.genotyped_segments_vcf @@ -470,6 +476,9 @@ task GermlineCNVCallerCohortMode { # If optional output_dir not specified, use "out" String output_dir_ = select_first([output_dir, "out"]) + Int num_samples = length(read_count_files) + + String dollar = "$" #WDL workaround, see https://github.com/broadinstitute/cromwell/issues/1819 command <<< set -e @@ -529,9 +538,17 @@ task GermlineCNVCallerCohortMode { --caller-external-admixing-rate ${default="1.00" caller_external_admixing_rate} \ --disable-annealing ${default="false" disable_annealing} - tar czf ${cohort_entity_id}-gcnv-model-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-model . - tar czf ${cohort_entity_id}-gcnv-calls-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls . - tar czf ${cohort_entity_id}-gcnv-tracking-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-tracking . + tar czf ${cohort_entity_id}-gcnv-model-shard-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-model . + tar czf ${cohort_entity_id}-gcnv-tracking-shard-${scatter_index}.tar.gz -C ${output_dir_}/${cohort_entity_id}-tracking . + + CURRENT_SAMPLE=0 + NUM_SAMPLES=${num_samples} + NUM_DIGITS=${dollar}{#NUM_SAMPLES} + while [ $CURRENT_SAMPLE -lt $NUM_SAMPLES ]; do + CURRENT_SAMPLE_WITH_LEADING_ZEROS=$(printf "%0${dollar}{NUM_DIGITS}d" $CURRENT_SAMPLE) + tar czf ${cohort_entity_id}-gcnv-calls-shard-${scatter_index}-sample-$CURRENT_SAMPLE_WITH_LEADING_ZEROS.tar.gz -C ${output_dir_}/${cohort_entity_id}-calls/SAMPLE_$CURRENT_SAMPLE . + let CURRENT_SAMPLE=CURRENT_SAMPLE+1 + done >>> runtime { @@ -543,9 +560,9 @@ task GermlineCNVCallerCohortMode { } output { - File gcnv_model_tar = "${cohort_entity_id}-gcnv-model-${scatter_index}.tar.gz" - File gcnv_calls_tar = "${cohort_entity_id}-gcnv-calls-${scatter_index}.tar.gz" - File gcnv_tracking_tar = "${cohort_entity_id}-gcnv-tracking-${scatter_index}.tar.gz" + File gcnv_model_tar = "${cohort_entity_id}-gcnv-model-shard-${scatter_index}.tar.gz" + Array[File] gcnv_call_tars = glob("${cohort_entity_id}-gcnv-calls-shard-${scatter_index}-sample-*.tar.gz") + File gcnv_tracking_tar = "${cohort_entity_id}-gcnv-tracking-shard-${scatter_index}.tar.gz" File calling_config_json = "${output_dir_}/${cohort_entity_id}-calls/calling_config.json" File denoising_config_json = "${output_dir_}/${cohort_entity_id}-calls/denoising_config.json" File gcnvkernel_version_json = "${output_dir_}/${cohort_entity_id}-calls/gcnvkernel_version.json"