Skip to content

Commit

Permalink
Put PGEN deliverable (instead of intermediate) files in output_gcs_di…
Browse files Browse the repository at this point in the history
…r [VS-431] (#8919)
  • Loading branch information
rsasch authored Jul 19, 2024
1 parent c1cc9fd commit afdd774
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 34 deletions.
2 changes: 1 addition & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ workflows:
- master
- ah_var_store
- EchoCallset
- vs_1412_pgen_pvars_not_compressed
- rsa_vs_1431
- name: MergePgenHierarchicalWdl
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/MergePgenHierarchical.wdl
Expand Down
43 changes: 15 additions & 28 deletions scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,11 @@ workflow GvsExtractCallsetPgen {
}

Int effective_scatter_count = if defined(scatter_count) then select_first([scatter_count])
else if GetNumSamplesLoaded.num_samples < 100 then 100 # Quickstart
else if GetNumSamplesLoaded.num_samples < 1000 then 500
else if GetNumSamplesLoaded.num_samples < 5000 then 1000
else if GetNumSamplesLoaded.num_samples < 20000 then 2000 # Stroke Anderson
else if GetNumSamplesLoaded.num_samples < 50000 then 10000
else if GetNumSamplesLoaded.num_samples < 100000 then 20000 # Charlie
else 34000
else if GetNumSamplesLoaded.num_samples < 5000 then 1
else if GetNumSamplesLoaded.num_samples < 20000 then 2000 # Stroke Anderson
else if GetNumSamplesLoaded.num_samples < 50000 then 10000
else if GetNumSamplesLoaded.num_samples < 100000 then 20000 # Charlie
else 34000

Int effective_split_intervals_disk_size_override = select_first([split_intervals_disk_size_override,
if GetNumSamplesLoaded.num_samples < 100 then 50 # Quickstart
Expand Down Expand Up @@ -166,7 +164,8 @@ workflow GvsExtractCallsetPgen {
split_intervals_mem_override = split_intervals_mem_override,
gatk_docker = effective_gatk_docker,
gatk_override = gatk_override,
}
output_gcs_dir = output_gcs_dir,
}

call Utils.GetBQTableLastModifiedDatetime as FilterSetInfoTimestamp {
input:
Expand Down Expand Up @@ -250,7 +249,6 @@ workflow GvsExtractCallsetPgen {
drop_state = drop_state,
output_pgen_basename = pgen_basename,
zero_pad_output_pgen_filenames = zero_pad_output_pgen_filenames,
output_gcs_dir = output_gcs_dir,
max_last_modified_timestamp = GetBQTablesMaxLastModifiedTimestamp.max_last_modified_timestamp,
extract_preemptible_override = extract_preemptible_override,
extract_maxretries_override = extract_maxretries_override,
Expand All @@ -265,6 +263,7 @@ workflow GvsExtractCallsetPgen {
call SumBytes {
input:
file_sizes_bytes = flatten([PgenExtractTask.output_pgen_bytes, PgenExtractTask.output_pvar_bytes, PgenExtractTask.output_psam_bytes]),
output_gcs_dir = output_gcs_dir,
cloud_sdk_docker = effective_cloud_sdk_docker,
}

Expand Down Expand Up @@ -346,7 +345,6 @@ task PgenExtractTask {
String read_project_id
String output_pgen_basename
Boolean zero_pad_output_pgen_filenames
String? output_gcs_dir

String cost_observability_tablename = "cost_observability"

Expand Down Expand Up @@ -452,9 +450,6 @@ task PgenExtractTask {
--allow-empty-pgen


# Drop trailing slash if one exists
OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//')

OUTPUT_FILE_BYTES="$(du -b ~{output_pgen_basename}.pgen | cut -f1)"
echo ${OUTPUT_FILE_BYTES} > pgen_bytes.txt

Expand All @@ -464,18 +459,9 @@ task PgenExtractTask {
OUTPUT_FILE_PSAM_BYTES="$(du -b ~{output_pgen_basename}.psam | cut -f1)"
echo ${OUTPUT_FILE_PSAM_BYTES} > psam_bytes.txt

if [ -n "${OUTPUT_GCS_DIR}" ]; then
gsutil cp ~{output_pgen_basename}.pgen ${OUTPUT_GCS_DIR}/
gsutil cp ~{output_pgen_basename}.pvar.zst ${OUTPUT_GCS_DIR}/
gsutil cp ~{output_pgen_basename}.psam ${OUTPUT_GCS_DIR}/
OUTPUT_FILE_DEST="${OUTPUT_GCS_DIR}/~{output_pgen_basename}.pgen"
OUTPUT_FILE_PVAR_DEST="${OUTPUT_GCS_DIR}/~{output_pgen_basename}.pvar.zst"
OUTPUT_FILE_PSAM_DEST="${OUTPUT_GCS_DIR}/~{output_pgen_basename}.psam"
else
OUTPUT_FILE_DEST="~{output_pgen_basename}.pgen"
OUTPUT_FILE_PVAR_DEST="~{output_pgen_basename}.pvar.zst"
OUTPUT_FILE_PSAM_DEST="~{output_pgen_basename}.psam"
fi
OUTPUT_FILE_DEST="~{output_pgen_basename}.pgen"
OUTPUT_FILE_PVAR_DEST="~{output_pgen_basename}.pvar.zst"
OUTPUT_FILE_PSAM_DEST="~{output_pgen_basename}.psam"

# Parent Task will collect manifest lines and create a joined file
# Currently, the schema is `[interval_number], [output_file_location], [output_file_size_bytes], [output_file_pvar_location], [output_file_pvar_size_bytes], [output_file_psam_location], [output_file_psam_size_bytes]`
Expand Down Expand Up @@ -510,6 +496,7 @@ task SumBytes {
input {
Array[Float] file_sizes_bytes
String cloud_sdk_docker
String? output_gcs_dir
}
meta {
# Not `volatile: true` since there shouldn't be a need to re-run this if there has already been a successful execution.
Expand Down Expand Up @@ -561,7 +548,7 @@ task CreateManifest {
OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//')

if [ -n "$OUTPUT_GCS_DIR" ]; then
gsutil cp manifest.txt ${OUTPUT_GCS_DIR}/
gsutil cp manifest.txt ${OUTPUT_GCS_DIR}/
fi
>>>
output {
Expand Down Expand Up @@ -608,7 +595,7 @@ task GenerateSampleListFile {
'SELECT sample_name FROM `~{fq_samples_to_extract_table}`' | sed 1d > sample-name-list.txt

if [ -n "$OUTPUT_GCS_DIR" ]; then
gsutil cp sample-name-list.txt ${OUTPUT_GCS_DIR}/
gsutil cp sample-name-list.txt ${OUTPUT_GCS_DIR}/
fi
>>>
output {
Expand All @@ -622,4 +609,4 @@ task GenerateSampleListFile {
preemptible: 3
cpu: 1
}
}
}
6 changes: 4 additions & 2 deletions scripts/variantstore/wdl/GvsExtractCallsetPgenMerged.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ workflow GvsExtractCallsetPgenMerged {
x_bed_weight_scaling = x_bed_weight_scaling,
y_bed_weight_scaling = y_bed_weight_scaling,
write_cost_to_db = write_cost_to_db,
output_gcs_dir = output_gcs_dir,
}

call SplitFilesByChromosome {
Expand Down Expand Up @@ -145,6 +146,7 @@ workflow GvsExtractCallsetPgenMerged {
split_count = split_count,
zero_padded_prefix = zero_pad_output_pgen_filenames,
variants_docker = effective_variants_docker,
output_gcs_dir = output_gcs_dir,
}
}

Expand Down Expand Up @@ -206,12 +208,12 @@ task SplitFilesByChromosome {
Array[File] pvar_lists = glob("*.pvar_list")
Array[File] psam_lists = glob("*.psam_list")
}

runtime {
docker: "ubuntu:20.04"
memory: "1 GB"
disks: "local-disk ${disk_size} HDD"
cpu: "1"
preemptible: 1
}
}
}
17 changes: 14 additions & 3 deletions scripts/variantstore/wdl/MergePgenHierarchical.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ workflow MergePgenWorkflow {
Int split_count
Boolean zero_padded_prefix
String variants_docker
String? output_gcs_dir
}

call SortFileLists {
Expand Down Expand Up @@ -66,7 +67,8 @@ workflow MergePgenWorkflow {
plink_docker = plink_docker,
output_file_base_name = output_file_base_name,
threads = threads,
disk_in_gb = merge_disk_size
disk_in_gb = merge_disk_size,
output_gcs_dir = output_gcs_dir,
}

output {
Expand All @@ -84,6 +86,7 @@ task MergePgen {
String pgen_chromosome_code
String plink_docker
String output_file_base_name
String? output_gcs_dir
Int threads = 1
Int disk_in_gb
}
Expand All @@ -95,6 +98,9 @@ task MergePgen {
PS4='\D{+%F %T} \w $ '
set -o errexit -o nounset -o pipefail -o xtrace

# Drop trailing slash if one exists
OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//')

# Download files using gsutil
mkdir pgen_dir
cat ~{pgen_list} | gsutil -m cp -I pgen_dir
Expand Down Expand Up @@ -149,6 +155,11 @@ task MergePgen {
plink2 --zst-decompress ~{output_file_base_name}.pvar.zst > ~{output_file_base_name}.pvar
fi

if [ -n "${OUTPUT_GCS_DIR}" ]; then
gsutil cp ~{output_file_base_name}.pgen ${OUTPUT_GCS_DIR}/
gsutil cp ~{output_file_base_name}.pvar ${OUTPUT_GCS_DIR}/
gsutil cp ~{output_file_base_name}.psam ${OUTPUT_GCS_DIR}/
fi
>>>

output {
Expand Down Expand Up @@ -185,7 +196,7 @@ task MakeFileLists {
localization_optional: true
}
}

meta {
# This causes issues when call cached for some reason, so we don't want to do that
volatile: true
Expand Down Expand Up @@ -339,4 +350,4 @@ task SplitFileLists {
bootDiskSizeGb: 15
noAddress: true
}
}
}

0 comments on commit afdd774

Please sign in to comment.