@@ -5,6 +5,7 @@ from scripts.common import (
5
5
joint_option
6
6
)
7
7
8
+
8
9
# Helper functions for tumor, normal pairs
9
10
def get_normal_recal_bam (wildcards ):
10
11
"""
@@ -797,7 +798,6 @@ rule deepsomatic_make_examples:
797
798
normal_name_option = lambda w : "--sample_name_normal {0}" .format (
798
799
tumor2normal [w .name ]
799
800
) if tumor2normal [w .name ] else "" ,
800
- message : "Running DeepSomatic make_examples on '{input.tumor}' input file"
801
801
threads : int (allocated ("threads" , "deepsomatic_make_examples" , cluster ))
802
802
container : config ['images' ]['deepsomatic' ]
803
803
envmodules : config ['tools' ]['deepsomatic' ]
@@ -849,76 +849,176 @@ rule deepsomatic_make_examples:
849
849
&& touch {output.success}
850
850
"""
851
851
852
-
853
- rule deepsomatic_call_variants :
854
- """
855
- Data processing step to call somatic variants using deep neural
856
- network. The make_examples step prepares the input data for the
857
- deepsomatic's CNN. DeepSomatic is an extension of deep learning-
858
- based variant caller Deepvariant. It is composed of multiple steps
859
- that takes aligned reads (in BAM or CRAM format), produces pileup
860
- image tensors from them, classifies each tensor using a convolutional
861
- neural network, and finally reports the results in a standard VCF or
862
- gVCF file. This rule is the first step in the deepsomatic pipeline:
863
- 1. make_examples (CPU, parallelizable with gnu-parallel)
864
- * 2. call_variants (GPU, use a GPU node)
865
- 3. postprocess_variants (CPU, single-threaded)
866
- Running deepsomatic in a single step using run_deepsomatic is not
867
- optimal for large-scale projects as it will consume resources very
868
- inefficently. As so, it is better to run the 1st/3rd step on a
869
- compute node and run the 2nd step on a GPU node.
870
- @Input:
871
- Flag file to indicate success of make_examples (scatter)
872
- @Output:
873
- Per sample call_variants tensorflow records file
874
- """
875
- input :
876
- success = join (workpath , "deepsomatic" , "mk_examples" , "{name}.make_examples.success" ),
877
- output :
878
- callvar = join (workpath , "deepsomatic" , "call_variants" , "{name}.call_variants.tfrecord.gz" ),
879
- params :
880
- rname = "ds_callvars" ,
881
- genome = config ['references' ]['GENOME' ],
882
- tmpdir = tmpdir ,
883
- # NOTE: There BE dragons here!
884
- # We need allocation info from make_examples rule
885
- # to determine the number of shards that were
886
- # used in the make_examples step, this is used
887
- # to resolve a dependency file of this rule,
888
- # which is the examples tf record file produced by
889
- # make_examples. This file gets passed to the
890
- # --examples option of call_variants.
891
- example = lambda w : join (workpath , "deepsomatic" , "mk_examples" , "{0}.make_examples.tfrecord@{1}.gz" .format (
892
- w .name ,
893
- int (allocated ("threads" , "deepsomatic_make_examples" , cluster ))
894
- )),
895
- # TODO: add option --ffpe option to pipeline, that
896
- # selects either the ffpe_wgs or ffpe_wes checkpoints.
897
- # Building option for checkpoint file (assumes TN-pairs and
898
- # non-FFPE samples), where:
899
- # @WES = "/opt/models/deepsomatic/wes"
900
- # @WGS = "/opt/models/deepsomatic/wgs"
901
- ckpt = lambda _ : "/opt/models/deepsomatic/wes" if run_wes else "/opt/models/deepsomatic/wgs" ,
902
- message : "Running deepsomatic call_variants on '{wildcards.name}' sample"
903
- threads : int (allocated ("threads" , "deepsomatic_call_variants" , cluster ))
904
- container : config ['images' ]['deepsomatic' ]
905
- envmodules : config ['tools' ]['deepsomatic' ]
906
- shell : """
907
- # Setups temporary directory for
908
- # intermediate files with built-in
909
- # mechanism for deletion on exit
910
- if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
911
- tmp=$(mktemp -d -p "{params.tmpdir}")
912
- trap 'rm -rf "${{tmp}}"' EXIT
913
- echo "Using tmpdir: ${{tmp}}"
914
- export TMPDIR="${{tmp}}"
915
-
916
- # Run DeepSomatic call_variants
917
- time call_variants \\
918
- --outfile {output.callvar} \\
919
- --examples {params.example} \\
920
- --checkpoint {params.ckpt}
921
- """
852
+ if use_gpus :
853
+ # Use GPU-acceleration to speed up
854
+ # the second step in deepsomatic
855
+ rule deepsomatic_call_variants_gpu :
856
+ """
857
+ Data processing step to call somatic variants using deep neural
858
+ network. The make_examples step prepares the input data for the
859
+ deepsomatic's CNN. DeepSomatic is an extension of deep learning-
860
+ based variant caller Deepvariant. It is composed of multiple steps
861
+ that takes aligned reads (in BAM or CRAM format), produces pileup
862
+ image tensors from them, classifies each tensor using a convolutional
863
+ neural network, and finally reports the results in a standard VCF or
864
+ gVCF file. This rule is the first step in the deepsomatic pipeline:
865
+ 1. make_examples (CPU, parallelizable with gnu-parallel)
866
+ * 2. call_variants (GPU, use a GPU node)
867
+ 3. postprocess_variants (CPU, single-threaded)
868
+ Running deepsomatic in a single step using run_deepsomatic is not
869
+ optimal for large-scale projects as it will consume resources very
870
+ inefficently. As so, it is better to run the 1st/3rd step on a
871
+ compute node and run the 2nd step on a GPU node.
872
+ NOTE: When deepsomatic is run on a GPU/TPU, it will scatter the
873
+ writing of the output *.call_variants.tfrecord.gz across a pool
874
+ of processes (by default, --writer_threads 16). This causes causes
875
+ the final output file to be different if you are running DeepSomatic
876
+ on a CPU versus GPU.
877
+ @Input:
878
+ Flag file to indicate success of make_examples (scatter)
879
+ @Output:
880
+ Flag file to indicate success of call_variants,
881
+ actually produces (given 16 writer threads):
882
+ {name}.call_variants-00000-of-00016.tfrecord.gz, ...
883
+ {name}.call_variants-00015-of-00016.tfrecord.gz
884
+ """
885
+ input :
886
+ success = join (workpath , "deepsomatic" , "mk_examples" , "{name}.make_examples.success" ),
887
+ output :
888
+ success = join (workpath , "deepsomatic" , "call_variants" , "{name}.cv.success" ),
889
+ params :
890
+ rname = "ds_callvars_gpu" ,
891
+ genome = config ['references' ]['GENOME' ],
892
+ # Singularity options
893
+ sif = config ['images' ]['deepsomatic_gpu' ],
894
+ bindpaths = ',' .join (bindpath ),
895
+ tmpdir = tmpdir ,
896
+ # NOTE: There BE dragons here!
897
+ # We need allocation info from make_examples rule
898
+ # to determine the number of shards that were
899
+ # used in the make_examples step, this is used
900
+ # to resolve a dependency file of this rule,
901
+ # which is the examples tf record file produced by
902
+ # make_examples. This file gets passed to the
903
+ # --examples option of call_variants.
904
+ example = lambda w : join (workpath , "deepsomatic" , "mk_examples" , "{0}.make_examples.tfrecord@{1}.gz" .format (
905
+ w .name ,
906
+ int (allocated ("threads" , "deepsomatic_make_examples" , cluster ))
907
+ )),
908
+ callvar = join (workpath , "deepsomatic" , "call_variants" , "{name}.call_variants.tfrecord.gz" ),
909
+ # TODO: add option --ffpe option to pipeline, that
910
+ # selects either the ffpe_wgs or ffpe_wes checkpoints.
911
+ # Building option for checkpoint file (assumes TN-pairs and
912
+ # non-FFPE samples), where:
913
+ # @WES = "/opt/models/deepsomatic/wes"
914
+ # @WGS = "/opt/models/deepsomatic/wgs"
915
+ ckpt = lambda _ : "/opt/models/deepsomatic/wes" if run_wes else "/opt/models/deepsomatic/wgs" ,
916
+ threads : max (int (allocated ("threads" , "deepsomatic_call_variants_gpu" , cluster )) - 2 , 4 ),
917
+ envmodules : config ['tools' ]['deepsomatic' ]
918
+ shell : """
919
+ # Setups temporary directory for
920
+ # intermediate files with built-in
921
+ # mechanism for deletion on exit
922
+ if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
923
+ tmp=$(mktemp -d -p "{params.tmpdir}")
924
+ trap 'rm -rf "${{tmp}}"' EXIT
925
+ echo "Using tmpdir: ${{tmp}}"
926
+ export TMPDIR="${{tmp}}"
927
+
928
+ # Run DeepSomatic call_variants
929
+ # using a GPU acceleration
930
+ singularity exec \\
931
+ -c \\
932
+ --nv \\
933
+ -B {params.bindpaths},${{tmp}}:/tmp \\
934
+ {params.sif} /bin/bash -c \\
935
+ 'time call_variants \\
936
+ --outfile {params.callvar} \\
937
+ --examples {params.example} \\
938
+ --checkpoint {params.ckpt} \\
939
+ --writer_threads {threads}'
940
+ touch "{output.success}"
941
+ """
942
+ else :
943
+ # Use CPU accelerated version for the
944
+ # second step in deepsomatic
945
+ rule deepsomatic_call_variants_cpu :
946
+ """
947
+ Data processing step to call somatic variants using deep neural
948
+ network. The make_examples step prepares the input data for the
949
+ deepsomatic's CNN. DeepSomatic is an extension of deep learning-
950
+ based variant caller Deepvariant. It is composed of multiple steps
951
+ that takes aligned reads (in BAM or CRAM format), produces pileup
952
+ image tensors from them, classifies each tensor using a convolutional
953
+ neural network, and finally reports the results in a standard VCF or
954
+ gVCF file. This rule is the first step in the deepsomatic pipeline:
955
+ 1. make_examples (CPU, parallelizable with gnu-parallel)
956
+ * 2. call_variants (CPU, multi-threaded)
957
+ 3. postprocess_variants (CPU, single-threaded)
958
+ Running deepsomatic in a single step using run_deepsomatic is not
959
+ optimal for large-scale projects as it will consume resources very
960
+ inefficently. As so, it is better to run the 1st/3rd step on a
961
+ compute node and run the 2nd step on a GPU node.
962
+ NOTE: There be dragens here! When deepsomatic is run on a GPU/TPU,
963
+ it will scatter the writing of the output *.call_variants.tfrecord.gz
964
+ across a pool of processes (by default, --writer_threads 16). This
965
+ causes causes the final output file to be different if you are
966
+ running DeepSomatic on a CPU versus GPU.
967
+ @Input:
968
+ Flag file to indicate success of make_examples (scatter)
969
+ @Output:
970
+ Flag file to indicate success of call_variants,
971
+ actually produces:
972
+ {name}.call_variants.tfrecord.gz
973
+ """
974
+ input :
975
+ success = join (workpath , "deepsomatic" , "mk_examples" , "{name}.make_examples.success" ),
976
+ output :
977
+ success = join (workpath , "deepsomatic" , "call_variants" , "{name}.cv.success" ),
978
+ params :
979
+ rname = "ds_callvars_cpu" ,
980
+ genome = config ['references' ]['GENOME' ],
981
+ tmpdir = tmpdir ,
982
+ # NOTE: There BE dragons here!
983
+ # We need allocation info from make_examples rule
984
+ # to determine the number of shards that were
985
+ # used in the make_examples step, this is used
986
+ # to resolve a dependency file of this rule,
987
+ # which is the examples tf record file produced by
988
+ # make_examples. This file gets passed to the
989
+ # --examples option of call_variants.
990
+ example = lambda w : join (workpath , "deepsomatic" , "mk_examples" , "{0}.make_examples.tfrecord@{1}.gz" .format (
991
+ w .name ,
992
+ int (allocated ("threads" , "deepsomatic_make_examples" , cluster ))
993
+ )),
994
+ callvar = join (workpath , "deepsomatic" , "call_variants" , "{name}.call_variants.tfrecord.gz" ),
995
+ # TODO: add option --ffpe option to pipeline, that
996
+ # selects either the ffpe_wgs or ffpe_wes checkpoints.
997
+ # Building option for checkpoint file (assumes TN-pairs and
998
+ # non-FFPE samples), where:
999
+ # @WES = "/opt/models/deepsomatic/wes"
1000
+ # @WGS = "/opt/models/deepsomatic/wgs"
1001
+ ckpt = lambda _ : "/opt/models/deepsomatic/wes" if run_wes else "/opt/models/deepsomatic/wgs" ,
1002
+ threads : int (allocated ("threads" , "deepsomatic_call_variants_cpu" , cluster ))
1003
+ container : config ['images' ]['deepsomatic' ]
1004
+ envmodules : config ['tools' ]['deepsomatic' ]
1005
+ shell : """
1006
+ # Setups temporary directory for
1007
+ # intermediate files with built-in
1008
+ # mechanism for deletion on exit
1009
+ if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
1010
+ tmp=$(mktemp -d -p "{params.tmpdir}")
1011
+ trap 'rm -rf "${{tmp}}"' EXIT
1012
+ echo "Using tmpdir: ${{tmp}}"
1013
+ export TMPDIR="${{tmp}}"
1014
+
1015
+ # Run CPU DeepSomatic call_variants
1016
+ time call_variants \\
1017
+ --outfile {params.callvar} \\
1018
+ --examples {params.example} \\
1019
+ --checkpoint {params.ckpt}
1020
+ touch "{output.success}"
1021
+ """
922
1022
923
1023
924
1024
rule deepsomatic_postprocess_variants :
@@ -938,20 +1038,29 @@ rule deepsomatic_postprocess_variants:
938
1038
optimal for large-scale projects as it will consume resources very
939
1039
inefficently. As so, it is better to run the 1st/3rd step on a
940
1040
compute node and run the 2nd step on a GPU node.
1041
+ NOTE: There be dragens here! Deepsomatic will generate a different
1042
+ set of output files at the call_variants steps if it is run on a
1043
+ CPU versus a GPU. A flag file is used to indicate this step was
1044
+ successful and the actual sharded/non-shared file (which is input
1045
+ to this step) is resolved in the params section. Please note this
1046
+ file will not actually exist if call_variants was run with a GPU.
1047
+ Looking at their source code, it appears deepsomatic has some logic
1048
+ to detect if a sharded writer was used in the previous step, and it
1049
+ will read in the set of sharded call_variants files without issues.
941
1050
@Input:
942
1051
Per-sample call_variants tensorflow records file (scatter)
943
1052
@Output:
944
- Single-sample gVCF file with called variants
1053
+ Single-sample VCF file with called variants
945
1054
"""
946
1055
input :
947
- callvar = join (workpath , "deepsomatic" , "call_variants" , "{name}.call_variants.tfrecord.gz " ),
1056
+ success = join (workpath , "deepsomatic" , "call_variants" , "{name}.cv.success " ),
948
1057
output :
949
1058
vcf = join (workpath , "deepsomatic" , "somatic" , "{name}.deepsomatic.vcf" ),
950
1059
params :
951
- rname = "ds_postprovars" ,
952
- genome = config ['references' ]['GENOME' ],
953
- tmpdir = tmpdir ,
954
- message : "Running DeepSomatic postprocess_variants on '{input. callvar}' input file"
1060
+ rname = "ds_postprovars" ,
1061
+ genome = config ['references' ]['GENOME' ],
1062
+ tmpdir = tmpdir ,
1063
+ callvar = join ( workpath , "deepsomatic" , "call_variants" , "{name}.call_variants.tfrecord.gz" ),
955
1064
threads : int (allocated ("threads" , "deepsomatic_postprocess_variants" , cluster ))
956
1065
container : config ['images' ]['deepsomatic' ]
957
1066
envmodules : config ['tools' ]['deepsomatic' ]
@@ -968,9 +1077,10 @@ rule deepsomatic_postprocess_variants:
968
1077
# Run DeepSomatic postprocess_variants
969
1078
time postprocess_variants \\
970
1079
--ref {params.genome} \\
971
- --infile {input .callvar} \\
1080
+ --infile {params .callvar} \\
972
1081
--outfile {output.vcf} \\
973
- --process_somatic=true
1082
+ --process_somatic=true \\
1083
+ --cpus={threads}
974
1084
"""
975
1085
976
1086
0 commit comments