Breaking up deepsomatic into its constituent steps

skchronicles · skchronicles · commit 57b666c2a4dc · 2024-11-05T15:24:27.000-05:00
diff --git a/config/cluster/slurm.json b/config/cluster/slurm.json
@@ -73,11 +73,30 @@
         "gres": "lscratch:512"
     },
     "deepsomatic": {
-        "threads": "24",
-        "mem": "64G",
+        "threads": "36",
+        "mem": "192G",
         "time": "1-18:00:00",
         "gres": "lscratch:750"
     },
+    "deepsomatic_make_examples": {
+        "threads": "36",
+        "mem": "96G",
+        "time": "1-00:00:00",
+        "gres": "lscratch:750"
+    },
+    "deepsomatic_call_variants": {
+        "threads": "16",
+        "mem": "60G",
+        "partition": "gpu",
+        "gres": "gpu:a100:1,lscratch:450",
+        "time": "1-00:00:00"
+    },
+    "deepsomatic_postprocess_variants": {
+        "threads": "4",
+        "mem": "64G",
+        "time": "1-00:00:00",
+        "gres": "lscratch:256"
+    },
     "deepvariant": {
         "threads": "18",
         "mem": "48G",
diff --git a/config/cluster/uge.json b/config/cluster/uge.json
@@ -75,6 +75,21 @@
     "mem": "4G",
     "partition": ""
   },
+  "deepsomatic_call_variants": {
+    "mem": "4G",
+    "partition": "",
+    "threads": "8"
+  },
+  "deepsomatic_make_examples": {
+    "mem": "4G",
+    "partition": "",
+    "threads": "8"
+  },
+  "deepsomatic_postprocess_variants": {
+    "mem": "8G",
+    "partition": "",
+    "threads": "4"
+  },
   "deepvariant": {
     "mem": "3G",
     "partition": "",
diff --git a/workflow/rules/depreciated.smk b/workflow/rules/depreciated.smk
@@ -1,4 +1,17 @@
 # Depreciated rules that may still be useful for some projects
+def get_normal_sorted_bam(wildcards):
+    """
+    Returns a tumor samples paired normal
+    See config['pairs'] for tumor, normal pairs.
+    """
+    normal = tumor2normal[wildcards.name]
+    if normal:
+        # Runs in a tumor, normal mode
+        return join(workpath, "BAM", "{0}.sorted.bam".format(normal))
+    else:
+        # Runs in tumor-only mode
+        return []
+
 
 # Depreciated germline variant calling rule(s)
 rule deepvariant:
@@ -57,4 +70,83 @@ rule deepvariant:
         --output_vcf={output.vcf} \\
         --num_shards={threads} \\
         --intermediate_results_dir=${{tmp}}
-    """
+    """
+
+# Depreciated somatic variant calling rule(s)
+rule deepsomatic:
+    """
+    Data processing step to call somatic variants using deep neural 
+    network in tumor-normal pairs. DeepSomatic is an extension of the
+    deep learning-based variant caller DeepVariant that takes aligned
+    reads (in BAM or CRAM format) from tumor and normal data, produces 
+    pileup image tensors from them, classifies each tensor using a CNN,
+    and  finally reports somatic variants in a standard VCF or gVCF file. 
+    This rule runs all three steps in the deepsomatic pipeline as a one 
+    step: i.e. make_examples, call_variants, and postprocess_variants.
+    This is not optimal for large-scale projects as it will consume a lot
+    of resources inefficently (only the 2nd step in the dv pipeline can
+    make use of GPU-computing). As so, it is better to run the 1st/3rd 
+    step on a normal compute node and run the 2nd step on a GPU node.
+    @Input:
+        Duplicate marked, sorted Tumor-Normal BAM file (scatter)
+    @Output:
+        Single-sample VCF file with called somatic variants
+    """
+    input: 
+        tumor  = join(workpath, "BAM", "{name}.sorted.bam"),
+        normal = get_normal_sorted_bam
+    output:
+        vcf  = join(workpath, "deepsomatic", "somatic", "{name}.deepsomatic.vcf"),
+    params: 
+        rname  = "deepsom",
+        genome = config['references']['GENOME'],
+        tmpdir = tmpdir,
+        # Building option for deepsomatic config, where:
+        #  @WGS = --model_type=WGS
+        #  @WES = --model_type=WES  (may be added in future)
+        dv_model_type = "WGS",
+        # Get tumor and normal sample names 
+        tumor  = '{name}',
+        # Building option for the paired normal sorted bam
+        normal_bam_option = lambda w: "--reads_normal={0}.sorted.bam".format(
+            join(workpath, "BAM", tumor2normal[w.name])
+        ) if tumor2normal[w.name] else "",
+        # Building option for the normal sample name
+        normal_name_option = lambda w: "--sample_name_normal={0}".format(
+            tumor2normal[w.name]
+        ) if tumor2normal[w.name] else "",
+    threads: int(allocated("threads", "deepsomatic", cluster))
+    container: config['images']['deepsomatic']
+    envmodules: config['tools']['deepsomatic']
+    shell: """
+    # Setups temporary directory for
+    # intermediate files with built-in 
+    # mechanism for deletion on exit
+    if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+    tmp=$(mktemp -d -p "{params.tmpdir}")
+    trap 'du -sh "${{tmp}}"; rm -rf "${{tmp}}"' EXIT
+
+    # Export OpenBLAS variable to
+    # control the number of threads
+    # in a thread pool. By setting
+    # this variable to 1, work is
+    # done in the thread that ran
+    # the operation, rather than
+    # disbatching the work to a
+    # thread pool. If this option
+    # is not provided, it can lead
+    # to nested parallelism.
+    # See this issue for more info:
+    # https://github.com/google/deepsomatic/issues/28
+    export OPENBLAS_NUM_THREADS=1
+
+    # Run deepsomatic
+    run_deepsomatic \\
+        --model_type={params.dv_model_type} \\
+        --ref={params.genome} \\
+        --reads_tumor={input.tumor} {params.normal_bam_option} \\
+        --sample_name_tumor={params.tumor} {params.normal_name_option} \\
+        --output_vcf={output.vcf} \\
+        --num_shards={threads} \\
+        --intermediate_results_dir=${{tmp}}
+    """
diff --git a/workflow/rules/somatic.smk b/workflow/rules/somatic.smk