Configurable approaches to prep alignment splits

- Adds a configuration option to explore alternatives to our current bgzip/grabix preparation for fastqs to see if we can improve speed. - Add indexing option with rtg SDF files, which improves speed at the cost of a 3x larger disk footprint.
bcbio · Jan 6, 2016 · c08aba1 · c08aba1 · schelhorn · Jan 6, 2016
1 parent 5da92ae
commit c08aba1
Show file tree

Hide file tree

Showing 8 changed files with 164 additions and 35 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -11,6 +11,8 @@
   representation and pick best metric for plotting ROC scores.
 - Remove `--sudo` flag from installer. bcbio requires install into a directory
   structure with user permissions.
+- Add ability to tweak fastq preparation for alignment splitting so we can
+  explore alternative approaches to bgzip and grabix index.
 - Re-enable `stringtie` as an expression caller.
 - Allow `stringtie` as a transcript assembler.
 - Replace the `assemble_transcriptome` option with `transcript_assembler`, which

diff --git a/bcbio/ngsalign/alignprep.py b/bcbio/ngsalign/alignprep.py
@@ -2,6 +2,7 @@
 """
 import collections
 import copy
+import glob
 import os
 import shutil
 import subprocess
@@ -14,6 +15,7 @@
 from bcbio.distributed import objectstore
 from bcbio.distributed.multi import run_multicore, zeromq_aware_logging
 from bcbio.distributed.transaction import file_transaction
+from bcbio.ngsalign import rtg
 from bcbio.pipeline import config_utils, tools
 from bcbio.pipeline import datadict as dd
 from bcbio.provenance import do
@@ -22,46 +24,65 @@ def create_inputs(data):
     """Index input reads and prepare groups of reads to process concurrently.
 
     Allows parallelization of alignment beyond processors available on a single
-    machine. Uses bgzip and grabix to prepare an indexed fastq file.
+    machine. Prepares a rtg SDF format file with build in indexes for retrieving
+    sections of files.
+
+    Retains back compatibility with bgzip/grabix approach.
     """
     aligner = tz.get_in(("config", "algorithm", "aligner"), data)
     # CRAM files must be converted to bgzipped fastq, unless not aligning.
     # Also need to prep and download remote files.
     if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or
                                                                objectstore.is_remote(data["files"][0]))):
         # skip indexing on samples without input files or not doing alignment
-        # skip if we're not BAM and not doing alignment splitting
-        if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner
-              or _no_index_needed(data)):
+        if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner):
             return [[data]]
-    ready_files = _prep_grabix_indexes(data["files"], data["dirs"], data)
-    data["files"] = ready_files
-    # bgzip preparation takes care of converting illumina into sanger format
+    approach = "grabix" if _has_grabix_indices(data) else dd.get_align_prep_method(data)
+    if approach == "rtg":
+        data["files"] = [rtg.to_sdf(data["files"], data)]
+    else:
+        data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data)
+    # preparation converts illumina into sanger format
     data["config"]["algorithm"]["quality_format"] = "standard"
     if tz.get_in(["config", "algorithm", "align_split_size"], data):
-        splits = _find_read_splits(ready_files[0], data["config"]["algorithm"]["align_split_size"])
-    else:
-        splits = [None]
-    if len(splits) == 1:
-        return [[data]]
-    else:
         out = []
+        if approach == "rtg":
+            splits = rtg.calculate_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"])
+        else:
+            splits = _find_read_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"])
         for split in splits:
             cur_data = copy.deepcopy(data)
-            cur_data["align_split"] = "-".join([str(x) for x in split])
+            cur_data["align_split"] = split
             out.append([cur_data])
         return out
+    else:
+        return [[data]]
 
-def _no_index_needed(data):
-    return (not data["files"][0].endswith(".bam")
-            and data["config"]["algorithm"].get("align_split_size") is None)
+def _has_grabix_indices(data):
+    """Back compatibility with existing runs, look for grabix indexes.
+    """
+    work_dir = (os.path.join(data["dirs"]["work"], "align_prep"))
+    return len(glob.glob(os.path.join(work_dir, "*.gbi"))) > 0
 
-def split_namedpipe_cl(in_file, data):
+def split_namedpipe_cls(pair1_file, pair2_file, data):
     """Create a commandline suitable for use as a named pipe with reads in a given region.
     """
-    grabix = config_utils.get_program("grabix", data["config"])
-    start, end = [int(x) for x in data["align_split"].split("-")]
-    return "<({grabix} grab {in_file} {start} {end})".format(**locals())
+    if "align_split" in data:
+        start, end = [int(x) for x in data["align_split"].split("-")]
+    else:
+        start, end = None, None
+    if pair1_file.endswith(".sdf"):
+        assert not pair2_file, pair2_file
+        return rtg.to_fastq_apipe_cl(pair1_file, start, end)
+    else:
+        out = []
+        for in_file in pair1_file, pair2_file:
+            if in_file:
+                assert os.path.exists(in_file + ".gbi"), "Need grabix index for %s" % in_file
+                out.append("<(grabix grab {in_file} {start} {end})".format(**locals()))
+            else:
+                out.append(None)
+        return out
 
 def fastq_convert_pipe_cl(in_file, data):
     """Create an anonymous pipe converting Illumina 1.3-1.7 to Sanger.
@@ -89,6 +110,8 @@ def parallel_multiplier(items):
 def setup_combine(final_file, data):
     """Setup the data and outputs to allow merging data back together.
     """
+    if "align_split" not in data:
+        return final_file, data
     align_dir = os.path.dirname(final_file)
     base, ext = os.path.splitext(os.path.basename(final_file))
     start, end = [int(x) for x in data["align_split"].split("-")]
@@ -149,7 +172,7 @@ def _find_read_splits(in_file, split_size):
         new = last + split_lines - 1
         chunks.append((last, min(new, num_lines)))
         last = new + 1
-    return chunks
+    return ["%s-%s" % (s, e) for s, e in chunks]
 
 # ## bgzip and grabix
 

diff --git a/bcbio/ngsalign/bwa.py b/bcbio/ngsalign/bwa.py
@@ -9,7 +9,7 @@
 from bcbio import bam, utils
 from bcbio.distributed import objectstore
 from bcbio.distributed.transaction import file_transaction, tx_tmpdir
-from bcbio.ngsalign import alignprep, novoalign, postalign
+from bcbio.ngsalign import alignprep, novoalign, postalign, rtg
 from bcbio.provenance import do
 from bcbio.rnaseq import gtf
 import bcbio.pipeline.datadict as dd
@@ -84,12 +84,16 @@ def _get_bwa_mem_cmd(data, out_file, ref_file, fastq1, fastq2=""):
                "{ref_file} {fastq1} {fastq2} ")
     return (bwa_cmd + alt_cmd).format(**locals())
 
-def _can_use_mem(fastq_file, data):
+def _can_use_mem(fastq_file, data, read_min_size=None):
     """bwa-mem handle longer (> 70bp) reads with improved piping.
     Randomly samples 5000 reads from the first two million.
     Default to no piping if more than 75% of the sampled reads are small.
+    If we've previously calculated minimum read sizes (from rtg SDF output)
+    we can skip the formal check.
     """
     min_size = 70
+    if read_min_size and read_min_size >= min_size:
+        return True
     thresh = 0.75
     head_count = 8000000
     tocheck = 5000
@@ -115,12 +119,13 @@ def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
     pair_file = pair_file if pair_file else ""
     out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
     qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
-    if data.get("align_split"):
+    min_size = None
+    if data.get("align_split") or fastq_file.endswith(".sdf"):
+        if fastq_file.endswith(".sdf"):
+            min_size = rtg.min_read_size(fastq_file)
         final_file = out_file
         out_file, data = alignprep.setup_combine(final_file, data)
-        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
-        if pair_file:
-            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
+        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
     else:
         final_file = None
         if qual_format == "illumina":
@@ -131,7 +136,7 @@ def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
     if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
         # If we cannot do piping, use older bwa aln approach
         if ("bwa-mem" not in dd.get_tools_on(data) and
-              ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data))):
+              ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))):
             out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file,
                                         names, rg_info, data)
         else:
@@ -152,7 +157,6 @@ def _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data):
 def _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data):
     """Perform a BWA alignment using 'aln' backtrack algorithm.
     """
-    assert not data.get("align_split"), "Do not handle split alignments with non-piped bwa"
     bwa = config_utils.get_program("bwa", data["config"])
     config = data["config"]
     sai1_file = "%s_1.sai" % os.path.splitext(out_file)[0]

diff --git a/bcbio/ngsalign/novoalign.py b/bcbio/ngsalign/novoalign.py
@@ -55,12 +55,10 @@ def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
     """
     pair_file = pair_file if pair_file else ""
     out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
-    if data.get("align_split"):
+    if data.get("align_split") or fastq_file.endswith(".sdf"):
         final_file = out_file
         out_file, data = alignprep.setup_combine(final_file, data)
-        fastq_file = alignprep.split_namedpipe_cl(fastq_file, data)
-        if pair_file:
-            pair_file = alignprep.split_namedpipe_cl(pair_file, data)
+        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
     else:
         final_file = None
     samtools = config_utils.get_program("samtools", data["config"])

diff --git a/bcbio/ngsalign/rtg.py b/bcbio/ngsalign/rtg.py
@@ -0,0 +1,100 @@
+"""Provide indexing and retrieval of files using Real Time Genomics SDF format.
+
+Prepares a sdf representation of reads suitable for indexed retrieval,
+normalizing many different input types.
+
+https://github.com/RealTimeGenomics/rtg-tools
+"""
+import os
+import subprocess
+
+from bcbio import bam, utils
+from bcbio.distributed.transaction import file_transaction
+from bcbio.pipeline import datadict as dd
+from bcbio.provenance import do
+
+def to_sdf(files, data):
+    """Convert a fastq or BAM input into a SDF indexed file.
+    """
+    # BAM
+    if len(files) == 1 and files[0].endswith(".bam"):
+        qual = []
+        format = ["-f", "sam-pe" if bam.is_paired(files[0]) else "sam-se"]
+        inputs = [files[0]]
+    # fastq
+    else:
+        qual = ["-q", "illumina" if dd.get_quality_format(data).lower() == "illumina" else "sanger"]
+        format = ["-f", "fastq"]
+        if len(files) == 2:
+            inputs = ["-l", files[0], "-r", files[1]]
+        else:
+            assert len(files) == 1
+            inputs = [files[0]]
+    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
+    out_file = os.path.join(work_dir,
+                            "%s.sdf" % utils.splitext_plus(os.path.basename(os.path.commonprefix(files)))[0])
+    if not utils.file_exists(out_file):
+        with file_transaction(data, out_file) as tx_out_file:
+            cmd = _rtg_cmd(["rtg", "format", "-o", tx_out_file] + format + qual + inputs)
+            do.run(cmd, "Format inputs to indexed SDF")
+    return out_file
+
+def _rtg_cmd(cmd):
+    return "export RTG_JAVA_OPTS='-Xms500m' && export RTG_MEM=2g && " + " ".join(cmd)
+
+def calculate_splits(sdf_file, split_size):
+    """Retrieve
+    """
+    counts = _sdfstats(sdf_file)["counts"]
+    splits = []
+    cur = 0
+    for i in range(counts // split_size + (0 if counts % split_size == 0 else 1)):
+        splits.append("%s-%s" % (cur, min(counts, cur + split_size)))
+        cur += split_size
+    return splits
+
+def _sdfstats(sdf_file):
+    cmd = ["rtg", "sdfstats", sdf_file]
+    pairs = []
+    counts = []
+    lengths = []
+    for line in subprocess.check_output(_rtg_cmd(cmd), shell=True).split("\n"):
+        if line.startswith("Paired arm"):
+            pairs.append(line.strip().split()[-1])
+        elif line.startswith("Number of sequences"):
+            counts.append(int(line.strip().split()[-1]))
+        elif line.startswith("Minimum length"):
+            lengths.append(int(line.strip().split()[-1]))
+    assert len(set(counts)) == 1, counts
+    return {"counts": counts[0], "pairs": pairs, "min_size": min(lengths)}
+
+def min_read_size(sdf_file):
+    """Retrieve minimum read size from SDF statistics.
+    """
+    return _sdfstats(sdf_file)["min_size"]
+
+def is_paired(sdf_file):
+    """Check if we have paired inputs in a SDF file.
+    """
+    pairs = _sdfstats(sdf_file)["pairs"]
+    return len(set(pairs)) > 1
+
+def to_fastq_apipe_cl(sdf_file, start=None, end=None):
+    """Return a command lines to provide streaming fastq input.
+
+    For paired end, returns a forward and reverse command line. For
+    single end returns a single command line and None for the pair.
+    """
+    cmd = ["rtg", "sdf2fastq", "--no-gzip", "-o", "-"]
+    if start is not None:
+        cmd += ["--start-id=%s" % start]
+    if end is not None:
+        cmd += ["--end-id=%s" % end]
+    if is_paired(sdf_file):
+        out = []
+        for ext in ["left", "right"]:
+            out.append("<(%s)" % _rtg_cmd(cmd + ["-i", os.path.join(sdf_file, ext)]))
+        return out
+    else:
+        cmd += ["-i", sdf_file]
+        return ["<(%s)" % _rtg_cmd(cmd), None]
diff --git a/bcbio/pipeline/datadict.py b/bcbio/pipeline/datadict.py
@@ -119,6 +119,7 @@
     "coverage": {"keys": ["config", "algorithm", "coverage"]},
     "deduped_bam": {"keys": ["deduped_bam"]},
     "align_bam": {"keys": ["align_bam"]},
+    "align_prep_method": {"keys": ["config", "algorithm", "align_prep_method"], "default": "grabix"},
     "tools_off": {"keys": ["config", "algorithm", "tools_off"], "default": []},
     "tools_on": {"keys": ["config", "algorithm", "tools_on"], "default": []},
     "cwl_reporting": {"keys": ["config", "algorithm", "cwl_reporting"]},

diff --git a/bcbio/pipeline/run_info.py b/bcbio/pipeline/run_info.py
@@ -355,7 +355,7 @@ def _check_for_misplaced(xs, subkey, other_keys):
 
 ALGORITHM_KEYS = set(["platform", "aligner", "bam_clean", "bam_sort",
                       "trim_reads", "adapters", "custom_trim", "species", "kraken",
-                      "align_split_size", "quality_bin", "transcriptome_align",
+                      "align_split_size", "align_prep_method", "quality_bin", "transcriptome_align",
                       "quality_format", "write_summary", "merge_bamprep",
                       "coverage", "coverage_interval", "ploidy", "indelcaller",
                       "variantcaller", "jointcaller", "variant_regions", "peakcaller",

diff --git a/tests/data/automated/run_info-bam.yaml b/tests/data/automated/run_info-bam.yaml
@@ -21,6 +21,7 @@ details:
       #quality_bin: [prealignment, postrecal]
       #archive: cram
       align_split_size: 15000
+      #align_prep_method: grabix
       aligner: bwa
       #aligner: false
       #bam_clean: picard