Merge branch 'model-choice' into 'dev'

[CW-2686] Stub out auto models See merge request research/medaka!565
nanoporetech · Oct 23, 2023 · 92fbd06 · 92fbd06
2 parents 2cf7a6a + c477d35
commit 92fbd06
Show file tree

Hide file tree

Showing 9 changed files with 315 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -110,7 +110,7 @@ not be provided by the user.
 
 **Using a GPU**
 
-Since version 1.1.0 `medaka` uses Tensorflow 2.2, prior versions used Tensorflow 1.4.
+Since version 1.1.0 `medaka` uses Tensorflow 2, prior versions used Tensorflow 1.
 For `medaka` 1.1.0 and higher installation from source or using `pip` can make
 immediate use of GPUs. However, note that the `tensorflow` package is compiled against
 specific versions of the NVIDIA CUDA and cuDNN libraries; users are directed to the
@@ -214,6 +214,33 @@ For best results it is important to specify the correct model, `-m` in the
 above, according to the basecaller used. Allowed values can be found by
 running `medaka tools list\_models`.
 
+**Recent basecallers**
+
+Recent basecaller versions annotate their output with their model version.
+In such cases medaka can inspect the files and attempt to select an appropriate
+model for itself. This typically works best in the case of BAM output from
+basecallers. It will work also for FASTQ input provided the FASTQ has been
+created from basecaller output using:
+
+```
+samtools fastq -T '*' dorado.bam | gzip -c > dorado.fastq.gz
+```
+
+The command `medaka consensus` will attempt to automatically determine a
+correct model by inspecting its BAM input file. The helper scripts
+`medaka_consensus` and `medaka_haploid_variant` will make similar attempts
+from their FASTQ input.
+
+To inspect files for yourself, the command:
+
+```
+medaka tools resolve_model --auto_model <consensus/variant> <input.bam/input.fastq>
+```
+
+will print the model that automatic model selection will use.
+
+**For older basecallers and when automatic selection is unsuccessful**
+
 Medaka models are named to indicate i) the pore type, ii) the sequencing
 device (MinION or PromethION), iii) the basecaller variant, and iv) the
 basecaller version, with the format:

diff --git a/medaka/medaka.py b/medaka/medaka.py
@@ -20,6 +20,7 @@
 import medaka.vcf
 import medaka.wrappers
 
+
 class ResolveModel(argparse.Action):
     """Resolve model filename or ID into filename"""
     def __init__(
@@ -36,7 +37,22 @@ def __call__(self, parser, namespace, values, option_string=None):
         except Exception as e:
             msg = "Error validating model from '--{}' argument: {}."
             raise RuntimeError(msg.format(self.dest, str(e)))
-        #TODO: verify the file is a model?
+        setattr(namespace, f"{self.dest}_was_given", True)
+        setattr(namespace, self.dest, model_fp)
+
+
+class AutoModel(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        variant, input_file = values
+        if variant not in {'consensus', 'variant'}:
+            raise ValueError("'TYPE' must be one of 'consensus' or 'variant'.")
+        variant = variant == 'variant'
+        model = medaka.models.model_from_basecaller(input_file, variant=variant)
+        try:
+            model_fp = medaka.models.resolve_model(model)
+        except Exception as e:
+            msg = "Error validating model from '--{}' argument: {}."
+            raise RuntimeError(msg.format(self.dest, str(e)))
         setattr(namespace, self.dest, model_fp)
 
 
@@ -91,7 +107,7 @@ def check_read_groups(fname, rg=None):
 
         :raises: RuntimeError
         """
-        with pysam.AlignmentFile(fname) as bam:
+        with pysam.AlignmentFile(fname, check_sq=False) as bam:
             # As of 13/12/19 pypi still has no wheel for pysam v0.15.3 so we
             # pinned to v0.15.2. However bioconda's v0.15.2 package
             # conflicts with the libdeflate they have so we are forced
@@ -177,9 +193,13 @@ def _log_level():
 def _model_arg():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False)
-    parser.add_argument('--model', action=ResolveModel,
+    grp = parser.add_mutually_exclusive_group()
+    grp.add_argument('--model', action=ResolveModel,
             default=medaka.options.default_models['consensus'],
             help='Model to use.')
+    grp.add_argument('--auto_model', nargs=2, action=AutoModel,
+            metavar=("TYPE", "INPUT"), dest='model',
+            help="Automatically choose model according to INPUT. TYPE should be one of 'consensus' or 'variant'.")
     return parser
 
 
@@ -245,18 +265,47 @@ def _chunking_feature_args(batch_size=100, chunk_len=10000, chunk_ovlp=1000):
     return parser
 
 
-def _validate_common_args(args):
+def _validate_common_args(args, parser):
     """Do some common argument validation."""
     logger = medaka.common.get_named_logger('ValidArgs')
+
+    # check BAM has some required fields, fail early
     if hasattr(args, 'bam') and args.bam is not None:
         RG = args.RG if hasattr(args, 'RG') else None
         CheckBam.check_read_groups(args.bam, RG)
         if RG is not None:
             msg = "Reads will be filtered to only those with RG tag: {}"
-            logger.info(msg.format(RG))
-    # if model is default, resolve to file, save mess in help text
-    if hasattr(args, 'model') and args.model is not None:
-        args.model = medaka.models.resolve_model(args.model)
+            logger.debug(msg.format(RG))
+
+    # rationalise the model
+    if hasattr(args, 'model'):
+        # if --model was not given on the command-line try to guess from the
+        # the input file. otherwise leave alone
+        if (
+                hasattr(args, 'bam')
+                and args.bam is not None
+                and not hasattr(args, 'model_was_given')):
+            # try to guess model using the input file, assume consensus
+            # assuming consensus might not be right, but this is not a change
+            # in behaviour from the historic.
+            logger.debug("Guessing model")
+            if hasattr(args, 'bam') and args.bam is not None:
+                model = medaka.models.model_from_basecaller(
+                    args.bam, variant="consensus")
+                try:
+                    args.model = medaka.models.resolve_model(model)
+                except Exception as e:
+                    logger.warning(
+                        "Failed to guess medaka model input file. Using default.")
+                else:
+                    logger.debug(
+                        f"Chosen model '{args.model}' for input '{args.bam}'.")
+        elif args.model is not None:
+            # TODO: why is this done? it will have been done in ResolveModel?
+            #       the resolve_model function is idempotent so doesn't really
+            #       matter too much
+            args.model = medaka.models.resolve_model(args.model)
+            logger.debug(f"Model is: {args.model}")
 
 
 def print_model_path(args):
@@ -757,5 +806,6 @@ def main():
         # TODO: is there a cleaner way to access this?
         parser.__dict__['_actions'][1].choices['tools'].print_help()
     else:
-        _validate_common_args(args)
+        # perform some post processing on the values, then run entry point
+        _validate_common_args(args, parser)
         args.func(args)
diff --git a/medaka/models.py b/medaka/models.py
@@ -1,9 +1,11 @@
 """Creation and loading of models."""
 
+import itertools
 import os
 import pathlib
 import tempfile
 
+import pysam
 import requests
 
 import medaka.common
@@ -90,6 +92,98 @@ def resolve_model(model):
     raise RuntimeError("Model resolution failed")
 
 
+def model_from_basecaller(fname, variant=False):
+    """Determine correct medaka model from basecaller output file.
+
+    :param fname: a basecaller output (.sam/.bam/.cram/.fastq).
+    :param variant: whether to return variant model (otherwise consensus).
+
+    There are slight differences is the search strategy for .bam and .fastq
+    files due to differences in what information is available in each.
+
+    For .bam (and related) files the DS subfield of the read group header
+    is examined to find the "basecall_model=" key=value entry. The found
+    model is returned vebatim without fursther checks.
+
+    For .fastq (and related) a RG:Z key=value in record comments is searched
+    in the first 100 records. Due to ambiguities in the representation the
+    search looks explicitely for known models.
+    """
+    logger = medaka.common.get_named_logger("MdlInspect")
+    logger.info("Trying to find model")
+    try:
+        models = _model_from_bam(fname)
+    except Exception:
+        try:
+            models = _model_from_fastq(fname)
+        except Exception:
+            raise IOError(
+                "Failed to parse basecaller models from input file.")
+
+    if len(models) != 1:
+        # TODO: this potentially conflicts with medaka's ability to use
+        #       multiple data types. In that case a user can just
+        #       explicitely provide the correct model.
+        raise ValueError(
+            "Input file did not contain precisely 1 basecaller "
+            "model reference.")
+
+    basecaller = list(models)[0]
+    if basecaller not in medaka.options.basecaller_models.keys():
+        raise KeyError(
+            "Unknown basecaller model. Please provide a medaka model "
+            "explicitely using --model.")
+
+    consensus, var = medaka.options.basecaller_models[basecaller]
+    model = var if variant else consensus
+    if model is None:
+        txt = "variant" if variant else "consensus"
+        raise ValueError(
+            f"No {txt} model available for basecaller {basecaller}.")
+    return model
+
+
+def _model_from_bam(fname):
+    """Search for basecaller models listed in a .bam."""
+    models = set()
+    with pysam.AlignmentFile(fname, check_sq=False) as bam:
+        callers = [rg['DS'] for rg in bam.header['RG']]
+        logger.info(f"Found basecall models: {callers}")
+        for caller in callers:
+            models.add(caller.split("basecall_model=")[1].split()[0])
+    return models
+
+
+def _model_from_fastq(fname):
+    """Search for model files listed in a .fastq."""
+    known_models = list(medaka.options.basecaller_models.keys())
+    models = set()
+    with pysam.FastxFile(fname, 'r') as fastq:
+        for rec in itertools.islice(fastq, 100):
+            # model is embedded in RG:Z: tag of comment as
+            # <run_id>_<model>_<barcode>, but model has _
+            # characters in also so search for known models
+            try:
+                read_group = rec.comment.split("RG:Z:")[1].split()[0]
+                for model in known_models:
+                    if model in read_group:
+                        models.add(model)
+            except Exception:
+                pass
+    if len(models) > 1:
+        # filter out any models without an `@`. These are likely FPs of
+        # the search above (there are unversioned models whose name
+        # is a substring of the versioned models).
+        unversioned = {m for m in models if '@' not in m}
+        versioned = {m for m in models if '@' in m}
+        remove = set()
+        for unver, ver in itertools.product(unversioned, versioned):
+            if unver in ver:
+                remove.add(unver)
+        models = models - remove
+    return models
+
+
 def open_model(fname):
     """Determine model type from model name.
 

diff --git a/medaka/options.py b/medaka/options.py
@@ -7,16 +7,13 @@
 
 import pkg_resources
 
+# the models used by default for CLI entry points
+default_models = {
+    'consensus': 'r1041_e82_400bps_sup_v4.2.0',
+    'variant': 'r1041_e82_400bps_sup_variant_v4.2.0'
+}
 
-model_subdir = 'data'
-model_stores = (
-    pkg_resources.resource_filename(__package__, model_subdir),
-    os.path.join(
-        str(pathlib.Path.home()), '.{}'.format(__package__), model_subdir)
-)
-model_url_template = \
-    'https://github.com/nanoporetech/{pkg}/raw/master/{pkg}/{subdir}/{fname}'
-
+# current models are those included in PyPI packages
 current_models = [
     # r1041 e82 (kit14) consensus
     'r1041_e82_400bps_hac_v4.2.0',
@@ -25,6 +22,63 @@
     'r1041_e82_400bps_hac_variant_v4.2.0',
     'r1041_e82_400bps_sup_variant_v4.2.0',
 ]
+
+# mapping from basecaller model names to medaka models.
+# name: (consensus, variant)
+# models here are automatically added to full archived list
+basecaller_models = {
+    # R10.3
+    'dna_r10.3_450bps_hac':
+        ('r103_hac_g507', 'r103_hac_variant_g507'),
+    'dna_r10.3_450bps_hac_prom':
+    ('r103_hac_g507', 'r103_hac_variant_g507'),
+    # R10.4.1 260bps
+    'dna_r10.4.1_e8.2_260bps_hac':
+    ('r1041_e82_260bps_hac_g632', 'r1041_e82_260bps_hac_variant_g632'),
+    '[email protected]':
+        ('r1041_e82_260bps_hac_v4.0.0', None),
+    '[email protected]':
+        ('r1041_e82_260bps_hac_v4.1.0', 'r1041_e82_260bps_hac_variant_v4.1.0'),
+    'dna_r10.4.1_e8.2_260bps_hac_prom':
+        ('r1041_e82_260bps_hac_g632', 'r1041_e82_260bps_hac_variant_g632'),
+    '[email protected]':
+        ('r1041_e82_260bps_sup_v4.0.0', None),
+    '[email protected]':
+        ('r1041_e82_260bps_sup_v4.1.0', 'r1041_e82_260bps_sup_variant_v4.1.0'),
+    # R10.4.1 400bps
+    'dna_r10.4.1_e8.2_400bps_hac':
+        ('r1041_e82_400bps_hac_g632', 'r1041_e82_400bps_hac_variant_g632'),
+    '[email protected]':
+        ('r1041_e82_400bps_hac_g632', 'r1041_e82_400bps_hac_variant_g632'),
+    '[email protected]':
+        ('r1041_e82_400bps_hac_v4.0.0', None),
+    '[email protected]':
+        ('r1041_e82_400bps_hac_v4.1.0', 'r1041_e82_400bps_hac_variant_v4.1.0'),
+    '[email protected]':
+        ('r1041_e82_400bps_hac_v4.2.0', 'r1041_e82_400bps_hac_variant_v4.2.0'),
+    'dna_r10.4.1_e8.2_400bps_hac_prom':
+        ('r1041_e82_400bps_hac_g632', 'r1041_e82_400bps_hac_variant_g632'),
+    '[email protected]':
+        ('r1041_e82_400bps_sup_g615', 'r1041_e82_400bps_sup_variant_g615'),
+    '[email protected]':
+        ('r1041_e82_400bps_sup_v4.0.0', None),
+    '[email protected]':
+        ('r1041_e82_400bps_sup_v4.1.0', 'r1041_e82_400bps_sup_variant_v4.1.0'),
+    '[email protected]':
+        ('r1041_e82_400bps_sup_v4.2.0', 'r1041_e82_400bps_sup_variant_v4.2.0'),
+    # R9.4.1 This is a little dodgy
+    # note: 'dna_r9.4.1_450bps_hac' is not present here as there is not a
+    #       injective mapping.
+    '[email protected]':
+        ('r941_min_fast_g507', 'r941_min_fast_variant_g507'),
+    '[email protected]':
+        ('r941_min_hac_g507', 'r941_min_hac_variant_g507'),
+    '[email protected]':
+        ('r941_min_sup_g507', 'r941_min_sup_variant_g507')}
+
+# archived models are not included in packages but can be downloaded on the fly
+# this list SHOULD NOT be added to, any new models should go into the above
+# basecaller_models structure
 archived_models = [
     # r9 consensus
     'r941_sup_plant_g610',
@@ -95,12 +149,23 @@
     'r941_e81_fast_variant_g514', 'r941_e81_hac_variant_g514',
     'r941_e81_sup_variant_g514',
 ]
-allowed_models = sorted(current_models + archived_models)
-default_models = {
-    'consensus': 'r1041_e82_400bps_sup_v4.2.0',
-    'variant': 'r1041_e82_400bps_sup_variant_v4.2.0'
-}
 
+# add basecaller models, then deduplicate and sort
+for models in basecaller_models.values():
+    archived_models.extend((m for m in models if m is not None))
+allowed_models = sorted(set(current_models + archived_models))
+
+# where we look for model files and store them
+model_subdir = 'data'
+model_stores = (
+    pkg_resources.resource_filename(__package__, model_subdir),
+    os.path.join(
+        str(pathlib.Path.home()), '.{}'.format(__package__), model_subdir)
+)
+model_url_template = \
+    'https://github.com/nanoporetech/{pkg}/raw/master/{pkg}/{subdir}/{fname}'
+
+# suspect this isn't used anymore...
 alignment_params = {
     'rle': "-M 5 -S 4 -O 2 -E 3",
     'non-rle': "-M 2 -S 4 -O 4,24 -E 2,1"}
diff --git a/medaka/test/data/bc_model_scrape.bam b/medaka/test/data/bc_model_scrape.bam
diff --git a/medaka/test/data/bc_model_scrape.fastq.gz b/medaka/test/data/bc_model_scrape.fastq.gz