Skip to content

Commit

Permalink
Merge branch 'model-choice' into 'dev'
Browse files Browse the repository at this point in the history
[CW-2686] Stub out auto models

See merge request research/medaka!565
  • Loading branch information
cjw85 committed Oct 23, 2023
2 parents 2cf7a6a + c477d35 commit 92fbd06
Show file tree
Hide file tree
Showing 9 changed files with 315 additions and 28 deletions.
29 changes: 28 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ not be provided by the user.

**Using a GPU**

Since version 1.1.0 `medaka` uses Tensorflow 2.2, prior versions used Tensorflow 1.4.
Since version 1.1.0 `medaka` uses Tensorflow 2, prior versions used Tensorflow 1.
For `medaka` 1.1.0 and higher installation from source or using `pip` can make
immediate use of GPUs. However, note that the `tensorflow` package is compiled against
specific versions of the NVIDIA CUDA and cuDNN libraries; users are directed to the
Expand Down Expand Up @@ -214,6 +214,33 @@ For best results it is important to specify the correct model, `-m` in the
above, according to the basecaller used. Allowed values can be found by
running `medaka tools list\_models`.

**Recent basecallers**

Recent basecaller versions annotate their output with their model version.
In such cases medaka can inspect the files and attempt to select an appropriate
model for itself. This typically works best in the case of BAM output from
basecallers. It will work also for FASTQ input provided the FASTQ has been
created from basecaller output using:

```
samtools fastq -T '*' dorado.bam | gzip -c > dorado.fastq.gz
```

The command `medaka consensus` will attempt to automatically determine a
correct model by inspecting its BAM input file. The helper scripts
`medaka_consensus` and `medaka_haploid_variant` will make similar attempts
from their FASTQ input.

To inspect files for yourself, the command:

```
medaka tools resolve_model --auto_model <consensus/variant> <input.bam/input.fastq>
```

will print the model that automatic model selection will use.

**For older basecallers and when automatic selection is unsuccessful**

Medaka models are named to indicate i) the pore type, ii) the sequencing
device (MinION or PromethION), iii) the basecaller variant, and iv) the
basecaller version, with the format:
Expand Down
68 changes: 59 additions & 9 deletions medaka/medaka.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import medaka.vcf
import medaka.wrappers


class ResolveModel(argparse.Action):
"""Resolve model filename or ID into filename"""
def __init__(
Expand All @@ -36,7 +37,22 @@ def __call__(self, parser, namespace, values, option_string=None):
except Exception as e:
msg = "Error validating model from '--{}' argument: {}."
raise RuntimeError(msg.format(self.dest, str(e)))
#TODO: verify the file is a model?
setattr(namespace, f"{self.dest}_was_given", True)
setattr(namespace, self.dest, model_fp)


class AutoModel(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
variant, input_file = values
if variant not in {'consensus', 'variant'}:
raise ValueError("'TYPE' must be one of 'consensus' or 'variant'.")
variant = variant == 'variant'
model = medaka.models.model_from_basecaller(input_file, variant=variant)
try:
model_fp = medaka.models.resolve_model(model)
except Exception as e:
msg = "Error validating model from '--{}' argument: {}."
raise RuntimeError(msg.format(self.dest, str(e)))
setattr(namespace, self.dest, model_fp)


Expand Down Expand Up @@ -91,7 +107,7 @@ def check_read_groups(fname, rg=None):
:raises: RuntimeError
"""
with pysam.AlignmentFile(fname) as bam:
with pysam.AlignmentFile(fname, check_sq=False) as bam:
# As of 13/12/19 pypi still has no wheel for pysam v0.15.3 so we
# pinned to v0.15.2. However bioconda's v0.15.2 package
# conflicts with the libdeflate they have so we are forced
Expand Down Expand Up @@ -177,9 +193,13 @@ def _log_level():
def _model_arg():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False)
parser.add_argument('--model', action=ResolveModel,
grp = parser.add_mutually_exclusive_group()
grp.add_argument('--model', action=ResolveModel,
default=medaka.options.default_models['consensus'],
help='Model to use.')
grp.add_argument('--auto_model', nargs=2, action=AutoModel,
metavar=("TYPE", "INPUT"), dest='model',
help="Automatically choose model according to INPUT. TYPE should be one of 'consensus' or 'variant'.")
return parser


Expand Down Expand Up @@ -245,18 +265,47 @@ def _chunking_feature_args(batch_size=100, chunk_len=10000, chunk_ovlp=1000):
return parser


def _validate_common_args(args):
def _validate_common_args(args, parser):
"""Do some common argument validation."""
logger = medaka.common.get_named_logger('ValidArgs')

# check BAM has some required fields, fail early
if hasattr(args, 'bam') and args.bam is not None:
RG = args.RG if hasattr(args, 'RG') else None
CheckBam.check_read_groups(args.bam, RG)
if RG is not None:
msg = "Reads will be filtered to only those with RG tag: {}"
logger.info(msg.format(RG))
# if model is default, resolve to file, save mess in help text
if hasattr(args, 'model') and args.model is not None:
args.model = medaka.models.resolve_model(args.model)
logger.debug(msg.format(RG))

# rationalise the model
if hasattr(args, 'model'):
# if --model was not given on the command-line try to guess from the
# the input file. otherwise leave alone
if (
hasattr(args, 'bam')
and args.bam is not None
and not hasattr(args, 'model_was_given')):
# try to guess model using the input file, assume consensus
# assuming consensus might not be right, but this is not a change
# in behaviour from the historic.
logger.debug("Guessing model")
if hasattr(args, 'bam') and args.bam is not None:
model = medaka.models.model_from_basecaller(
args.bam, variant="consensus")
try:
args.model = medaka.models.resolve_model(model)
except Exception as e:
logger.warning(
"Failed to guess medaka model input file. Using default.")
else:
logger.debug(
f"Chosen model '{args.model}' for input '{args.bam}'.")
elif args.model is not None:
# TODO: why is this done? it will have been done in ResolveModel?
# the resolve_model function is idempotent so doesn't really
# matter too much
args.model = medaka.models.resolve_model(args.model)
logger.debug(f"Model is: {args.model}")


def print_model_path(args):
Expand Down Expand Up @@ -757,5 +806,6 @@ def main():
# TODO: is there a cleaner way to access this?
parser.__dict__['_actions'][1].choices['tools'].print_help()
else:
_validate_common_args(args)
# perform some post processing on the values, then run entry point
_validate_common_args(args, parser)
args.func(args)
94 changes: 94 additions & 0 deletions medaka/models.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Creation and loading of models."""

import itertools
import os
import pathlib
import tempfile

import pysam
import requests

import medaka.common
Expand Down Expand Up @@ -90,6 +92,98 @@ def resolve_model(model):
raise RuntimeError("Model resolution failed")


def model_from_basecaller(fname, variant=False):
"""Determine correct medaka model from basecaller output file.
:param fname: a basecaller output (.sam/.bam/.cram/.fastq).
:param variant: whether to return variant model (otherwise consensus).
There are slight differences is the search strategy for .bam and .fastq
files due to differences in what information is available in each.
For .bam (and related) files the DS subfield of the read group header
is examined to find the "basecall_model=" key=value entry. The found
model is returned vebatim without fursther checks.
For .fastq (and related) a RG:Z key=value in record comments is searched
in the first 100 records. Due to ambiguities in the representation the
search looks explicitely for known models.
"""
logger = medaka.common.get_named_logger("MdlInspect")
logger.info("Trying to find model")
try:
models = _model_from_bam(fname)
except Exception:
try:
models = _model_from_fastq(fname)
except Exception:
raise IOError(
"Failed to parse basecaller models from input file.")

if len(models) != 1:
# TODO: this potentially conflicts with medaka's ability to use
# multiple data types. In that case a user can just
# explicitely provide the correct model.
raise ValueError(
"Input file did not contain precisely 1 basecaller "
"model reference.")

basecaller = list(models)[0]
if basecaller not in medaka.options.basecaller_models.keys():
raise KeyError(
"Unknown basecaller model. Please provide a medaka model "
"explicitely using --model.")

consensus, var = medaka.options.basecaller_models[basecaller]
model = var if variant else consensus
if model is None:
txt = "variant" if variant else "consensus"
raise ValueError(
f"No {txt} model available for basecaller {basecaller}.")
return model


def _model_from_bam(fname):
"""Search for basecaller models listed in a .bam."""
models = set()
with pysam.AlignmentFile(fname, check_sq=False) as bam:
callers = [rg['DS'] for rg in bam.header['RG']]
logger.info(f"Found basecall models: {callers}")
for caller in callers:
models.add(caller.split("basecall_model=")[1].split()[0])
return models


def _model_from_fastq(fname):
"""Search for model files listed in a .fastq."""
known_models = list(medaka.options.basecaller_models.keys())
models = set()
with pysam.FastxFile(fname, 'r') as fastq:
for rec in itertools.islice(fastq, 100):
# model is embedded in RG:Z: tag of comment as
# <run_id>_<model>_<barcode>, but model has _
# characters in also so search for known models
try:
read_group = rec.comment.split("RG:Z:")[1].split()[0]
for model in known_models:
if model in read_group:
models.add(model)
except Exception:
pass
if len(models) > 1:
# filter out any models without an `@`. These are likely FPs of
# the search above (there are unversioned models whose name
# is a substring of the versioned models).
unversioned = {m for m in models if '@' not in m}
versioned = {m for m in models if '@' in m}
remove = set()
for unver, ver in itertools.product(unversioned, versioned):
if unver in ver:
remove.add(unver)
models = models - remove
return models


def open_model(fname):
"""Determine model type from model name.
Expand Down
93 changes: 79 additions & 14 deletions medaka/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,13 @@

import pkg_resources

# the models used by default for CLI entry points
default_models = {
'consensus': 'r1041_e82_400bps_sup_v4.2.0',
'variant': 'r1041_e82_400bps_sup_variant_v4.2.0'
}

model_subdir = 'data'
model_stores = (
pkg_resources.resource_filename(__package__, model_subdir),
os.path.join(
str(pathlib.Path.home()), '.{}'.format(__package__), model_subdir)
)
model_url_template = \
'https://github.com/nanoporetech/{pkg}/raw/master/{pkg}/{subdir}/{fname}'

# current models are those included in PyPI packages
current_models = [
# r1041 e82 (kit14) consensus
'r1041_e82_400bps_hac_v4.2.0',
Expand All @@ -25,6 +22,63 @@
'r1041_e82_400bps_hac_variant_v4.2.0',
'r1041_e82_400bps_sup_variant_v4.2.0',
]

# mapping from basecaller model names to medaka models.
# name: (consensus, variant)
# models here are automatically added to full archived list
basecaller_models = {
# R10.3
'dna_r10.3_450bps_hac':
('r103_hac_g507', 'r103_hac_variant_g507'),
'dna_r10.3_450bps_hac_prom':
('r103_hac_g507', 'r103_hac_variant_g507'),
# R10.4.1 260bps
'dna_r10.4.1_e8.2_260bps_hac':
('r1041_e82_260bps_hac_g632', 'r1041_e82_260bps_hac_variant_g632'),
'[email protected]':
('r1041_e82_260bps_hac_v4.0.0', None),
'[email protected]':
('r1041_e82_260bps_hac_v4.1.0', 'r1041_e82_260bps_hac_variant_v4.1.0'),
'dna_r10.4.1_e8.2_260bps_hac_prom':
('r1041_e82_260bps_hac_g632', 'r1041_e82_260bps_hac_variant_g632'),
'[email protected]':
('r1041_e82_260bps_sup_v4.0.0', None),
'[email protected]':
('r1041_e82_260bps_sup_v4.1.0', 'r1041_e82_260bps_sup_variant_v4.1.0'),
# R10.4.1 400bps
'dna_r10.4.1_e8.2_400bps_hac':
('r1041_e82_400bps_hac_g632', 'r1041_e82_400bps_hac_variant_g632'),
'[email protected]':
('r1041_e82_400bps_hac_g632', 'r1041_e82_400bps_hac_variant_g632'),
'[email protected]':
('r1041_e82_400bps_hac_v4.0.0', None),
'[email protected]':
('r1041_e82_400bps_hac_v4.1.0', 'r1041_e82_400bps_hac_variant_v4.1.0'),
'[email protected]':
('r1041_e82_400bps_hac_v4.2.0', 'r1041_e82_400bps_hac_variant_v4.2.0'),
'dna_r10.4.1_e8.2_400bps_hac_prom':
('r1041_e82_400bps_hac_g632', 'r1041_e82_400bps_hac_variant_g632'),
'[email protected]':
('r1041_e82_400bps_sup_g615', 'r1041_e82_400bps_sup_variant_g615'),
'[email protected]':
('r1041_e82_400bps_sup_v4.0.0', None),
'[email protected]':
('r1041_e82_400bps_sup_v4.1.0', 'r1041_e82_400bps_sup_variant_v4.1.0'),
'[email protected]':
('r1041_e82_400bps_sup_v4.2.0', 'r1041_e82_400bps_sup_variant_v4.2.0'),
# R9.4.1 This is a little dodgy
# note: 'dna_r9.4.1_450bps_hac' is not present here as there is not a
# injective mapping.
'[email protected]':
('r941_min_fast_g507', 'r941_min_fast_variant_g507'),
'[email protected]':
('r941_min_hac_g507', 'r941_min_hac_variant_g507'),
'[email protected]':
('r941_min_sup_g507', 'r941_min_sup_variant_g507')}

# archived models are not included in packages but can be downloaded on the fly
# this list SHOULD NOT be added to, any new models should go into the above
# basecaller_models structure
archived_models = [
# r9 consensus
'r941_sup_plant_g610',
Expand Down Expand Up @@ -95,12 +149,23 @@
'r941_e81_fast_variant_g514', 'r941_e81_hac_variant_g514',
'r941_e81_sup_variant_g514',
]
allowed_models = sorted(current_models + archived_models)
default_models = {
'consensus': 'r1041_e82_400bps_sup_v4.2.0',
'variant': 'r1041_e82_400bps_sup_variant_v4.2.0'
}

# add basecaller models, then deduplicate and sort
for models in basecaller_models.values():
archived_models.extend((m for m in models if m is not None))
allowed_models = sorted(set(current_models + archived_models))

# where we look for model files and store them
model_subdir = 'data'
model_stores = (
pkg_resources.resource_filename(__package__, model_subdir),
os.path.join(
str(pathlib.Path.home()), '.{}'.format(__package__), model_subdir)
)
model_url_template = \
'https://github.com/nanoporetech/{pkg}/raw/master/{pkg}/{subdir}/{fname}'

# suspect this isn't used anymore...
alignment_params = {
'rle': "-M 5 -S 4 -O 2 -E 3",
'non-rle': "-M 2 -S 4 -O 4,24 -E 2,1"}
Binary file added medaka/test/data/bc_model_scrape.bam
Binary file not shown.
Binary file added medaka/test/data/bc_model_scrape.fastq.gz
Binary file not shown.
Loading

0 comments on commit 92fbd06

Please sign in to comment.