diff --git a/README.md b/README.md index a0f1dfd..4e42556 100755 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ not be provided by the user. **Using a GPU** -Since version 1.1.0 `medaka` uses Tensorflow 2.2, prior versions used Tensorflow 1.4. +Since version 1.1.0 `medaka` uses Tensorflow 2, prior versions used Tensorflow 1. For `medaka` 1.1.0 and higher installation from source or using `pip` can make immediate use of GPUs. However, note that the `tensorflow` package is compiled against specific versions of the NVIDIA CUDA and cuDNN libraries; users are directed to the @@ -214,6 +214,33 @@ For best results it is important to specify the correct model, `-m` in the above, according to the basecaller used. Allowed values can be found by running `medaka tools list\_models`. +**Recent basecallers** + +Recent basecaller versions annotate their output with their model version. +In such cases medaka can inspect the files and attempt to select an appropriate +model for itself. This typically works best in the case of BAM output from +basecallers. It will work also for FASTQ input provided the FASTQ has been +created from basecaller output using: + +``` +samtools fastq -T '*' dorado.bam | gzip -c > dorado.fastq.gz +``` + +The command `medaka consensus` will attempt to automatically determine a +correct model by inspecting its BAM input file. The helper scripts +`medaka_consensus` and `medaka_haploid_variant` will make similar attempts +from their FASTQ input. + +To inspect files for yourself, the command: + +``` +medaka tools resolve_model --auto_model +``` + +will print the model that automatic model selection will use. + +**For older basecallers and when automatic selection is unsuccessful** + Medaka models are named to indicate i) the pore type, ii) the sequencing device (MinION or PromethION), iii) the basecaller variant, and iv) the basecaller version, with the format: diff --git a/medaka/medaka.py b/medaka/medaka.py index 22a8e77..9166f26 100644 --- a/medaka/medaka.py +++ b/medaka/medaka.py @@ -20,6 +20,7 @@ import medaka.vcf import medaka.wrappers + class ResolveModel(argparse.Action): """Resolve model filename or ID into filename""" def __init__( @@ -36,7 +37,22 @@ def __call__(self, parser, namespace, values, option_string=None): except Exception as e: msg = "Error validating model from '--{}' argument: {}." raise RuntimeError(msg.format(self.dest, str(e))) - #TODO: verify the file is a model? + setattr(namespace, f"{self.dest}_was_given", True) + setattr(namespace, self.dest, model_fp) + + +class AutoModel(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + variant, input_file = values + if variant not in {'consensus', 'variant'}: + raise ValueError("'TYPE' must be one of 'consensus' or 'variant'.") + variant = variant == 'variant' + model = medaka.models.model_from_basecaller(input_file, variant=variant) + try: + model_fp = medaka.models.resolve_model(model) + except Exception as e: + msg = "Error validating model from '--{}' argument: {}." + raise RuntimeError(msg.format(self.dest, str(e))) setattr(namespace, self.dest, model_fp) @@ -91,7 +107,7 @@ def check_read_groups(fname, rg=None): :raises: RuntimeError """ - with pysam.AlignmentFile(fname) as bam: + with pysam.AlignmentFile(fname, check_sq=False) as bam: # As of 13/12/19 pypi still has no wheel for pysam v0.15.3 so we # pinned to v0.15.2. However bioconda's v0.15.2 package # conflicts with the libdeflate they have so we are forced @@ -177,9 +193,13 @@ def _log_level(): def _model_arg(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False) - parser.add_argument('--model', action=ResolveModel, + grp = parser.add_mutually_exclusive_group() + grp.add_argument('--model', action=ResolveModel, default=medaka.options.default_models['consensus'], help='Model to use.') + grp.add_argument('--auto_model', nargs=2, action=AutoModel, + metavar=("TYPE", "INPUT"), dest='model', + help="Automatically choose model according to INPUT. TYPE should be one of 'consensus' or 'variant'.") return parser @@ -245,18 +265,47 @@ def _chunking_feature_args(batch_size=100, chunk_len=10000, chunk_ovlp=1000): return parser -def _validate_common_args(args): +def _validate_common_args(args, parser): """Do some common argument validation.""" logger = medaka.common.get_named_logger('ValidArgs') + + # check BAM has some required fields, fail early if hasattr(args, 'bam') and args.bam is not None: RG = args.RG if hasattr(args, 'RG') else None CheckBam.check_read_groups(args.bam, RG) if RG is not None: msg = "Reads will be filtered to only those with RG tag: {}" - logger.info(msg.format(RG)) - # if model is default, resolve to file, save mess in help text - if hasattr(args, 'model') and args.model is not None: - args.model = medaka.models.resolve_model(args.model) + logger.debug(msg.format(RG)) + + # rationalise the model + if hasattr(args, 'model'): + # if --model was not given on the command-line try to guess from the + # the input file. otherwise leave alone + if ( + hasattr(args, 'bam') + and args.bam is not None + and not hasattr(args, 'model_was_given')): + # try to guess model using the input file, assume consensus + # assuming consensus might not be right, but this is not a change + # in behaviour from the historic. + logger.debug("Guessing model") + if hasattr(args, 'bam') and args.bam is not None: + model = medaka.models.model_from_basecaller( + args.bam, variant="consensus") + try: + args.model = medaka.models.resolve_model(model) + except Exception as e: + logger.warning( + "Failed to guess medaka model input file. Using default.") + else: + logger.debug( + f"Chosen model '{args.model}' for input '{args.bam}'.") + elif args.model is not None: + # TODO: why is this done? it will have been done in ResolveModel? + # the resolve_model function is idempotent so doesn't really + # matter too much + args.model = medaka.models.resolve_model(args.model) + logger.debug(f"Model is: {args.model}") def print_model_path(args): @@ -757,5 +806,6 @@ def main(): # TODO: is there a cleaner way to access this? parser.__dict__['_actions'][1].choices['tools'].print_help() else: - _validate_common_args(args) + # perform some post processing on the values, then run entry point + _validate_common_args(args, parser) args.func(args) diff --git a/medaka/models.py b/medaka/models.py index 4e38ad6..c5e53c3 100644 --- a/medaka/models.py +++ b/medaka/models.py @@ -1,9 +1,11 @@ """Creation and loading of models.""" +import itertools import os import pathlib import tempfile +import pysam import requests import medaka.common @@ -90,6 +92,98 @@ def resolve_model(model): raise RuntimeError("Model resolution failed") +def model_from_basecaller(fname, variant=False): + """Determine correct medaka model from basecaller output file. + + :param fname: a basecaller output (.sam/.bam/.cram/.fastq). + :param variant: whether to return variant model (otherwise consensus). + + There are slight differences is the search strategy for .bam and .fastq + files due to differences in what information is available in each. + + For .bam (and related) files the DS subfield of the read group header + is examined to find the "basecall_model=" key=value entry. The found + model is returned vebatim without fursther checks. + + For .fastq (and related) a RG:Z key=value in record comments is searched + in the first 100 records. Due to ambiguities in the representation the + search looks explicitely for known models. + """ + logger = medaka.common.get_named_logger("MdlInspect") + logger.info("Trying to find model") + try: + models = _model_from_bam(fname) + except Exception: + try: + models = _model_from_fastq(fname) + except Exception: + raise IOError( + "Failed to parse basecaller models from input file.") + + if len(models) != 1: + # TODO: this potentially conflicts with medaka's ability to use + # multiple data types. In that case a user can just + # explicitely provide the correct model. + raise ValueError( + "Input file did not contain precisely 1 basecaller " + "model reference.") + + basecaller = list(models)[0] + if basecaller not in medaka.options.basecaller_models.keys(): + raise KeyError( + "Unknown basecaller model. Please provide a medaka model " + "explicitely using --model.") + + consensus, var = medaka.options.basecaller_models[basecaller] + model = var if variant else consensus + if model is None: + txt = "variant" if variant else "consensus" + raise ValueError( + f"No {txt} model available for basecaller {basecaller}.") + return model + + +def _model_from_bam(fname): + """Search for basecaller models listed in a .bam.""" + models = set() + with pysam.AlignmentFile(fname, check_sq=False) as bam: + callers = [rg['DS'] for rg in bam.header['RG']] + logger.info(f"Found basecall models: {callers}") + for caller in callers: + models.add(caller.split("basecall_model=")[1].split()[0]) + return models + + +def _model_from_fastq(fname): + """Search for model files listed in a .fastq.""" + known_models = list(medaka.options.basecaller_models.keys()) + models = set() + with pysam.FastxFile(fname, 'r') as fastq: + for rec in itertools.islice(fastq, 100): + # model is embedded in RG:Z: tag of comment as + # __, but model has _ + # characters in also so search for known models + try: + read_group = rec.comment.split("RG:Z:")[1].split()[0] + for model in known_models: + if model in read_group: + models.add(model) + except Exception: + pass + if len(models) > 1: + # filter out any models without an `@`. These are likely FPs of + # the search above (there are unversioned models whose name + # is a substring of the versioned models). + unversioned = {m for m in models if '@' not in m} + versioned = {m for m in models if '@' in m} + remove = set() + for unver, ver in itertools.product(unversioned, versioned): + if unver in ver: + remove.add(unver) + models = models - remove + return models + + def open_model(fname): """Determine model type from model name. diff --git a/medaka/options.py b/medaka/options.py index 239a005..6a37d6f 100755 --- a/medaka/options.py +++ b/medaka/options.py @@ -7,16 +7,13 @@ import pkg_resources +# the models used by default for CLI entry points +default_models = { + 'consensus': 'r1041_e82_400bps_sup_v4.2.0', + 'variant': 'r1041_e82_400bps_sup_variant_v4.2.0' +} -model_subdir = 'data' -model_stores = ( - pkg_resources.resource_filename(__package__, model_subdir), - os.path.join( - str(pathlib.Path.home()), '.{}'.format(__package__), model_subdir) -) -model_url_template = \ - 'https://github.com/nanoporetech/{pkg}/raw/master/{pkg}/{subdir}/{fname}' - +# current models are those included in PyPI packages current_models = [ # r1041 e82 (kit14) consensus 'r1041_e82_400bps_hac_v4.2.0', @@ -25,6 +22,63 @@ 'r1041_e82_400bps_hac_variant_v4.2.0', 'r1041_e82_400bps_sup_variant_v4.2.0', ] + +# mapping from basecaller model names to medaka models. +# name: (consensus, variant) +# models here are automatically added to full archived list +basecaller_models = { + # R10.3 + 'dna_r10.3_450bps_hac': + ('r103_hac_g507', 'r103_hac_variant_g507'), + 'dna_r10.3_450bps_hac_prom': + ('r103_hac_g507', 'r103_hac_variant_g507'), + # R10.4.1 260bps + 'dna_r10.4.1_e8.2_260bps_hac': + ('r1041_e82_260bps_hac_g632', 'r1041_e82_260bps_hac_variant_g632'), + 'dna_r10.4.1_e8.2_260bps_hac@v4.0.0': + ('r1041_e82_260bps_hac_v4.0.0', None), + 'dna_r10.4.1_e8.2_260bps_hac@v4.1.0': + ('r1041_e82_260bps_hac_v4.1.0', 'r1041_e82_260bps_hac_variant_v4.1.0'), + 'dna_r10.4.1_e8.2_260bps_hac_prom': + ('r1041_e82_260bps_hac_g632', 'r1041_e82_260bps_hac_variant_g632'), + 'dna_r10.4.1_e8.2_260bps_sup@v4.0.0': + ('r1041_e82_260bps_sup_v4.0.0', None), + 'dna_r10.4.1_e8.2_260bps_sup@v4.1.0': + ('r1041_e82_260bps_sup_v4.1.0', 'r1041_e82_260bps_sup_variant_v4.1.0'), + # R10.4.1 400bps + 'dna_r10.4.1_e8.2_400bps_hac': + ('r1041_e82_400bps_hac_g632', 'r1041_e82_400bps_hac_variant_g632'), + 'dna_r10.4.1_e8.2_400bps_hac@v3.5.2': + ('r1041_e82_400bps_hac_g632', 'r1041_e82_400bps_hac_variant_g632'), + 'dna_r10.4.1_e8.2_400bps_hac@v4.0.0': + ('r1041_e82_400bps_hac_v4.0.0', None), + 'dna_r10.4.1_e8.2_400bps_hac@v4.1.0': + ('r1041_e82_400bps_hac_v4.1.0', 'r1041_e82_400bps_hac_variant_v4.1.0'), + 'dna_r10.4.1_e8.2_400bps_hac@v4.2.0': + ('r1041_e82_400bps_hac_v4.2.0', 'r1041_e82_400bps_hac_variant_v4.2.0'), + 'dna_r10.4.1_e8.2_400bps_hac_prom': + ('r1041_e82_400bps_hac_g632', 'r1041_e82_400bps_hac_variant_g632'), + 'dna_r10.4.1_e8.2_400bps_sup@v3.5.2': + ('r1041_e82_400bps_sup_g615', 'r1041_e82_400bps_sup_variant_g615'), + 'dna_r10.4.1_e8.2_400bps_sup@v4.0.0': + ('r1041_e82_400bps_sup_v4.0.0', None), + 'dna_r10.4.1_e8.2_400bps_sup@v4.1.0': + ('r1041_e82_400bps_sup_v4.1.0', 'r1041_e82_400bps_sup_variant_v4.1.0'), + 'dna_r10.4.1_e8.2_400bps_sup@v4.2.0': + ('r1041_e82_400bps_sup_v4.2.0', 'r1041_e82_400bps_sup_variant_v4.2.0'), + # R9.4.1 This is a little dodgy + # note: 'dna_r9.4.1_450bps_hac' is not present here as there is not a + # injective mapping. + 'dna_r9.4.1_e8_fast@v3.4': + ('r941_min_fast_g507', 'r941_min_fast_variant_g507'), + 'dna_r9.4.1_e8_hac@v3.3': + ('r941_min_hac_g507', 'r941_min_hac_variant_g507'), + 'dna_r9.4.1_e8_sup@v3.3': + ('r941_min_sup_g507', 'r941_min_sup_variant_g507')} + +# archived models are not included in packages but can be downloaded on the fly +# this list SHOULD NOT be added to, any new models should go into the above +# basecaller_models structure archived_models = [ # r9 consensus 'r941_sup_plant_g610', @@ -95,12 +149,23 @@ 'r941_e81_fast_variant_g514', 'r941_e81_hac_variant_g514', 'r941_e81_sup_variant_g514', ] -allowed_models = sorted(current_models + archived_models) -default_models = { - 'consensus': 'r1041_e82_400bps_sup_v4.2.0', - 'variant': 'r1041_e82_400bps_sup_variant_v4.2.0' -} +# add basecaller models, then deduplicate and sort +for models in basecaller_models.values(): + archived_models.extend((m for m in models if m is not None)) +allowed_models = sorted(set(current_models + archived_models)) + +# where we look for model files and store them +model_subdir = 'data' +model_stores = ( + pkg_resources.resource_filename(__package__, model_subdir), + os.path.join( + str(pathlib.Path.home()), '.{}'.format(__package__), model_subdir) +) +model_url_template = \ + 'https://github.com/nanoporetech/{pkg}/raw/master/{pkg}/{subdir}/{fname}' + +# suspect this isn't used anymore... alignment_params = { 'rle': "-M 5 -S 4 -O 2 -E 3", 'non-rle': "-M 2 -S 4 -O 4,24 -E 2,1"} diff --git a/medaka/test/data/bc_model_scrape.bam b/medaka/test/data/bc_model_scrape.bam new file mode 100644 index 0000000..1b486a2 Binary files /dev/null and b/medaka/test/data/bc_model_scrape.bam differ diff --git a/medaka/test/data/bc_model_scrape.fastq.gz b/medaka/test/data/bc_model_scrape.fastq.gz new file mode 100644 index 0000000..b6e251f Binary files /dev/null and b/medaka/test/data/bc_model_scrape.fastq.gz differ diff --git a/medaka/test/test_model.py b/medaka/test/test_model.py index a6e8174..30ca2e3 100755 --- a/medaka/test/test_model.py +++ b/medaka/test/test_model.py @@ -61,6 +61,29 @@ def _load_one(self, model_file): self.assertIsInstance(label_scheme, BaseLabelScheme) +class TestScrapBasecaller(unittest.TestCase): + + root_dir = os.path.abspath(os.path.dirname(__file__)) + bam = os.path.join(root_dir, 'data/bc_model_scrape.bam') + fastq = os.path.join(root_dir, 'data/bc_model_scrape.fastq.gz') + + def test_000_from_bam_consensus(self): + model = models.model_from_basecaller(self.bam, variant=False) + self.assertEqual(model, "r1041_e82_400bps_hac_v4.2.0") + + def test_001_from_bam_variant(self): + model = models.model_from_basecaller(self.bam, variant=True) + self.assertEqual(model, "r1041_e82_400bps_hac_variant_v4.2.0") + + def test_010_from_fastq_consensus(self): + model = models.model_from_basecaller(self.fastq, variant=False) + self.assertEqual(model, "r1041_e82_400bps_hac_v4.2.0") + + def test_011_from_fastq_variant(self): + model = models.model_from_basecaller(self.fastq, variant=True) + self.assertEqual(model, "r1041_e82_400bps_hac_variant_v4.2.0") + + class TestBuildModel(unittest.TestCase): def test_000_build_all_models(self): diff --git a/scripts/medaka_consensus b/scripts/medaka_consensus index ed1e4be..81e2f2b 100755 --- a/scripts/medaka_consensus +++ b/scripts/medaka_consensus @@ -36,6 +36,7 @@ QUALITIES=false iflag=false dflag=false xflag=false +mflag=false usage=" ${medaka_version} @@ -55,6 +56,8 @@ $(basename "$0") [-h] -i -d -m medaka model, (default: ${MODEL}). Choices: ${MODELS} Alternatively a .tar.gz/.hdf file from 'medaka train'. + If not provided, and automatic choice will be attempted based on + the contents of the input file. -f Force overwrite of outputs (default will reuse existing outputs). -x Force recreation of alignment index. -t number of threads with which to create features (default: 1). @@ -70,7 +73,7 @@ while getopts ':hi::d:o:gr:m:fxt:b:q' option; do o ) OUTPUT=$OPTARG;; g ) NOFILLGAPS=true;; r ) GAPFILLCHAR=$OPTARG;; - m ) MODEL=$(medaka tools resolve_model --model $OPTARG);; + m ) mflag=true; MODEL=$(medaka tools resolve_model --model $OPTARG);; f ) FORCE=true;; x ) xflag=true;; t ) THREADS=$OPTARG;; @@ -96,6 +99,17 @@ if ! $dflag; then exit 1; fi +if ! $mflag; then + echo "Attempting to automatically select model version." + model=$(medaka tools resolve_model --auto_model consensus "${BASECALLS}" 2>/dev/null || true) + if [[ "${model}" == "" ]] ; then + echo "WARNING: Failed to detect a model version, will use default: '${MODEL}'" + else + echo "SUCCESS: Automatic model selection chose model: '${model}'" + MODEL=${model} + fi +fi + echo "Checking program versions" echo "This is ${medaka_version}" medaka_version_report || exit 1 @@ -103,9 +117,9 @@ medaka_version_report || exit 1 if [[ ! -e "${OUTPUT}" ]]; then mkdir -p "${OUTPUT}" elif ${FORCE}; then - echo "Warning: Output will be overwritten (-f flag)" + echo "WARNING: Output will be overwritten (-f flag)" else - echo "Warning: Output ${OUTPUT} already exists, may use old results." + echo "WARNING: Output ${OUTPUT} already exists, may use old results." fi cd "${OUTPUT}" diff --git a/scripts/medaka_haploid_variant b/scripts/medaka_haploid_variant index 1f33114..f0a41b5 100755 --- a/scripts/medaka_haploid_variant +++ b/scripts/medaka_haploid_variant @@ -34,6 +34,7 @@ FORCE=false iflag=false rflag=false xflag=false +mflag=false usage=" ${medaka_version} @@ -49,6 +50,8 @@ $(basename "$0") [-h] -i -r -o output folder (default: medaka). -m medaka model, (default: ${MODEL}). Choices: $MODELS. + If not provided, and automatic choice will be attempted based on + the contents of the input file. -s Perform read realignment when annotating variants. -f Force overwrite of outputs (default will reuse existing outputs). -x Force recreation of alignment index. @@ -62,7 +65,7 @@ while getopts ':hi:r:o:m:sfxt:b:' option; do i ) iflag=true; BASECALLS=$(follow_link $OPTARG);; r ) rflag=true; REFERENCE=$(follow_link $OPTARG);; o ) OUTPUT=$OPTARG;; - m ) MODEL=$(medaka tools resolve_model --model $OPTARG);; + m ) mflag=true; MODEL=$(medaka tools resolve_model --model $OPTARG);; s ) ANNOT_OPTS="--dpsp";; f ) FORCE=true;; x ) xflag=true;; @@ -95,6 +98,17 @@ if $rleflag; then exit 1 fi +if ! $mflag; then + echo "Attempting to automatically select model version." + model=$(medaka tools resolve_model --auto_model variant "${BASECALLS}" 2>/dev/null || true) + if [[ "${model}" == "" ]] ; then + echo "WARNING: Failed to detect a model version, will use default: '${MODEL}'" + else + echo "SUCCESS: Automatic model selection chose model: '${model}'" + MODEL=${model} + fi +fi + if [[ "${MODEL}" != *"variant"* ]]; then echo "WARNING: The model '${MODEL}' is not recommended for use with this program." echo " Please select a model named with 'variant' in its name. It is"