Skip to content

Commit f20cb1a

Browse files
committed
Parse model names from guppy/minknow files
1 parent 78d47ef commit f20cb1a

File tree

5 files changed

+24
-6
lines changed

5 files changed

+24
-6
lines changed

CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

7-
## [unreleased]
7+
## [v1.11.2]
8+
### Added
9+
- Parsing model information from fastq headers output by Guppy and MinKNOW.
810
### Changed
911
- Additional explanatory information in VCF INFO fields concerning depth calculations.
1012

medaka/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import subprocess
66
import sys
77

8-
__version__ = "1.11.1"
8+
__version__ = "1.11.2"
99

1010
try:
1111
import pyabpoa as abpoa

medaka/models.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -173,16 +173,24 @@ def _model_from_fastq(fname):
173173
models = set()
174174
with pysam.FastxFile(fname, 'r') as fastq:
175175
for rec in itertools.islice(fastq, 100):
176-
# model is embedded in RG:Z: tag of comment as
177-
# <run_id>_<model>_<barcode>, but model has _
178-
# characters in also so search for known models
179176
try:
177+
# dorado SAM converted to FASTQ with e.g. samtools fastq
178+
# model is embedded in RG:Z: tag of comment as
179+
# <run_id>_<model>_<barcode>, but model has _
180+
# characters in also so search for known models
180181
read_group = rec.comment.split("RG:Z:")[1].split()[0]
181182
for model in known_models:
182183
if model in read_group:
183184
models.add(model)
184185
except Exception:
185-
pass
186+
# minknow/guppy
187+
# basecall_model_version_id=<model>
188+
try:
189+
model = rec.comment.split(
190+
"basecall_model_version_id=")[1].split()[0]
191+
models.add(model)
192+
except Exception:
193+
pass
186194
if len(models) > 1:
187195
# filter out any models without an `@`. These are likely FPs of
188196
# the search above (there are unversioned models whose name
Binary file not shown.

medaka/test/test_model.py

+8
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ class TestScrapBasecaller(unittest.TestCase):
7171
root_dir = os.path.abspath(os.path.dirname(__file__))
7272
bam = os.path.join(root_dir, 'data/bc_model_scrape.bam')
7373
fastq = os.path.join(root_dir, 'data/bc_model_scrape.fastq.gz')
74+
fastq_minknow = os.path.join(root_dir, 'data/bc_model_scrape_minknow.fastq.gz')
7475

7576
def test_000_from_bam_consensus(self):
7677
model = models.model_from_basecaller(self.bam, variant=False)
@@ -88,6 +89,13 @@ def test_011_from_fastq_variant(self):
8889
model = models.model_from_basecaller(self.fastq, variant=True)
8990
self.assertEqual(model, "r1041_e82_400bps_hac_variant_v4.2.0")
9091

92+
def test_020_from_fastq_minknow(self):
93+
model = models.model_from_basecaller(self.fastq_minknow, variant=False)
94+
self.assertEqual(model, "r1041_e82_400bps_sup_v4.2.0")
95+
96+
def test_021_from_fastq_minknow_variant(self):
97+
model = models.model_from_basecaller(self.fastq_minknow, variant=True)
98+
self.assertEqual(model, "r1041_e82_400bps_sup_variant_v4.2.0")
9199

92100
class TestBuildModel(unittest.TestCase):
93101

0 commit comments

Comments
 (0)