diff --git a/pymzml/file_classes/standardMzml.py b/pymzml/file_classes/standardMzml.py index a8115410..56a0acbd 100755 --- a/pymzml/file_classes/standardMzml.py +++ b/pymzml/file_classes/standardMzml.py @@ -203,7 +203,9 @@ def _binary_search(self, target_index): matches = re.finditer(regex_patterns.SPECTRUM_OPEN_PATTERN, chunk) for _match_number, match in enumerate(matches): if match is not None: - scan = int(re.search(b"[0-9]*$", match.group("id")).group()) + spec_info = match.groups() + spec_info = dict(zip(spec_info[0::2], spec_info[1::2])) + scan = int(re.search(b"[0-9]*$", spec_info[b"id"]).group()) # print(">>", _match_number, scan) if jump_direction == 'forwards': if scan > target_index: @@ -493,8 +495,10 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100): if spec_start is not None: spec_start_offset = file_pointer + spec_start.start() seeker.seek(spec_start_offset) + spec_info = self.spec_open.search(data).groups() + spec_info = dict(zip(spec_info[0::2], spec_info[1::2])) current_index = int( - re.search(b"[0-9]*$", spec_start.group("id")).group() + re.search(b"[0-9]*$", spec_info[b"id"]).group() ) self.offset_dict[current_index] = (spec_start_offset,) @@ -519,9 +523,10 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100): current_position = seeker.tell() data = seeker.read(chunk_size) if self.spec_open.search(data): - spec_start = self.spec_open.search(data) + spec_info = self.spec_open.search(data).groups() + spec_info = dict(zip(spec_info[0::2], spec_info[1::2])) current_index = int( - re.search(b"[0-9]*$", spec_start.group("id")).group() + re.search(b"[0-9]*$", spec_info[b"id"]).group() ) seeker.seek(current_position) spectrum = self._search_linear(seeker, target_index) @@ -685,8 +690,10 @@ def _search_linear(self, seeker, index, chunk_size=8): if spec_start: spec_start_offset = file_pointer + spec_start.start() seeker.seek(spec_start_offset) + spec_info = spec_start.groups() + spec_info = dict(zip(spec_info[0::2], spec_info[1::2])) current_index = int( - re.search(b"[0-9]*$", spec_start.group("id")).group() + re.search(b"[0-9]*$", spec_info[b"id"]).group() ) # print(current_index) spec_end = self.spec_close.search(data[spec_start.start() :]) diff --git a/pymzml/regex_patterns.py b/pymzml/regex_patterns.py index 00839c9a..f91eb82a 100755 --- a/pymzml/regex_patterns.py +++ b/pymzml/regex_patterns.py @@ -38,10 +38,7 @@ """ Regex to catch moby dick chapter number used in the index gezip writer example. """ - -SPECTRUM_OPEN_PATTERN = re.compile( - b'<*spectrum[^>]*index="(?P[0-9]+)" id="(?P[^"]+)" defaultArrayLength="[0-9]+">' -) +SPECTRUM_OPEN_PATTERN = re.compile(b'<*spectrum[^>]*(index|id)="(.*?)".*(index|id)="(.*?)"') """ Regex to catch specturm open xml tag with encoded array length """ diff --git a/tests/regex_test.py b/tests/regex_test.py index 8726ee3c..443ba49a 100755 --- a/tests/regex_test.py +++ b/tests/regex_test.py @@ -9,6 +9,7 @@ import pymzml.regex_patterns as rp import unittest from collections import OrderedDict as odict +import re class RegexTest(unittest.TestCase): @@ -62,6 +63,15 @@ def test_spectrum_tag_patter(self): for tag in self.spec_tags.values(): self.assertRegex(tag.decode("utf-8"), rp.SPECTRUM_TAG_PATTERN) + def test_index_and_id_order_does_not_matter(self): + a = b'' + b = b'' + a_match = re.search(rp.SPECTRUM_OPEN_PATTERN, a).groups() + b_match = re.search(rp.SPECTRUM_OPEN_PATTERN, b).groups() + a_dict = dict(zip(a_match[0::2], a_match[1::2])) + b_dict = dict(zip(b_match[0::2], b_match[1::2])) + assert a_dict == b_dict + if __name__ == "__main__": unittest.main(verbosity=3)