Skip to content

Commit

Permalink
Compare sequence objects for equality.
Browse files Browse the repository at this point in the history
str(record) is the string representation of the entire record and
comparing the string representations compares more than the sequences.
  • Loading branch information
groutr committed Mar 30, 2020
1 parent b3b9e3d commit 3656fac
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 8 deletions.
7 changes: 3 additions & 4 deletions augur/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,12 @@ def read_sequences(*fnames):
try:
for fname in fnames:
for record in SeqIO.parse(fname, 'fasta'):
if record.name in seqs:
if str(record) != str(seqs[record.name]):
raise AlignmentError("Detected duplicate input strains \"%s\" but the sequences are different."%record.name)
if record.name in seqs and record.seq != seqs[record.name].seq:
raise AlignmentError("Detected duplicate input strains \"%s\" but the sequences are different." % record.name)
# if the same sequence then we can proceed (and we only take one)
seqs[record.name] = record
except FileNotFoundError:
raise AlignmentError("\nCannot read sequences -- make sure the file %s exists and contains sequences in fasta format"%fname)
raise AlignmentError("\nCannot read sequences -- make sure the file %s exists and contains sequences in fasta format" % fname)
except ValueError as error:
raise AlignmentError("\nERROR: Problem reading in {}: {}".format(fname, str(error)))
return seqs
Expand Down
28 changes: 28 additions & 0 deletions tests/data/align/aa-seq_h3n2_ha_2y_HA1_dup.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
>A/Victoria/361/2011
QKLPGNDNSTATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQNSSIGEICDSPHQILD
GENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEF
NNESFNWTGVTQNGTSSACIRRSNNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIW
GVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRNIPSRISIYWTIVKPG
DILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCNSECITPNGSIPNDKPFQNVNRI
TYGACPRYVKQSTLKLATGMRNVPEKQTR
>A/Hawaii/22/2012
QKLPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILD
GENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEF
NNESFNWTGVTQNGTSSACIRRSNNSFFSRLNWLTHSNFKYPALNVTMPNNEQFDKLYIW
GVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRNIPSRISIYWTIVKPG
DILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRI
TYGACPRYVKQSTLKLATGMRNVPEKQTR
>A/Texas/50/2012
QKLPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILD
GENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEF
NNESFNWNGVTQNGTSSACIRRSNNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIW
GVHHPGTDKDQIFLYAQPSGRITVSTKRSQQAVIPNIGSRPRIRNIPSRISIYWTIVKPG
DILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRI
TYGACPRYVKQSTLKLATGMRNVPEKQTR
>A/Texas/50/2012
QKLPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILD
GENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEF
NNESFNWNGVTQNGTSSACIRRSNNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIW
GVHHPGTDKDQIFLYAQPSGRITVSTKRSQQAVIPNIGSRPRIRNIPSRISIYWTIVKPG
DILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRI
TYGACPRYATQSTLKLATGMRNVPEKQTR
11 changes: 7 additions & 4 deletions tests/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from Bio.SeqRecord import SeqRecord

from shlex import quote
import pathlib

from augur import align

import pytest
import pathlib


class TestAlign:
Expand Down Expand Up @@ -76,7 +76,7 @@ def test_check_duplicates_MSA_and_string_with_duplicates(self):
)
with pytest.raises(align.AlignmentError):
assert align.check_duplicates(alignment, "seq3")

def test_prune_seqs_matching_alignment(self):
sequence = {
"seq1": SeqRecord(Seq("GTAC"), name="seq1"),
Expand Down Expand Up @@ -139,5 +139,8 @@ def test_read_sequences(self):
data_file = pathlib.Path('tests/data/align/test_aligned_sequences.fasta')
result = align.read_sequences(data_file)
assert len(result.keys()) == 3



def test_read_seq_compare(self):
data_file = pathlib.Path("tests/data/align/aa-seq_h3n2_ha_2y_2HA1_dup.fasta")
with pytest.raises(align.AlignmentError):
assert align.read_sequences(data_file)

0 comments on commit 3656fac

Please sign in to comment.