Merge branch 'release/0.8.0'

tshauck · Jul 8, 2019 · 088504b · 088504b
2 parents 9a9ba77 + 0e4f2ac
commit 088504b
Show file tree

Hide file tree

Showing 16 changed files with 93 additions and 65 deletions.
diff --git a/.yeyo.json b/.yeyo.json
diff --git a/.yeyo.yaml b/.yeyo.yaml
@@ -0,0 +1,12 @@
+version: 0.8.0
+tag_template: v{{ yeyo_version }}
+commit_template: v{{ yeyo_version }}
+files:
+- file_path: VERSION
+  match_template: yeyo_version
+- file_path: docs/index.md
+  match_template: yeyo_version
+- file_path: gcgc/__init__.py
+  match_template: yeyo_version
+- file_path: pyproject.toml
+  match_template: yeyo_version
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,23 @@
 
 ## Development
 
+## 0.8.0 (2019-07-04)
+
+### Fixed
+
+- Broken test due to platform differences in `Path.glob` sorting.
+
+### Added
+
+- User can specify to use start or end tokens optionally.
+
+### Removed
+
+- Removed one_hot_encoding. The user can do that pretty easily if needed. E.g.
+  see `scatter` in PyTorch.
+
+## 0.7.0 (2019-06-22)
+
 ### Added
 
 - Properties to access the integer encodings of special tokens. (35cae2a)

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.7.0
+0.8.0
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -5,3 +5,8 @@ services:
     build:
       context: .
     image: tshauck/gcgc
+  yeyo:
+    image: docker.io/thauck/yeyo:0.3.0-dev.2
+    volumes:
+      - $PWD:/project
+      - $HOME/.gitconfig:/etc/gitconfig
diff --git a/docs/concepts/encoding_seq.md b/docs/concepts/encoding_seq.md
@@ -39,13 +39,6 @@ es.encapsulate().conform(7)
 
 After the sequence has been modified, integer encodings are available as properties.
 
-```python
-es.one_hot_encoded
-# [[0, 1, 0, 0, 0, 0, 0],
-#  [0, 0, 1, 0, 0, 0, 0],
-#  [0, 0, 0, 1, 0, 0, 0],
-#  [1, 0, 0, 0, 0, 0, 0]]
-
 es.integer_encoded
 # [1, 2, 3, 0]
 ```
diff --git a/docs/index.md b/docs/index.md
@@ -39,4 +39,4 @@ $ pip install gcgc[torch]
 
 ## Documentation Version
 
-The documentation you're reading was build for version: `0.7.0`.
+The documentation you're reading was build for version: `0.8.0`.
diff --git a/gcgc/__init__.py b/gcgc/__init__.py
@@ -2,4 +2,4 @@
 # All Rights Reserved
 """Top-level GCGC module."""
 
-__version__ = "0.7.0"
+__version__ = "0.8.0"
diff --git a/gcgc/alphabet/base.py b/gcgc/alphabet/base.py
@@ -12,25 +12,48 @@
 class EncodingAlphabet:
     """The Encoding Alphabet is meant to be a baseclass for other alphabets."""
 
+    PADDING: str = "|"
     START: str = ">"
     END: str = "<"
-    PADDING: str = "|"
 
     # Convince linting that EncodingAlphabet will have a letters attribute.
     letters: str
 
-    def __init__(self, kmer_size: int = 1):
+    def __init__(self, kmer_size: int = 1, start_token: bool = True, end_token: bool = True):
         """Create the EncodingAlphabet object."""
 
-        self.letters_and_tokens = self.START + self.END + self.PADDING + self.letters
+        self.start = start_token
+        self.end = end_token
         self.kmer_size = kmer_size
 
-        self.kmers = ["".join(kmer) for kmer in it.product(self.letters, repeat=self.kmer_size)]
-        self.kmers_and_tokens = list(self.START) + list(self.END) + list(self.PADDING) + self.kmers
-
         self.encoding_index = {letter: idx for idx, letter in enumerate(self.kmers_and_tokens)}
         self.decoding_index = {idx: letter for letter, idx in self.encoding_index.items()}
 
+    @property
+    def letters_and_tokens(self):
+        """Return the letters and tokens combined into a single string."""
+        return self.tokens + self.letters
+
+    @property
+    def tokens(self):
+        """Returns the token string given the start and end configuration."""
+        append_string = [self.PADDING]
+        if self.start:
+            append_string.append(self.START)
+        if self.end:
+            append_string.append(self.END)
+
+        return "".join(append_string)
+
+    @property
+    def kmers(self):
+        """Return the possible kmers given the letters and kmer size."""
+        return ["".join(kmer) for kmer in it.product(self.letters, repeat=self.kmer_size)]
+
+    @property
+    def kmers_and_tokens(self):
+        return list(self.tokens) + self.kmers
+
     @property
     def encoded_padding(self):
         """Get the integer for the padding character."""
@@ -47,8 +70,8 @@ def encoded_end(self):
         return self.encode_token(self.END)
 
     def __len__(self) -> int:
-        """Get the lenght of the Alphabet."""
-        return len(self.letters_and_tokens)
+        """Get the length of the Alphabet."""
+        return len(self.encoding_index)
 
     def encode_token(self, token: str) -> int:
         """Given a particular token, return the integer representation."""

diff --git a/gcgc/encoded_seq/encoded_seq.py b/gcgc/encoded_seq/encoded_seq.py
@@ -122,19 +122,6 @@ def integer_encoded(self):
         """Return the underlying sequence in its integer representation."""
         return self.alphabet.integer_encode(self)
 
-    @property
-    def one_hot_encoded(self) -> Sequence[Sequence[int]]:
-        """Encode into D x N matrix where D is the size of the alphabet and N is the padding."""
-
-        encoded_sequence = self.alphabet.integer_encode(self)
-        encoded_len = len(encoded_sequence)
-        letters_len = len(self.alphabet.letters_and_tokens)
-
-        one_hot_seq = np.zeros((encoded_len, letters_len), dtype=np.int)
-        one_hot_seq[np.arange(encoded_len), encoded_sequence] = 1
-
-        return one_hot_seq.tolist()
-
     @classmethod
     def from_integer_encoded_seq(
         cls, integer_encoded_seq: Iterable[int], alphabet: EncodingAlphabet

diff --git a/gcgc/ml/pytorch_utils/parser.py b/gcgc/ml/pytorch_utils/parser.py
@@ -18,19 +18,12 @@ def parse_record(self, gcgc_record: GCGCRecord) -> Dict:
 
         parsed_features = super().parse_record(gcgc_record)
         parsed_features["seq_tensor"] = torch.LongTensor(parsed_features["seq_tensor"])
-        parsed_features["seq_tensor_one_hot"] = torch.LongTensor(
-            parsed_features["seq_tensor_one_hot"]
-        )
 
         if self.has_offset:
             parsed_features["offset_seq_tensor"] = torch.LongTensor(
                 parsed_features["offset_seq_tensor"]
             )
 
-            parsed_features["offset_seq_tensor_one_hot"] = torch.LongTensor(
-                parsed_features["offset_seq_tensor_one_hot"]
-            )
-
         if self.has_file_features:
             for file_feature in self.file_features:
                 parsed_features[file_feature.name] = torch.tensor(

diff --git a/gcgc/parser/base.py b/gcgc/parser/base.py
@@ -79,15 +79,13 @@ def parse_record(self, gcgc_record: GCGCRecord) -> Dict:
         parsed_features: Dict[str, Any] = {}
 
         parsed_features["seq_tensor"] = processed_seq.integer_encoded
-        parsed_features["seq_tensor_one_hot"] = processed_seq.one_hot_encoded
 
         if self.has_offset:
             offset_seq = processed_seq.shift(self.sequence_offset)
-
             parsed_features["offset_seq_tensor"] = offset_seq.integer_encoded
-            parsed_features["offset_seq_tensor_one_hot"] = offset_seq.one_hot_encoded
 
         parsed_features["id"] = gcgc_record.seq_record.id
+
         parsed_features.update(self._generate_file_features(gcgc_record.path))
         parsed_features.update(self._generate_annotation_features(gcgc_record.seq_record))
         parsed_features.update(self._generate_description_features(gcgc_record.seq_record))

diff --git a/gcgc/tests/alphabet/test_alphabet.py b/gcgc/tests/alphabet/test_alphabet.py
@@ -52,14 +52,15 @@ def test_biopython_from_seq(biopython_class, gcgc_class):
     assert isinstance(es.alphabet, gcgc_class)
 
 
-class TestAlphabet(unittest.TestCase):
-    def test_len(self):
-        dna = alphabet.IUPACUnambiguousDNAEncoding()
-        self.assertEqual(len(dna), len(dna.letters_and_tokens))
+@pytest.mark.parametrize("kmer_size,start,expected_len", [(1, True, 7), (2, False, 18)])
+def test_len(kmer_size, start, expected_len):
+    dna = alphabet.IUPACUnambiguousDNAEncoding(kmer_size=kmer_size, start_token=start)
+    assert len(dna) == expected_len
+
 
+class TestAlphabet(unittest.TestCase):
     def test_decoding_index(self):
         dna = alphabet.IUPACUnambiguousDNAEncoding()
-
         self.assertEqual(dna.decode_token(0), dna.decoding_index[0])
 
     def test_encoding_index(self):
@@ -102,7 +103,7 @@ def test_kmer_tokens_size(self):
     ],
 )
 def test_kmer_encoding(seq, kmer_size, expected_kmer):
-
+    """Test the kemrs are encoded as expected."""
     dna = alphabet.IUPACUnambiguousDNAEncoding(kmer_size=kmer_size)
     expected_integers = [dna.encode_token(t) for t in expected_kmer]
 
@@ -117,4 +118,16 @@ def test_special_token_integer_encoding():
 
     assert dna.encoded_start == dna.encode_token(dna.START)
     assert dna.encoded_end == dna.encode_token(dna.END)
+
     assert dna.encoded_padding == dna.encode_token(dna.PADDING)
+    assert dna.encoded_padding == 0
+
+
+@pytest.mark.parametrize(
+    "start_token,end_token,expected_tokens",
+    [(False, True, "|<"), (False, False, "|"), (True, False, "|>")],
+)
+def test_alphabet_configuration(start_token, end_token, expected_tokens):
+    """Test that we can selectively use start and end tokens."""
+    dna = alphabet.IUPACUnambiguousDNAEncoding(start_token=start_token, end_token=end_token)
+    assert dna.tokens == expected_tokens
diff --git a/gcgc/tests/encoded_seq/test_encoded_seq.py b/gcgc/tests/encoded_seq/test_encoded_seq.py
@@ -59,18 +59,6 @@ def test_conform(self):
         self.assertEqual(len(new_es), length)
         self.assertIsInstance(new_es, EncodedSeq)
 
-    def test_one_hot_encoding(self):
-
-        expected = [
-            [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
-            [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
-            [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
-        ]
-
-        es = EncodedSeq("ATCG", ExtendedIUPACDNAEncoding())
-        assert_array_equal(es.one_hot_encoded, expected)
-
     def test_from_seq_bad_alphabet(self):
         seq = Seq("ATCG", None)
 
@@ -89,7 +77,7 @@ def test_from_seq(self):
 def test_decode_tokens():
     alphabet = ExtendedIUPACDNAEncoding()
 
-    new_seq = EncodedSeq.from_integer_encoded_seq([0, 4, 4, 1], alphabet)
+    new_seq = EncodedSeq.from_integer_encoded_seq([1, 4, 4, 2], alphabet)
     expected_seq = EncodedSeq(">AA<", alphabet)
 
     assert new_seq == expected_seq
@@ -141,7 +129,7 @@ def test_seq_shift(actual_seq, expected_seq, offset):
 
 
 @pytest.mark.parametrize(
-    "seq,kmer_size,integer_encoding", [(">ATCG<||", 2, [0, 13, 22, 27, 1, 2, 2])]
+    "seq,kmer_size,integer_encoding", [(">ATCG<||", 2, [1, 13, 22, 27, 2, 0, 0])]
 )
 def test_kmer_size_encoding(seq, kmer_size, integer_encoding):
     encoded_seq = EncodedSeq(seq, ExtendedIUPACDNAEncoding(kmer_size=kmer_size))

diff --git a/gcgc/tests/third_party/pytorch_utils/test_pytorch_utils.py b/gcgc/tests/third_party/pytorch_utils/test_pytorch_utils.py
@@ -23,12 +23,12 @@ def yielder():
 
 def test_index_multiple_files():
 
-    glob = ECOLI_PATH.glob("*.fasta")
+    glob = sorted(ECOLI_PATH.glob("*.fasta"))
 
     pe = IUPACProteinEncoding()
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        db_path = pathlib.Path(tmpdir) / 'test.db'
+        db_path = pathlib.Path(tmpdir) / "test.db"
         test_dataset = GenomicDataset.from_paths(glob, SP, "fasta", pe, str(db_path))
 
         assert len(test_dataset) == 25

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gcgc"
-version = "0.7.0"
+version = "0.8.0"
 description = "GCGC is a preprocessing library for biological sequence model development."
 authors = ["Trent Hauck <[email protected]>"]
 include = ["./gcgc/data/splice/*.fasta"]