Skip to content
This repository has been archived by the owner on Dec 21, 2022. It is now read-only.

Commit

Permalink
Merge branch 'release/0.8.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
tshauck committed Jul 8, 2019
2 parents 9a9ba77 + 0e4f2ac commit 088504b
Show file tree
Hide file tree
Showing 16 changed files with 93 additions and 65 deletions.
1 change: 0 additions & 1 deletion .yeyo.json

This file was deleted.

12 changes: 12 additions & 0 deletions .yeyo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: 0.8.0
tag_template: v{{ yeyo_version }}
commit_template: v{{ yeyo_version }}
files:
- file_path: VERSION
match_template: yeyo_version
- file_path: docs/index.md
match_template: yeyo_version
- file_path: gcgc/__init__.py
match_template: yeyo_version
- file_path: pyproject.toml
match_template: yeyo_version
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,23 @@

## Development

## 0.8.0 (2019-07-04)

### Fixed

- Broken test due to platform differences in `Path.glob` sorting.

### Added

- User can specify to use start or end tokens optionally.

### Removed

- Removed one_hot_encoding. The user can do that pretty easily if needed. E.g.
see `scatter` in PyTorch.

## 0.7.0 (2019-06-22)

### Added

- Properties to access the integer encodings of special tokens. (35cae2a)
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.7.0
0.8.0
5 changes: 5 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,8 @@ services:
build:
context: .
image: tshauck/gcgc
yeyo:
image: docker.io/thauck/yeyo:0.3.0-dev.2
volumes:
- $PWD:/project
- $HOME/.gitconfig:/etc/gitconfig
7 changes: 0 additions & 7 deletions docs/concepts/encoding_seq.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,6 @@ es.encapsulate().conform(7)

After the sequence has been modified, integer encodings are available as properties.

```python
es.one_hot_encoded
# [[0, 1, 0, 0, 0, 0, 0],
# [0, 0, 1, 0, 0, 0, 0],
# [0, 0, 0, 1, 0, 0, 0],
# [1, 0, 0, 0, 0, 0, 0]]

es.integer_encoded
# [1, 2, 3, 0]
```
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ $ pip install gcgc[torch]

## Documentation Version

The documentation you're reading was build for version: `0.7.0`.
The documentation you're reading was build for version: `0.8.0`.
2 changes: 1 addition & 1 deletion gcgc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
# All Rights Reserved
"""Top-level GCGC module."""

__version__ = "0.7.0"
__version__ = "0.8.0"
39 changes: 31 additions & 8 deletions gcgc/alphabet/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,48 @@
class EncodingAlphabet:
"""The Encoding Alphabet is meant to be a baseclass for other alphabets."""

PADDING: str = "|"
START: str = ">"
END: str = "<"
PADDING: str = "|"

# Convince linting that EncodingAlphabet will have a letters attribute.
letters: str

def __init__(self, kmer_size: int = 1):
def __init__(self, kmer_size: int = 1, start_token: bool = True, end_token: bool = True):
"""Create the EncodingAlphabet object."""

self.letters_and_tokens = self.START + self.END + self.PADDING + self.letters
self.start = start_token
self.end = end_token
self.kmer_size = kmer_size

self.kmers = ["".join(kmer) for kmer in it.product(self.letters, repeat=self.kmer_size)]
self.kmers_and_tokens = list(self.START) + list(self.END) + list(self.PADDING) + self.kmers

self.encoding_index = {letter: idx for idx, letter in enumerate(self.kmers_and_tokens)}
self.decoding_index = {idx: letter for letter, idx in self.encoding_index.items()}

@property
def letters_and_tokens(self):
"""Return the letters and tokens combined into a single string."""
return self.tokens + self.letters

@property
def tokens(self):
"""Returns the token string given the start and end configuration."""
append_string = [self.PADDING]
if self.start:
append_string.append(self.START)
if self.end:
append_string.append(self.END)

return "".join(append_string)

@property
def kmers(self):
"""Return the possible kmers given the letters and kmer size."""
return ["".join(kmer) for kmer in it.product(self.letters, repeat=self.kmer_size)]

@property
def kmers_and_tokens(self):
return list(self.tokens) + self.kmers

@property
def encoded_padding(self):
"""Get the integer for the padding character."""
Expand All @@ -47,8 +70,8 @@ def encoded_end(self):
return self.encode_token(self.END)

def __len__(self) -> int:
"""Get the lenght of the Alphabet."""
return len(self.letters_and_tokens)
"""Get the length of the Alphabet."""
return len(self.encoding_index)

def encode_token(self, token: str) -> int:
"""Given a particular token, return the integer representation."""
Expand Down
13 changes: 0 additions & 13 deletions gcgc/encoded_seq/encoded_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,19 +122,6 @@ def integer_encoded(self):
"""Return the underlying sequence in its integer representation."""
return self.alphabet.integer_encode(self)

@property
def one_hot_encoded(self) -> Sequence[Sequence[int]]:
"""Encode into D x N matrix where D is the size of the alphabet and N is the padding."""

encoded_sequence = self.alphabet.integer_encode(self)
encoded_len = len(encoded_sequence)
letters_len = len(self.alphabet.letters_and_tokens)

one_hot_seq = np.zeros((encoded_len, letters_len), dtype=np.int)
one_hot_seq[np.arange(encoded_len), encoded_sequence] = 1

return one_hot_seq.tolist()

@classmethod
def from_integer_encoded_seq(
cls, integer_encoded_seq: Iterable[int], alphabet: EncodingAlphabet
Expand Down
7 changes: 0 additions & 7 deletions gcgc/ml/pytorch_utils/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,12 @@ def parse_record(self, gcgc_record: GCGCRecord) -> Dict:

parsed_features = super().parse_record(gcgc_record)
parsed_features["seq_tensor"] = torch.LongTensor(parsed_features["seq_tensor"])
parsed_features["seq_tensor_one_hot"] = torch.LongTensor(
parsed_features["seq_tensor_one_hot"]
)

if self.has_offset:
parsed_features["offset_seq_tensor"] = torch.LongTensor(
parsed_features["offset_seq_tensor"]
)

parsed_features["offset_seq_tensor_one_hot"] = torch.LongTensor(
parsed_features["offset_seq_tensor_one_hot"]
)

if self.has_file_features:
for file_feature in self.file_features:
parsed_features[file_feature.name] = torch.tensor(
Expand Down
4 changes: 1 addition & 3 deletions gcgc/parser/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,13 @@ def parse_record(self, gcgc_record: GCGCRecord) -> Dict:
parsed_features: Dict[str, Any] = {}

parsed_features["seq_tensor"] = processed_seq.integer_encoded
parsed_features["seq_tensor_one_hot"] = processed_seq.one_hot_encoded

if self.has_offset:
offset_seq = processed_seq.shift(self.sequence_offset)

parsed_features["offset_seq_tensor"] = offset_seq.integer_encoded
parsed_features["offset_seq_tensor_one_hot"] = offset_seq.one_hot_encoded

parsed_features["id"] = gcgc_record.seq_record.id

parsed_features.update(self._generate_file_features(gcgc_record.path))
parsed_features.update(self._generate_annotation_features(gcgc_record.seq_record))
parsed_features.update(self._generate_description_features(gcgc_record.seq_record))
Expand Down
25 changes: 19 additions & 6 deletions gcgc/tests/alphabet/test_alphabet.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,15 @@ def test_biopython_from_seq(biopython_class, gcgc_class):
assert isinstance(es.alphabet, gcgc_class)


class TestAlphabet(unittest.TestCase):
def test_len(self):
dna = alphabet.IUPACUnambiguousDNAEncoding()
self.assertEqual(len(dna), len(dna.letters_and_tokens))
@pytest.mark.parametrize("kmer_size,start,expected_len", [(1, True, 7), (2, False, 18)])
def test_len(kmer_size, start, expected_len):
dna = alphabet.IUPACUnambiguousDNAEncoding(kmer_size=kmer_size, start_token=start)
assert len(dna) == expected_len


class TestAlphabet(unittest.TestCase):
def test_decoding_index(self):
dna = alphabet.IUPACUnambiguousDNAEncoding()

self.assertEqual(dna.decode_token(0), dna.decoding_index[0])

def test_encoding_index(self):
Expand Down Expand Up @@ -102,7 +103,7 @@ def test_kmer_tokens_size(self):
],
)
def test_kmer_encoding(seq, kmer_size, expected_kmer):

"""Test the kemrs are encoded as expected."""
dna = alphabet.IUPACUnambiguousDNAEncoding(kmer_size=kmer_size)
expected_integers = [dna.encode_token(t) for t in expected_kmer]

Expand All @@ -117,4 +118,16 @@ def test_special_token_integer_encoding():

assert dna.encoded_start == dna.encode_token(dna.START)
assert dna.encoded_end == dna.encode_token(dna.END)

assert dna.encoded_padding == dna.encode_token(dna.PADDING)
assert dna.encoded_padding == 0


@pytest.mark.parametrize(
"start_token,end_token,expected_tokens",
[(False, True, "|<"), (False, False, "|"), (True, False, "|>")],
)
def test_alphabet_configuration(start_token, end_token, expected_tokens):
"""Test that we can selectively use start and end tokens."""
dna = alphabet.IUPACUnambiguousDNAEncoding(start_token=start_token, end_token=end_token)
assert dna.tokens == expected_tokens
16 changes: 2 additions & 14 deletions gcgc/tests/encoded_seq/test_encoded_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,6 @@ def test_conform(self):
self.assertEqual(len(new_es), length)
self.assertIsInstance(new_es, EncodedSeq)

def test_one_hot_encoding(self):

expected = [
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
]

es = EncodedSeq("ATCG", ExtendedIUPACDNAEncoding())
assert_array_equal(es.one_hot_encoded, expected)

def test_from_seq_bad_alphabet(self):
seq = Seq("ATCG", None)

Expand All @@ -89,7 +77,7 @@ def test_from_seq(self):
def test_decode_tokens():
alphabet = ExtendedIUPACDNAEncoding()

new_seq = EncodedSeq.from_integer_encoded_seq([0, 4, 4, 1], alphabet)
new_seq = EncodedSeq.from_integer_encoded_seq([1, 4, 4, 2], alphabet)
expected_seq = EncodedSeq(">AA<", alphabet)

assert new_seq == expected_seq
Expand Down Expand Up @@ -141,7 +129,7 @@ def test_seq_shift(actual_seq, expected_seq, offset):


@pytest.mark.parametrize(
"seq,kmer_size,integer_encoding", [(">ATCG<||", 2, [0, 13, 22, 27, 1, 2, 2])]
"seq,kmer_size,integer_encoding", [(">ATCG<||", 2, [1, 13, 22, 27, 2, 0, 0])]
)
def test_kmer_size_encoding(seq, kmer_size, integer_encoding):
encoded_seq = EncodedSeq(seq, ExtendedIUPACDNAEncoding(kmer_size=kmer_size))
Expand Down
4 changes: 2 additions & 2 deletions gcgc/tests/third_party/pytorch_utils/test_pytorch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ def yielder():

def test_index_multiple_files():

glob = ECOLI_PATH.glob("*.fasta")
glob = sorted(ECOLI_PATH.glob("*.fasta"))

pe = IUPACProteinEncoding()

with tempfile.TemporaryDirectory() as tmpdir:
db_path = pathlib.Path(tmpdir) / 'test.db'
db_path = pathlib.Path(tmpdir) / "test.db"
test_dataset = GenomicDataset.from_paths(glob, SP, "fasta", pe, str(db_path))

assert len(test_dataset) == 25
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gcgc"
version = "0.7.0"
version = "0.8.0"
description = "GCGC is a preprocessing library for biological sequence model development."
authors = ["Trent Hauck <[email protected]>"]
include = ["./gcgc/data/splice/*.fasta"]
Expand Down

0 comments on commit 088504b

Please sign in to comment.