Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions conversion/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1617,6 +1617,11 @@ def _set_vocab_hybriddna(self):
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
# k-mer's own id (llama.cpp strips it on detokenization)
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]

Expand Down
50 changes: 30 additions & 20 deletions src/llama-vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1581,6 +1581,11 @@ struct llm_tokenizer_plamo2_session {
const llm_tokenizer_plamo2 & tokenizer;
};

// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical
// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token
// text at load
static const std::string dna_kmer_marker = "\xee\x80\x80";

struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}

Expand Down Expand Up @@ -1636,34 +1641,22 @@ struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
c = char(c - 32);
}
}
auto is_valid_kmer = [](const std::string & s) {
for (char c : s) {
if (c != 'A' && c != 'C' && c != 'G' && c != 'T') {
return false;
}
}
return true;

// k-mers carry the reserved marker suffix; a non-ACGT k-mer simply
// isn't in the vocab and falls back to <oov>
auto kmer_token = [&](const std::string & kmer) {
const auto tok = vocab.text_to_token(kmer + dna_kmer_marker);
return tok != LLAMA_TOKEN_NULL ? tok : oov_id;
};

size_t i = 0;
for (; i + k <= seq.size(); i += k) {
const std::string kmer = seq.substr(i, k);
if (is_valid_kmer(kmer)) {
const auto tok = vocab.text_to_token(kmer);
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
} else {
output.push_back(oov_id);
}
output.push_back(kmer_token(seq.substr(i, k)));
}
if (i < seq.size()) {
std::string kmer = seq.substr(i);
kmer.append(k - kmer.size(), 'A');
if (is_valid_kmer(kmer)) {
const auto tok = vocab.text_to_token(kmer);
output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
} else {
output.push_back(oov_id);
}
output.push_back(kmer_token(kmer));
}
}

Expand Down Expand Up @@ -2357,6 +2350,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
}
GGML_ASSERT(id_to_token.size() == token_to_id.size());

// hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase
// it from id_to_token so the k-mers detokenize to the bare DNA sequence. The
// k-mers are the block right after <oov>, so only scan from there.
if (tokenizer_model == "hybriddna") {
const auto idx = token_to_id.find("<oov>");
if (idx != token_to_id.end()) {
auto it = id_to_token.begin() + idx->second + 1;
for (; it != id_to_token.end(); ++it) {
std::string & text = it->text;
if (text.size() > dna_kmer_marker.size()
&& text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) {
text.erase(text.size() - dna_kmer_marker.size());
}
}
}
}

init_tokenizer(type);

// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
Expand Down
Loading