Skip to content

Commit

Permalink
unicode : switch to multimap based nfd_map (ggerganov#5799)
Browse files Browse the repository at this point in the history
* switch to multimap based nfd_map due to compile time issues

* simplify multimap keys

* dont construct new locale every time
  • Loading branch information
iamlemec authored and hazelnutcloud committed Mar 10, 2024
1 parent 996a2ff commit 51a38fc
Show file tree
Hide file tree
Showing 2 changed files with 312 additions and 265 deletions.
11 changes: 6 additions & 5 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm {
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
std::vector<uint32_t> nfd_codepoints;
for (uint32_t code : codepoints) {
auto it = nfd_map.find(code);
if (it != nfd_map.end()) {
for (uint32_t c : it->second) {
nfd_codepoints.push_back(c);
auto it = nfd_map.equal_range(code);
if (it.first != it.second) {
for (auto jt = it.first; jt != it.second; jt++) {
nfd_codepoints.push_back(jt->second);
}
} else {
nfd_codepoints.push_back(code);
Expand Down Expand Up @@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm {
}

uint32_t to_lower(uint32_t code) {
static const std::locale locale("en_US.UTF-8");
#if defined(_WIN32)
if (code > 0xFFFF) {
return code;
}
#endif
return std::tolower(wchar_t(code), std::locale("en_US.UTF-8"));
return std::tolower(wchar_t(code), locale);
}

bool is_ascii_punct(uint32_t code) {
Expand Down
Loading

0 comments on commit 51a38fc

Please sign in to comment.