Skip to content

Commit

Permalink
Merge pull request #265 from keymanapp/fix/sil.km.ggc-custom-breaker
Browse files Browse the repository at this point in the history
fix: fix custom wordbreaker output format for sil.km.gcc
  • Loading branch information
DavidLRowe authored Aug 26, 2024
2 parents 6588b4f + 6256f59 commit 9fac85c
Showing 1 changed file with 53 additions and 41 deletions.
94 changes: 53 additions & 41 deletions release/sil/sil.km.ggc/source/sil.km.ggc.model.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
Google Crawler 1.0 generated from template.
This is a minimal lexical model source that uses a tab delimited wordlist.
See documentation online at https://help.keyman.com/developer/ for
additional parameters.
Expand All @@ -10,59 +10,71 @@
const source: LexicalModelSource = {
format: 'trie-1.0',
sources: ['wordlist.tsv'],
wordBreaker: function (str) {
const tokens = str.split(/\s|\u200b/);
wordBreaker: function (str) {
const whitespaceRegex = /\s|\u200b|\n|\r/;
const tokens = str.split(whitespaceRegex);

for(let i=0; i < tokens.length; i++) {
const token = tokens[i];
if(token.length == 1) {
continue;
}
for(let i=0; i < tokens.length; i++) {
const token = tokens[i];
if(token.length == 0) {
tokens.splice(i, 1);
i--;
continue;
} else if(token.length == 1 && whitespaceRegex.test(token)) {
tokens.splice(i, 1);
i--;
continue;
}

// Certain punctuation marks should be considered a separate token from the word they're next to.
const punctuationMarks = ['«', '»', '$', '#' /* add extras here */];
const punctSplitIndices = [];
// Certain punctuation marks should be considered a separate token from the word they're next to.
const punctuationMarks = ['«', '»', '$', '#' /* add extras here */];
const punctSplitIndices = [];

// Find if and where each mark exists within the token
for(let i = 0; i < punctuationMarks.length; i++) {
const split = token.indexOf(punctuationMarks[i]);
if(split >= 0) {
punctSplitIndices.push(split);
// Find if and where each mark exists within the token
for(let i = 0; i < punctuationMarks.length; i++) {
const split = token.indexOf(punctuationMarks[i]);
if(split >= 0) {
punctSplitIndices.push(split);
}
}
}

// Sort and pick the earliest mark's location. If none exists, use -1.
punctSplitIndices.sort();
const splitPoint = punctSplitIndices[0] === undefined ? -1 : punctSplitIndices[0];
// Sort and pick the earliest mark's location. If none exists, use -1.
punctSplitIndices.sort();
const splitPoint = punctSplitIndices[0] === undefined ? -1 : punctSplitIndices[0];

if(splitPoint > -1) {
const left = token.substring(0, splitPoint); // (0, -1) => ''
const punct = token.substring(splitPoint, splitPoint+1);
const right = token.substring(splitPoint+1); // Starting past the end of the string => ''
if(splitPoint > -1) {
const left = token.substring(0, splitPoint); // (0, -1) => ''
const punct = token.substring(splitPoint, splitPoint+1);
const right = token.substring(splitPoint+1); // Starting past the end of the string => ''

if(left) {
tokens.splice(i++, 0, left);
}
tokens.splice(i++, 1, punct);
if(right) {
tokens.splice(i, 0, right);
if(left) {
tokens.splice(i++, 0, left);
}
tokens.splice(i++, 1, punct);
if(right) {
tokens.splice(i, 0, right);
}
// Ensure that the next iteration puts `i` immediately after the punctuation token... even if
// there was a `right` portion, as it may have extra marks that also need to be spun off.
i--;
}
// Ensure that the next iteration puts `i` immediately after the punctuation token... even if
// there was a `right` portion, as it may have extra marks that also need to be spun off.
i--;
}
}
return tokens.map(function(token) {

let latestIndex = 0;
return tokens.map(function(token) {
const start = str.indexOf(token, latestIndex);
latestIndex = start + token.length;
return {
left: str.indexOf(token),
start: str.indexOf(token),
right: str.indexOf(token) + token.length,
end: str.indexOf(token) + token.length,
left: start,
start: start,
right: start + token.length,
end: start + token.length,
length: token.length,
text: token
}
});
},
punctuation: {
},
punctuation: {
insertAfterWord: "\u200B"
}
};
Expand Down

0 comments on commit 9fac85c

Please sign in to comment.