Merge pull request #265 from keymanapp/fix/sil.km.ggc-custom-breaker

fix: fix custom wordbreaker output format for sil.km.gcc
keymanapp · Aug 26, 2024 · 9fac85c · 9fac85c
2 parents 6588b4f + 6256f59
commit 9fac85c
Showing 1 changed file with 53 additions and 41 deletions.
diff --git a/release/sil/sil.km.ggc/source/sil.km.ggc.model.ts b/release/sil/sil.km.ggc/source/sil.km.ggc.model.ts
@@ -1,6 +1,6 @@
 /*
   Google Crawler 1.0 generated from template.
-  
+
   This is a minimal lexical model source that uses a tab delimited wordlist.
   See documentation online at https://help.keyman.com/developer/ for
   additional parameters.
@@ -10,59 +10,71 @@
 const source: LexicalModelSource = {
   format: 'trie-1.0',
   sources: ['wordlist.tsv'],
-        wordBreaker: function (str) {
-  const tokens = str.split(/\s|\u200b/);
+  wordBreaker: function (str) {
+    const whitespaceRegex = /\s|\u200b|\n|\r/;
+    const tokens = str.split(whitespaceRegex);
 
-  for(let i=0; i < tokens.length; i++) {
-    const token = tokens[i];
-    if(token.length == 1) {
-      continue;
-    }
+    for(let i=0; i < tokens.length; i++) {
+      const token = tokens[i];
+      if(token.length == 0) {
+        tokens.splice(i, 1);
+        i--;
+        continue;
+      } else if(token.length == 1 && whitespaceRegex.test(token)) {
+        tokens.splice(i, 1);
+        i--;
+        continue;
+      }
 
-    // Certain punctuation marks should be considered a separate token from the word they're next to.
-    const punctuationMarks = ['«', '»', '$', '#' /* add extras here */];
-    const punctSplitIndices = [];
+      // Certain punctuation marks should be considered a separate token from the word they're next to.
+      const punctuationMarks = ['«', '»', '$', '#' /* add extras here */];
+      const punctSplitIndices = [];
 
-    // Find if and where each mark exists within the token
-    for(let i = 0; i < punctuationMarks.length; i++) {
-      const split = token.indexOf(punctuationMarks[i]);
-      if(split >= 0) {
-        punctSplitIndices.push(split);
+      // Find if and where each mark exists within the token
+      for(let i = 0; i < punctuationMarks.length; i++) {
+        const split = token.indexOf(punctuationMarks[i]);
+        if(split >= 0) {
+          punctSplitIndices.push(split);
+        }
       }
-    }
 
-    // Sort and pick the earliest mark's location.  If none exists, use -1.
-    punctSplitIndices.sort();
-    const splitPoint = punctSplitIndices[0] === undefined ? -1 : punctSplitIndices[0];
+      // Sort and pick the earliest mark's location.  If none exists, use -1.
+      punctSplitIndices.sort();
+      const splitPoint = punctSplitIndices[0] === undefined ? -1 : punctSplitIndices[0];
 
-    if(splitPoint > -1) {
-      const left = token.substring(0, splitPoint);  // (0, -1) => ''
-      const punct = token.substring(splitPoint, splitPoint+1);
-      const right = token.substring(splitPoint+1);  // Starting past the end of the string => ''
+      if(splitPoint > -1) {
+        const left = token.substring(0, splitPoint);  // (0, -1) => ''
+        const punct = token.substring(splitPoint, splitPoint+1);
+        const right = token.substring(splitPoint+1);  // Starting past the end of the string => ''
 
-      if(left) {
-        tokens.splice(i++, 0, left);
-      }
-      tokens.splice(i++, 1, punct);
-      if(right) {
-        tokens.splice(i, 0, right);
+        if(left) {
+          tokens.splice(i++, 0, left);
+        }
+        tokens.splice(i++, 1, punct);
+        if(right) {
+          tokens.splice(i, 0, right);
+        }
+        // Ensure that the next iteration puts `i` immediately after the punctuation token... even if
+        // there was a `right` portion, as it may have extra marks that also need to be spun off.
+        i--;
       }
-      // Ensure that the next iteration puts `i` immediately after the punctuation token... even if
-      // there was a `right` portion, as it may have extra marks that also need to be spun off.
-      i--;
     }
-   }
-   return tokens.map(function(token) {
+
+    let latestIndex = 0;
+    return tokens.map(function(token) {
+      const start = str.indexOf(token, latestIndex);
+      latestIndex = start + token.length;
       return {
-        left: str.indexOf(token),
-        start: str.indexOf(token),
-        right: str.indexOf(token) + token.length,
-        end: str.indexOf(token) + token.length,
+        left: start,
+        start: start,
+        right: start + token.length,
+        end: start + token.length,
+        length: token.length,
         text: token
       }
     });
-},
-punctuation: {
+  },
+  punctuation: {
     insertAfterWord: "\u200B"
   }
 };