Merge pull request #7205 from keymanapp/fix/web/non-whitespace-wordbr…

…eaks fix(common/models): fixes quote-adjacent pred-text suggestions
keymanapp · Sep 15, 2022 · ec53662 · ec53662
2 parents af6cbc7 + 917fce0
commit ec53662
Show file tree

Hide file tree

Showing 7 changed files with 268 additions and 104 deletions.
diff --git a/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js b/common/predictive-text/unit_tests/headless/edit-distance/context-tracker.js
@@ -45,7 +45,7 @@ describe('ContextTracker', function() {
       assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);
     });
 
-    it("properly matches and aligns when a 'wordbreak' is added'", function() {
+    it("properly matches and aligns when a 'wordbreak' is added", function() {
       let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"];
       let transform = {
         insert: ' ',
@@ -56,7 +56,7 @@ describe('ContextTracker', function() {
       let rawTokens = ["an", null, "apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""];
 
       let existingState = ContextTracker.modelContextState(existingContext);
-      let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform));
+      let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
       assert.isNotNull(state);
       assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);
 
@@ -65,6 +65,26 @@ describe('ContextTracker', function() {
       assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
     });
 
+    it("properly matches and aligns when an implied 'wordbreak' occurs (as when following \"'\")", function() {
+      let existingContext = ["'"];
+      let transform = {
+        insert: 'a',
+        deleteLeft: 0
+      }
+      let newContext = Array.from(existingContext);
+      newContext.push('a'); // The incoming transform should produce a new token WITH TEXT.
+      let rawTokens = ["'", null, "a"];
+
+      let existingState = ContextTracker.modelContextState(existingContext);
+      let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
+      assert.isNotNull(state);
+      assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);
+
+      // The 'wordbreak' transform
+      assert.isEmpty(state.tokens[state.tokens.length - 2].transformDistributions);
+      assert.isNotEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
+    });
+
     it("properly matches and aligns when lead token is removed AND a 'wordbreak' is added'", function() {
       let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"];
       let transform = {
@@ -77,7 +97,7 @@ describe('ContextTracker', function() {
       let rawTokens = ["apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""];
 
       let existingState = ContextTracker.modelContextState(existingContext);
-      let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform));
+      let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
       assert.isNotNull(state);
       assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);
 

diff --git a/common/predictive-text/unit_tests/headless/transform-utils.js b/common/predictive-text/unit_tests/headless/transform-utils.js
@@ -0,0 +1,58 @@
+var assert = require('chai').assert;
+
+let TransformUtils = require('../../../web/lm-worker/build/intermediate.js').TransformUtils;
+
+describe('TransformUtils', function () {
+  describe('isWhitespace', function () {
+    it("should not match a string containing standard alphabetic characters", function () {
+      let testTransforms = [{
+        insert: "a ",
+        deleteLeft: 0
+      }, {
+        insert: " a",
+        deleteLeft: 0
+      }, {
+        insert: "ab",
+        deleteLeft: 0
+      }];
+
+      testTransforms.forEach((transform) => assert.isFalse(TransformUtils.isWhitespace(transform), `failed with: '${transform.insert}'`));
+    });
+
+    it("should match a simple ' ' transform", function() {
+      transform = {
+        insert: " ",
+        deleteLeft: 0
+      };
+
+      assert.isTrue(TransformUtils.isWhitespace(transform));
+    });
+
+    it("should match a simple ' ' transform with delete-left", function() {
+      transform = {
+        insert: " ",
+        deleteLeft: 1
+      };
+
+      assert.isTrue(TransformUtils.isWhitespace(transform));
+    });
+
+    it("should match a transform consisting of multiple characters of only whitespace", function() {
+      transform = {
+        insert: " \n\r\u00a0\t\u2000 ",
+        deleteLeft: 0
+      };
+
+      assert.isTrue(TransformUtils.isWhitespace(transform));
+    });
+
+    it("stress tests", function() {
+      transform = {
+        insert: " \n\r\u00a0\ta\u2000 ",  // the 'a' should cause failure.
+        deleteLeft: 0
+      };
+
+      assert.isFalse(TransformUtils.isWhitespace(transform));
+    });
+  });
+});
diff --git a/common/predictive-text/unit_tests/headless/worker-model-compositor.js b/common/predictive-text/unit_tests/headless/worker-model-compositor.js
@@ -66,12 +66,53 @@ describe('ModelCompositor', function() {
           // Suggestions always delete the full root of the suggestion.
           //
           // After a backspace, that means the text 'the' - 3 chars.
-          // Char 4 is for the original backspace, as suggstions are built
+          // Char 4 is for the original backspace, as suggestions are built
           // based on the context state BEFORE the triggering input -
           // here, a backspace.
           assert.equal(suggestion.transform.deleteLeft, 4);
         });
       });
+
+      it('properly handles suggestions for the first letter after a ` `', function() {
+        let compositor = new ModelCompositor(plainModel);
+        let context = {
+          left: 'the', startOfBuffer: true, endOfBuffer: true,
+        };
+
+        let inputTransform = {
+          insert: ' ',
+          deleteLeft: 0
+        };
+
+        let suggestions = compositor.predict(inputTransform, context);
+        suggestions.forEach(function(suggestion) {
+          // After a space, predictions are based on a new, zero-length root.
+          // With nothing to replace, .deleteLeft should be zero.
+          assert.equal(suggestion.transform.deleteLeft, 0);
+        });
+      });
+
+      it('properly handles suggestions for the first letter after a `\'`', function() {
+        let compositor = new ModelCompositor(plainModel);
+        let context = {
+          left: "the '", startOfBuffer: true, endOfBuffer: true,
+        };
+
+        // This results in a new word boundary (between the `'` and the `a`).
+        // Basically, an implied (but nonexistent) ` `.
+        let inputTransform = {
+          insert: "a",
+          deleteLeft: 0
+        };
+
+        let suggestions = compositor.predict(inputTransform, context);
+        suggestions.forEach(function(suggestion) {
+          // Suggestions always delete the full root of the suggestion.
+          // Which, here, didn't exist before the input.  Nothing to
+          // replace => nothing for the suggestion to delete.
+          assert.equal(suggestion.transform.deleteLeft, 0);
+        });
+      });
     });
 
     describe('applySuggestionCasing', function() {

diff --git a/common/web/lm-worker/src/correction/context-tracker.ts b/common/web/lm-worker/src/correction/context-tracker.ts
@@ -32,10 +32,6 @@ namespace correction {
     replacements: TrackedContextSuggestion[];
     activeReplacementId: number = -1;
 
-    get isNew(): boolean {
-      return this.transformDistributions.length == 0;
-    }
-
     get currentText(): string {
       if(this.replacementText === undefined || this.replacementText === null) {
         return this.raw;
@@ -89,7 +85,7 @@ namespace correction {
           if(token.replacementText) {
             copy.replacementText = token.replacementText;
           }
-  
+
           return copy;
         });
         this.searchSpace = obj.searchSpace;
@@ -139,8 +135,8 @@ namespace correction {
 
       // Track the Transform that resulted in the whitespace 'token'.
       // Will be needed for phrase-level correction/prediction.
-      whitespaceToken.transformDistributions = [transformDistribution]; 
-      
+      whitespaceToken.transformDistributions = transformDistribution ? [transformDistribution] : [];
+
       whitespaceToken.raw = null;
       this.tokens.push(whitespaceToken);
     }
@@ -149,19 +145,19 @@ namespace correction {
      * Used for 14.0's backspace workaround, which flattens all previous Distribution<Transform>
      * entries because of limitations with direct use of backspace transforms.
      * @param tokenText
-     * @param transformId 
+     * @param transformId
      */
     replaceTailForBackspace(tokenText: USVString, transformId: number) {
       this.tokens.pop();
 
       // It's a backspace transform; time for special handling!
       //
-      // For now, with 14.0, we simply compress all remaining Transforms for the token into 
-      // multiple single-char transforms.  Probabalistically modeling BKSP is quite complex, 
+      // For now, with 14.0, we simply compress all remaining Transforms for the token into
+      // multiple single-char transforms.  Probabalistically modeling BKSP is quite complex,
       // so we simplify by assuming everything remaining after a BKSP is 'true' and 'intended' text.
       //
       // Note that we cannot just use a single, monolithic transform at this point b/c
-      // of our current edit-distance optimization strategy; diagonalization is currently... 
+      // of our current edit-distance optimization strategy; diagonalization is currently...
       // not very compatible with that.
       let backspacedTokenContext: Distribution<Transform>[] = textToCharTransforms(tokenText, transformId).map(function(transform) {
         return [{sample: transform, p: 1.0}];
@@ -175,7 +171,7 @@ namespace correction {
 
     updateTail(transformDistribution: Distribution<Transform>, tokenText?: USVString) {
       let editedToken = this.tail;
-      
+
       // Preserve existing text if new text isn't specified.
       tokenText = tokenText || (tokenText === '' ? '' : editedToken.raw);
 
@@ -191,7 +187,7 @@ namespace correction {
 
     toRawTokenization() {
       let sequence: USVString[] = [];
-      
+
       for(let token of this.tokens) {
         // Hide any tokens representing wordbreaks.  (Thinking ahead to phrase-level possibilities)
         if(token.currentText !== null) {
@@ -281,7 +277,7 @@ namespace correction {
     /**
      * Returns items contained within the circular array, ordered from 'oldest' to 'newest' -
      * the same order in which the items will be dequeued.
-     * @param index 
+     * @param index
      */
     item(index: number) {
       if(index >= this.count) {
@@ -294,7 +290,7 @@ namespace correction {
   }
 
   export class ContextTracker extends CircularArray<TrackedContextState> {
-    static attemptMatchContext(tokenizedContext: USVString[], 
+    static attemptMatchContext(tokenizedContext: USVString[],
                                matchState: TrackedContextState,
                                transformDistribution?: Distribution<Transform>,): TrackedContextState {
       // Map the previous tokenized state to an edit-distance friendly version.
@@ -335,7 +331,7 @@ namespace correction {
       }
 
       // Can happen for the first text input after backspace deletes a wordbreaking character,
-      // thus the new input continues a previous word while dropping the empty word after 
+      // thus the new input continues a previous word while dropping the empty word after
       // that prior wordbreaking character.
       //
       // We can't handle it reliably from this match state, but a previous entry (without the empty token)
@@ -353,7 +349,7 @@ namespace correction {
 
       // If we've made it here... success!  We have a context match!
       let state: TrackedContextState;
-      
+
       if(pushedTail) {
         // On suggestion acceptance, we should update the previous final token.
         // We do it first so that the acceptance is replicated in the new TrackedContextState
@@ -376,7 +372,9 @@ namespace correction {
       if(primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft == 0 && !primaryInput.deleteRight) {
         primaryInput = null;
       }
-      const isBackspace = primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft > 0 && !primaryInput.deleteRight;
+
+      const isWhitespace = primaryInput && TransformUtils.isWhitespace(primaryInput);
+      const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput);
       const finalToken = tokenizedContext[tokenizedContext.length-1];
 
       /* Assumption:  This is an adequate check for its two sub-branches.
@@ -388,7 +386,7 @@ namespace correction {
        * - Assumption:  one keystroke may only cause a single token to be appended to the context
        *   - That is, no "reasonable" keystroke would emit a Transform adding two separate word tokens
        *     - For languages using whitespace to word-break, said keystroke would have to include said whitespace to break the assumption.
-       */ 
+       */
 
       // If there is/was more than one context token available...
       if(editPath.length > 1) {
@@ -399,17 +397,29 @@ namespace correction {
 
         // We're adding an additional context token.
         if(pushedTail) {
-          // ASSUMPTION:  any transform that triggers this case is a pure-whitespace Transform, as we
-          //              need a word-break before beginning a new word's context.
-          //              Worth note:  when invalid, the lm-layer already has problems in other aspects too.
-          state.pushWhitespaceToTail(transformDistribution);
-
-          let emptyToken = new TrackedContextToken();
-          emptyToken.raw = '';
-          // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters
-          // for the new word (token), so the input keystrokes do not correspond to the new text token.
-          emptyToken.transformDistributions = [];
-          state.pushTail(emptyToken);
+          const tokenizedTail = tokenizedContext[tokenizedContext.length - 1];
+          /*
+           * Common-case:  most transforms that trigger this case are from pure-whitespace Transforms.  MOST.
+           *
+           * Less-common, but noteworthy:  some wordbreaks may occur without whitespace.  Example:
+           * `"o` => ['"', 'o'].  Make sure to double-check against `tokenizedContext`!
+           */
+          let pushedToken = new TrackedContextToken();
+          pushedToken.raw = tokenizedTail;
+
+          if(isWhitespace || !primaryInput) {
+            state.pushWhitespaceToTail(transformDistribution ?? []);
+            // Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters
+            // for the new word (token), so the input keystrokes do not correspond to the new text token.
+            pushedToken.transformDistributions = [];
+          } else {
+            state.pushWhitespaceToTail();
+            // Assumption: Since we only allow one-transform-at-a-time changes between states, we shouldn't be missing
+            // any metadata used to construct the new context state token.
+            pushedToken.transformDistributions = transformDistribution ? [transformDistribution] : [];
+          }
+
+          state.pushTail(pushedToken);
         } else { // We're editing the final context token.
           // TODO:  Assumption:  we didn't 'miss' any inputs somehow.
           //        As is, may be prone to fragility should the lm-layer's tracked context 'desync' from its host's.
@@ -442,7 +452,9 @@ namespace correction {
       return state;
     }
 
-    static modelContextState(tokenizedContext: USVString[], lexicalModel: LexicalModel): TrackedContextState {
+    static modelContextState(tokenizedContext: USVString[],
+                             transformDistribution: Distribution<Transform>,
+                             lexicalModel: LexicalModel): TrackedContextState {
       let baseTokens = tokenizedContext.map(function(entry) {
         let token = new TrackedContextToken();
         token.raw = entry;
@@ -483,13 +495,12 @@ namespace correction {
      * Compares the current, post-input context against the most recently-seen contexts from previous prediction calls, returning
      * the most information-rich `TrackedContextState` possible.  If a match is found, the state will be annotated with the
      * input information provided to previous prediction calls and persisted correction-search calculations for re-use.
-     * 
-     * @param model 
-     * @param context 
-     * @param mainTransform 
-     * @param transformDistribution 
+     *
+     * @param model
+     * @param context
+     * @param transformDistribution
      */
-    analyzeState(model: LexicalModel, 
+    analyzeState(model: LexicalModel,
                  context: Context,
                  transformDistribution?: Distribution<Transform>): TrackedContextState {
       if(!model.traverseFromRoot) {
@@ -519,7 +530,7 @@ namespace correction {
       //
       // Assumption:  as a caret needs to move to context before any actual transform distributions occur,
       // this state is only reached on caret moves; thus, transformDistribution is actually just a single null transform.
-      let state = ContextTracker.modelContextState(tokenizedContext.left, model);
+      let state = ContextTracker.modelContextState(tokenizedContext.left, transformDistribution, model);
       state.taggedContext = context;
       this.enqueue(state);
       return state;

diff --git a/common/web/lm-worker/src/index.ts b/common/web/lm-worker/src/index.ts
@@ -32,6 +32,7 @@
 /// <reference types="@keymanapp/lm-message-types" />
 /// <reference path="./models/dummy-model.ts" />
 /// <reference path="./model-compositor.ts" />
+/// <reference path="./transformUtils.ts" />
 
 /**
  * Encapsulates all the state required for the LMLayer's worker thread.
@@ -407,6 +408,7 @@ if (typeof module !== 'undefined' && typeof module.exports !== 'undefined') {
   module.exports['wordBreakers'] = wordBreakers;
   /// XXX: export the ModelCompositor for testing.
   module.exports['ModelCompositor'] = ModelCompositor;
+  module.exports['TransformUtils'] = TransformUtils;
 } else if (typeof self !== 'undefined' && 'postMessage' in self && 'importScripts' in self) {
   // Automatically install if we're in a Web Worker.
   LMLayerWorker.install(self as any); // really, 'as typeof globalThis', but we're currently getting TS errors from use of that.