Merge pull request #12927 from keymanapp/fix/web/context-match-applie…

…d-suggestions fix(web): correctly track context transitions caused by applying suggestions 🖲️
keymanapp · Feb 3, 2025 · 0e7b47e · 0e7b47e
2 parents 3c7ae5d + d88eb83
commit 0e7b47e
Show file tree

Hide file tree

Showing 4 changed files with 101 additions and 21 deletions.
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/classical-calculation.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/classical-calculation.ts
@@ -1,6 +1,6 @@
 import { SENTINEL_CODE_UNIT } from '@keymanapp/models-templates';
 
-type EditOperation = 'insert' | 'delete' | 'match' | 'substitute' | 'transpose-start' | 'transpose-end' | 'transpose-insert' | 'transpose-delete';
+export type EditOperation = 'insert' | 'delete' | 'match' | 'substitute' | 'transpose-start' | 'transpose-end' | 'transpose-insert' | 'transpose-delete';
 
 /**
  * Represents the lowest-level unit for comparison during edit-distance calculations.

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tracker.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tracker.ts
@@ -1,6 +1,6 @@
 import { applyTransform, buildMergedTransform, Token } from '@keymanapp/models-templates';
 
-import { ClassicalDistanceCalculation } from './classical-calculation.js';
+import { ClassicalDistanceCalculation, EditOperation } from './classical-calculation.js';
 import { SearchSpace } from './distance-modeler.js';
 import TransformUtils from '../transformUtils.js';
 import { determineModelTokenizer } from '../model-helpers.js';
@@ -31,6 +31,18 @@ function textToCharTransforms(text: string, transformId?: number) {
   return perCharTransforms;
 }
 
+export function getEditPathLastMatch(editPath: EditOperation[]) {
+  const editLength = editPath.length;
+  // Special handling: appending whitespace to whitespace with the default wordbreaker.
+  // The default wordbreaker currently adds an empty token after whitespace; this would
+  // show up with 'substitute', 'match' at the end of the edit path.  (This should remain.)
+  if(editLength >= 2 && editPath[editLength - 2] == 'substitute' && editPath[editLength - 1] == 'match') {
+    return editPath.lastIndexOf('match', editLength - 2);
+  } else {
+    return editPath.lastIndexOf('match');
+  } 
+}
+
 export class TrackedContextSuggestion {
   suggestion: Suggestion;
   tokenWidth: number;
@@ -354,14 +366,7 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
 
     let editPath = mapping.editPath();
     const firstMatch = editPath.indexOf('match');
-    let lastMatch = editPath.lastIndexOf('match');
-
-    // Special handling: appending whitespace to whitespace with the default wordbreaker.
-    // The default wordbreaker currently adds an empty token after whitespace; this would
-    // show up with 'substitute', 'match' at the end of the edit path.
-    if(editPath.length >= 2 && editPath[editPath.length - 2] == 'substitute' && editPath[editPath.length - 1] == 'match') {
-      lastMatch = editPath.lastIndexOf('match', editPath.length - 2);
-    }
+    const lastMatch = getEditPathLastMatch(editPath);
 
     // Assertion:  for a long context, the bulk of the edit path should be a
     // continuous block of 'match' entries.  If there's anything else in
@@ -451,17 +456,12 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
       // transform as part of the input when doing correction-search.
       let primaryInput = hasDistribution ? tokenDistribution[0]?.sample : null;
 
-      // If the incoming token has text but we have no transform to match it with,
-      // abort the matching attempt.  We can't match this case well yet.
-      // These cases are generally infrequent enough to be low-ROI, not worth the
-      // investment to optimize further.  (Doing so isn't the simplest.)
-      //
-      // This may happen if tokenization for pre-existing text changes due to incoming
-      // input - refer to #12494 for an example case.
+      // If the incoming token has text but we have no transform (or 'insert') to match
+      // it with, abort the matching attempt.  We can't match this case well yet.
       if(editPath[i] != 'delete') {
         if(!incomingToken) {
           return null;
-        } else if(!primaryInput && incomingToken?.text != '') {
+        } else if(!(primaryInput || editPath[i] == 'insert' ) && incomingToken?.text != '') {
           return null;
         }
       }
@@ -477,7 +477,7 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
       // Note:  will need a either a different approach or more specialized
       // handling if/when supporting phrase-level (multi-token) suggestions.
       if(!isLastToken) {
-        preservationTransform = preservationTransform ? buildMergedTransform(preservationTransform, primaryInput) : primaryInput;
+        preservationTransform = preservationTransform && primaryInput ? buildMergedTransform(preservationTransform, primaryInput) : (primaryInput ?? preservationTransform);
       }
       const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput);
 
@@ -565,6 +565,11 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
           // we know to properly relocate the `f` and `l` transforms?
           if(primaryInput) {
             pushedToken.transformDistributions = tokenDistribution ? [tokenDistribution] : [];
+          } else if(incomingToken.text) {
+            // We have no transform data to match against an inserted token with text; abort!
+            // Refer to #12494 for an example case; we currently can't map previously-committed
+            // input transforms to a newly split-off token.
+            return null;
           }
           pushedToken.isWhitespace = incomingToken.isWhitespace;
 

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/index.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/index.ts
@@ -1,4 +1,5 @@
 export * from './classical-calculation.js';
 export * from './context-tracker.js';
 export * from './distance-modeler.js';
-export * from './execution-timer.js';
+export * from './execution-timer.js';
+export * from './transform-tokenization.js';
diff --git a/...gine/predictive-text/worker-thread/src/tests/mocha/cases/edit-distance/context-tracker.js b/...gine/predictive-text/worker-thread/src/tests/mocha/cases/edit-distance/context-tracker.js
@@ -1,14 +1,18 @@
 import { assert } from 'chai';
 
 import { ContextTracker } from '#./correction/context-tracker.js';
+import { tokenizeTransformDistribution } from '#./correction/transform-tokenization.js';
 import ModelCompositor from '#./model-compositor.js';
 import * as models from '#./models/index.js';
+import { determineModelTokenizer } from '#./model-helpers.js';
 
 import { default as defaultBreaker } from '@keymanapp/models-wordbreakers';
 import { deepCopy } from '@keymanapp/web-utils';
 
 import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs';
 
+const tokenizer = determineModelTokenizer(new models.DummyModel({wordbreaker: defaultBreaker}));
+
 describe('ContextTracker', function() {
   function toWrapperDistribution(transforms) {
     transforms = Array.isArray(transforms) ? transforms : [transforms];
@@ -190,6 +194,72 @@ describe('ContextTracker', function() {
       assert.equal(newContextMatch.tailTokensAdded, 2);
     });
 
+    it("properly matches and aligns when initial token is modified AND a 'wordbreak' is added'", function() {
+      let existingContext = models.tokenize(defaultBreaker, {
+        left: "an"
+      });
+      let transform = {
+        insert: 'd ',
+        deleteLeft: 0
+      }
+      let newContext = models.tokenize(defaultBreaker, {
+        left: "and "
+      });
+      let rawTokens = ["and", " ", ""];
+
+      let baseContextMatch = ContextTracker.modelContextState(existingContext.left);
+      let newContextMatch = ContextTracker.attemptMatchContext(
+        newContext.left, 
+        baseContextMatch, 
+        tokenizeTransformDistribution(tokenizer, {left: "an"}, [{sample: transform, p: 1}])
+      );
+      assert.isNotNull(newContextMatch?.state);
+      assert.deepEqual(newContextMatch.state.tokens.map(token => token.raw), rawTokens);
+      // We want to preserve all text preceding the new token when applying a suggestion.
+      assert.deepEqual(newContextMatch.preservationTransform, { insert: 'd ', deleteLeft: 0, deleteRight: 0});
+
+      // The 'wordbreak' transform
+      let state = newContextMatch.state;
+      assert.isNotEmpty(state.tokens[state.tokens.length - 2].transformDistributions);
+      assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
+
+      assert.equal(newContextMatch.headTokensRemoved, 0);
+      assert.equal(newContextMatch.tailTokensAdded, 2);
+    });
+
+    it("properly matches and aligns when tail token is modified AND a 'wordbreak' is added'", function() {
+      let existingContext = models.tokenize(defaultBreaker, {
+        left: "apple a day keeps the doc"
+      });
+      let transform = {
+        insert: 'tor ',
+        deleteLeft: 0
+      }
+      let newContext = models.tokenize(defaultBreaker, {
+        left: "apple a day keeps the doctor "
+      });
+      let rawTokens = ["apple", " ", "a", " ", "day", " ", "keeps", " ", "the", " ", "doctor", " ", ""];
+
+      let baseContextMatch = ContextTracker.modelContextState(existingContext.left);
+      let newContextMatch = ContextTracker.attemptMatchContext(
+        newContext.left, 
+        baseContextMatch, 
+        tokenizeTransformDistribution(tokenizer, {left: "apple a day keeps the doc"}, [{sample: transform, p: 1}])
+      );
+      assert.isNotNull(newContextMatch?.state);
+      assert.deepEqual(newContextMatch.state.tokens.map(token => token.raw), rawTokens);
+      // We want to preserve all text preceding the new token when applying a suggestion.
+      assert.deepEqual(newContextMatch.preservationTransform, { insert: 'tor ', deleteLeft: 0, deleteRight: 0 });
+
+      // The 'wordbreak' transform
+      let state = newContextMatch.state;
+      assert.isNotEmpty(state.tokens[state.tokens.length - 2].transformDistributions);
+      assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
+
+      assert.equal(newContextMatch.headTokensRemoved, 0);
+      assert.equal(newContextMatch.tailTokensAdded, 2);
+    });
+
     it('rejects hard-to-handle case: tail token is split into three rather than two', function() {
       let baseContext = models.tokenize(defaultBreaker, {
         left: "text'"
@@ -209,7 +279,11 @@ describe('ContextTracker', function() {
         insert: '\"',
         deleteLeft: 0
       }
-      let problemContextMatch = ContextTracker.attemptMatchContext(newContext.left, baseContextMatch, toWrapperDistribution(transform));
+      let problemContextMatch = ContextTracker.attemptMatchContext(
+        newContext.left, 
+        baseContextMatch, 
+        tokenizeTransformDistribution(tokenizer, {left: "text'"}, [{sample: transform, p: 1}])
+      );
       assert.isNull(problemContextMatch);
     });
   });