Skip to content

Commit

Permalink
Merge pull request #12927 from keymanapp/fix/web/context-match-applie…
Browse files Browse the repository at this point in the history
…d-suggestions

fix(web): correctly track context transitions caused by applying suggestions 🖲️
  • Loading branch information
jahorton authored Feb 3, 2025
2 parents 3c7ae5d + d88eb83 commit 0e7b47e
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 21 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { SENTINEL_CODE_UNIT } from '@keymanapp/models-templates';

type EditOperation = 'insert' | 'delete' | 'match' | 'substitute' | 'transpose-start' | 'transpose-end' | 'transpose-insert' | 'transpose-delete';
export type EditOperation = 'insert' | 'delete' | 'match' | 'substitute' | 'transpose-start' | 'transpose-end' | 'transpose-insert' | 'transpose-delete';

/**
* Represents the lowest-level unit for comparison during edit-distance calculations.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { applyTransform, buildMergedTransform, Token } from '@keymanapp/models-templates';

import { ClassicalDistanceCalculation } from './classical-calculation.js';
import { ClassicalDistanceCalculation, EditOperation } from './classical-calculation.js';
import { SearchSpace } from './distance-modeler.js';
import TransformUtils from '../transformUtils.js';
import { determineModelTokenizer } from '../model-helpers.js';
Expand Down Expand Up @@ -31,6 +31,18 @@ function textToCharTransforms(text: string, transformId?: number) {
return perCharTransforms;
}

export function getEditPathLastMatch(editPath: EditOperation[]) {
const editLength = editPath.length;
// Special handling: appending whitespace to whitespace with the default wordbreaker.
// The default wordbreaker currently adds an empty token after whitespace; this would
// show up with 'substitute', 'match' at the end of the edit path. (This should remain.)
if(editLength >= 2 && editPath[editLength - 2] == 'substitute' && editPath[editLength - 1] == 'match') {
return editPath.lastIndexOf('match', editLength - 2);
} else {
return editPath.lastIndexOf('match');
}
}

export class TrackedContextSuggestion {
suggestion: Suggestion;
tokenWidth: number;
Expand Down Expand Up @@ -354,14 +366,7 @@ export class ContextTracker extends CircularArray<TrackedContextState> {

let editPath = mapping.editPath();
const firstMatch = editPath.indexOf('match');
let lastMatch = editPath.lastIndexOf('match');

// Special handling: appending whitespace to whitespace with the default wordbreaker.
// The default wordbreaker currently adds an empty token after whitespace; this would
// show up with 'substitute', 'match' at the end of the edit path.
if(editPath.length >= 2 && editPath[editPath.length - 2] == 'substitute' && editPath[editPath.length - 1] == 'match') {
lastMatch = editPath.lastIndexOf('match', editPath.length - 2);
}
const lastMatch = getEditPathLastMatch(editPath);

// Assertion: for a long context, the bulk of the edit path should be a
// continuous block of 'match' entries. If there's anything else in
Expand Down Expand Up @@ -451,17 +456,12 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
// transform as part of the input when doing correction-search.
let primaryInput = hasDistribution ? tokenDistribution[0]?.sample : null;

// If the incoming token has text but we have no transform to match it with,
// abort the matching attempt. We can't match this case well yet.
// These cases are generally infrequent enough to be low-ROI, not worth the
// investment to optimize further. (Doing so isn't the simplest.)
//
// This may happen if tokenization for pre-existing text changes due to incoming
// input - refer to #12494 for an example case.
// If the incoming token has text but we have no transform (or 'insert') to match
// it with, abort the matching attempt. We can't match this case well yet.
if(editPath[i] != 'delete') {
if(!incomingToken) {
return null;
} else if(!primaryInput && incomingToken?.text != '') {
} else if(!(primaryInput || editPath[i] == 'insert' ) && incomingToken?.text != '') {
return null;
}
}
Expand All @@ -477,7 +477,7 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
// Note: will need a either a different approach or more specialized
// handling if/when supporting phrase-level (multi-token) suggestions.
if(!isLastToken) {
preservationTransform = preservationTransform ? buildMergedTransform(preservationTransform, primaryInput) : primaryInput;
preservationTransform = preservationTransform && primaryInput ? buildMergedTransform(preservationTransform, primaryInput) : (primaryInput ?? preservationTransform);
}
const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput);

Expand Down Expand Up @@ -565,6 +565,11 @@ export class ContextTracker extends CircularArray<TrackedContextState> {
// we know to properly relocate the `f` and `l` transforms?
if(primaryInput) {
pushedToken.transformDistributions = tokenDistribution ? [tokenDistribution] : [];
} else if(incomingToken.text) {
// We have no transform data to match against an inserted token with text; abort!
// Refer to #12494 for an example case; we currently can't map previously-committed
// input transforms to a newly split-off token.
return null;
}
pushedToken.isWhitespace = incomingToken.isWhitespace;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
export * from './classical-calculation.js';
export * from './context-tracker.js';
export * from './distance-modeler.js';
export * from './execution-timer.js';
export * from './execution-timer.js';
export * from './transform-tokenization.js';
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import { assert } from 'chai';

import { ContextTracker } from '#./correction/context-tracker.js';
import { tokenizeTransformDistribution } from '#./correction/transform-tokenization.js';
import ModelCompositor from '#./model-compositor.js';
import * as models from '#./models/index.js';
import { determineModelTokenizer } from '#./model-helpers.js';

import { default as defaultBreaker } from '@keymanapp/models-wordbreakers';
import { deepCopy } from '@keymanapp/web-utils';

import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs';

const tokenizer = determineModelTokenizer(new models.DummyModel({wordbreaker: defaultBreaker}));

describe('ContextTracker', function() {
function toWrapperDistribution(transforms) {
transforms = Array.isArray(transforms) ? transforms : [transforms];
Expand Down Expand Up @@ -190,6 +194,72 @@ describe('ContextTracker', function() {
assert.equal(newContextMatch.tailTokensAdded, 2);
});

it("properly matches and aligns when initial token is modified AND a 'wordbreak' is added'", function() {
let existingContext = models.tokenize(defaultBreaker, {
left: "an"
});
let transform = {
insert: 'd ',
deleteLeft: 0
}
let newContext = models.tokenize(defaultBreaker, {
left: "and "
});
let rawTokens = ["and", " ", ""];

let baseContextMatch = ContextTracker.modelContextState(existingContext.left);
let newContextMatch = ContextTracker.attemptMatchContext(
newContext.left,
baseContextMatch,
tokenizeTransformDistribution(tokenizer, {left: "an"}, [{sample: transform, p: 1}])
);
assert.isNotNull(newContextMatch?.state);
assert.deepEqual(newContextMatch.state.tokens.map(token => token.raw), rawTokens);
// We want to preserve all text preceding the new token when applying a suggestion.
assert.deepEqual(newContextMatch.preservationTransform, { insert: 'd ', deleteLeft: 0, deleteRight: 0});

// The 'wordbreak' transform
let state = newContextMatch.state;
assert.isNotEmpty(state.tokens[state.tokens.length - 2].transformDistributions);
assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);

assert.equal(newContextMatch.headTokensRemoved, 0);
assert.equal(newContextMatch.tailTokensAdded, 2);
});

it("properly matches and aligns when tail token is modified AND a 'wordbreak' is added'", function() {
let existingContext = models.tokenize(defaultBreaker, {
left: "apple a day keeps the doc"
});
let transform = {
insert: 'tor ',
deleteLeft: 0
}
let newContext = models.tokenize(defaultBreaker, {
left: "apple a day keeps the doctor "
});
let rawTokens = ["apple", " ", "a", " ", "day", " ", "keeps", " ", "the", " ", "doctor", " ", ""];

let baseContextMatch = ContextTracker.modelContextState(existingContext.left);
let newContextMatch = ContextTracker.attemptMatchContext(
newContext.left,
baseContextMatch,
tokenizeTransformDistribution(tokenizer, {left: "apple a day keeps the doc"}, [{sample: transform, p: 1}])
);
assert.isNotNull(newContextMatch?.state);
assert.deepEqual(newContextMatch.state.tokens.map(token => token.raw), rawTokens);
// We want to preserve all text preceding the new token when applying a suggestion.
assert.deepEqual(newContextMatch.preservationTransform, { insert: 'tor ', deleteLeft: 0, deleteRight: 0 });

// The 'wordbreak' transform
let state = newContextMatch.state;
assert.isNotEmpty(state.tokens[state.tokens.length - 2].transformDistributions);
assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);

assert.equal(newContextMatch.headTokensRemoved, 0);
assert.equal(newContextMatch.tailTokensAdded, 2);
});

it('rejects hard-to-handle case: tail token is split into three rather than two', function() {
let baseContext = models.tokenize(defaultBreaker, {
left: "text'"
Expand All @@ -209,7 +279,11 @@ describe('ContextTracker', function() {
insert: '\"',
deleteLeft: 0
}
let problemContextMatch = ContextTracker.attemptMatchContext(newContext.left, baseContextMatch, toWrapperDistribution(transform));
let problemContextMatch = ContextTracker.attemptMatchContext(
newContext.left,
baseContextMatch,
tokenizeTransformDistribution(tokenizer, {left: "text'"}, [{sample: transform, p: 1}])
);
assert.isNull(problemContextMatch);
});
});
Expand Down

0 comments on commit 0e7b47e

Please sign in to comment.