Skip to content

Commit

Permalink
Merge pull request #7205 from keymanapp/fix/web/non-whitespace-wordbr…
Browse files Browse the repository at this point in the history
…eaks

fix(common/models): fixes quote-adjacent pred-text suggestions
  • Loading branch information
jahorton authored Sep 15, 2022
2 parents af6cbc7 + 917fce0 commit ec53662
Show file tree
Hide file tree
Showing 7 changed files with 268 additions and 104 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ describe('ContextTracker', function() {
assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);
});

it("properly matches and aligns when a 'wordbreak' is added'", function() {
it("properly matches and aligns when a 'wordbreak' is added", function() {
let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"];
let transform = {
insert: ' ',
Expand All @@ -56,7 +56,7 @@ describe('ContextTracker', function() {
let rawTokens = ["an", null, "apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""];

let existingState = ContextTracker.modelContextState(existingContext);
let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform));
let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
assert.isNotNull(state);
assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);

Expand All @@ -65,6 +65,26 @@ describe('ContextTracker', function() {
assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
});

it("properly matches and aligns when an implied 'wordbreak' occurs (as when following \"'\")", function() {
let existingContext = ["'"];
let transform = {
insert: 'a',
deleteLeft: 0
}
let newContext = Array.from(existingContext);
newContext.push('a'); // The incoming transform should produce a new token WITH TEXT.
let rawTokens = ["'", null, "a"];

let existingState = ContextTracker.modelContextState(existingContext);
let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
assert.isNotNull(state);
assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);

// The 'wordbreak' transform
assert.isEmpty(state.tokens[state.tokens.length - 2].transformDistributions);
assert.isNotEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
});

it("properly matches and aligns when lead token is removed AND a 'wordbreak' is added'", function() {
let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"];
let transform = {
Expand All @@ -77,7 +97,7 @@ describe('ContextTracker', function() {
let rawTokens = ["apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""];

let existingState = ContextTracker.modelContextState(existingContext);
let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform));
let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
assert.isNotNull(state);
assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);

Expand Down
58 changes: 58 additions & 0 deletions common/predictive-text/unit_tests/headless/transform-utils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
var assert = require('chai').assert;

let TransformUtils = require('../../../web/lm-worker/build/intermediate.js').TransformUtils;

describe('TransformUtils', function () {
describe('isWhitespace', function () {
it("should not match a string containing standard alphabetic characters", function () {
let testTransforms = [{
insert: "a ",
deleteLeft: 0
}, {
insert: " a",
deleteLeft: 0
}, {
insert: "ab",
deleteLeft: 0
}];

testTransforms.forEach((transform) => assert.isFalse(TransformUtils.isWhitespace(transform), `failed with: '${transform.insert}'`));
});

it("should match a simple ' ' transform", function() {
transform = {
insert: " ",
deleteLeft: 0
};

assert.isTrue(TransformUtils.isWhitespace(transform));
});

it("should match a simple ' ' transform with delete-left", function() {
transform = {
insert: " ",
deleteLeft: 1
};

assert.isTrue(TransformUtils.isWhitespace(transform));
});

it("should match a transform consisting of multiple characters of only whitespace", function() {
transform = {
insert: " \n\r\u00a0\t\u2000 ",
deleteLeft: 0
};

assert.isTrue(TransformUtils.isWhitespace(transform));
});

it("stress tests", function() {
transform = {
insert: " \n\r\u00a0\ta\u2000 ", // the 'a' should cause failure.
deleteLeft: 0
};

assert.isFalse(TransformUtils.isWhitespace(transform));
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,53 @@ describe('ModelCompositor', function() {
// Suggestions always delete the full root of the suggestion.
//
// After a backspace, that means the text 'the' - 3 chars.
// Char 4 is for the original backspace, as suggstions are built
// Char 4 is for the original backspace, as suggestions are built
// based on the context state BEFORE the triggering input -
// here, a backspace.
assert.equal(suggestion.transform.deleteLeft, 4);
});
});

it('properly handles suggestions for the first letter after a ` `', function() {
let compositor = new ModelCompositor(plainModel);
let context = {
left: 'the', startOfBuffer: true, endOfBuffer: true,
};

let inputTransform = {
insert: ' ',
deleteLeft: 0
};

let suggestions = compositor.predict(inputTransform, context);
suggestions.forEach(function(suggestion) {
// After a space, predictions are based on a new, zero-length root.
// With nothing to replace, .deleteLeft should be zero.
assert.equal(suggestion.transform.deleteLeft, 0);
});
});

it('properly handles suggestions for the first letter after a `\'`', function() {
let compositor = new ModelCompositor(plainModel);
let context = {
left: "the '", startOfBuffer: true, endOfBuffer: true,
};

// This results in a new word boundary (between the `'` and the `a`).
// Basically, an implied (but nonexistent) ` `.
let inputTransform = {
insert: "a",
deleteLeft: 0
};

let suggestions = compositor.predict(inputTransform, context);
suggestions.forEach(function(suggestion) {
// Suggestions always delete the full root of the suggestion.
// Which, here, didn't exist before the input. Nothing to
// replace => nothing for the suggestion to delete.
assert.equal(suggestion.transform.deleteLeft, 0);
});
});
});

describe('applySuggestionCasing', function() {
Expand Down
87 changes: 49 additions & 38 deletions common/web/lm-worker/src/correction/context-tracker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,6 @@ namespace correction {
replacements: TrackedContextSuggestion[];
activeReplacementId: number = -1;

get isNew(): boolean {
return this.transformDistributions.length == 0;
}

get currentText(): string {
if(this.replacementText === undefined || this.replacementText === null) {
return this.raw;
Expand Down Expand Up @@ -89,7 +85,7 @@ namespace correction {
if(token.replacementText) {
copy.replacementText = token.replacementText;
}

return copy;
});
this.searchSpace = obj.searchSpace;
Expand Down Expand Up @@ -139,8 +135,8 @@ namespace correction {

// Track the Transform that resulted in the whitespace 'token'.
// Will be needed for phrase-level correction/prediction.
whitespaceToken.transformDistributions = [transformDistribution];
whitespaceToken.transformDistributions = transformDistribution ? [transformDistribution] : [];

whitespaceToken.raw = null;
this.tokens.push(whitespaceToken);
}
Expand All @@ -149,19 +145,19 @@ namespace correction {
* Used for 14.0's backspace workaround, which flattens all previous Distribution<Transform>
* entries because of limitations with direct use of backspace transforms.
* @param tokenText
* @param transformId
* @param transformId
*/
replaceTailForBackspace(tokenText: USVString, transformId: number) {
this.tokens.pop();

// It's a backspace transform; time for special handling!
//
// For now, with 14.0, we simply compress all remaining Transforms for the token into
// multiple single-char transforms. Probabalistically modeling BKSP is quite complex,
// For now, with 14.0, we simply compress all remaining Transforms for the token into
// multiple single-char transforms. Probabalistically modeling BKSP is quite complex,
// so we simplify by assuming everything remaining after a BKSP is 'true' and 'intended' text.
//
// Note that we cannot just use a single, monolithic transform at this point b/c
// of our current edit-distance optimization strategy; diagonalization is currently...
// of our current edit-distance optimization strategy; diagonalization is currently...
// not very compatible with that.
let backspacedTokenContext: Distribution<Transform>[] = textToCharTransforms(tokenText, transformId).map(function(transform) {
return [{sample: transform, p: 1.0}];
Expand All @@ -175,7 +171,7 @@ namespace correction {

updateTail(transformDistribution: Distribution<Transform>, tokenText?: USVString) {
let editedToken = this.tail;

// Preserve existing text if new text isn't specified.
tokenText = tokenText || (tokenText === '' ? '' : editedToken.raw);

Expand All @@ -191,7 +187,7 @@ namespace correction {

toRawTokenization() {
let sequence: USVString[] = [];

for(let token of this.tokens) {
// Hide any tokens representing wordbreaks. (Thinking ahead to phrase-level possibilities)
if(token.currentText !== null) {
Expand Down Expand Up @@ -281,7 +277,7 @@ namespace correction {
/**
* Returns items contained within the circular array, ordered from 'oldest' to 'newest' -
* the same order in which the items will be dequeued.
* @param index
* @param index
*/
item(index: number) {
if(index >= this.count) {
Expand All @@ -294,7 +290,7 @@ namespace correction {
}

export class ContextTracker extends CircularArray<TrackedContextState> {
static attemptMatchContext(tokenizedContext: USVString[],
static attemptMatchContext(tokenizedContext: USVString[],
matchState: TrackedContextState,
transformDistribution?: Distribution<Transform>,): TrackedContextState {
// Map the previous tokenized state to an edit-distance friendly version.
Expand Down Expand Up @@ -335,7 +331,7 @@ namespace correction {
}

// Can happen for the first text input after backspace deletes a wordbreaking character,
// thus the new input continues a previous word while dropping the empty word after
// thus the new input continues a previous word while dropping the empty word after
// that prior wordbreaking character.
//
// We can't handle it reliably from this match state, but a previous entry (without the empty token)
Expand All @@ -353,7 +349,7 @@ namespace correction {

// If we've made it here... success! We have a context match!
let state: TrackedContextState;

if(pushedTail) {
// On suggestion acceptance, we should update the previous final token.
// We do it first so that the acceptance is replicated in the new TrackedContextState
Expand All @@ -376,7 +372,9 @@ namespace correction {
if(primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft == 0 && !primaryInput.deleteRight) {
primaryInput = null;
}
const isBackspace = primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft > 0 && !primaryInput.deleteRight;

const isWhitespace = primaryInput && TransformUtils.isWhitespace(primaryInput);
const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput);
const finalToken = tokenizedContext[tokenizedContext.length-1];

/* Assumption: This is an adequate check for its two sub-branches.
Expand All @@ -388,7 +386,7 @@ namespace correction {
* - Assumption: one keystroke may only cause a single token to be appended to the context
* - That is, no "reasonable" keystroke would emit a Transform adding two separate word tokens
* - For languages using whitespace to word-break, said keystroke would have to include said whitespace to break the assumption.
*/
*/

// If there is/was more than one context token available...
if(editPath.length > 1) {
Expand All @@ -399,17 +397,29 @@ namespace correction {

// We're adding an additional context token.
if(pushedTail) {
// ASSUMPTION: any transform that triggers this case is a pure-whitespace Transform, as we
// need a word-break before beginning a new word's context.
// Worth note: when invalid, the lm-layer already has problems in other aspects too.
state.pushWhitespaceToTail(transformDistribution);

let emptyToken = new TrackedContextToken();
emptyToken.raw = '';
// Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters
// for the new word (token), so the input keystrokes do not correspond to the new text token.
emptyToken.transformDistributions = [];
state.pushTail(emptyToken);
const tokenizedTail = tokenizedContext[tokenizedContext.length - 1];
/*
* Common-case: most transforms that trigger this case are from pure-whitespace Transforms. MOST.
*
* Less-common, but noteworthy: some wordbreaks may occur without whitespace. Example:
* `"o` => ['"', 'o']. Make sure to double-check against `tokenizedContext`!
*/
let pushedToken = new TrackedContextToken();
pushedToken.raw = tokenizedTail;

if(isWhitespace || !primaryInput) {
state.pushWhitespaceToTail(transformDistribution ?? []);
// Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters
// for the new word (token), so the input keystrokes do not correspond to the new text token.
pushedToken.transformDistributions = [];
} else {
state.pushWhitespaceToTail();
// Assumption: Since we only allow one-transform-at-a-time changes between states, we shouldn't be missing
// any metadata used to construct the new context state token.
pushedToken.transformDistributions = transformDistribution ? [transformDistribution] : [];
}

state.pushTail(pushedToken);
} else { // We're editing the final context token.
// TODO: Assumption: we didn't 'miss' any inputs somehow.
// As is, may be prone to fragility should the lm-layer's tracked context 'desync' from its host's.
Expand Down Expand Up @@ -442,7 +452,9 @@ namespace correction {
return state;
}

static modelContextState(tokenizedContext: USVString[], lexicalModel: LexicalModel): TrackedContextState {
static modelContextState(tokenizedContext: USVString[],
transformDistribution: Distribution<Transform>,
lexicalModel: LexicalModel): TrackedContextState {
let baseTokens = tokenizedContext.map(function(entry) {
let token = new TrackedContextToken();
token.raw = entry;
Expand Down Expand Up @@ -483,13 +495,12 @@ namespace correction {
* Compares the current, post-input context against the most recently-seen contexts from previous prediction calls, returning
* the most information-rich `TrackedContextState` possible. If a match is found, the state will be annotated with the
* input information provided to previous prediction calls and persisted correction-search calculations for re-use.
*
* @param model
* @param context
* @param mainTransform
* @param transformDistribution
*
* @param model
* @param context
* @param transformDistribution
*/
analyzeState(model: LexicalModel,
analyzeState(model: LexicalModel,
context: Context,
transformDistribution?: Distribution<Transform>): TrackedContextState {
if(!model.traverseFromRoot) {
Expand Down Expand Up @@ -519,7 +530,7 @@ namespace correction {
//
// Assumption: as a caret needs to move to context before any actual transform distributions occur,
// this state is only reached on caret moves; thus, transformDistribution is actually just a single null transform.
let state = ContextTracker.modelContextState(tokenizedContext.left, model);
let state = ContextTracker.modelContextState(tokenizedContext.left, transformDistribution, model);
state.taggedContext = context;
this.enqueue(state);
return state;
Expand Down
2 changes: 2 additions & 0 deletions common/web/lm-worker/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
/// <reference types="@keymanapp/lm-message-types" />
/// <reference path="./models/dummy-model.ts" />
/// <reference path="./model-compositor.ts" />
/// <reference path="./transformUtils.ts" />

/**
* Encapsulates all the state required for the LMLayer's worker thread.
Expand Down Expand Up @@ -407,6 +408,7 @@ if (typeof module !== 'undefined' && typeof module.exports !== 'undefined') {
module.exports['wordBreakers'] = wordBreakers;
/// XXX: export the ModelCompositor for testing.
module.exports['ModelCompositor'] = ModelCompositor;
module.exports['TransformUtils'] = TransformUtils;
} else if (typeof self !== 'undefined' && 'postMessage' in self && 'importScripts' in self) {
// Automatically install if we're in a Web Worker.
LMLayerWorker.install(self as any); // really, 'as typeof globalThis', but we're currently getting TS errors from use of that.
Expand Down
Loading

0 comments on commit ec53662

Please sign in to comment.