Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(common/models): fixes quote-adjacent pred-text suggestions #7205

Merged
merged 20 commits into from
Sep 15, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
ad8c30c
fix(web): pred-text context tracking when wordbreak not caused by whi…
jahorton Sep 5, 2022
be562b3
fix(web): missed method references
jahorton Sep 5, 2022
b0184ec
fix(web): adds unit test targeting issue, handler for edge case
jahorton Sep 5, 2022
0a237f8
chore(web): test name tweak
jahorton Sep 5, 2022
d33a9f0
docs(web): fixes missed doc update
jahorton Sep 5, 2022
57ce30e
fix(web): post-suggestion-apply error
jahorton Sep 5, 2022
b47596a
fix(common/models): context token .isNew maintenance
jahorton Sep 5, 2022
26c5150
fix(web): context-tracker newFlag management for new contexts
jahorton Sep 5, 2022
9634c76
chore(common/models): suggested tweak from review
jahorton Sep 6, 2022
9be8839
fix(common/models): undefined != 0
jahorton Sep 6, 2022
fb0cc50
fix(common/models): backspacing shouldn't make 'new' tokens
jahorton Sep 7, 2022
52cb9aa
change(common/models): drops .isNew, replaces with tokenized context …
jahorton Sep 8, 2022
3aed0bf
feat(common/models): also, unit tests
jahorton Sep 8, 2022
75e0b27
change(web): pushWhitespaceToTail tweak
jahorton Sep 8, 2022
6c50de9
change(web): conciser version of last commit
jahorton Sep 8, 2022
415201b
fix(common/models): isWhitespace, adds related unit tests
jahorton Sep 12, 2022
665b149
fix(common/models): needed export for unit tests
jahorton Sep 12, 2022
2f044c4
chore(web): Merge branch 'master' into fix/web/non-whitespace-wordbreaks
jahorton Sep 12, 2022
a390c2f
chore(common/models): Apply suggestions from code review
jahorton Sep 14, 2022
917fce0
chore(common/models): final requested tweak
jahorton Sep 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ describe('ContextTracker', function() {
assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);
});

it("properly matches and aligns when a 'wordbreak' is added'", function() {
it("properly matches and aligns when a 'wordbreak' is added", function() {
let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"];
let transform = {
insert: ' ',
Expand All @@ -56,7 +56,7 @@ describe('ContextTracker', function() {
let rawTokens = ["an", null, "apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""];

let existingState = ContextTracker.modelContextState(existingContext);
let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform));
let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
assert.isNotNull(state);
assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);

Expand All @@ -65,6 +65,26 @@ describe('ContextTracker', function() {
assert.isEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
});

it("properly matches and aligns when an implied 'wordbreak' occurs (as when following \"'\")", function() {
let existingContext = ["'"];
let transform = {
insert: 'a',
deleteLeft: 0
}
let newContext = Array.from(existingContext);
newContext.push('a'); // The incoming transform should produce a new token WITH TEXT.
let rawTokens = ["'", null, "a"];

let existingState = ContextTracker.modelContextState(existingContext);
let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
assert.isNotNull(state);
assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);

// The 'wordbreak' transform
assert.isEmpty(state.tokens[state.tokens.length - 2].transformDistributions);
assert.isNotEmpty(state.tokens[state.tokens.length - 1].transformDistributions);
});

it("properly matches and aligns when lead token is removed AND a 'wordbreak' is added'", function() {
let existingContext = ["an", "apple", "a", "day", "keeps", "the", "doctor"];
let transform = {
Expand All @@ -77,7 +97,7 @@ describe('ContextTracker', function() {
let rawTokens = ["apple", null, "a", null, "day", null, "keeps", null, "the", null, "doctor", null, ""];

let existingState = ContextTracker.modelContextState(existingContext);
let state = ContextTracker.attemptMatchContext(newContext, existingState, null, toWrapperDistribution(transform));
let state = ContextTracker.attemptMatchContext(newContext, existingState, toWrapperDistribution(transform));
assert.isNotNull(state);
assert.deepEqual(state.tokens.map(token => token.raw), rawTokens);

Expand Down
98 changes: 63 additions & 35 deletions common/web/lm-worker/src/correction/context-tracker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ namespace correction {
export class TrackedContextToken {
raw: string;
replacementText: string;
newFlag: boolean = false;
jahorton marked this conversation as resolved.
Show resolved Hide resolved

transformDistributions: Distribution<Transform>[] = [];
replacements: TrackedContextSuggestion[];
activeReplacementId: number = -1;

get isNew(): boolean {
return this.transformDistributions.length == 0;
return this.newFlag;
}

get currentText(): string {
Expand Down Expand Up @@ -89,7 +90,7 @@ namespace correction {
if(token.replacementText) {
copy.replacementText = token.replacementText;
}

return copy;
});
this.searchSpace = obj.searchSpace;
Expand Down Expand Up @@ -139,8 +140,10 @@ namespace correction {

// Track the Transform that resulted in the whitespace 'token'.
// Will be needed for phrase-level correction/prediction.
whitespaceToken.transformDistributions = [transformDistribution];

if(transformDistribution) {
whitespaceToken.transformDistributions = [transformDistribution];
}
jahorton marked this conversation as resolved.
Show resolved Hide resolved

whitespaceToken.raw = null;
this.tokens.push(whitespaceToken);
}
Expand All @@ -149,19 +152,19 @@ namespace correction {
* Used for 14.0's backspace workaround, which flattens all previous Distribution<Transform>
* entries because of limitations with direct use of backspace transforms.
* @param tokenText
* @param transformId
* @param transformId
*/
replaceTailForBackspace(tokenText: USVString, transformId: number) {
this.tokens.pop();

// It's a backspace transform; time for special handling!
//
// For now, with 14.0, we simply compress all remaining Transforms for the token into
// multiple single-char transforms. Probabalistically modeling BKSP is quite complex,
// For now, with 14.0, we simply compress all remaining Transforms for the token into
// multiple single-char transforms. Probabalistically modeling BKSP is quite complex,
// so we simplify by assuming everything remaining after a BKSP is 'true' and 'intended' text.
//
// Note that we cannot just use a single, monolithic transform at this point b/c
// of our current edit-distance optimization strategy; diagonalization is currently...
// of our current edit-distance optimization strategy; diagonalization is currently...
// not very compatible with that.
let backspacedTokenContext: Distribution<Transform>[] = textToCharTransforms(tokenText, transformId).map(function(transform) {
return [{sample: transform, p: 1.0}];
Expand All @@ -175,7 +178,7 @@ namespace correction {

updateTail(transformDistribution: Distribution<Transform>, tokenText?: USVString) {
let editedToken = this.tail;

// Preserve existing text if new text isn't specified.
tokenText = tokenText || (tokenText === '' ? '' : editedToken.raw);

Expand All @@ -187,11 +190,12 @@ namespace correction {
}
// Replace old token's raw-text with new token's raw-text.
editedToken.raw = tokenText;
editedToken.newFlag = false;
}

toRawTokenization() {
let sequence: USVString[] = [];

for(let token of this.tokens) {
// Hide any tokens representing wordbreaks. (Thinking ahead to phrase-level possibilities)
if(token.currentText !== null) {
Expand Down Expand Up @@ -281,7 +285,7 @@ namespace correction {
/**
* Returns items contained within the circular array, ordered from 'oldest' to 'newest' -
* the same order in which the items will be dequeued.
* @param index
* @param index
*/
item(index: number) {
if(index >= this.count) {
Expand All @@ -294,7 +298,7 @@ namespace correction {
}

export class ContextTracker extends CircularArray<TrackedContextState> {
static attemptMatchContext(tokenizedContext: USVString[],
static attemptMatchContext(tokenizedContext: USVString[],
matchState: TrackedContextState,
transformDistribution?: Distribution<Transform>,): TrackedContextState {
// Map the previous tokenized state to an edit-distance friendly version.
Expand Down Expand Up @@ -335,7 +339,7 @@ namespace correction {
}

// Can happen for the first text input after backspace deletes a wordbreaking character,
// thus the new input continues a previous word while dropping the empty word after
// thus the new input continues a previous word while dropping the empty word after
// that prior wordbreaking character.
//
// We can't handle it reliably from this match state, but a previous entry (without the empty token)
Expand All @@ -353,7 +357,7 @@ namespace correction {

// If we've made it here... success! We have a context match!
let state: TrackedContextState;

if(pushedTail) {
// On suggestion acceptance, we should update the previous final token.
// We do it first so that the acceptance is replicated in the new TrackedContextState
Expand All @@ -376,7 +380,9 @@ namespace correction {
if(primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft == 0 && !primaryInput.deleteRight) {
primaryInput = null;
}
const isBackspace = primaryInput && primaryInput.insert == "" && primaryInput.deleteLeft > 0 && !primaryInput.deleteRight;

const isWhitespace = primaryInput && TransformUtils.isWhitespace(primaryInput);
const isBackspace = primaryInput && TransformUtils.isBackspace(primaryInput);
const finalToken = tokenizedContext[tokenizedContext.length-1];

/* Assumption: This is an adequate check for its two sub-branches.
Expand All @@ -388,7 +394,7 @@ namespace correction {
* - Assumption: one keystroke may only cause a single token to be appended to the context
* - That is, no "reasonable" keystroke would emit a Transform adding two separate word tokens
* - For languages using whitespace to word-break, said keystroke would have to include said whitespace to break the assumption.
*/
*/

// If there is/was more than one context token available...
if(editPath.length > 1) {
Expand All @@ -399,17 +405,30 @@ namespace correction {

// We're adding an additional context token.
if(pushedTail) {
// ASSUMPTION: any transform that triggers this case is a pure-whitespace Transform, as we
// need a word-break before beginning a new word's context.
// Worth note: when invalid, the lm-layer already has problems in other aspects too.
state.pushWhitespaceToTail(transformDistribution);

let emptyToken = new TrackedContextToken();
emptyToken.raw = '';
// Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters
// for the new word (token), so the input keystrokes do not correspond to the new text token.
emptyToken.transformDistributions = [];
state.pushTail(emptyToken);
const tokenizedTail = tokenizedContext[tokenizedContext.length - 1];
/*
* Common-case: most transforms that trigger this case are from pure-whitespace Transforms. MOST.
*
* Less-common, but noteworthy: some wordbreaks may occur without whitespace. Example:
* `"o` => ['"', 'o']. Make sure to double-check against `tokenizedContext`!
*/
let pushedToken = new TrackedContextToken();
pushedToken.raw = tokenizedTail;

if(isWhitespace || !primaryInput) {
state.pushWhitespaceToTail(transformDistribution ?? []);
// Continuing the earlier assumption, that 'pure-whitespace Transform' does not emit any initial characters
// for the new word (token), so the input keystrokes do not correspond to the new text token.
pushedToken.transformDistributions = [];
} else {
state.pushWhitespaceToTail();
// Assumption: Since we only allow one-transform-at-a-time changes between states, we shouldn't be missing
// any metadata used to construct the new context state token.
pushedToken.transformDistributions = transformDistribution ? [transformDistribution] : [];
}

pushedToken.newFlag = true;
state.pushTail(pushedToken);
} else { // We're editing the final context token.
// TODO: Assumption: we didn't 'miss' any inputs somehow.
// As is, may be prone to fragility should the lm-layer's tracked context 'desync' from its host's.
Expand All @@ -429,6 +448,7 @@ namespace correction {
let token = new TrackedContextToken();
token.raw = tokenizedContext[0];
token.transformDistributions = [transformDistribution];
token.newFlag = true;
state.pushTail(token);
} else { // Edit the lone context token.
// Consider backspace entry for this case?
Expand All @@ -442,7 +462,9 @@ namespace correction {
return state;
}

static modelContextState(tokenizedContext: USVString[], lexicalModel: LexicalModel): TrackedContextState {
static modelContextState(tokenizedContext: USVString[],
transformDistribution: Distribution<Transform>,
lexicalModel: LexicalModel): TrackedContextState {
let baseTokens = tokenizedContext.map(function(entry) {
let token = new TrackedContextToken();
token.raw = entry;
Expand Down Expand Up @@ -476,20 +498,26 @@ namespace correction {
state.pushTail(token);
}

const finalToken = state.tokens[state.tokens.length - 1];
const baseTransform = (transformDistribution && transformDistribution.length > 0) ? transformDistribution[0] : null;

if(baseTransform && baseTransform.sample.insert == finalToken.raw) {
finalToken.newFlag = true;
}

return state;
}

/**
* Compares the current, post-input context against the most recently-seen contexts from previous prediction calls, returning
* the most information-rich `TrackedContextState` possible. If a match is found, the state will be annotated with the
* input information provided to previous prediction calls and persisted correction-search calculations for re-use.
*
* @param model
* @param context
* @param mainTransform
* @param transformDistribution
*
* @param model
* @param context
* @param transformDistribution
*/
analyzeState(model: LexicalModel,
analyzeState(model: LexicalModel,
context: Context,
transformDistribution?: Distribution<Transform>): TrackedContextState {
if(!model.traverseFromRoot) {
Expand Down Expand Up @@ -519,7 +547,7 @@ namespace correction {
//
// Assumption: as a caret needs to move to context before any actual transform distributions occur,
// this state is only reached on caret moves; thus, transformDistribution is actually just a single null transform.
let state = ContextTracker.modelContextState(tokenizedContext.left, model);
let state = ContextTracker.modelContextState(tokenizedContext.left, transformDistribution, model);
state.taggedContext = context;
this.enqueue(state);
return state;
Expand Down
Loading