Skip to content

Commit

Permalink
fix: Try non-English suffix endings on word breaks (#6066)
Browse files Browse the repository at this point in the history
Co-authored-by: street-side-software-automation[bot] <74785433+street-side-software-automation[bot]@users.noreply.github.com>
  • Loading branch information
1 parent ec89e83 commit ecfa89a
Show file tree
Hide file tree
Showing 14 changed files with 181 additions and 66 deletions.
69 changes: 33 additions & 36 deletions packages/cspell-lib/api/api.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 0 additions & 4 deletions packages/cspell-lib/src/lib/__snapshots__/index.test.ts.snap
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,6 @@ exports[`Validate the cspell API > Verify API exports 1`] = `
"SuggestionError": [Function],
"SuggestionResult": undefined,
"Text": {
"__testing__": {
"regExWords": /\\\\p\\{L\\}\\\\p\\{M\\}\\?\\(\\?:\\(\\?:\\\\\\\\\\?\\['\\]\\)\\?\\\\p\\{L\\}\\\\p\\{M\\}\\?\\)\\*/gu,
"regExWordsAndDigits": /\\[\\\\p\\{L\\}\\\\w'\`\\.\\+-\\]\\(\\?:\\(\\?:\\\\\\\\\\(\\?=\\['\\]\\)\\)\\?\\[\\\\p\\{L\\}\\\\p\\{M\\}\\\\w'\`\\.\\+-\\]\\)\\*/gu,
},
"calculateTextDocumentOffsets": [Function],
"camelToSnake": [Function],
"cleanText": [Function],
Expand Down
2 changes: 1 addition & 1 deletion packages/cspell-lib/src/lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ export type { TraceOptions, TraceResult, TraceWordResult } from './trace.js';
export { traceWords, traceWordsAsync } from './trace.js';
export { getLogger, Logger, setLogger } from './util/logger.js';
export { resolveFile } from './util/resolveFile.js';
export * as Text from './util/text.js';
export * as Text from './util/textApi.js';
export {
checkText,
checkTextDocument,
Expand Down
52 changes: 45 additions & 7 deletions packages/cspell-lib/src/lib/textValidation/lineValidatorFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ import * as RxPat from '../Settings/RegExpPatterns.js';
import {
extractPossibleWordsFromTextOffset,
extractText,
extractWordsFromCodeTextOffset,
extractWordsFromTextOffset,
splitWordWithOffset,
} from '../util/text.js';
import { regExpCamelCaseWordBreaksWithEnglishSuffix } from '../util/textRegex.js';
import { split } from '../util/wordSplitter.js';
import { defaultMinWordLength } from './defaultConstants.js';
import { isWordValidWithEscapeRetry } from './isWordValid.js';
Expand Down Expand Up @@ -199,9 +200,51 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat
// English exceptions :-(
if (isAllCapsWithTrailingCommonEnglishSuffixOk(vr)) return [];

if (isWordIgnored(vr.text) || checkWord(vr).isFound) {
rememberFilter((_) => false)(vr);
return [];
}
if (vr.isFlagged) return [vr];

const codeWordResults: ValidationIssueRO[] = checkCamelCaseWord(vr);

if (!codeWordResults.length) {
rememberFilter((_) => false)(vr);
return [];
}

return codeWordResults;
}

/**
* Break a camel case word into its parts and check each part.
*
* There are two word break patterns:
* - `regExpCamelCaseWordBreaks`
* - `regExpCamelCaseWordBreaksWithEnglishSuffix` is the default pattern with English suffixes on ALL CAPS words.
*
* Note: See [#6066](https://github.com/streetsidesoftware/cspell/pull/6066)
* Using just `regExpCamelCaseWordBreaks` misses unknown 4-letter words.
*
* The code below was tried, but it missed words.
* - `LSTM` was caught. // cspell:disable-line
* - `LSTMs` was missed because it becomes `LST` and `Ms`. // cspell:disable-line
*
* ```ts
* const results = _checkCamelCaseWord(vr, regExpCamelCaseWordBreaks);
* if (!results.length) return results;
* const resultsEnglishBreaks = _checkCamelCaseWord(vr, regExpCamelCaseWordBreaksWithEnglishSuffix);
* return results.length < resultsEnglishBreaks.length ? results : resultsEnglishBreaks;
* ```
*/
function checkCamelCaseWord(vr: ValidationIssueRO): ValidationIssueRO[] {
return _checkCamelCaseWord(vr, regExpCamelCaseWordBreaksWithEnglishSuffix);
}

function _checkCamelCaseWord(vr: ValidationIssueRO, regExpWordBreaks: RegExp): ValidationIssueRO[] {
const codeWordResults: ValidationIssueRO[] = [];

for (const wo of extractWordsFromCodeTextOffset(vr)) {
for (const wo of splitWordWithOffset(vr, regExpWordBreaks)) {
if (setOfKnownSuccessfulWords.has(wo.text)) continue;
const issue = wo as ValidationIssue;
issue.line = vr.line;
Expand All @@ -215,11 +258,6 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat
codeWordResults.push(issue);
}

if (!codeWordResults.length || isWordIgnored(vr.text) || checkWord(vr).isFound) {
rememberFilter((_) => false)(vr);
return [];
}

return codeWordResults;
}

Expand Down
16 changes: 13 additions & 3 deletions packages/cspell-lib/src/lib/textValidation/textValidator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,21 @@ describe('Validate textValidator functions', () => {
});

test('tests trailing s, ed, ing, etc. are attached to the words', async () => {
const dictEmpty = await createSpellingDictionary([], 'empty', 'test', opts());
const text = 'We have PUBLISHed multiple FIXesToThePROBLEMs';
const dictEmpty = createSpellingDictionary([], 'empty', 'test', opts());
const text = 'We have PUBLISHed multiple FixesToThePROBLEMs';
const result = [...validateText(text, dictEmpty, sToV({}))];
const errors = result.map((wo) => wo.text);
expect(errors).toEqual(['have', 'PUBLISHed', 'multiple', 'Fixes', 'PROBLEMs']);
});

// cspell:ignore UI

test('words breaks', async () => {
const dictEmpty = createSpellingDictionary(['mark', 'as', 'ready'], 'sample', 'test', opts());
const text = 'markUIAsReady() ';
const result = [...validateText(text, dictEmpty, sToV({}))];
const errors = result.map((wo) => wo.text);
expect(errors).toEqual(['have', 'PUBLISHed', 'multiple', 'FIXes', 'PROBLEMs']);
expect(errors).toEqual(['UIAs']);
});

test('tests case in ignore words', async () => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,27 @@ exports[`Validate wordSplitter > Extract all possible word breaks to 'hello' 1`]
]
`;

exports[`Validate wordSplitter > Extract all possible word breaks to 'markUIAsReady' 1`] = `
[
"mark|UI|As|Ready",
"mark|UI|A|Ready",
"mark|UI|AsReady",
"mark|UIA|s|Ready",
"mark|UIA|Ready",
"mark|UIA|sReady",
"mark|UIAs|Ready",
"mark|UIAsReady",
"markUI|As|Ready",
"markUI|A|Ready",
"markUI|AsReady",
"markUIA|s|Ready",
"markUIA|Ready",
"markUIA|sReady",
"markUIAs|Ready",
"markUIAsReady",
]
`;

exports[`Validate wordSplitter > Extract all possible word breaks to 'well-educated' 1`] = `
[
"well|educated",
Expand Down
1 change: 1 addition & 0 deletions packages/cspell-lib/src/lib/util/text.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ describe('Util Text', () => {
${'ASCIIToUTF16'} | ${['ASCII', 'To', 'UTF16']}
${'URLsAndDBAs'} | ${['URLs', 'And', 'DBAs']}
${'WALKingRUNning'} | ${['WALKing', 'RUNning']}
${'c0de'} | ${['c0de']}
`('splitCamelCaseWord $word', ({ word, expected }) => {
expect(splitCamelCaseWord(word)).toEqual(expected);
});
Expand Down
22 changes: 19 additions & 3 deletions packages/cspell-lib/src/lib/util/text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import {
regExAllUpper,
regExFirstUpper,
regExIgnoreCharacters,
regExpSplitWordBreaks,
regExpCamelCaseWordBreaksWithEnglishSuffix,
regExWords,
regExWordsAndDigits,
} from './textRegex.js';
Expand All @@ -20,7 +20,7 @@ export { stringToRegExp } from './textRegex.js';

// CSpell:ignore ings ning gimuy tsmerge

export function splitCamelCaseWordWithOffset(wo: TextOffset): Array<TextOffset> {
export function splitCamelCaseWordWithOffset(wo: TextOffset): TextOffset[] {
return splitCamelCaseWord(wo.text).map(
scanMap<string, TextOffset>((last, text) => ({ text, offset: last.offset + last.text.length }), {
text: '',
Expand All @@ -33,7 +33,23 @@ export function splitCamelCaseWordWithOffset(wo: TextOffset): Array<TextOffset>
* Split camelCase words into an array of strings.
*/
export function splitCamelCaseWord(word: string): string[] {
return word.split(regExpSplitWordBreaks);
return splitWord(word, regExpCamelCaseWordBreaksWithEnglishSuffix);
}

export function splitWordWithOffset(wo: TextOffset, regExpWordBreaks: RegExp): TextOffset[] {
return splitWord(wo.text, regExpWordBreaks).map(
scanMap<string, TextOffset>((last, text) => ({ text, offset: last.offset + last.text.length }), {
text: '',
offset: wo.offset,
}),
);
}

/**
* Split camelCase words into an array of strings.
*/
export function splitWord(word: string, regExpWordBreaks: RegExp): string[] {
return word.split(regExpWordBreaks);
}

/**
Expand Down
Loading

0 comments on commit ecfa89a

Please sign in to comment.