From ecfa89a654eaec78e68e13b1be30a9cd109d3d0d Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Mon, 12 Aug 2024 13:05:19 +0200 Subject: [PATCH] fix: Try non-English suffix endings on word breaks (#6066) Co-authored-by: street-side-software-automation[bot] <74785433+street-side-software-automation[bot]@users.noreply.github.com> --- packages/cspell-lib/api/api.d.ts | 69 +++++++++---------- .../src/lib/__snapshots__/index.test.ts.snap | 4 -- packages/cspell-lib/src/lib/index.ts | 2 +- .../textValidation/lineValidatorFactory.ts | 52 ++++++++++++-- .../lib/textValidation/textValidator.test.ts | 16 ++++- .../__snapshots__/wordSplitter.test.ts.snap | 21 ++++++ packages/cspell-lib/src/lib/util/text.test.ts | 1 + packages/cspell-lib/src/lib/util/text.ts | 22 +++++- packages/cspell-lib/src/lib/util/textApi.ts | 29 ++++++++ .../cspell-lib/src/lib/util/textRegex.test.ts | 10 +-- packages/cspell-lib/src/lib/util/textRegex.ts | 5 +- .../src/lib/util/wordSplitter.test.ts | 4 +- .../cspell-lib/src/lib/util/wordSplitter.ts | 4 +- .../logging/dictionary-logging.csv | 8 ++- 14 files changed, 181 insertions(+), 66 deletions(-) create mode 100644 packages/cspell-lib/src/lib/util/textApi.ts diff --git a/packages/cspell-lib/api/api.d.ts b/packages/cspell-lib/api/api.d.ts index 13aa4c3ef23..eb4a962c20d 100644 --- a/packages/cspell-lib/api/api.d.ts +++ b/packages/cspell-lib/api/api.d.ts @@ -979,7 +979,7 @@ declare function getLogger(): Logger; declare function stringToRegExp(pattern: string | RegExp, defaultFlags?: string, forceFlags?: string): RegExp | undefined; -declare function splitCamelCaseWordWithOffset(wo: TextOffset): Array; +declare function splitCamelCaseWordWithOffset(wo: TextOffset): TextOffset[]; /** * Split camelCase words into an array of strings. */ @@ -1025,41 +1025,38 @@ declare function textOffset(text: string, offset?: number): TextOffset; declare function extractText(textOffset: TextOffset, startPos: number, endPos: number): string; declare function calculateTextDocumentOffsets(uri: string | Uri | URL, doc: string, wordOffsets: T[]): (TextDocumentOffset & T)[]; declare function removeAccents(text: string): string; -declare const __testing__: { - regExWords: RegExp; - regExWordsAndDigits: RegExp; -}; -declare const text_d___testing__: typeof __testing__; -declare const text_d_calculateTextDocumentOffsets: typeof calculateTextDocumentOffsets; -declare const text_d_camelToSnake: typeof camelToSnake; -declare const text_d_cleanText: typeof cleanText; -declare const text_d_cleanTextOffset: typeof cleanTextOffset; -declare const text_d_extractLinesOfText: typeof extractLinesOfText; -declare const text_d_extractPossibleWordsFromTextOffset: typeof extractPossibleWordsFromTextOffset; -declare const text_d_extractText: typeof extractText; -declare const text_d_extractWordsFromCode: typeof extractWordsFromCode; -declare const text_d_extractWordsFromCodeTextOffset: typeof extractWordsFromCodeTextOffset; -declare const text_d_extractWordsFromText: typeof extractWordsFromText; -declare const text_d_extractWordsFromTextOffset: typeof extractWordsFromTextOffset; -declare const text_d_isFirstCharacterLower: typeof isFirstCharacterLower; -declare const text_d_isFirstCharacterUpper: typeof isFirstCharacterUpper; -declare const text_d_isLowerCase: typeof isLowerCase; -declare const text_d_isUpperCase: typeof isUpperCase; -declare const text_d_lcFirst: typeof lcFirst; -declare const text_d_match: typeof match; -declare const text_d_matchCase: typeof matchCase; -declare const text_d_matchStringToTextOffset: typeof matchStringToTextOffset; -declare const text_d_matchToTextOffset: typeof matchToTextOffset; -declare const text_d_removeAccents: typeof removeAccents; -declare const text_d_snakeToCamel: typeof snakeToCamel; -declare const text_d_splitCamelCaseWord: typeof splitCamelCaseWord; -declare const text_d_splitCamelCaseWordWithOffset: typeof splitCamelCaseWordWithOffset; -declare const text_d_stringToRegExp: typeof stringToRegExp; -declare const text_d_textOffset: typeof textOffset; -declare const text_d_ucFirst: typeof ucFirst; -declare namespace text_d { - export { text_d___testing__ as __testing__, text_d_calculateTextDocumentOffsets as calculateTextDocumentOffsets, text_d_camelToSnake as camelToSnake, text_d_cleanText as cleanText, text_d_cleanTextOffset as cleanTextOffset, text_d_extractLinesOfText as extractLinesOfText, text_d_extractPossibleWordsFromTextOffset as extractPossibleWordsFromTextOffset, text_d_extractText as extractText, text_d_extractWordsFromCode as extractWordsFromCode, text_d_extractWordsFromCodeTextOffset as extractWordsFromCodeTextOffset, text_d_extractWordsFromText as extractWordsFromText, text_d_extractWordsFromTextOffset as extractWordsFromTextOffset, text_d_isFirstCharacterLower as isFirstCharacterLower, text_d_isFirstCharacterUpper as isFirstCharacterUpper, text_d_isLowerCase as isLowerCase, text_d_isUpperCase as isUpperCase, text_d_lcFirst as lcFirst, text_d_match as match, text_d_matchCase as matchCase, text_d_matchStringToTextOffset as matchStringToTextOffset, text_d_matchToTextOffset as matchToTextOffset, text_d_removeAccents as removeAccents, text_d_snakeToCamel as snakeToCamel, text_d_splitCamelCaseWord as splitCamelCaseWord, text_d_splitCamelCaseWordWithOffset as splitCamelCaseWordWithOffset, text_d_stringToRegExp as stringToRegExp, text_d_textOffset as textOffset, text_d_ucFirst as ucFirst }; +//# sourceMappingURL=textApi.d.ts.map + +declare const textApi_d_calculateTextDocumentOffsets: typeof calculateTextDocumentOffsets; +declare const textApi_d_camelToSnake: typeof camelToSnake; +declare const textApi_d_cleanText: typeof cleanText; +declare const textApi_d_cleanTextOffset: typeof cleanTextOffset; +declare const textApi_d_extractLinesOfText: typeof extractLinesOfText; +declare const textApi_d_extractPossibleWordsFromTextOffset: typeof extractPossibleWordsFromTextOffset; +declare const textApi_d_extractText: typeof extractText; +declare const textApi_d_extractWordsFromCode: typeof extractWordsFromCode; +declare const textApi_d_extractWordsFromCodeTextOffset: typeof extractWordsFromCodeTextOffset; +declare const textApi_d_extractWordsFromText: typeof extractWordsFromText; +declare const textApi_d_extractWordsFromTextOffset: typeof extractWordsFromTextOffset; +declare const textApi_d_isFirstCharacterLower: typeof isFirstCharacterLower; +declare const textApi_d_isFirstCharacterUpper: typeof isFirstCharacterUpper; +declare const textApi_d_isLowerCase: typeof isLowerCase; +declare const textApi_d_isUpperCase: typeof isUpperCase; +declare const textApi_d_lcFirst: typeof lcFirst; +declare const textApi_d_match: typeof match; +declare const textApi_d_matchCase: typeof matchCase; +declare const textApi_d_matchStringToTextOffset: typeof matchStringToTextOffset; +declare const textApi_d_matchToTextOffset: typeof matchToTextOffset; +declare const textApi_d_removeAccents: typeof removeAccents; +declare const textApi_d_snakeToCamel: typeof snakeToCamel; +declare const textApi_d_splitCamelCaseWord: typeof splitCamelCaseWord; +declare const textApi_d_splitCamelCaseWordWithOffset: typeof splitCamelCaseWordWithOffset; +declare const textApi_d_stringToRegExp: typeof stringToRegExp; +declare const textApi_d_textOffset: typeof textOffset; +declare const textApi_d_ucFirst: typeof ucFirst; +declare namespace textApi_d { + export { textApi_d_calculateTextDocumentOffsets as calculateTextDocumentOffsets, textApi_d_camelToSnake as camelToSnake, textApi_d_cleanText as cleanText, textApi_d_cleanTextOffset as cleanTextOffset, textApi_d_extractLinesOfText as extractLinesOfText, textApi_d_extractPossibleWordsFromTextOffset as extractPossibleWordsFromTextOffset, textApi_d_extractText as extractText, textApi_d_extractWordsFromCode as extractWordsFromCode, textApi_d_extractWordsFromCodeTextOffset as extractWordsFromCodeTextOffset, textApi_d_extractWordsFromText as extractWordsFromText, textApi_d_extractWordsFromTextOffset as extractWordsFromTextOffset, textApi_d_isFirstCharacterLower as isFirstCharacterLower, textApi_d_isFirstCharacterUpper as isFirstCharacterUpper, textApi_d_isLowerCase as isLowerCase, textApi_d_isUpperCase as isUpperCase, textApi_d_lcFirst as lcFirst, textApi_d_match as match, textApi_d_matchCase as matchCase, textApi_d_matchStringToTextOffset as matchStringToTextOffset, textApi_d_matchToTextOffset as matchToTextOffset, textApi_d_removeAccents as removeAccents, textApi_d_snakeToCamel as snakeToCamel, textApi_d_splitCamelCaseWord as splitCamelCaseWord, textApi_d_splitCamelCaseWordWithOffset as splitCamelCaseWordWithOffset, textApi_d_stringToRegExp as stringToRegExp, textApi_d_textOffset as textOffset, textApi_d_ucFirst as ucFirst }; } -export { type CheckTextInfo, type ConfigurationDependencies, type CreateTextDocumentParams, type DetermineFinalDocumentSettingsResult, type Document, DocumentValidator, type DocumentValidatorOptions, ENV_CSPELL_GLOB_ROOT, type ExcludeFilesGlobMap, type ExclusionFunction, exclusionHelper_d as ExclusionHelper, type FeatureFlag, FeatureFlags, ImportError, type ImportFileRefWithError$1 as ImportFileRefWithError, IncludeExcludeFlag, type IncludeExcludeOptions, index_link_d as Link, type Logger, type PerfTimer, type SpellCheckFileOptions, type SpellCheckFilePerf, type SpellCheckFileResult, SpellingDictionaryLoadError, type SuggestedWord, SuggestionError, type SuggestionOptions, type SuggestionsForWordResult, text_d as Text, type TextDocument, type TextDocumentLine, type TextDocumentRef, type TextInfoItem, type TraceOptions, type TraceResult, type TraceWordResult, UnknownFeatureFlagError, type ValidationIssue, calcOverrideSettings, checkFilenameMatchesExcludeGlob as checkFilenameMatchesGlob, checkText, checkTextDocument, clearCachedFiles, clearCaches, combineTextAndLanguageSettings, combineTextAndLanguageSettings as constructSettingsForText, createConfigLoader, createPerfTimer, createTextDocument, currentSettingsFileVersion, defaultConfigFilenames, defaultFileName, defaultFileName as defaultSettingsFilename, determineFinalDocumentSettings, extractDependencies, extractImportErrors, fileToDocument, fileToTextDocument, finalizeSettings, getCachedFileSize, getDefaultBundledSettingsAsync, getDefaultConfigLoader, getDefaultSettings, getDictionary, getGlobalSettings, getGlobalSettingsAsync, getLanguagesForBasename as getLanguageIdsForBaseFilename, getLanguagesForExt, getLogger, getSources, getSystemFeatureFlags, getVirtualFS, isBinaryFile, isSpellingDictionaryLoadError, loadConfig, loadPnP, mergeInDocSettings, mergeSettings, readRawSettings, readSettings, readSettingsFiles, refreshDictionaryCache, resolveFile, searchForConfig, sectionCSpell, setLogger, shouldCheckDocument, spellCheckDocument, spellCheckFile, suggestionsForWord, suggestionsForWords, traceWords, traceWordsAsync, updateTextDocument, validateText }; +export { type CheckTextInfo, type ConfigurationDependencies, type CreateTextDocumentParams, type DetermineFinalDocumentSettingsResult, type Document, DocumentValidator, type DocumentValidatorOptions, ENV_CSPELL_GLOB_ROOT, type ExcludeFilesGlobMap, type ExclusionFunction, exclusionHelper_d as ExclusionHelper, type FeatureFlag, FeatureFlags, ImportError, type ImportFileRefWithError$1 as ImportFileRefWithError, IncludeExcludeFlag, type IncludeExcludeOptions, index_link_d as Link, type Logger, type PerfTimer, type SpellCheckFileOptions, type SpellCheckFilePerf, type SpellCheckFileResult, SpellingDictionaryLoadError, type SuggestedWord, SuggestionError, type SuggestionOptions, type SuggestionsForWordResult, textApi_d as Text, type TextDocument, type TextDocumentLine, type TextDocumentRef, type TextInfoItem, type TraceOptions, type TraceResult, type TraceWordResult, UnknownFeatureFlagError, type ValidationIssue, calcOverrideSettings, checkFilenameMatchesExcludeGlob as checkFilenameMatchesGlob, checkText, checkTextDocument, clearCachedFiles, clearCaches, combineTextAndLanguageSettings, combineTextAndLanguageSettings as constructSettingsForText, createConfigLoader, createPerfTimer, createTextDocument, currentSettingsFileVersion, defaultConfigFilenames, defaultFileName, defaultFileName as defaultSettingsFilename, determineFinalDocumentSettings, extractDependencies, extractImportErrors, fileToDocument, fileToTextDocument, finalizeSettings, getCachedFileSize, getDefaultBundledSettingsAsync, getDefaultConfigLoader, getDefaultSettings, getDictionary, getGlobalSettings, getGlobalSettingsAsync, getLanguagesForBasename as getLanguageIdsForBaseFilename, getLanguagesForExt, getLogger, getSources, getSystemFeatureFlags, getVirtualFS, isBinaryFile, isSpellingDictionaryLoadError, loadConfig, loadPnP, mergeInDocSettings, mergeSettings, readRawSettings, readSettings, readSettingsFiles, refreshDictionaryCache, resolveFile, searchForConfig, sectionCSpell, setLogger, shouldCheckDocument, spellCheckDocument, spellCheckFile, suggestionsForWord, suggestionsForWords, traceWords, traceWordsAsync, updateTextDocument, validateText }; diff --git a/packages/cspell-lib/src/lib/__snapshots__/index.test.ts.snap b/packages/cspell-lib/src/lib/__snapshots__/index.test.ts.snap index 49bc633b7f9..2186256d0b2 100644 --- a/packages/cspell-lib/src/lib/__snapshots__/index.test.ts.snap +++ b/packages/cspell-lib/src/lib/__snapshots__/index.test.ts.snap @@ -127,10 +127,6 @@ exports[`Validate the cspell API > Verify API exports 1`] = ` "SuggestionError": [Function], "SuggestionResult": undefined, "Text": { - "__testing__": { - "regExWords": /\\\\p\\{L\\}\\\\p\\{M\\}\\?\\(\\?:\\(\\?:\\\\\\\\\\?\\['’\\]\\)\\?\\\\p\\{L\\}\\\\p\\{M\\}\\?\\)\\*/gu, - "regExWordsAndDigits": /\\[\\\\p\\{L\\}\\\\w'’\`\\.\\+-\\]\\(\\?:\\(\\?:\\\\\\\\\\(\\?=\\['\\]\\)\\)\\?\\[\\\\p\\{L\\}\\\\p\\{M\\}\\\\w'’\`\\.\\+-\\]\\)\\*/gu, - }, "calculateTextDocumentOffsets": [Function], "camelToSnake": [Function], "cleanText": [Function], diff --git a/packages/cspell-lib/src/lib/index.ts b/packages/cspell-lib/src/lib/index.ts index a764a0bd547..1832e1a1783 100644 --- a/packages/cspell-lib/src/lib/index.ts +++ b/packages/cspell-lib/src/lib/index.ts @@ -83,7 +83,7 @@ export type { TraceOptions, TraceResult, TraceWordResult } from './trace.js'; export { traceWords, traceWordsAsync } from './trace.js'; export { getLogger, Logger, setLogger } from './util/logger.js'; export { resolveFile } from './util/resolveFile.js'; -export * as Text from './util/text.js'; +export * as Text from './util/textApi.js'; export { checkText, checkTextDocument, diff --git a/packages/cspell-lib/src/lib/textValidation/lineValidatorFactory.ts b/packages/cspell-lib/src/lib/textValidation/lineValidatorFactory.ts index bbe9b820487..040500494dd 100644 --- a/packages/cspell-lib/src/lib/textValidation/lineValidatorFactory.ts +++ b/packages/cspell-lib/src/lib/textValidation/lineValidatorFactory.ts @@ -10,9 +10,10 @@ import * as RxPat from '../Settings/RegExpPatterns.js'; import { extractPossibleWordsFromTextOffset, extractText, - extractWordsFromCodeTextOffset, extractWordsFromTextOffset, + splitWordWithOffset, } from '../util/text.js'; +import { regExpCamelCaseWordBreaksWithEnglishSuffix } from '../util/textRegex.js'; import { split } from '../util/wordSplitter.js'; import { defaultMinWordLength } from './defaultConstants.js'; import { isWordValidWithEscapeRetry } from './isWordValid.js'; @@ -199,9 +200,51 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat // English exceptions :-( if (isAllCapsWithTrailingCommonEnglishSuffixOk(vr)) return []; + if (isWordIgnored(vr.text) || checkWord(vr).isFound) { + rememberFilter((_) => false)(vr); + return []; + } + if (vr.isFlagged) return [vr]; + + const codeWordResults: ValidationIssueRO[] = checkCamelCaseWord(vr); + + if (!codeWordResults.length) { + rememberFilter((_) => false)(vr); + return []; + } + + return codeWordResults; + } + + /** + * Break a camel case word into its parts and check each part. + * + * There are two word break patterns: + * - `regExpCamelCaseWordBreaks` + * - `regExpCamelCaseWordBreaksWithEnglishSuffix` is the default pattern with English suffixes on ALL CAPS words. + * + * Note: See [#6066](https://github.com/streetsidesoftware/cspell/pull/6066) + * Using just `regExpCamelCaseWordBreaks` misses unknown 4-letter words. + * + * The code below was tried, but it missed words. + * - `LSTM` was caught. // cspell:disable-line + * - `LSTMs` was missed because it becomes `LST` and `Ms`. // cspell:disable-line + * + * ```ts + * const results = _checkCamelCaseWord(vr, regExpCamelCaseWordBreaks); + * if (!results.length) return results; + * const resultsEnglishBreaks = _checkCamelCaseWord(vr, regExpCamelCaseWordBreaksWithEnglishSuffix); + * return results.length < resultsEnglishBreaks.length ? results : resultsEnglishBreaks; + * ``` + */ + function checkCamelCaseWord(vr: ValidationIssueRO): ValidationIssueRO[] { + return _checkCamelCaseWord(vr, regExpCamelCaseWordBreaksWithEnglishSuffix); + } + + function _checkCamelCaseWord(vr: ValidationIssueRO, regExpWordBreaks: RegExp): ValidationIssueRO[] { const codeWordResults: ValidationIssueRO[] = []; - for (const wo of extractWordsFromCodeTextOffset(vr)) { + for (const wo of splitWordWithOffset(vr, regExpWordBreaks)) { if (setOfKnownSuccessfulWords.has(wo.text)) continue; const issue = wo as ValidationIssue; issue.line = vr.line; @@ -215,11 +258,6 @@ export function lineValidatorFactory(sDict: SpellingDictionary, options: Validat codeWordResults.push(issue); } - if (!codeWordResults.length || isWordIgnored(vr.text) || checkWord(vr).isFound) { - rememberFilter((_) => false)(vr); - return []; - } - return codeWordResults; } diff --git a/packages/cspell-lib/src/lib/textValidation/textValidator.test.ts b/packages/cspell-lib/src/lib/textValidation/textValidator.test.ts index 947631e0668..1ad4c385c18 100644 --- a/packages/cspell-lib/src/lib/textValidation/textValidator.test.ts +++ b/packages/cspell-lib/src/lib/textValidation/textValidator.test.ts @@ -43,11 +43,21 @@ describe('Validate textValidator functions', () => { }); test('tests trailing s, ed, ing, etc. are attached to the words', async () => { - const dictEmpty = await createSpellingDictionary([], 'empty', 'test', opts()); - const text = 'We have PUBLISHed multiple FIXesToThePROBLEMs'; + const dictEmpty = createSpellingDictionary([], 'empty', 'test', opts()); + const text = 'We have PUBLISHed multiple FixesToThePROBLEMs'; + const result = [...validateText(text, dictEmpty, sToV({}))]; + const errors = result.map((wo) => wo.text); + expect(errors).toEqual(['have', 'PUBLISHed', 'multiple', 'Fixes', 'PROBLEMs']); + }); + + // cspell:ignore UI + + test('words breaks', async () => { + const dictEmpty = createSpellingDictionary(['mark', 'as', 'ready'], 'sample', 'test', opts()); + const text = 'markUIAsReady() '; const result = [...validateText(text, dictEmpty, sToV({}))]; const errors = result.map((wo) => wo.text); - expect(errors).toEqual(['have', 'PUBLISHed', 'multiple', 'FIXes', 'PROBLEMs']); + expect(errors).toEqual(['UIAs']); }); test('tests case in ignore words', async () => { diff --git a/packages/cspell-lib/src/lib/util/__snapshots__/wordSplitter.test.ts.snap b/packages/cspell-lib/src/lib/util/__snapshots__/wordSplitter.test.ts.snap index 50531c21f79..811997331b8 100644 --- a/packages/cspell-lib/src/lib/util/__snapshots__/wordSplitter.test.ts.snap +++ b/packages/cspell-lib/src/lib/util/__snapshots__/wordSplitter.test.ts.snap @@ -91,6 +91,27 @@ exports[`Validate wordSplitter > Extract all possible word breaks to 'hello' 1`] ] `; +exports[`Validate wordSplitter > Extract all possible word breaks to 'markUIAsReady' 1`] = ` +[ + "mark|UI|As|Ready", + "mark|UI|A|Ready", + "mark|UI|AsReady", + "mark|UIA|s|Ready", + "mark|UIA|Ready", + "mark|UIA|sReady", + "mark|UIAs|Ready", + "mark|UIAsReady", + "markUI|As|Ready", + "markUI|A|Ready", + "markUI|AsReady", + "markUIA|s|Ready", + "markUIA|Ready", + "markUIA|sReady", + "markUIAs|Ready", + "markUIAsReady", +] +`; + exports[`Validate wordSplitter > Extract all possible word breaks to 'well-educated' 1`] = ` [ "well|educated", diff --git a/packages/cspell-lib/src/lib/util/text.test.ts b/packages/cspell-lib/src/lib/util/text.test.ts index cb81bebb526..e4e425d0dec 100644 --- a/packages/cspell-lib/src/lib/util/text.test.ts +++ b/packages/cspell-lib/src/lib/util/text.test.ts @@ -38,6 +38,7 @@ describe('Util Text', () => { ${'ASCIIToUTF16'} | ${['ASCII', 'To', 'UTF16']} ${'URLsAndDBAs'} | ${['URLs', 'And', 'DBAs']} ${'WALKingRUNning'} | ${['WALKing', 'RUNning']} + ${'c0de'} | ${['c0de']} `('splitCamelCaseWord $word', ({ word, expected }) => { expect(splitCamelCaseWord(word)).toEqual(expected); }); diff --git a/packages/cspell-lib/src/lib/util/text.ts b/packages/cspell-lib/src/lib/util/text.ts index 427aa3220f1..d0344e27eee 100644 --- a/packages/cspell-lib/src/lib/util/text.ts +++ b/packages/cspell-lib/src/lib/util/text.ts @@ -8,7 +8,7 @@ import { regExAllUpper, regExFirstUpper, regExIgnoreCharacters, - regExpSplitWordBreaks, + regExpCamelCaseWordBreaksWithEnglishSuffix, regExWords, regExWordsAndDigits, } from './textRegex.js'; @@ -20,7 +20,7 @@ export { stringToRegExp } from './textRegex.js'; // CSpell:ignore ings ning gimuy tsmerge -export function splitCamelCaseWordWithOffset(wo: TextOffset): Array { +export function splitCamelCaseWordWithOffset(wo: TextOffset): TextOffset[] { return splitCamelCaseWord(wo.text).map( scanMap((last, text) => ({ text, offset: last.offset + last.text.length }), { text: '', @@ -33,7 +33,23 @@ export function splitCamelCaseWordWithOffset(wo: TextOffset): Array * Split camelCase words into an array of strings. */ export function splitCamelCaseWord(word: string): string[] { - return word.split(regExpSplitWordBreaks); + return splitWord(word, regExpCamelCaseWordBreaksWithEnglishSuffix); +} + +export function splitWordWithOffset(wo: TextOffset, regExpWordBreaks: RegExp): TextOffset[] { + return splitWord(wo.text, regExpWordBreaks).map( + scanMap((last, text) => ({ text, offset: last.offset + last.text.length }), { + text: '', + offset: wo.offset, + }), + ); +} + +/** + * Split camelCase words into an array of strings. + */ +export function splitWord(word: string, regExpWordBreaks: RegExp): string[] { + return word.split(regExpWordBreaks); } /** diff --git a/packages/cspell-lib/src/lib/util/textApi.ts b/packages/cspell-lib/src/lib/util/textApi.ts new file mode 100644 index 00000000000..2626e08dacc --- /dev/null +++ b/packages/cspell-lib/src/lib/util/textApi.ts @@ -0,0 +1,29 @@ +export { + calculateTextDocumentOffsets, + camelToSnake, + cleanText, + cleanTextOffset, + extractLinesOfText, + extractPossibleWordsFromTextOffset, + extractText, + extractWordsFromCode, + extractWordsFromCodeTextOffset, + extractWordsFromText, + extractWordsFromTextOffset, + isFirstCharacterLower, + isFirstCharacterUpper, + isLowerCase, + isUpperCase, + lcFirst, + match, + matchCase, + matchStringToTextOffset, + matchToTextOffset, + removeAccents, + snakeToCamel, + splitCamelCaseWord, + splitCamelCaseWordWithOffset, + stringToRegExp, + textOffset, + ucFirst, +} from './text.js'; diff --git a/packages/cspell-lib/src/lib/util/textRegex.test.ts b/packages/cspell-lib/src/lib/util/textRegex.test.ts index c3e73ce7b1f..cbac6b5cd65 100644 --- a/packages/cspell-lib/src/lib/util/textRegex.test.ts +++ b/packages/cspell-lib/src/lib/util/textRegex.test.ts @@ -121,11 +121,11 @@ describe('Validate textRegex', () => { test.each` text | expected ${'hello'} | ${[]} - ${'ERRORCode'} | ${[['RCo', 'R', 'Co']]} - ${nfc('CAFÉStyle')} | ${[[nfc('ÉSt'), nfc('É'), 'St']]} - ${nfd('CAFÉStyle')} | ${[[nfd('ÉSt'), nfd('É'), 'St']]} - ${nfc('CODEÉrror')} | ${[[nfc('EÉr'), 'E', nfc('Ér')]]} - ${nfd('CODEÉrror')} | ${[[nfd('EÉr'), 'E', nfd('Ér')]]} + ${'ERRORCode'} | ${[['RCo', 'R', 'Co', 'C']]} + ${nfc('CAFÉStyle')} | ${[[nfc('ÉSt'), nfc('É'), 'St', 'S']]} + ${nfd('CAFÉStyle')} | ${[[nfd('ÉSt'), nfd('É'), 'St', 'S']]} + ${nfc('CODEÉrror')} | ${[[nfc('EÉr'), 'E', nfc('Ér'), nfc('É')]]} + ${nfd('CODEÉrror')} | ${[[nfd('EÉr'), 'E', nfd('Ér'), nfd('É')]]} ${'ERRORS'} | ${[]} `('regExSplitWords2 on "$text"', ({ text, expected }: { text: string; expected: string[] }) => { const m = [...text.matchAll(regExSplitWords2)].map((m) => [...m]); diff --git a/packages/cspell-lib/src/lib/util/textRegex.ts b/packages/cspell-lib/src/lib/util/textRegex.ts index 54894bfebf7..a6dc4a5faa5 100644 --- a/packages/cspell-lib/src/lib/util/textRegex.ts +++ b/packages/cspell-lib/src/lib/util/textRegex.ts @@ -2,9 +2,10 @@ export const regExUpperSOrIng = /([\p{Lu}\p{M}]+(?:\\?['’])?(?:s|ing|ies|es|ings|ed|ning))(?!\p{Ll})/gu; export const regExSplitWords = /(\p{Ll}\p{M}?)(\p{Lu})/gu; -export const regExSplitWords2 = /(\p{Lu}\p{M}?)(\p{Lu}\p{M}?\p{Ll})/gu; -export const regExpSplitWordBreaks = +export const regExSplitWords2 = /(\p{Lu}\p{M}?)((\p{Lu}\p{M}?)\p{Ll})/gu; +export const regExpCamelCaseWordBreaksWithEnglishSuffix = /(?<=\p{Ll}\p{M}?)(?=\p{Lu})|(?<=\p{Lu}\p{M}?)(?=\p{Lu}\p{M}?\p{Ll})(?!\p{Lu}\p{M}?(?:s|ing|ies|es|ings|ed|ning)(?!\p{Ll}))/gu; +export const regExpCamelCaseWordBreaks = /(?<=\p{Ll}\p{M}?)(?=\p{Lu})|(?<=\p{Lu}\p{M}?)(?=\p{Lu}\p{M}?\p{Ll})/gu; export const regExpAllPossibleWordBreaks = /(?<=\p{Ll}\p{M}?)(?=\p{Lu})|(?<=\p{Lu}\p{M}?)(?=\p{Lu}\p{M}?\p{Ll})|(?<=\p{Lu}\p{M}?\p{Lu}\p{M}?)(?=\p{Ll})|(?<=\p{L}\p{M}?)(?=\P{L})|(?<=\P{L})(?=\p{L})/gu; export const regExWords = /\p{L}\p{M}?(?:(?:\\?['’])?\p{L}\p{M}?)*/gu; diff --git a/packages/cspell-lib/src/lib/util/wordSplitter.test.ts b/packages/cspell-lib/src/lib/util/wordSplitter.test.ts index be902d41f6c..09385d70007 100644 --- a/packages/cspell-lib/src/lib/util/wordSplitter.test.ts +++ b/packages/cspell-lib/src/lib/util/wordSplitter.test.ts @@ -33,7 +33,7 @@ describe('Validate wordSplitter', () => { expect(result.words.filter((w) => !w.isFound).length).toBe(7); }); - // cspell:ignore MOVSX + // cspell:ignore MOVSX UI test.each` text | expected ${'hello'} | ${['hello']} @@ -43,6 +43,7 @@ describe('Validate wordSplitter', () => { ${'MOVSX_r_rm16'} | ${['MOVSX', 'r', 'rm']} ${'32bit-checksum'} | ${['bit', 'checksum']} ${'camelCase'} | ${['camel', 'Case']} + ${'markUIAsReady'} | ${['mark', 'UI', 'As', '', 'Ready']} `('Extract word breaks to $text', ({ text, expected }: TestApplyWordBreaks) => { const line = { text, @@ -83,6 +84,7 @@ describe('Validate wordSplitter', () => { ${'32bit-checksum'} | ${['bit|checksum']} ${'ERRORCode'} | ${['ERROR|Code']} ${'camelCase'} | ${['camel|Case', 'camelCase']} + ${'markUIAsReady'} | ${['mark|UI|As|Ready']} `('Extract all possible word breaks to $text', ({ text, expected }: TestApplyWordBreaks) => { const line = { text, diff --git a/packages/cspell-lib/src/lib/util/wordSplitter.ts b/packages/cspell-lib/src/lib/util/wordSplitter.ts index d106573e576..c42534e10f3 100644 --- a/packages/cspell-lib/src/lib/util/wordSplitter.ts +++ b/packages/cspell-lib/src/lib/util/wordSplitter.ts @@ -189,7 +189,7 @@ function genWordBreakCamel(line: LineSegment): SortedBreaks[] { // lower,Upper: camelCase -> camel|Case for (const m of text.matchAll(offsetRegEx(regExSplitWords, line.relStart))) { if (m.index === undefined) break; - const i = m.index + 1; + const i = m.index + m[1].length; breaksCamel1.push({ offset: m.index, breaks: [[i, i], ignoreBreak], @@ -203,7 +203,7 @@ function genWordBreakCamel(line: LineSegment): SortedBreaks[] { for (const m of text.matchAll(offsetRegEx(regExSplitWords2, line.relStart))) { if (m.index === undefined) break; const i = m.index + m[1].length; - const j = i + 1; + const j = i + m[3].length; breaksCamel2.push({ offset: m.index, breaks: [[i, i], [j, j], ignoreBreak], diff --git a/packages/cspell/src/app/lint/__snapshots__/logging/dictionary-logging.csv b/packages/cspell/src/app/lint/__snapshots__/logging/dictionary-logging.csv index 4cc79d08dc0..8ba5349a37c 100644 --- a/packages/cspell/src/app/lint/__snapshots__/logging/dictionary-logging.csv +++ b/packages/cspell/src/app/lint/__snapshots__/logging/dictionary-logging.csv @@ -1,5 +1,6 @@ word, value -Book, true +eBook, true +LaTeX, true Template, true Version, true This, true @@ -34,6 +35,7 @@ structure, true layout, true Fairy, true Tales, true +Book, true title, true Author, true edition, true @@ -1033,7 +1035,7 @@ wetted, true kingdom, true joyfully, true contented, true -Book, true +eBook, true Structural, true Definitions, true File, true @@ -1207,8 +1209,10 @@ output, true following, true shows, true power, true +LaTeX, true This, true sample, true +LaTeX, true input, true file, true Version, true