From a6873eaf009dceb6b33cc19980736ef17ae202e8 Mon Sep 17 00:00:00 2001 From: yumetodo Date: Wed, 14 Aug 2024 13:17:00 +0900 Subject: [PATCH] feat: count string by codepoint (#44) * feat: count string by codepoint Unicode says that there are 4 ways to count string length. https://unicode.org/faq/char_combmark.html#7 This commit supports counting by Code points. * refactor: cut-out to strLenByCodeUnits function ref: - https://github.com/textlint-rule/textlint-rule-sentence-length/pull/44#discussion_r1714720782 Co-authored-by: azu * chore: s/code units/codeunits/ ref: - https://github.com/textlint-rule/textlint-rule-sentence-length/pull/44#discussion_r1714717649 Co-authored-by: azu * chore: s/strLenByCodePoint/strLenByCodePoints/ --------- Co-authored-by: azu --- src/sentence-length.ts | 33 ++++++++++++++++++++++++++++++--- test/sentence-length-test.ts | 19 +++++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/sentence-length.ts b/src/sentence-length.ts index 4bbd52f..25d30a2 100644 --- a/src/sentence-length.ts +++ b/src/sentence-length.ts @@ -37,6 +37,12 @@ export type Options = { * @deprecated use skipPatterns */ exclusionPatterns?: string[]; + /** + * Determine how to count string length. + * By default or set to "codeunits", count string by UTF-16 code unit(= using `String.prototype.length`). + * If set to "codepoints", count string by codepoint. + */ + countBy?: "codeunits" | "codepoints"; }; const defaultOptions: Required = { max: 100, @@ -45,16 +51,37 @@ const defaultOptions: Required = { /** * @deprecated */ - exclusionPatterns: [] + exclusionPatterns: [], + countBy: "codeunits" }; const isSentenceNode = (node: TxtParentNodeWithSentenceNodeContent): node is TxtSentenceNode => { return node.type === SentenceSplitterSyntax.Sentence; }; + +/** + * A count of the number of code units currently in the string. + * @param s string + */ +const strLenByCodeUnits = (s: string): number => s.length; +/** + * A count of the number of codepoint currently in the string. + * + * Complexity: O(n) + * @param s string + */ +const strLenByCodePoints = (s: string): number => { + let i = 0; + for (const _ of s) { + ++i; + } + return i; +}; const reporter: TextlintRuleReporter = (context, options = {}) => { const maxLength = options.max ?? defaultOptions.max; const skipPatterns = options.skipPatterns ?? options.exclusionPatterns ?? defaultOptions.skipPatterns; const skipUrlStringLink = options.skipUrlStringLink ?? defaultOptions.skipUrlStringLink; + const strLen = options.countBy == null || options.countBy === "codeunits" ? strLenByCodeUnits : strLenByCodePoints; const helper = new RuleHelper(context); const { Syntax, RuleError, report } = context; const isUrlStringLink = (node: TxtSentenceNodeChildren): boolean => { @@ -96,8 +123,8 @@ const reporter: TextlintRuleReporter = (context, options = {}) => { const actualText = source.toString(); const sentenceText = removeRangeFromString(actualText, skipPatterns); // larger than > 100 - const actualTextLength = actualText.length; - const sentenceLength = sentenceText.length; + const actualTextLength = strLen(actualText); + const sentenceLength = strLen(sentenceText); if (sentenceLength > maxLength) { const startLine = filteredSentence.loc.start.line; report( diff --git a/test/sentence-length-test.ts b/test/sentence-length-test.ts index de27ff5..8a3d1bc 100644 --- a/test/sentence-length-test.ts +++ b/test/sentence-length-test.ts @@ -118,6 +118,13 @@ Shopify Functionで利用されるが、非同期処理の制限や5ms未満で max: 10, skipPatterns: ['/".*"/'] } + }, + { + text: "𦥑井と臼井", + options: { + max: 5, + countBy: "codepoints" + } } ], invalid: [ @@ -249,6 +256,18 @@ Over 18 characters.` max: 5, skipUrlStringLink: false } + }, + { + text: "𦥑井と臼井", + errors: [ + { + message: `Line 1 sentence length(6) exceeds the maximum sentence length of 5. +Over 1 characters.` + } + ], + options: { + max: 5 + } } ] });