fix(text): unicode support and word splitting according to case (#5447)

Co-authored-by: Yoshiya Hinosawa <[email protected]> Co-authored-by: Asher Gomez <[email protected]>
denoland · Jul 22, 2024 · 97c5596 · 97c5596
1 parent e1935ec
commit 97c5596
Show file tree

Hide file tree

Showing 3 changed files with 122 additions and 3 deletions.
diff --git a/text/_util.ts b/text/_util.ts
@@ -1,9 +1,18 @@
 // Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
 
+const CAPITALIZED_WORD_REGEXP = /\p{Lu}\p{Ll}+/u; // e.g. Apple
+const ACRONYM_REGEXP = /\p{Lu}+(?=(\p{Lu}\p{Ll})|\P{L}|\b)/u; // e.g. ID, URL, handles an acronym followed by a capitalized word e.g. HTMLElement
+const LOWERCASED_WORD_REGEXP = /(\p{Ll}+)/u; // e.g. apple
+const ANY_LETTERS = /\p{L}+/u; // will match any sequence of letters, including in languages without a concept of upper/lower case
+const DIGITS_REGEXP = /\p{N}+/u; // e.g. 123
+
+const WORD_OR_NUMBER_REGEXP = new RegExp(
+  `${CAPITALIZED_WORD_REGEXP.source}|${ACRONYM_REGEXP.source}|${LOWERCASED_WORD_REGEXP.source}|${ANY_LETTERS.source}|${DIGITS_REGEXP.source}`,
+  "gu",
+);
+
 export function splitToWords(input: string) {
-  input = input.replaceAll(/[^a-zA-Z0-9\s-_]/g, "");
-  if (/[\s-_]+/.test(input)) return input.split(/[\s-_]+/);
-  return input.split(/(?=[A-Z])+/);
+  return input.match(WORD_OR_NUMBER_REGEXP) || [];
 }
 
 export function capitalizeWord(word: string): string {

diff --git a/text/_util_test.ts b/text/_util_test.ts
@@ -3,6 +3,33 @@
 import { assertEquals } from "@std/assert";
 import { splitToWords } from "./_util.ts";
 
+Deno.test({
+  name: "split() returns an empty array for an empty string",
+  fn() {
+    const result = splitToWords("");
+    assertEquals(result.length, 0);
+  },
+});
+
+Deno.test({
+  name:
+    "split() returns an empty array when input has no alphanumeric characters",
+  fn() {
+    const result = splitToWords("🦕♥️ 🦕♥️ 🦕♥️");
+    assertEquals(result.length, 0);
+  },
+});
+
+Deno.test({
+  name: "split() ignores non-alphanumeric characters mixed with words",
+  fn() {
+    const result = splitToWords("🦕deno♥️wuv");
+    const expected = ["deno", "wuv"];
+
+    assertEquals(result, expected);
+  },
+});
+
 Deno.test({
   name: "split() handles whitespace",
   fn() {
@@ -12,6 +39,15 @@ Deno.test({
   },
 });
 
+Deno.test({
+  name: "split() handles whitespace at string end and start",
+  fn() {
+    const result = splitToWords("  deno Is AWESOME ");
+    const expected = ["deno", "Is", "AWESOME"];
+    assertEquals(result, expected);
+  },
+});
+
 Deno.test({
   name: "split() handles mixed delimiters",
   fn() {
@@ -21,6 +57,15 @@ Deno.test({
   },
 });
 
+Deno.test({
+  name: "split() handles a delimiter sequence",
+  fn() {
+    const result = splitToWords("I am   -> thirsty!");
+    const expected = ["I", "am", "thirsty"];
+    assertEquals(result, expected);
+  },
+});
+
 Deno.test({
   name: "split() handles upper case delimiter",
   fn() {
@@ -39,6 +84,42 @@ Deno.test({
   },
 });
 
+Deno.test({
+  name: "split() handles casing",
+  fn() {
+    const result = splitToWords("denoIsAwesome");
+    const expected = ["deno", "Is", "Awesome"];
+    assertEquals(result, expected);
+  },
+});
+
+Deno.test({
+  name: "split() handles unicode",
+  fn() {
+    const result = splitToWords("шруберри IsAwesome");
+    const expected = ["шруберри", "Is", "Awesome"];
+    assertEquals(result, expected);
+  },
+});
+
+Deno.test({
+  name: "split() handles unicode casing",
+  fn() {
+    const result = splitToWords("шруберриШруберри");
+    const expected = ["шруберри", "Шруберри"];
+    assertEquals(result, expected);
+  },
+});
+
+Deno.test({
+  name: "split() handles languages without casing",
+  fn() {
+    const result = splitToWords("אין_על דינו");
+    const expected = ["אין", "על", "דינו"];
+    assertEquals(result, expected);
+  },
+});
+
 Deno.test({
   name: "split() handles screaming snake case",
   fn() {
@@ -48,6 +129,15 @@ Deno.test({
   },
 });
 
+Deno.test({
+  name: "split() handles acronym followed by a capitalized word",
+  fn() {
+    const result = splitToWords("I Love HTMLDivElement");
+    const expected = ["I", "Love", "HTML", "Div", "Element"];
+    assertEquals(result, expected);
+  },
+});
+
 Deno.test({
   name: "split() handles underscore delimiter",
   fn() {
@@ -56,3 +146,12 @@ Deno.test({
     assertEquals(result, expected);
   },
 });
+
+Deno.test({
+  name: "split() handles acronym followed by a capitalized word",
+  fn() {
+    const result = splitToWords("I Love HTMLDivElement");
+    const expected = ["I", "Love", "HTML", "Div", "Element"];
+    assertEquals(result, expected);
+  },
+});
diff --git a/text/case_test.ts b/text/case_test.ts
@@ -93,6 +93,12 @@ Deno.test("toPascalCase() trims whitespace", () => {
   assertEquals(result, expected);
 });
 
+Deno.test("toPascalCase() converts a single word with Cyrillic letters", () => {
+  const input = "шруберри";
+  const expected = "Шруберри";
+  assertEquals(toPascalCase(input), expected);
+});
+
 Deno.test("toSnakeCase() handles an empty string", () => {
   assertEquals(toSnakeCase(""), "");
 });
@@ -121,6 +127,11 @@ Deno.test("toSnakeCase() trims whitespace", () => {
   assertEquals(result, expected);
 });
 
+Deno.test("toSnakeCase() splits words before and after the numbers", () => {
+  assertEquals(toSnakeCase("str2Num"), "str_2_num");
+  assertEquals(toSnakeCase("Str2Num"), "str_2_num");
+});
+
 Deno.test("toConstantCase() converts a single word", () => {
   const input = "shruberry";
   const expected = "SHRUBERRY";