Skip to content

Commit

Permalink
fix(text): unicode support and word splitting according to case (#5447)
Browse files Browse the repository at this point in the history
Co-authored-by: Yoshiya Hinosawa <[email protected]>
Co-authored-by: Asher Gomez <[email protected]>
  • Loading branch information
3 people authored Jul 22, 2024
1 parent e1935ec commit 97c5596
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 3 deletions.
15 changes: 12 additions & 3 deletions text/_util.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

const CAPITALIZED_WORD_REGEXP = /\p{Lu}\p{Ll}+/u; // e.g. Apple
const ACRONYM_REGEXP = /\p{Lu}+(?=(\p{Lu}\p{Ll})|\P{L}|\b)/u; // e.g. ID, URL, handles an acronym followed by a capitalized word e.g. HTMLElement
const LOWERCASED_WORD_REGEXP = /(\p{Ll}+)/u; // e.g. apple
const ANY_LETTERS = /\p{L}+/u; // will match any sequence of letters, including in languages without a concept of upper/lower case
const DIGITS_REGEXP = /\p{N}+/u; // e.g. 123

const WORD_OR_NUMBER_REGEXP = new RegExp(
`${CAPITALIZED_WORD_REGEXP.source}|${ACRONYM_REGEXP.source}|${LOWERCASED_WORD_REGEXP.source}|${ANY_LETTERS.source}|${DIGITS_REGEXP.source}`,
"gu",
);

export function splitToWords(input: string) {
input = input.replaceAll(/[^a-zA-Z0-9\s-_]/g, "");
if (/[\s-_]+/.test(input)) return input.split(/[\s-_]+/);
return input.split(/(?=[A-Z])+/);
return input.match(WORD_OR_NUMBER_REGEXP) || [];
}

export function capitalizeWord(word: string): string {
Expand Down
99 changes: 99 additions & 0 deletions text/_util_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,33 @@
import { assertEquals } from "@std/assert";
import { splitToWords } from "./_util.ts";

Deno.test({
name: "split() returns an empty array for an empty string",
fn() {
const result = splitToWords("");
assertEquals(result.length, 0);
},
});

Deno.test({
name:
"split() returns an empty array when input has no alphanumeric characters",
fn() {
const result = splitToWords("🦕♥️ 🦕♥️ 🦕♥️");
assertEquals(result.length, 0);
},
});

Deno.test({
name: "split() ignores non-alphanumeric characters mixed with words",
fn() {
const result = splitToWords("🦕deno♥️wuv");
const expected = ["deno", "wuv"];

assertEquals(result, expected);
},
});

Deno.test({
name: "split() handles whitespace",
fn() {
Expand All @@ -12,6 +39,15 @@ Deno.test({
},
});

Deno.test({
name: "split() handles whitespace at string end and start",
fn() {
const result = splitToWords(" deno Is AWESOME ");
const expected = ["deno", "Is", "AWESOME"];
assertEquals(result, expected);
},
});

Deno.test({
name: "split() handles mixed delimiters",
fn() {
Expand All @@ -21,6 +57,15 @@ Deno.test({
},
});

Deno.test({
name: "split() handles a delimiter sequence",
fn() {
const result = splitToWords("I am -> thirsty!");
const expected = ["I", "am", "thirsty"];
assertEquals(result, expected);
},
});

Deno.test({
name: "split() handles upper case delimiter",
fn() {
Expand All @@ -39,6 +84,42 @@ Deno.test({
},
});

Deno.test({
name: "split() handles casing",
fn() {
const result = splitToWords("denoIsAwesome");
const expected = ["deno", "Is", "Awesome"];
assertEquals(result, expected);
},
});

Deno.test({
name: "split() handles unicode",
fn() {
const result = splitToWords("шруберри IsAwesome");
const expected = ["шруберри", "Is", "Awesome"];
assertEquals(result, expected);
},
});

Deno.test({
name: "split() handles unicode casing",
fn() {
const result = splitToWords("шруберриШруберри");
const expected = ["шруберри", "Шруберри"];
assertEquals(result, expected);
},
});

Deno.test({
name: "split() handles languages without casing",
fn() {
const result = splitToWords("אין_על דינו");
const expected = ["אין", "על", "דינו"];
assertEquals(result, expected);
},
});

Deno.test({
name: "split() handles screaming snake case",
fn() {
Expand All @@ -48,6 +129,15 @@ Deno.test({
},
});

Deno.test({
name: "split() handles acronym followed by a capitalized word",
fn() {
const result = splitToWords("I Love HTMLDivElement");
const expected = ["I", "Love", "HTML", "Div", "Element"];
assertEquals(result, expected);
},
});

Deno.test({
name: "split() handles underscore delimiter",
fn() {
Expand All @@ -56,3 +146,12 @@ Deno.test({
assertEquals(result, expected);
},
});

Deno.test({
name: "split() handles acronym followed by a capitalized word",
fn() {
const result = splitToWords("I Love HTMLDivElement");
const expected = ["I", "Love", "HTML", "Div", "Element"];
assertEquals(result, expected);
},
});
11 changes: 11 additions & 0 deletions text/case_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ Deno.test("toPascalCase() trims whitespace", () => {
assertEquals(result, expected);
});

Deno.test("toPascalCase() converts a single word with Cyrillic letters", () => {
const input = "шруберри";
const expected = "Шруберри";
assertEquals(toPascalCase(input), expected);
});

Deno.test("toSnakeCase() handles an empty string", () => {
assertEquals(toSnakeCase(""), "");
});
Expand Down Expand Up @@ -121,6 +127,11 @@ Deno.test("toSnakeCase() trims whitespace", () => {
assertEquals(result, expected);
});

Deno.test("toSnakeCase() splits words before and after the numbers", () => {
assertEquals(toSnakeCase("str2Num"), "str_2_num");
assertEquals(toSnakeCase("Str2Num"), "str_2_num");
});

Deno.test("toConstantCase() converts a single word", () => {
const input = "shruberry";
const expected = "SHRUBERRY";
Expand Down

0 comments on commit 97c5596

Please sign in to comment.