Skip to content

Commit

Permalink
Add NFD normalizer (#1211)
Browse files Browse the repository at this point in the history
* Add NFD normalizer

* Update test model ID
  • Loading branch information
xenova authored Feb 26, 2025
1 parent 591a112 commit 161237b
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 29 deletions.
70 changes: 42 additions & 28 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -995,6 +995,8 @@ class Normalizer extends Callable {
return new Replace(config);
case 'NFC':
return new NFC(config);
case 'NFD':
return new NFD(config);
case 'NFKC':
return new NFKC(config);
case 'NFKD':
Expand Down Expand Up @@ -1053,50 +1055,62 @@ class Replace extends Normalizer {
}

/**
* A normalizer that applies Unicode normalization form C (NFC) to the input text.
* A normalizer that applies Unicode normalization to the input text.
* @extends Normalizer
* @abstract
*/
class NFC extends Normalizer {
class UnicodeNormalizer extends Normalizer {
/**
* @type {string} The Unicode normalization form to apply.
* Should be one of: 'NFC', 'NFD', 'NFKC', or 'NFKD'.
*/
form = undefined;

/**
* Normalize the input text by applying Unicode normalization form C (NFC).
* Normalize the input text by applying Unicode normalization.
* @param {string} text The input text to be normalized.
* @returns {string} The normalized text.
*/
normalize(text) {
text = text.normalize('NFC')
text = text.normalize(this.form)
return text;
}
}

/**
* NFKC Normalizer.
* @extends Normalizer
* A normalizer that applies Unicode normalization form C (NFC) to the input text.
* Canonical Decomposition, followed by Canonical Composition.
* @extends UnicodeNormalizer
*/
class NFKC extends Normalizer {
/**
* Normalize text using NFKC normalization.
* @param {string} text The text to be normalized.
* @returns {string} The normalized text.
*/
normalize(text) {
text = text.normalize('NFKC')
return text;
}
class NFC extends UnicodeNormalizer {
form = 'NFC';
}

/**
* NFKD Normalizer.
* @extends Normalizer
* A normalizer that applies Unicode normalization form D (NFD) to the input text.
* Canonical Decomposition.
* @extends UnicodeNormalizer
*/
class NFKD extends Normalizer {
/**
* Normalize text using NFKD normalization.
* @param {string} text The text to be normalized.
* @returns {string} The normalized text.
*/
normalize(text) {
text = text.normalize('NFKD')
return text;
}
class NFD extends UnicodeNormalizer {
form = 'NFD';
}

/**
* A normalizer that applies Unicode normalization form KC (NFKC) to the input text.
* Compatibility Decomposition, followed by Canonical Composition.
* @extends UnicodeNormalizer
*/
class NFKC extends UnicodeNormalizer {
form = 'NFKC';
}

/**
* A normalizer that applies Unicode normalization form KD (NFKD) to the input text.
* Compatibility Decomposition.
* @extends UnicodeNormalizer
*/
class NFKD extends UnicodeNormalizer {
form = 'NFKD';
}

/**
Expand Down
29 changes: 28 additions & 1 deletion tests/models/bert/test_tokenization_bert.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { BertTokenizer } from "../../../src/tokenizers.js";
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js";
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS, NORMALIZATION_TEST_STRINGS } from "../test_strings.js";

export const TOKENIZER_CLASS = BertTokenizer;
export const TEST_CONFIG = {
Expand Down Expand Up @@ -1341,4 +1341,31 @@ export const TEST_CONFIG = {
decoded: "[CLS] ah [UNK] [UNK] zz [SEP]",
},
},
// NFD normalizer
"onnx-community/language_detection-ONNX": {
DEFAULT_EXAMPLE: {
text: NORMALIZATION_TEST_STRINGS.DEFAULT_EXAMPLE,
tokens: ["ame", "##lie", "|", "ame", "##lie"],
ids: [1, 21947, 31933, 70, 21947, 31933, 2],
decoded: "[CLS] amelie | amelie [SEP]",
},
CANONICAL_EQUIVALENCE_NORMALIZATION: {
text: NORMALIZATION_TEST_STRINGS.CANONICAL_EQUIVALENCE_NORMALIZATION,
tokens: ["n", "|", "n"],
ids: [1, 56, 70, 56, 2],
decoded: "[CLS] n | n [SEP]",
},
COMPATIBILITY_NORMALIZATION: {
text: NORMALIZATION_TEST_STRINGS.COMPATIBILITY_NORMALIZATION,
tokens: ["[UNK]", "|", "ff"],
ids: [1, 0, 70, 40133, 2],
decoded: "[CLS] [UNK] | ff [SEP]",
},
COMBINED_EXAMPLE: {
text: NORMALIZATION_TEST_STRINGS.COMBINED_EXAMPLE,
tokens: ["ſ", "|", "ſ", "|", "ſ", "|", "s", "|", "s"],
ids: [1, 121, 70, 121, 70, 121, 70, 61, 70, 61, 2],
decoded: "[CLS] ſ | ſ | ſ | s | s [SEP]",
},
},
};
10 changes: 10 additions & 0 deletions tests/models/test_strings.js
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,13 @@ export const M2M_100_TEST_STRINGS = {
HIDNI_TEXT: "जीवन एक चॉकलेट बॉक्स की तरह है।",
CHINESE_TEXT: "生活就像一盒巧克力。",
};

// Test strings adapted from https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize
export const NORMALIZATION_TEST_STRINGS = {
DEFAULT_EXAMPLE: "\u0041\u006d\u00e9\u006c\u0069\u0065 | \u0041\u006d\u0065\u0301\u006c\u0069\u0065",
CANONICAL_EQUIVALENCE_NORMALIZATION: "\u00F1 | \u006E\u0303",
COMPATIBILITY_NORMALIZATION: "\uFB00 | \u0066\u0066",

// Original | NFC | NFD | NFKC | NFKD
COMBINED_EXAMPLE: "\u1E9B\u0323 | \u1E9B\u0323 | \u017F\u0323\u0307 | \u1E69 | \u0073\u0323\u0307",
};

0 comments on commit 161237b

Please sign in to comment.