From d968ca7419e6f4e40685888c56d03bea50fd39d7 Mon Sep 17 00:00:00 2001 From: Reza Moradi Date: Tue, 22 Feb 2022 20:05:08 +0100 Subject: [PATCH] allow unicode (#111) * initial commit to allow unicode * update version and changelog * add the flag to the CLI * update README.md --- CHANGELOG.md | 4 + README.md | 16 ++- slugify/__main__.py | 5 +- slugify/__version__.py | 2 +- slugify/slugify.py | 20 ++- test.py | 288 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 328 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95ad243..49f88dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 6.1.0 + +- Add `allow_unicode` flag to allow unicode characters in the slug + ## 6.0.1 - Rework regex_pattern to mean the opposite (disallowed chars instead of allowed) diff --git a/README.md b/README.md index 11e20da..f93afee 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,8 @@ def slugify( stopwords=(), regex_pattern=None, lowercase=True, - replacements=() + replacements=(), + allow_unicode=False ): """ Make a slug from the given text. @@ -58,6 +59,7 @@ def slugify( :param regex_pattern (str): regex pattern for disallowed characters :param lowercase (bool): activate case sensitivity by setting it to False :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] + :param allow_unicode (bool): allow unicode characters :return (str): slugify text """ ``` @@ -75,6 +77,10 @@ txt = '影師嗎' r = slugify(txt) self.assertEqual(r, "ying-shi-ma") +txt = '影師嗎' +r = slugify(txt, allow_unicode=True) +self.assertEqual(r, "影師嗎") + txt = 'C\'est déjà l\'été.' r = slugify(txt) self.assertEqual(r, "c-est-deja-l-ete") @@ -133,6 +139,14 @@ txt = 'ÜBER Über German Umlaut' r = slugify(txt, replacements=[['Ü', 'UE'], ['ü', 'ue']]) self.assertEqual(r, "ueber-ueber-german-umlaut") +txt = 'i love 🦄' +r = slugify(txt, allow_unicode=True) +self.assertEqual(r, "i-love") + +txt = 'i love 🦄' +r = slugify(txt, allow_unicode=True, regex_pattern=r'[^🦄]+') +self.assertEqual(r, "🦄") + ``` For more examples, have a look at the [test.py](test.py) file. diff --git a/slugify/__main__.py b/slugify/__main__.py index 5a888fe..7dd6b01 100644 --- a/slugify/__main__.py +++ b/slugify/__main__.py @@ -36,6 +36,8 @@ def parse_args(argv): help="Activate case sensitivity") parser.add_argument("--replacements", nargs='+', help="""Additional replacement rules e.g. "|->or", "%%->percent".""") + parser.add_argument("--allow-unicode", action='store_true', default=False, + help="Allow unicode characters") args = parser.parse_args(argv[1:]) @@ -73,7 +75,8 @@ def slugify_params(args): separator=args.separator, stopwords=args.stopwords, lowercase=args.lowercase, - replacements=args.replacements + replacements=args.replacements, + allow_unicode=args.allow_unicode ) diff --git a/slugify/__version__.py b/slugify/__version__.py index 1eedf44..e14e887 100644 --- a/slugify/__version__.py +++ b/slugify/__version__.py @@ -5,4 +5,4 @@ __url__ = 'https://github.com/un33k/python-slugify' __license__ = 'MIT' __copyright__ = 'Copyright 2022 Val Neekman @ Neekware Inc.' -__version__ = '6.0.1' +__version__ = '6.1.0' diff --git a/slugify/slugify.py b/slugify/slugify.py index 190ea92..ae6c9b6 100644 --- a/slugify/slugify.py +++ b/slugify/slugify.py @@ -17,6 +17,7 @@ HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);') QUOTE_PATTERN = re.compile(r'[\']+') DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+') +DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+') DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}') NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)') DEFAULT_SEPARATOR = '-' @@ -66,7 +67,8 @@ def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', sav def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True, - replacements: typing.Iterable[typing.Iterable[str]] = ()): + replacements: typing.Iterable[typing.Iterable[str]] = (), + allow_unicode=False): """ Make a slug from the given text. :param text (str): initial text @@ -81,6 +83,7 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w :param regex_pattern (str): regex pattern for disallowed characters :param lowercase (bool): activate case sensitivity by setting it to False :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] + :param allow_unicode (bool): allow unicode characters :return (str): """ @@ -97,7 +100,8 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) # decode unicode - text = unidecode.unidecode(text) + if not allow_unicode: + text = unidecode.unidecode(text) # ensure text is still in unicode if not isinstance(text, str): @@ -122,7 +126,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w pass # translate - text = unicodedata.normalize('NFKD', text) + if allow_unicode: + text = unicodedata.normalize('NFKC', text) + else: + text = unicodedata.normalize('NFKD', text) + if sys.version_info < (3,): text = text.encode('ascii', 'ignore') @@ -137,7 +145,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w text = NUMBERS_PATTERN.sub('', text) # replace all other unwanted characters - pattern = regex_pattern or DISALLOWED_CHARS_PATTERN + if allow_unicode: + pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN + else: + pattern = regex_pattern or DISALLOWED_CHARS_PATTERN + text = re.sub(pattern, DEFAULT_SEPARATOR, text) # remove redundant diff --git a/test.py b/test.py index 752c499..931f38f 100644 --- a/test.py +++ b/test.py @@ -233,6 +233,294 @@ def test_replacements_german_umlaut_custom(self): self.assertEqual(r, "ueber-ueber-german-umlaut") +class TestSlugifyUnicode(unittest.TestCase): + + def test_extraneous_seperators(self): + + txt = "This is a test ---" + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "this-is-a-test") + + txt = "___This is a test ---" + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "this-is-a-test") + + txt = "___This is a test___" + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "this-is-a-test") + + def test_non_word_characters(self): + txt = "This -- is a ## test ---" + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "this-is-a-test") + + def test_phonetic_conversion_of_eastern_scripts(self): + txt = '影師嗎' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, txt) + + def test_accented_text(self): + txt = 'C\'est déjà l\'été.' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "c-est-déjà-l-été") + + txt = 'Nín hǎo. Wǒ shì zhōng guó rén' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "nín-hǎo-wǒ-shì-zhōng-guó-rén") + + def test_accented_text_with_non_word_characters(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "jaja-lol-méméméoo-a") + + def test_cyrillic_text(self): + txt = 'Компьютер' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "компьютер") + + def test_max_length(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=9) + self.assertEqual(r, "jaja-lol") + + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=15) + self.assertEqual(r, "jaja-lol-mémémé") + + def test_max_length_cutoff_not_required(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=50) + self.assertEqual(r, "jaja-lol-méméméoo-a") + + def test_word_boundary(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=15, word_boundary=True) + self.assertEqual(r, "jaja-lol-a") + + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=17, word_boundary=True) + self.assertEqual(r, "jaja-lol-méméméoo") + + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=18, word_boundary=True) + self.assertEqual(r, "jaja-lol-méméméoo") + + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=19, word_boundary=True) + self.assertEqual(r, "jaja-lol-méméméoo-a") + + def test_custom_separator(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=20, word_boundary=True, separator=".") + self.assertEqual(r, "jaja.lol.méméméoo.a") + + def test_multi_character_separator(self): + txt = 'jaja---lol-méméméoo--a' + r = slugify(txt, allow_unicode=True, max_length=20, word_boundary=True, separator="ZZZZZZ") + self.assertEqual(r, "jajaZZZZZZlolZZZZZZméméméooZZZZZZa") + + def test_save_order(self): + txt = 'one two three four five' + r = slugify(txt, allow_unicode=True, max_length=13, word_boundary=True, save_order=True) + self.assertEqual(r, "one-two-three") + + txt = 'one two three four five' + r = slugify(txt, allow_unicode=True, max_length=13, word_boundary=True, save_order=False) + self.assertEqual(r, "one-two-three") + + txt = 'one two three four five' + r = slugify(txt, allow_unicode=True, max_length=12, word_boundary=True, save_order=False) + self.assertEqual(r, "one-two-four") + + txt = 'one two three four five' + r = slugify(txt, allow_unicode=True, max_length=12, word_boundary=True, save_order=True) + self.assertEqual(r, "one-two") + + def test_save_order_rtl(self): + """For right-to-left unicode languages""" + txt = 'دو سه چهار پنج' + r = slugify(txt, allow_unicode=True, max_length=10, word_boundary=True, save_order=True) + self.assertEqual(r, "دو-سه-چهار") + + txt = 'دو سه چهار پنج' + r = slugify(txt, allow_unicode=True, max_length=10, word_boundary=True, save_order=False) + self.assertEqual(r, "دو-سه-چهار") + + txt = 'دو سه چهار پنج' + r = slugify(txt, allow_unicode=True, max_length=9, word_boundary=True, save_order=False) + self.assertEqual(r, "دو-سه-پنج") + + txt = 'دو سه چهار پنج' + r = slugify(txt, allow_unicode=True, max_length=9, word_boundary=True, save_order=True) + self.assertEqual(r, "دو-سه") + + def test_stopword_removal(self): + txt = 'this has a stopword' + r = slugify(txt, allow_unicode=True, stopwords=['stopword']) + self.assertEqual(r, 'this-has-a') + + txt = 'this has a Öländ' + r = slugify(txt, allow_unicode=True, stopwords=['Öländ']) + self.assertEqual(r, 'this-has-a') + + def test_stopword_removal_casesensitive(self): + txt = 'thIs Has a stopword Stopword' + r = slugify(txt, allow_unicode=True, stopwords=['Stopword'], lowercase=False) + self.assertEqual(r, 'thIs-Has-a-stopword') + + txt = 'thIs Has a öländ Öländ' + r = slugify(txt, allow_unicode=True, stopwords=['Öländ'], lowercase=False) + self.assertEqual(r, 'thIs-Has-a-öländ') + + def test_multiple_stopword_occurances(self): + txt = 'the quick brown fox jumps over the lazy dog' + r = slugify(txt, allow_unicode=True, stopwords=['the']) + self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog') + + def test_differently_cased_stopword_match(self): + txt = 'Foo A FOO B foo C' + r = slugify(txt, allow_unicode=True, stopwords=['foo']) + self.assertEqual(r, 'a-b-c') + + txt = 'Foo A FOO B foo C' + r = slugify(txt, allow_unicode=True, stopwords=['FOO']) + self.assertEqual(r, 'a-b-c') + + def test_multiple_stopwords(self): + txt = 'the quick brown fox jumps over the lazy dog in a hurry' + r = slugify(txt, allow_unicode=True, stopwords=['the', 'in', 'a', 'hurry']) + self.assertEqual(r, 'quick-brown-fox-jumps-over-lazy-dog') + + def test_stopwords_with_different_separator(self): + txt = 'the quick brown fox jumps over the lazy dog' + r = slugify(txt, allow_unicode=True, stopwords=['the'], separator=' ') + self.assertEqual(r, 'quick brown fox jumps over lazy dog') + + def test_html_entities_on(self): + txt = 'foo & bar' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, 'foo-bar') + + def test_html_entities_off(self): + txt = 'foo & bår' + r = slugify(txt, allow_unicode=True, entities=False) + self.assertEqual(r, 'foo-amp-bår') + + def test_html_decimal_on(self): + txt = 'Ž' + r = slugify(txt, allow_unicode=True, decimal=True) + self.assertEqual(r, 'ž') + + def test_html_decimal_off(self): + txt = 'Ž' + r = slugify(txt, allow_unicode=True, entities=False, decimal=False) + self.assertEqual(r, '381') + + def test_html_hexadecimal_on(self): + txt = 'Ž' + r = slugify(txt, allow_unicode=True, hexadecimal=True) + self.assertEqual(r, 'ž') + + def test_html_hexadecimal_off(self): + txt = 'Ž' + r = slugify(txt, allow_unicode=True, hexadecimal=False) + self.assertEqual(r, 'x17d') + + def test_starts_with_number(self): + txt = '10 amazing secrets' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, '10-amazing-secrets') + + def test_contains_numbers(self): + txt = 'buildings with 1000 windows' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, 'buildings-with-1000-windows') + + def test_ends_with_number(self): + txt = 'recipe number 3' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, 'recipe-number-3') + + def test_numbers_only(self): + txt = '404' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, '404') + + def test_numbers_and_symbols(self): + txt = '1,000 reasons you are #1' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, '1000-reasons-you-are-1') + + txt = '۱,۰۰۰ reasons you are #۱' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, '۱۰۰۰-reasons-you-are-۱') + + def test_regex_pattern_keep_underscore(self): + """allowing unicode should not overrule the passed regex_pattern""" + txt = "___This is a test___" + regex_pattern = r'[^-a-z0-9_]+' + r = slugify(txt, allow_unicode=True, regex_pattern=regex_pattern) + self.assertEqual(r, "___this-is-a-test___") + + def test_regex_pattern_keep_underscore_with_underscore_as_separator(self): + """ + The regex_pattern turns the power to the caller. + Hence, the caller must ensure that a custom separator doesn't clash + with the regex_pattern. + """ + txt = "___This is a test___" + regex_pattern = r'[^-a-z0-9_]+' + r = slugify(txt, allow_unicode=True, separator='_', regex_pattern=regex_pattern) + self.assertNotEqual(r, "_this_is_a_test_") + + def test_replacements(self): + txt = '10 | 20 %' + r = slugify(txt, allow_unicode=True, replacements=[['|', 'or'], ['%', 'percent']]) + self.assertEqual(r, "10-or-20-percent") + + txt = 'I ♥ 🦄' + r = slugify(txt, allow_unicode=True, replacements=[['♥', 'amour'], ['🦄', 'licorne']]) + self.assertEqual(r, "i-amour-licorne") + + txt = 'I ♥ 🦄' + r = slugify(txt, allow_unicode=True, replacements=[['♥', 'სიყვარული'], ['🦄', 'licorne']]) + self.assertEqual(r, "i-სიყვარული-licorne") + + def test_replacements_german_umlaut_custom(self): + txt = 'ÜBER Über German Umlaut' + r = slugify(txt, allow_unicode=True, replacements=[['Ü', 'UE'], ['ü', 'ue']]) + self.assertEqual(r, "ueber-ueber-german-umlaut") + + def test_emojis(self): + """ + allowing unicode shouldn't allow emojis, even in replacements. + the only exception is when it is allowed by the regex_pattern. regex_pattern overrules all + """ + txt = 'i love 🦄' + r = slugify(txt, allow_unicode=True) + self.assertEqual(r, "i-love") + + txt = 'i love 🦄' + r = slugify(txt, allow_unicode=True, decimal=True) + self.assertEqual(r, "i-love") + + txt = 'i love 🦄' + r = slugify(txt, allow_unicode=True, hexadecimal=True) + self.assertEqual(r, "i-love") + + txt = 'i love 🦄' + r = slugify(txt, allow_unicode=True, entities=True) + self.assertEqual(r, "i-love") + + txt = 'i love you' + r = slugify(txt, allow_unicode=True, replacements=[['you', '🦄']]) + self.assertEqual(r, "i-love") + + txt = 'i love 🦄' + r = slugify(txt, allow_unicode=True, regex_pattern=r'[^🦄]+') + self.assertEqual(r, "🦄") + + class TestUtils(unittest.TestCase): def test_smart_truncate_no_max_length(self):