allow unicode (#111)

* initial commit to allow unicode * update version and changelog * add the flag to the CLI * update README.md
un33k · Feb 22, 2022 · d968ca7 · d968ca7
1 parent 07b87da
commit d968ca7
Show file tree

Hide file tree

Showing 6 changed files with 328 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 6.1.0
+
+- Add `allow_unicode` flag to allow unicode characters in the slug
+
 ## 6.0.1
 
 - Rework regex_pattern to mean the opposite (disallowed chars instead of allowed)

diff --git a/README.md b/README.md
@@ -42,7 +42,8 @@ def slugify(
     stopwords=(),
     regex_pattern=None,
     lowercase=True,
-    replacements=()
+    replacements=(),
+    allow_unicode=False
   ):
   """
   Make a slug from the given text.
@@ -58,6 +59,7 @@ def slugify(
   :param regex_pattern (str): regex pattern for disallowed characters
   :param lowercase (bool): activate case sensitivity by setting it to False
   :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
+  :param allow_unicode (bool): allow unicode characters
   :return (str): slugify text
   """
 ```
@@ -75,6 +77,10 @@ txt = '影師嗎'
 r = slugify(txt)
 self.assertEqual(r, "ying-shi-ma")
 
+txt = '影師嗎'
+r = slugify(txt, allow_unicode=True)
+self.assertEqual(r, "影師嗎")
+
 txt = 'C\'est déjà l\'été.'
 r = slugify(txt)
 self.assertEqual(r, "c-est-deja-l-ete")
@@ -133,6 +139,14 @@ txt = 'ÜBER Über German Umlaut'
 r = slugify(txt, replacements=[['Ü', 'UE'], ['ü', 'ue']])
 self.assertEqual(r, "ueber-ueber-german-umlaut")
 
+txt = 'i love 🦄'
+r = slugify(txt, allow_unicode=True)
+self.assertEqual(r, "i-love")
+
+txt = 'i love 🦄'
+r = slugify(txt, allow_unicode=True, regex_pattern=r'[^🦄]+')
+self.assertEqual(r, "🦄")
+
 ```
 
 For more examples, have a look at the [test.py](test.py) file.

diff --git a/slugify/__main__.py b/slugify/__main__.py
@@ -36,6 +36,8 @@ def parse_args(argv):
                         help="Activate case sensitivity")
     parser.add_argument("--replacements", nargs='+',
                         help="""Additional replacement rules e.g. "|->or", "%%->percent".""")
+    parser.add_argument("--allow-unicode", action='store_true', default=False,
+                        help="Allow unicode characters")
 
     args = parser.parse_args(argv[1:])
 
@@ -73,7 +75,8 @@ def slugify_params(args):
         separator=args.separator,
         stopwords=args.stopwords,
         lowercase=args.lowercase,
-        replacements=args.replacements
+        replacements=args.replacements,
+        allow_unicode=args.allow_unicode
     )
 
 

diff --git a/slugify/__version__.py b/slugify/__version__.py
@@ -5,4 +5,4 @@
 __url__ = 'https://github.com/un33k/python-slugify'
 __license__ = 'MIT'
 __copyright__ = 'Copyright 2022 Val Neekman @ Neekware Inc.'
-__version__ = '6.0.1'
+__version__ = '6.1.0'
diff --git a/slugify/slugify.py b/slugify/slugify.py
@@ -17,6 +17,7 @@
 HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
 QUOTE_PATTERN = re.compile(r'[\']+')
 DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
+DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
 DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
 NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
 DEFAULT_SEPARATOR = '-'
@@ -66,7 +67,8 @@ def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', sav
 
 def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
             separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
-            replacements: typing.Iterable[typing.Iterable[str]] = ()):
+            replacements: typing.Iterable[typing.Iterable[str]] = (),
+            allow_unicode=False):
     """
     Make a slug from the given text.
     :param text (str): initial text
@@ -81,6 +83,7 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
     :param regex_pattern (str): regex pattern for disallowed characters
     :param lowercase (bool): activate case sensitivity by setting it to False
     :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
+    :param allow_unicode (bool): allow unicode characters
     :return (str):
     """
 
@@ -97,7 +100,8 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
     text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
 
     # decode unicode
-    text = unidecode.unidecode(text)
+    if not allow_unicode:
+        text = unidecode.unidecode(text)
 
     # ensure text is still in unicode
     if not isinstance(text, str):
@@ -122,7 +126,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
             pass
 
     # translate
-    text = unicodedata.normalize('NFKD', text)
+    if allow_unicode:
+        text = unicodedata.normalize('NFKC', text)
+    else:
+        text = unicodedata.normalize('NFKD', text)
+
     if sys.version_info < (3,):
         text = text.encode('ascii', 'ignore')
 
@@ -137,7 +145,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
     text = NUMBERS_PATTERN.sub('', text)
 
     # replace all other unwanted characters
-    pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
+    if allow_unicode:
+        pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
+    else:
+        pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
+
     text = re.sub(pattern, DEFAULT_SEPARATOR, text)
 
     # remove redundant