Skip to content

Commit

Permalink
allow unicode (#111)
Browse files Browse the repository at this point in the history
* initial commit to allow unicode

* update version and changelog

* add the flag to the CLI

* update README.md
  • Loading branch information
mrezzamoradi authored Feb 22, 2022
1 parent 07b87da commit d968ca7
Show file tree
Hide file tree
Showing 6 changed files with 328 additions and 7 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 6.1.0

- Add `allow_unicode` flag to allow unicode characters in the slug

## 6.0.1

- Rework regex_pattern to mean the opposite (disallowed chars instead of allowed)
Expand Down
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def slugify(
stopwords=(),
regex_pattern=None,
lowercase=True,
replacements=()
replacements=(),
allow_unicode=False
):
"""
Make a slug from the given text.
Expand All @@ -58,6 +59,7 @@ def slugify(
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:param allow_unicode (bool): allow unicode characters
:return (str): slugify text
"""
```
Expand All @@ -75,6 +77,10 @@ txt = '影師嗎'
r = slugify(txt)
self.assertEqual(r, "ying-shi-ma")

txt = '影師嗎'
r = slugify(txt, allow_unicode=True)
self.assertEqual(r, "影師嗎")

txt = 'C\'est déjà l\'été.'
r = slugify(txt)
self.assertEqual(r, "c-est-deja-l-ete")
Expand Down Expand Up @@ -133,6 +139,14 @@ txt = 'ÜBER Über German Umlaut'
r = slugify(txt, replacements=[['Ü', 'UE'], ['ü', 'ue']])
self.assertEqual(r, "ueber-ueber-german-umlaut")

txt = 'i love 🦄'
r = slugify(txt, allow_unicode=True)
self.assertEqual(r, "i-love")

txt = 'i love 🦄'
r = slugify(txt, allow_unicode=True, regex_pattern=r'[^🦄]+')
self.assertEqual(r, "🦄")

```

For more examples, have a look at the [test.py](test.py) file.
Expand Down
5 changes: 4 additions & 1 deletion slugify/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def parse_args(argv):
help="Activate case sensitivity")
parser.add_argument("--replacements", nargs='+',
help="""Additional replacement rules e.g. "|->or", "%%->percent".""")
parser.add_argument("--allow-unicode", action='store_true', default=False,
help="Allow unicode characters")

args = parser.parse_args(argv[1:])

Expand Down Expand Up @@ -73,7 +75,8 @@ def slugify_params(args):
separator=args.separator,
stopwords=args.stopwords,
lowercase=args.lowercase,
replacements=args.replacements
replacements=args.replacements,
allow_unicode=args.allow_unicode
)


Expand Down
2 changes: 1 addition & 1 deletion slugify/__version__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
__url__ = 'https://github.com/un33k/python-slugify'
__license__ = 'MIT'
__copyright__ = 'Copyright 2022 Val Neekman @ Neekware Inc.'
__version__ = '6.0.1'
__version__ = '6.1.0'
20 changes: 16 additions & 4 deletions slugify/slugify.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
QUOTE_PATTERN = re.compile(r'[\']+')
DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
DEFAULT_SEPARATOR = '-'
Expand Down Expand Up @@ -66,7 +67,8 @@ def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', sav

def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
replacements: typing.Iterable[typing.Iterable[str]] = ()):
replacements: typing.Iterable[typing.Iterable[str]] = (),
allow_unicode=False):
"""
Make a slug from the given text.
:param text (str): initial text
Expand All @@ -81,6 +83,7 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:param allow_unicode (bool): allow unicode characters
:return (str):
"""

Expand All @@ -97,7 +100,8 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)

# decode unicode
text = unidecode.unidecode(text)
if not allow_unicode:
text = unidecode.unidecode(text)

# ensure text is still in unicode
if not isinstance(text, str):
Expand All @@ -122,7 +126,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
pass

# translate
text = unicodedata.normalize('NFKD', text)
if allow_unicode:
text = unicodedata.normalize('NFKC', text)
else:
text = unicodedata.normalize('NFKD', text)

if sys.version_info < (3,):
text = text.encode('ascii', 'ignore')

Expand All @@ -137,7 +145,11 @@ def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, w
text = NUMBERS_PATTERN.sub('', text)

# replace all other unwanted characters
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
if allow_unicode:
pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
else:
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN

text = re.sub(pattern, DEFAULT_SEPARATOR, text)

# remove redundant
Expand Down
Loading

0 comments on commit d968ca7

Please sign in to comment.