Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 67 additions & 86 deletions slugify/slugify.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import re
import unicodedata
from collections.abc import Iterable
from collections.abc import Callable, Iterable
from html.entities import name2codepoint

try:
Expand Down Expand Up @@ -38,9 +38,8 @@ def smart_truncate(
:param word_boundary (bool):
:param save_order (bool): if True then word order of output string is like input string
:param separator (str): separator between words
:return:
:return: truncated string
"""

string = string.strip(separator)

if not max_length:
Expand Down Expand Up @@ -89,109 +88,91 @@ def slugify(
) -> str:
"""
Make a slug from the given text.
:param text (str): initial text
:param entities (bool): converts html entities to unicode
:param decimal (bool): converts html decimal to unicode
:param hexadecimal (bool): converts html hexadecimal to unicode
:param max_length (int): output string length
:param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
:param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
:param separator (str): separator between words
:param stopwords (iterable): words to discount
:param regex_pattern (str): regex pattern for disallowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:param allow_unicode (bool): allow unicode characters
:return (str):
"""
text = _apply_replacements(text, replacements)
text = _ensure_unicode(text)
text = _normalize_text(text, allow_unicode)
text = _process_html_entities(text, entities, decimal, hexadecimal)
text = _normalize_text(text, allow_unicode) # Re-normalize after entity processing
text = _adjust_case(text, lowercase)
text = _clean_special_chars(text, allow_unicode, regex_pattern)
text = _process_stopwords(text, stopwords, lowercase, DEFAULT_SEPARATOR)
text = _apply_replacements(text, replacements) # Final replacements
text = _truncate_text(text, max_length, word_boundary, separator, save_order, DEFAULT_SEPARATOR)
text = _adjust_separator(text, separator, DEFAULT_SEPARATOR)

return text

# user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
def _apply_replacements(text: str, replacements: Iterable[Iterable[str]]) -> str:
if not replacements:
return text
for old, new in replacements:
text = text.replace(old, new)
return text

# ensure text is unicode
def _ensure_unicode(text: str) -> str:
if not isinstance(text, str):
text = str(text, 'utf-8', 'ignore')
return str(text, 'utf-8', 'ignore')
return text

# replace quotes with dashes - pre-process
def _normalize_text(text: str, allow_unicode: bool) -> str:
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)

# normalize text, convert to unicode if required
if allow_unicode:
text = unicodedata.normalize('NFKC', text)
else:
text = unicodedata.normalize('NFKD', text)
text = unidecode.unidecode(text)

# ensure text is still in unicode
if not isinstance(text, str):
text = str(text, 'utf-8', 'ignore')
return unicodedata.normalize('NFKC', text)
text = unicodedata.normalize('NFKD', text)
return unidecode.unidecode(text)

# character entity reference
def _process_html_entities(text: str, entities: bool, decimal: bool, hexadecimal: bool) -> str:
if entities:
text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)

# decimal character reference

if decimal:
try:
text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
except Exception:
pass

# hexadecimal character reference
text = _safe_sub(DECIMAL_PATTERN, lambda m: chr(int(m.group(1))), text)

if hexadecimal:
try:
text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
except Exception:
pass
text = _safe_sub(HEX_PATTERN, lambda m: chr(int(m.group(1), 16)), text)

return text

# re normalize text
if allow_unicode:
text = unicodedata.normalize('NFKC', text)
else:
text = unicodedata.normalize('NFKD', text)
def _safe_sub(pattern: re.Pattern, repl: Callable, text: str) -> str:
try:
return pattern.sub(repl, text)
except Exception:
return text

# make the text lowercase (optional)
def _adjust_case(text: str, lowercase: bool) -> str:
if lowercase:
text = text.lower()
return text.lower()
return text

# remove generated quotes -- post-process
def _clean_special_chars(text: str, allow_unicode: bool, regex_pattern: re.Pattern | str | None) -> str:
text = QUOTE_PATTERN.sub('', text)

# cleanup numbers
text = NUMBERS_PATTERN.sub('', text)

# replace all other unwanted characters
if allow_unicode:
pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
else:
pattern = regex_pattern or DISALLOWED_CHARS_PATTERN


pattern = regex_pattern or (DISALLOWED_UNICODE_CHARS_PATTERN if allow_unicode else DISALLOWED_CHARS_PATTERN)
text = re.sub(pattern, DEFAULT_SEPARATOR, text)
return DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)

# remove redundant
text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)

# remove stopwords
if stopwords:
if lowercase:
stopwords_lower = [s.lower() for s in stopwords]
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
else:
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
text = DEFAULT_SEPARATOR.join(words)

# finalize user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
def _process_stopwords(text: str, stopwords: Iterable[str], lowercase: bool, separator: str) -> str:
if not stopwords:
return text

words = text.split(separator)
if lowercase:
stopwords_set = {s.lower() for s in stopwords}
words = [w for w in words if w not in stopwords_set]
else:
words = [w for w in words if w not in stopwords]

return separator.join(words)

# smart truncate if requested
def _truncate_text(text: str, max_length: int, word_boundary: bool,
save_order: bool, default_separator: str) -> str:
if max_length > 0:
text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)

if separator != DEFAULT_SEPARATOR:
text = text.replace(DEFAULT_SEPARATOR, separator)

return smart_truncate(text, max_length, word_boundary, default_separator, save_order)
return text

def _adjust_separator(text: str, separator: str, default_separator: str) -> str:
if separator != default_separator:
return text.replace(default_separator, separator)
return text