Browse Source

Support Unicode 16.0 emoji via unicodedata2 package

merge-requests/159/head
Andrew Shu 5 months ago
parent
commit
fab3abfb82
  1. 1
      tildes/requirements-dev.in
  2. 1
      tildes/requirements-dev.txt
  3. 1
      tildes/requirements.in
  4. 1
      tildes/requirements.txt
  5. 6
      tildes/tests/test_string.py
  6. 9
      tildes/tildes/lib/string.py

1
tildes/requirements-dev.in

@ -12,4 +12,5 @@ types-bleach
types-python-dateutil types-python-dateutil
types-redis types-redis
types-requests types-requests
unicodedata2
webtest webtest

1
tildes/requirements-dev.txt

@ -109,6 +109,7 @@ types-python-dateutil==0.1.4
types-redis==3.5.4 types-redis==3.5.4
types-requests==2.25.0 types-requests==2.25.0
typing-extensions==4.12.2 typing-extensions==4.12.2
unicodedata2==16.0.0
urllib3==1.26.6 urllib3==1.26.6
venusian==3.0.0 venusian==3.0.0
waitress==2.0.0 waitress==2.0.0

1
tildes/requirements.in

@ -36,6 +36,7 @@ SQLAlchemy<1.4
SQLAlchemy-Utils SQLAlchemy-Utils
stripe stripe
titlecase titlecase
unicodedata2
webargs webargs
wrapt wrapt
zope.sqlalchemy zope.sqlalchemy

1
tildes/requirements.txt

@ -67,6 +67,7 @@ tomli==1.2.3
traitlets==5.0.5 traitlets==5.0.5
transaction==3.0.1 transaction==3.0.1
translationstring==1.4 translationstring==1.4
unicodedata2==16.0.0
urllib3==1.26.6 urllib3==1.26.6
venusian==3.0.0 venusian==3.0.0
wcwidth==0.2.5 wcwidth==0.2.5

6
tildes/tests/test_string.py

@ -8,6 +8,7 @@ from tildes.lib.string import (
truncate_string_at_char, truncate_string_at_char,
word_count, word_count,
extract_text_from_html, extract_text_from_html,
_sanitize_characters,
) )
@ -173,3 +174,8 @@ def test_extract_text_from_html_exclude_details():
html = "<details><p>Hide me!</p></details>" html = "<details><p>Hide me!</p></details>"
text = extract_text_from_html(html, exclude_details_include_summary=True) text = extract_text_from_html(html, exclude_details_include_summary=True)
assert text == "Details" assert text == "Details"
def test_sanitize_characters_keeps_unicode_15_moose():
"""Ensure _sanitize_characters keeps a newer emoji introduced in Unicode 15."""
assert _sanitize_characters("🫎") == "🫎"

9
tildes/tildes/lib/string.py

@ -4,13 +4,13 @@
"""Functions related to processing/manipulating strings.""" """Functions related to processing/manipulating strings."""
import re import re
import unicodedata
from collections.abc import Iterator from collections.abc import Iterator
from typing import Optional from typing import Optional
from urllib.parse import quote from urllib.parse import quote
from xml.etree.ElementTree import Element from xml.etree.ElementTree import Element
from html5lib import HTMLParser from html5lib import HTMLParser
import unicodedata2
# regex for matching an entire word, handles words that include an apostrophe # regex for matching an entire word, handles words that include an apostrophe
@ -177,10 +177,11 @@ def simplify_string(original: str) -> str:
def _sanitize_characters(original: str) -> str: def _sanitize_characters(original: str) -> str:
"""Process a string and filter/replace problematic unicode.""" """Process a string and filter/replace problematic unicode."""
# pylint: disable=c-extension-no-member
final_characters = [] final_characters = []
for index, char in enumerate(original): for index, char in enumerate(original):
category = unicodedata.category(char)
category = unicodedata2.category(char)
if category.startswith("Z"): if category.startswith("Z"):
# "separator" chars - replace with a normal space # "separator" chars - replace with a normal space
@ -195,8 +196,8 @@ def _sanitize_characters(original: str) -> str:
# don't break certain emoji variants # don't break certain emoji variants
if char == "\u200D": if char == "\u200D":
try: try:
before_category = unicodedata.category(final_characters[-1])
after_category = unicodedata.category(original[index + 1])
before_category = unicodedata2.category(final_characters[-1])
after_category = unicodedata2.category(original[index + 1])
except IndexError: except IndexError:
continue continue

Loading…
Cancel
Save