Browse Source

Support Unicode 16.0 emoji via unicodedata2 package

See merge request tildes/tildes!159
staging-1.101
talklittle 5 months ago
committed by Andrew Shu
parent
commit
a8261cf441
  1. 1
      tildes/requirements-dev.in
  2. 1
      tildes/requirements-dev.txt
  3. 1
      tildes/requirements.in
  4. 1
      tildes/requirements.txt
  5. 5
      tildes/tests/test_simplestring_field.py
  6. 9
      tildes/tildes/lib/string.py

1
tildes/requirements-dev.in

@ -12,4 +12,5 @@ types-bleach
types-python-dateutil types-python-dateutil
types-redis types-redis
types-requests types-requests
unicodedata2
webtest webtest

1
tildes/requirements-dev.txt

@ -109,6 +109,7 @@ types-python-dateutil==0.1.4
types-redis==3.5.4 types-redis==3.5.4
types-requests==2.25.0 types-requests==2.25.0
typing-extensions==4.12.2 typing-extensions==4.12.2
unicodedata2==16.0.0
urllib3==1.26.6 urllib3==1.26.6
venusian==3.0.0 venusian==3.0.0
waitress==2.0.0 waitress==2.0.0

1
tildes/requirements.in

@ -36,6 +36,7 @@ SQLAlchemy<1.4
SQLAlchemy-Utils SQLAlchemy-Utils
stripe stripe
titlecase titlecase
unicodedata2
webargs webargs
wrapt wrapt
zope.sqlalchemy zope.sqlalchemy

1
tildes/requirements.txt

@ -67,6 +67,7 @@ tomli==1.2.3
traitlets==5.0.5 traitlets==5.0.5
transaction==3.0.1 transaction==3.0.1
translationstring==1.4 translationstring==1.4
unicodedata2==16.0.0
urllib3==1.26.6 urllib3==1.26.6
venusian==3.0.0 venusian==3.0.0
wcwidth==0.2.5 wcwidth==0.2.5

5
tildes/tests/test_simplestring_field.py

@ -99,3 +99,8 @@ def test_consecutive_spaces_collapsed():
"""Ensure runs of consecutive spaces are "collapsed" inside the string.""" """Ensure runs of consecutive spaces are "collapsed" inside the string."""
original = "I wanted to space this out" original = "I wanted to space this out"
assert process_string(original) == "I wanted to space this out" assert process_string(original) == "I wanted to space this out"
def test_unicode_15_moose_kept():
"""Ensure newer emoji introduced in Unicode 15 are kept."""
assert process_string("🫎") == "🫎"

9
tildes/tildes/lib/string.py

@ -4,13 +4,13 @@
"""Functions related to processing/manipulating strings.""" """Functions related to processing/manipulating strings."""
import re import re
import unicodedata
from collections.abc import Iterator from collections.abc import Iterator
from typing import Optional from typing import Optional
from urllib.parse import quote from urllib.parse import quote
from xml.etree.ElementTree import Element from xml.etree.ElementTree import Element
from html5lib import HTMLParser from html5lib import HTMLParser
import unicodedata2
# regex for matching an entire word, handles words that include an apostrophe # regex for matching an entire word, handles words that include an apostrophe
@ -177,10 +177,11 @@ def simplify_string(original: str) -> str:
def _sanitize_characters(original: str) -> str: def _sanitize_characters(original: str) -> str:
"""Process a string and filter/replace problematic unicode.""" """Process a string and filter/replace problematic unicode."""
# pylint: disable=c-extension-no-member
final_characters = [] final_characters = []
for index, char in enumerate(original): for index, char in enumerate(original):
category = unicodedata.category(char)
category = unicodedata2.category(char)
if category.startswith("Z"): if category.startswith("Z"):
# "separator" chars - replace with a normal space # "separator" chars - replace with a normal space
@ -195,8 +196,8 @@ def _sanitize_characters(original: str) -> str:
# don't break certain emoji variants # don't break certain emoji variants
if char == "\u200D": if char == "\u200D":
try: try:
before_category = unicodedata.category(final_characters[-1])
after_category = unicodedata.category(original[index + 1])
before_category = unicodedata2.category(final_characters[-1])
after_category = unicodedata2.category(original[index + 1])
except IndexError: except IndexError:
continue continue

Loading…
Cancel
Save