Browse Source

Support Unicode 16.0 emoji via unicodedata2 package

merge-requests/159/head
Andrew Shu 5 months ago
parent
commit
fab3abfb82
  1. 1
      tildes/requirements-dev.in
  2. 1
      tildes/requirements-dev.txt
  3. 1
      tildes/requirements.in
  4. 1
      tildes/requirements.txt
  5. 6
      tildes/tests/test_string.py
  6. 9
      tildes/tildes/lib/string.py

1
tildes/requirements-dev.in

@ -12,4 +12,5 @@ types-bleach
types-python-dateutil
types-redis
types-requests
unicodedata2
webtest

1
tildes/requirements-dev.txt

@ -109,6 +109,7 @@ types-python-dateutil==0.1.4
types-redis==3.5.4
types-requests==2.25.0
typing-extensions==4.12.2
unicodedata2==16.0.0
urllib3==1.26.6
venusian==3.0.0
waitress==2.0.0

1
tildes/requirements.in

@ -36,6 +36,7 @@ SQLAlchemy<1.4
SQLAlchemy-Utils
stripe
titlecase
unicodedata2
webargs
wrapt
zope.sqlalchemy

1
tildes/requirements.txt

@ -67,6 +67,7 @@ tomli==1.2.3
traitlets==5.0.5
transaction==3.0.1
translationstring==1.4
unicodedata2==16.0.0
urllib3==1.26.6
venusian==3.0.0
wcwidth==0.2.5

6
tildes/tests/test_string.py

@ -8,6 +8,7 @@ from tildes.lib.string import (
truncate_string_at_char,
word_count,
extract_text_from_html,
_sanitize_characters,
)
@ -173,3 +174,8 @@ def test_extract_text_from_html_exclude_details():
html = "<details><p>Hide me!</p></details>"
text = extract_text_from_html(html, exclude_details_include_summary=True)
assert text == "Details"
def test_sanitize_characters_keeps_unicode_15_moose():
"""Ensure _sanitize_characters keeps a newer emoji introduced in Unicode 15."""
assert _sanitize_characters("🫎") == "🫎"

9
tildes/tildes/lib/string.py

@ -4,13 +4,13 @@
"""Functions related to processing/manipulating strings."""
import re
import unicodedata
from collections.abc import Iterator
from typing import Optional
from urllib.parse import quote
from xml.etree.ElementTree import Element
from html5lib import HTMLParser
import unicodedata2
# regex for matching an entire word, handles words that include an apostrophe
@ -177,10 +177,11 @@ def simplify_string(original: str) -> str:
def _sanitize_characters(original: str) -> str:
"""Process a string and filter/replace problematic unicode."""
# pylint: disable=c-extension-no-member
final_characters = []
for index, char in enumerate(original):
category = unicodedata.category(char)
category = unicodedata2.category(char)
if category.startswith("Z"):
# "separator" chars - replace with a normal space
@ -195,8 +196,8 @@ def _sanitize_characters(original: str) -> str:
# don't break certain emoji variants
if char == "\u200D":
try:
before_category = unicodedata.category(final_characters[-1])
after_category = unicodedata.category(original[index + 1])
before_category = unicodedata2.category(final_characters[-1])
after_category = unicodedata2.category(original[index + 1])
except IndexError:
continue

Loading…
Cancel
Save