From fab3abfb82e8590d48d434910a6e48e9f0313e72 Mon Sep 17 00:00:00 2001 From: Andrew Shu Date: Thu, 10 Jul 2025 12:15:51 -0700 Subject: [PATCH] Support Unicode 16.0 emoji via unicodedata2 package --- tildes/requirements-dev.in | 1 + tildes/requirements-dev.txt | 1 + tildes/requirements.in | 1 + tildes/requirements.txt | 1 + tildes/tests/test_string.py | 6 ++++++ tildes/tildes/lib/string.py | 9 +++++---- 6 files changed, 15 insertions(+), 4 deletions(-) diff --git a/tildes/requirements-dev.in b/tildes/requirements-dev.in index 0299e91..4bf3d4e 100644 --- a/tildes/requirements-dev.in +++ b/tildes/requirements-dev.in @@ -12,4 +12,5 @@ types-bleach types-python-dateutil types-redis types-requests +unicodedata2 webtest diff --git a/tildes/requirements-dev.txt b/tildes/requirements-dev.txt index 650f9f6..d63468e 100644 --- a/tildes/requirements-dev.txt +++ b/tildes/requirements-dev.txt @@ -109,6 +109,7 @@ types-python-dateutil==0.1.4 types-redis==3.5.4 types-requests==2.25.0 typing-extensions==4.12.2 +unicodedata2==16.0.0 urllib3==1.26.6 venusian==3.0.0 waitress==2.0.0 diff --git a/tildes/requirements.in b/tildes/requirements.in index cce55ce..5e0c28d 100644 --- a/tildes/requirements.in +++ b/tildes/requirements.in @@ -36,6 +36,7 @@ SQLAlchemy<1.4 SQLAlchemy-Utils stripe titlecase +unicodedata2 webargs wrapt zope.sqlalchemy diff --git a/tildes/requirements.txt b/tildes/requirements.txt index 8e7d847..86922d1 100644 --- a/tildes/requirements.txt +++ b/tildes/requirements.txt @@ -67,6 +67,7 @@ tomli==1.2.3 traitlets==5.0.5 transaction==3.0.1 translationstring==1.4 +unicodedata2==16.0.0 urllib3==1.26.6 venusian==3.0.0 wcwidth==0.2.5 diff --git a/tildes/tests/test_string.py b/tildes/tests/test_string.py index d2eb99d..9245a88 100644 --- a/tildes/tests/test_string.py +++ b/tildes/tests/test_string.py @@ -8,6 +8,7 @@ from tildes.lib.string import ( truncate_string_at_char, word_count, extract_text_from_html, + _sanitize_characters, ) @@ -173,3 +174,8 @@ def test_extract_text_from_html_exclude_details(): html = "

Hide me!

" text = extract_text_from_html(html, exclude_details_include_summary=True) assert text == "Details" + + +def test_sanitize_characters_keeps_unicode_15_moose(): + """Ensure _sanitize_characters keeps a newer emoji introduced in Unicode 15.""" + assert _sanitize_characters("🫎") == "🫎" diff --git a/tildes/tildes/lib/string.py b/tildes/tildes/lib/string.py index aba05af..c6e460a 100644 --- a/tildes/tildes/lib/string.py +++ b/tildes/tildes/lib/string.py @@ -4,13 +4,13 @@ """Functions related to processing/manipulating strings.""" import re -import unicodedata from collections.abc import Iterator from typing import Optional from urllib.parse import quote from xml.etree.ElementTree import Element from html5lib import HTMLParser +import unicodedata2 # regex for matching an entire word, handles words that include an apostrophe @@ -177,10 +177,11 @@ def simplify_string(original: str) -> str: def _sanitize_characters(original: str) -> str: """Process a string and filter/replace problematic unicode.""" + # pylint: disable=c-extension-no-member final_characters = [] for index, char in enumerate(original): - category = unicodedata.category(char) + category = unicodedata2.category(char) if category.startswith("Z"): # "separator" chars - replace with a normal space @@ -195,8 +196,8 @@ def _sanitize_characters(original: str) -> str: # don't break certain emoji variants if char == "\u200D": try: - before_category = unicodedata.category(final_characters[-1]) - after_category = unicodedata.category(original[index + 1]) + before_category = unicodedata2.category(final_characters[-1]) + after_category = unicodedata2.category(original[index + 1]) except IndexError: continue