Support Unicode 16.0 emoji via unicodedata2 package

See merge request tildes/tildes!159
6 months ago · 4a31380fad
6 changed files with 14 additions and 4 deletions
--- a/tildes/requirements-dev.in
+++ b/tildes/requirements-dev.in
@ -12,4 +12,5 @@ types-bleach
 types-python-dateutil
 types-redis
 types-requests
+unicodedata2
 webtest
--- a/tildes/requirements-dev.txt
+++ b/tildes/requirements-dev.txt
@ -109,6 +109,7 @@ types-python-dateutil==0.1.4
 types-redis==3.5.4
 types-requests==2.25.0
 typing-extensions==4.12.2
+unicodedata2==16.0.0
 urllib3==1.26.6
 venusian==3.0.0
 waitress==2.0.0
--- a/tildes/requirements.in
+++ b/tildes/requirements.in
@ -36,6 +36,7 @@ SQLAlchemy<1.4
 SQLAlchemy-Utils
 stripe
 titlecase
+unicodedata2
 webargs
 wrapt
 zope.sqlalchemy
--- a/tildes/requirements.txt
+++ b/tildes/requirements.txt
@ -67,6 +67,7 @@ tomli==1.2.3
 traitlets==5.0.5
 transaction==3.0.1
 translationstring==1.4
+unicodedata2==16.0.0
 urllib3==1.26.6
 venusian==3.0.0
 wcwidth==0.2.5
--- a/tildes/tests/test_simplestring_field.py
+++ b/tildes/tests/test_simplestring_field.py
@ -99,3 +99,8 @@ def test_consecutive_spaces_collapsed():
    """Ensure runs of consecutive spaces are "collapsed" inside the string."""
    original = "I    wanted   to      space    this        out"
    assert process_string(original) == "I wanted to space this out"
+
+
+def test_unicode_15_moose_kept():
+    """Ensure newer emoji introduced in Unicode 15 are kept."""
+    assert process_string("🫎") == "🫎"
--- a/tildes/tildes/lib/string.py
+++ b/tildes/tildes/lib/string.py
@ -4,13 +4,13 @@
 """Functions related to processing/manipulating strings."""

 import re
-import unicodedata
 from collections.abc import Iterator
 from typing import Optional
 from urllib.parse import quote
 from xml.etree.ElementTree import Element

 from html5lib import HTMLParser
+import unicodedata2


 # regex for matching an entire word, handles words that include an apostrophe
@ -177,10 +177,11 @@ def simplify_string(original: str) -> str:

 def _sanitize_characters(original: str) -> str:
    """Process a string and filter/replace problematic unicode."""
+    # pylint: disable=c-extension-no-member
    final_characters = []

    for index, char in enumerate(original):
-        category = unicodedata.category(char)
+        category = unicodedata2.category(char)

        if category.startswith("Z"):
            # "separator" chars - replace with a normal space
@ -195,8 +196,8 @@ def _sanitize_characters(original: str) -> str:
            # don't break certain emoji variants
            if char == "\u200D":
                try:
-                    before_category = unicodedata.category(final_characters[-1])
-                    after_category = unicodedata.category(original[index + 1])
+                    before_category = unicodedata2.category(final_characters[-1])
+                    after_category = unicodedata2.category(original[index + 1])
                except IndexError:
                    continue