Handle zero width joiner unicode chars for emoji

Some emoji variants require a zero-width joiner, and they were being broken by the current code that stripped them out.
4 years ago · 70e570b77f
3 changed files with 36 additions and 1 deletions
--- a/tildes/tests/test_simplestring_field.py
+++ b/tildes/tests/test_simplestring_field.py
@ -77,6 +77,18 @@ def test_control_chars_removed():
    assert result == "I can be sneaky and add problemchars."


+def test_zero_width_joiners_kept_and_collapsed():
+    """"Ensure that multiple zero width joiners are collapsed like spaces."""
+    original = "🤷\u200D\u200D\u200D♀\u200d"
+    assert process_string(original) == "🤷\u200D♀"
+
+
+def test_zero_width_joiners_allowed_inside_emojis_and_not_other_words():
+    """"Ensure the zero width joiner char is kept inside emojis."""
+    original = "🤷\u200D♀ foo\u200dbar"
+    assert process_string(original) == "🤷\u200D♀ foobar"
+
+
 def test_leading_trailing_spaces_removed():
    """Ensure leading/trailing spaces are removed from the string."""
    original = "          Centered!          "
--- a/tildes/tests/test_title.py
+++ b/tildes/tests/test_title.py
@ -78,3 +78,13 @@ def test_unicode_control_chars_removed(title_schema):
    title = "nothing\u0000strange\u0085going\u009con\u007fhere"
    result = title_schema.load({"title": title})
    assert result["title"] == "nothingstrangegoingonhere"
+
+
+def test_zero_width_joiner_emojis_kept(title_schema):
+    """Test that emojis are parsed correctly"""
+    title = "🤷🤷‍♂️🤷‍♀️🤷🏻🤷🏻‍♀️🤷🏻‍♂️🤷🏼🤷🏼‍♀️🤷🏼‍♂️🤷🏽🤷🏽‍♀️🤷🏽‍♂️🤷🏾🤷🏾‍♀️🤷🏾‍♂️🤷🏿🤷🏿‍♀️🤷🏿‍♂️"
+    result = title_schema.load({"title": title})
+    assert (
+        result["title"]
+        == "🤷🤷‍♂️🤷‍♀️🤷🏻🤷🏻‍♀️🤷🏻‍♂️🤷🏼🤷🏼‍♀️🤷🏼‍♂️🤷🏽🤷🏽‍♀️🤷🏽‍♂️🤷🏾🤷🏾‍♀️🤷🏾‍♂️🤷🏿🤷🏿‍♀️🤷🏿‍♂️"
+    )
--- a/tildes/tildes/lib/string.py
+++ b/tildes/tildes/lib/string.py
@ -178,7 +178,7 @@ def _sanitize_characters(original: str) -> str:
    """Process a string and filter/replace problematic unicode."""
    final_characters = []

-    for char in original:
+    for index, char in enumerate(original):
        category = unicodedata.category(char)

        if category.startswith("Z"):
@ -189,6 +189,19 @@ def _sanitize_characters(original: str) -> str:
            # newlines, which are replaced with normal spaces
            if char == "\n":
                final_characters.append(" ")
+            elif char == "\u200D":
+                final_length = len(final_characters)
+                # only check for the ZWJ if it's between two characters
+                if final_length <= index < len(original) - 1:
+                    char_before_category = unicodedata.category(
+                        final_characters[final_length - 1]
+                    )
+                    char_after_category = unicodedata.category(original[index + 1])
+                    # only keep the ZWJ if it's between two symbol characters
+                    if char_before_category.startswith(
+                        "S"
+                    ) and char_after_category.startswith("S"):
+                        final_characters.append("\u200D")
        else:
            # any other type of character, just keep it
            final_characters.append(char)