Handle zero width joiner unicode chars for emoji

Some emoji variants require a zero-width joiner, and they were being broken by the current code that stripped them out.
4 years ago · 70e570b77f
3 changed files with 36 additions and 1 deletions
--- a/tildes/tests/test_simplestring_field.py
+++ b/tildes/tests/test_simplestring_field.py
@ -77,6 +77,18 @@ def test_control_chars_removed():
    assert result == "I can be sneaky and add problemchars."
 def test_zero_width_joiners_kept_and_collapsed():
    """"Ensure that multiple zero width joiners are collapsed like spaces."""
    original = "🤷\u200D\u200D\u200D♀\u200d"
    assert process_string(original) == "🤷\u200D♀"
 def test_zero_width_joiners_allowed_inside_emojis_and_not_other_words():
    """"Ensure the zero width joiner char is kept inside emojis."""
    original = "🤷\u200D♀ foo\u200dbar"
    assert process_string(original) == "🤷\u200D♀ foobar"
 def test_leading_trailing_spaces_removed():
    """Ensure leading/trailing spaces are removed from the string."""
    original = "          Centered!          "
--- a/tildes/tests/test_title.py
+++ b/tildes/tests/test_title.py
@ -78,3 +78,13 @@ def test_unicode_control_chars_removed(title_schema):
    title = "nothing\u0000strange\u0085going\u009con\u007fhere"
    result = title_schema.load({"title": title})
    assert result["title"] == "nothingstrangegoingonhere"
 def test_zero_width_joiner_emojis_kept(title_schema):
    """Test that emojis are parsed correctly"""
    title = "🤷🤷‍♂️🤷‍♀️🤷🏻🤷🏻‍♀️🤷🏻‍♂️🤷🏼🤷🏼‍♀️🤷🏼‍♂️🤷🏽🤷🏽‍♀️🤷🏽‍♂️🤷🏾🤷🏾‍♀️🤷🏾‍♂️🤷🏿🤷🏿‍♀️🤷🏿‍♂️"
    result = title_schema.load({"title": title})
    assert (
        result["title"]
        == "🤷🤷‍♂️🤷‍♀️🤷🏻🤷🏻‍♀️🤷🏻‍♂️🤷🏼🤷🏼‍♀️🤷🏼‍♂️🤷🏽🤷🏽‍♀️🤷🏽‍♂️🤷🏾🤷🏾‍♀️🤷🏾‍♂️🤷🏿🤷🏿‍♀️🤷🏿‍♂️"
    )
--- a/tildes/tildes/lib/string.py
+++ b/tildes/tildes/lib/string.py
@ -178,7 +178,7 @@ def _sanitize_characters(original: str) -> str:
    """Process a string and filter/replace problematic unicode."""
    final_characters = []
    for char in original:
    for index, char in enumerate(original):
        category = unicodedata.category(char)
        if category.startswith("Z"):
@ -189,6 +189,19 @@ def _sanitize_characters(original: str) -> str:
            # newlines, which are replaced with normal spaces
            if char == "\n":
                final_characters.append(" ")
            elif char == "\u200D":
                final_length = len(final_characters)
                # only check for the ZWJ if it's between two characters
                if final_length <= index < len(original) - 1:
                    char_before_category = unicodedata.category(
                        final_characters[final_length - 1]
                    )
                    char_after_category = unicodedata.category(original[index + 1])
                    # only keep the ZWJ if it's between two symbol characters
                    if char_before_category.startswith(
                        "S"
                    ) and char_after_category.startswith("S"):
                        final_characters.append("\u200D")
        else:
            # any other type of character, just keep it
            final_characters.append(char)