diff --git a/tildes/tests/test_simplestring_field.py b/tildes/tests/test_simplestring_field.py index bdf7e7b..da675b4 100644 --- a/tildes/tests/test_simplestring_field.py +++ b/tildes/tests/test_simplestring_field.py @@ -77,6 +77,18 @@ def test_control_chars_removed(): assert result == "I can be sneaky and add problemchars." +def test_zero_width_joiners_kept_and_collapsed(): + """"Ensure that multiple zero width joiners are collapsed like spaces.""" + original = "🀷\u200D\u200D\u200D♀\u200d" + assert process_string(original) == "🀷\u200D♀" + + +def test_zero_width_joiners_allowed_inside_emojis_and_not_other_words(): + """"Ensure the zero width joiner char is kept inside emojis.""" + original = "🀷\u200D♀ foo\u200dbar" + assert process_string(original) == "🀷\u200D♀ foobar" + + def test_leading_trailing_spaces_removed(): """Ensure leading/trailing spaces are removed from the string.""" original = " Centered! " diff --git a/tildes/tests/test_title.py b/tildes/tests/test_title.py index 4b8268b..f7afe0c 100644 --- a/tildes/tests/test_title.py +++ b/tildes/tests/test_title.py @@ -78,3 +78,13 @@ def test_unicode_control_chars_removed(title_schema): title = "nothing\u0000strange\u0085going\u009con\u007fhere" result = title_schema.load({"title": title}) assert result["title"] == "nothingstrangegoingonhere" + + +def test_zero_width_joiner_emojis_kept(title_schema): + """Test that emojis are parsed correctly""" + title = "πŸ€·πŸ€·β€β™‚οΈπŸ€·β€β™€οΈπŸ€·πŸ»πŸ€·πŸ»β€β™€οΈπŸ€·πŸ»β€β™‚οΈπŸ€·πŸΌπŸ€·πŸΌβ€β™€οΈπŸ€·πŸΌβ€β™‚οΈπŸ€·πŸ½πŸ€·πŸ½β€β™€οΈπŸ€·πŸ½β€β™‚οΈπŸ€·πŸΎπŸ€·πŸΎβ€β™€οΈπŸ€·πŸΎβ€β™‚οΈπŸ€·πŸΏπŸ€·πŸΏβ€β™€οΈπŸ€·πŸΏβ€β™‚οΈ" + result = title_schema.load({"title": title}) + assert ( + result["title"] + == "πŸ€·πŸ€·β€β™‚οΈπŸ€·β€β™€οΈπŸ€·πŸ»πŸ€·πŸ»β€β™€οΈπŸ€·πŸ»β€β™‚οΈπŸ€·πŸΌπŸ€·πŸΌβ€β™€οΈπŸ€·πŸΌβ€β™‚οΈπŸ€·πŸ½πŸ€·πŸ½β€β™€οΈπŸ€·πŸ½β€β™‚οΈπŸ€·πŸΎπŸ€·πŸΎβ€β™€οΈπŸ€·πŸΎβ€β™‚οΈπŸ€·πŸΏπŸ€·πŸΏβ€β™€οΈπŸ€·πŸΏβ€β™‚οΈ" + ) diff --git a/tildes/tildes/lib/string.py b/tildes/tildes/lib/string.py index 08bbb93..d012807 100644 --- a/tildes/tildes/lib/string.py +++ b/tildes/tildes/lib/string.py @@ -178,7 +178,7 @@ def _sanitize_characters(original: str) -> str: """Process a string and filter/replace problematic unicode.""" final_characters = [] - for char in original: + for index, char in enumerate(original): category = unicodedata.category(char) if category.startswith("Z"): @@ -189,6 +189,19 @@ def _sanitize_characters(original: str) -> str: # newlines, which are replaced with normal spaces if char == "\n": final_characters.append(" ") + elif char == "\u200D": + final_length = len(final_characters) + # only check for the ZWJ if it's between two characters + if final_length <= index < len(original) - 1: + char_before_category = unicodedata.category( + final_characters[final_length - 1] + ) + char_after_category = unicodedata.category(original[index + 1]) + # only keep the ZWJ if it's between two symbol characters + if char_before_category.startswith( + "S" + ) and char_after_category.startswith("S"): + final_characters.append("\u200D") else: # any other type of character, just keep it final_characters.append(char)