Browse Source

Handle zero width joiner unicode chars for emoji

Some emoji variants require a zero-width joiner, and they were being
broken by the current code that stripped them out.
merge-requests/135/head
Flashynuff 4 years ago
committed by Deimos
parent
commit
70e570b77f
  1. 12
      tildes/tests/test_simplestring_field.py
  2. 10
      tildes/tests/test_title.py
  3. 15
      tildes/tildes/lib/string.py

12
tildes/tests/test_simplestring_field.py

@ -77,6 +77,18 @@ def test_control_chars_removed():
assert result == "I can be sneaky and add problemchars." assert result == "I can be sneaky and add problemchars."
def test_zero_width_joiners_kept_and_collapsed():
""""Ensure that multiple zero width joiners are collapsed like spaces."""
original = "🤷\u200D\u200D\u200D\u200d"
assert process_string(original) == "🤷\u200D"
def test_zero_width_joiners_allowed_inside_emojis_and_not_other_words():
""""Ensure the zero width joiner char is kept inside emojis."""
original = "🤷\u200D♀ foo\u200dbar"
assert process_string(original) == "🤷\u200D♀ foobar"
def test_leading_trailing_spaces_removed(): def test_leading_trailing_spaces_removed():
"""Ensure leading/trailing spaces are removed from the string.""" """Ensure leading/trailing spaces are removed from the string."""
original = " Centered! " original = " Centered! "

10
tildes/tests/test_title.py

@ -78,3 +78,13 @@ def test_unicode_control_chars_removed(title_schema):
title = "nothing\u0000strange\u0085going\u009con\u007fhere" title = "nothing\u0000strange\u0085going\u009con\u007fhere"
result = title_schema.load({"title": title}) result = title_schema.load({"title": title})
assert result["title"] == "nothingstrangegoingonhere" assert result["title"] == "nothingstrangegoingonhere"
def test_zero_width_joiner_emojis_kept(title_schema):
"""Test that emojis are parsed correctly"""
title = "🤷🤷‍♂️🤷‍♀️🤷🏻🤷🏻‍♀️🤷🏻‍♂️🤷🏼🤷🏼‍♀️🤷🏼‍♂️🤷🏽🤷🏽‍♀️🤷🏽‍♂️🤷🏾🤷🏾‍♀️🤷🏾‍♂️🤷🏿🤷🏿‍♀️🤷🏿‍♂️"
result = title_schema.load({"title": title})
assert (
result["title"]
== "🤷🤷‍♂️🤷‍♀️🤷🏻🤷🏻‍♀️🤷🏻‍♂️🤷🏼🤷🏼‍♀️🤷🏼‍♂️🤷🏽🤷🏽‍♀️🤷🏽‍♂️🤷🏾🤷🏾‍♀️🤷🏾‍♂️🤷🏿🤷🏿‍♀️🤷🏿‍♂️"
)

15
tildes/tildes/lib/string.py

@ -178,7 +178,7 @@ def _sanitize_characters(original: str) -> str:
"""Process a string and filter/replace problematic unicode.""" """Process a string and filter/replace problematic unicode."""
final_characters = [] final_characters = []
for char in original:
for index, char in enumerate(original):
category = unicodedata.category(char) category = unicodedata.category(char)
if category.startswith("Z"): if category.startswith("Z"):
@ -189,6 +189,19 @@ def _sanitize_characters(original: str) -> str:
# newlines, which are replaced with normal spaces # newlines, which are replaced with normal spaces
if char == "\n": if char == "\n":
final_characters.append(" ") final_characters.append(" ")
elif char == "\u200D":
final_length = len(final_characters)
# only check for the ZWJ if it's between two characters
if final_length <= index < len(original) - 1:
char_before_category = unicodedata.category(
final_characters[final_length - 1]
)
char_after_category = unicodedata.category(original[index + 1])
# only keep the ZWJ if it's between two symbol characters
if char_before_category.startswith(
"S"
) and char_after_category.startswith("S"):
final_characters.append("\u200D")
else: else:
# any other type of character, just keep it # any other type of character, just keep it
final_characters.append(char) final_characters.append(char)

Loading…
Cancel
Save