|
@ -189,19 +189,18 @@ def _sanitize_characters(original: str) -> str: |
|
|
# newlines, which are replaced with normal spaces |
|
|
# newlines, which are replaced with normal spaces |
|
|
if char == "\n": |
|
|
if char == "\n": |
|
|
final_characters.append(" ") |
|
|
final_characters.append(" ") |
|
|
elif char == "\u200D": |
|
|
|
|
|
final_length = len(final_characters) |
|
|
|
|
|
# only check for the ZWJ if it's between two characters |
|
|
|
|
|
if final_length <= index < len(original) - 1: |
|
|
|
|
|
char_before_category = unicodedata.category( |
|
|
|
|
|
final_characters[final_length - 1] |
|
|
|
|
|
) |
|
|
|
|
|
char_after_category = unicodedata.category(original[index + 1]) |
|
|
|
|
|
# only keep the ZWJ if it's between two symbol characters |
|
|
|
|
|
if char_before_category.startswith( |
|
|
|
|
|
"S" |
|
|
|
|
|
) and char_after_category.startswith("S"): |
|
|
|
|
|
final_characters.append("\u200D") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Keep zero-width joiner only if it's between two symbol characters, so we |
|
|
|
|
|
# don't break certain emoji variants |
|
|
|
|
|
if char == "\u200D": |
|
|
|
|
|
try: |
|
|
|
|
|
before_category = unicodedata.category(final_characters[-1]) |
|
|
|
|
|
after_category = unicodedata.category(original[index + 1]) |
|
|
|
|
|
except IndexError: |
|
|
|
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
if before_category.startswith("S") and after_category.startswith("S"): |
|
|
|
|
|
final_characters.append(char) |
|
|
else: |
|
|
else: |
|
|
# any other type of character, just keep it |
|
|
# any other type of character, just keep it |
|
|
final_characters.append(char) |
|
|
final_characters.append(char) |
|
|