|
|
@ -4,13 +4,13 @@ |
|
|
|
"""Functions related to processing/manipulating strings.""" |
|
|
|
|
|
|
|
import re |
|
|
|
import unicodedata |
|
|
|
from collections.abc import Iterator |
|
|
|
from typing import Optional |
|
|
|
from urllib.parse import quote |
|
|
|
from xml.etree.ElementTree import Element |
|
|
|
|
|
|
|
from html5lib import HTMLParser |
|
|
|
import unicodedata2 |
|
|
|
|
|
|
|
|
|
|
|
# regex for matching an entire word, handles words that include an apostrophe |
|
|
@ -177,10 +177,11 @@ def simplify_string(original: str) -> str: |
|
|
|
|
|
|
|
def _sanitize_characters(original: str) -> str: |
|
|
|
"""Process a string and filter/replace problematic unicode.""" |
|
|
|
# pylint: disable=c-extension-no-member |
|
|
|
final_characters = [] |
|
|
|
|
|
|
|
for index, char in enumerate(original): |
|
|
|
category = unicodedata.category(char) |
|
|
|
category = unicodedata2.category(char) |
|
|
|
|
|
|
|
if category.startswith("Z"): |
|
|
|
# "separator" chars - replace with a normal space |
|
|
@ -195,8 +196,8 @@ def _sanitize_characters(original: str) -> str: |
|
|
|
# don't break certain emoji variants |
|
|
|
if char == "\u200D": |
|
|
|
try: |
|
|
|
before_category = unicodedata.category(final_characters[-1]) |
|
|
|
after_category = unicodedata.category(original[index + 1]) |
|
|
|
before_category = unicodedata2.category(final_characters[-1]) |
|
|
|
after_category = unicodedata2.category(original[index + 1]) |
|
|
|
except IndexError: |
|
|
|
continue |
|
|
|
|
|
|
|