mirror of https://gitlab.com/tildes/tildes.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
207 lines
7.6 KiB
207 lines
7.6 KiB
"""Functions related to processing/manipulating strings."""
|
|
|
|
import re
|
|
from typing import Optional
|
|
import unicodedata
|
|
from urllib.parse import quote
|
|
|
|
|
|
# regex for matching an entire word, handles words that include an apostrophe
|
|
WORD_REGEX = re.compile(r"\w[\w'’]*")
|
|
|
|
|
|
def word_count(string: str) -> int:
|
|
"""Count the number of words in the string."""
|
|
return len(WORD_REGEX.findall(string))
|
|
|
|
|
|
def convert_to_url_slug(original: str, max_length: int = 100) -> str:
|
|
"""Convert a string (often a title) into one usable as a url slug."""
|
|
slug = original.lower()
|
|
|
|
# remove apostrophes so contractions don't get broken up by underscores
|
|
slug = re.sub("['’]", "", slug)
|
|
|
|
# replace all remaining non-word characters with underscores
|
|
slug = re.sub(r"\W+", "_", slug)
|
|
|
|
# remove any consecutive underscores
|
|
slug = re.sub("_{2,}", "_", slug)
|
|
|
|
# remove "hanging" underscores on the start and/or end
|
|
slug = slug.strip("_")
|
|
|
|
# url-encode the slug
|
|
encoded_slug = quote(slug)
|
|
|
|
# if the slug's already short enough, just return without worrying about how it will
|
|
# need to be truncated
|
|
if len(encoded_slug) <= max_length:
|
|
return encoded_slug
|
|
|
|
# Truncating a url-encoded slug can be tricky if there are any multi-byte unicode
|
|
# characters, since the %-encoded forms of them can be quite long. Check to see if
|
|
# the slug looks like it might contain any of those.
|
|
maybe_multi_bytes = bool(re.search("%..%", encoded_slug))
|
|
|
|
# if that matched, we need to take a more complicated approach
|
|
if maybe_multi_bytes:
|
|
return _truncate_multibyte_slug(slug, max_length)
|
|
|
|
# simple truncate - break at underscore if possible, no overflow string
|
|
return truncate_string(
|
|
encoded_slug, max_length, truncate_at_chars="_", overflow_str=None
|
|
)
|
|
|
|
|
|
def _truncate_multibyte_slug(original: str, max_length: int) -> str:
|
|
"""URL-encode and truncate a slug known to contain multi-byte chars."""
|
|
# instead of the normal method of truncating "backwards" from the end of the string,
|
|
# build it up one encoded character at a time from the start until it's too long
|
|
encoded_slug = ""
|
|
for character in original:
|
|
encoded_char = quote(character)
|
|
|
|
# if adding this onto the string would make it too long, stop here
|
|
if len(encoded_slug) + len(encoded_char) > max_length:
|
|
break
|
|
|
|
encoded_slug += encoded_char
|
|
|
|
# Now we know that the string is made up of "whole" characters and is close to the
|
|
# maximum length. We'd still like to truncate it at an underscore if possible, but
|
|
# some languages like Japanese and Chinese won't have many (or any) underscores in
|
|
# the slug, and we could end up losing a lot of the characters. So try breaking it
|
|
# at an underscore, but if it means more than 30% of the slug gets cut off, just
|
|
# leave it alone. This means that some url slugs in other languages will end in
|
|
# partial words, but determining the word edges is not simple.
|
|
acceptable_truncation = 0.7
|
|
|
|
truncated_slug = truncate_string_at_char(encoded_slug, "_")
|
|
|
|
if len(truncated_slug) / len(encoded_slug) >= acceptable_truncation:
|
|
return truncated_slug
|
|
|
|
return encoded_slug
|
|
|
|
|
|
def truncate_string(
|
|
original: str,
|
|
length: int,
|
|
truncate_at_chars: Optional[str] = None,
|
|
overflow_str: Optional[str] = "...",
|
|
) -> str:
|
|
"""Truncate a string to be no longer than a specified length.
|
|
|
|
If `truncate_at_chars` is specified (as a string, one or more characters), the
|
|
truncation will happen at the last occurrence of any of those chars inside the
|
|
remaining string after it has been initially cut down to the desired length.
|
|
|
|
`overflow_str` will be appended to the result, and its length is included in the
|
|
final string length. So for example, if `overflow_str` has a length of 3 and the
|
|
target length is 10, at most 7 characters of the original string will be kept.
|
|
"""
|
|
if overflow_str is None:
|
|
overflow_str = ""
|
|
|
|
# no need to do anything if the string is already short enough
|
|
if len(original) <= length:
|
|
return original
|
|
|
|
# cut the string down to the max desired length (leaving space for the overflow
|
|
# string if one is specified)
|
|
truncated = original[: length - len(overflow_str)]
|
|
|
|
# if we don't want to truncate at particular characters, we're done
|
|
if not truncate_at_chars:
|
|
return truncated + overflow_str
|
|
|
|
# break the string at one of the requested chars instead, if possible
|
|
truncated = truncate_string_at_char(truncated, truncate_at_chars)
|
|
|
|
return truncated + overflow_str
|
|
|
|
|
|
def truncate_string_at_char(original: str, valid_chars: str) -> str:
|
|
"""Truncate a string at the last occurrence of a particular character.
|
|
|
|
Supports passing multiple valid characters (as a string) for `valid_chars`, for
|
|
example valid_chars='.?!' would truncate at the "right-most" occurrence of any of
|
|
those 3 characters in the string.
|
|
"""
|
|
# work backwards through the string until we find one of the chars we want
|
|
for num_from_end, char in enumerate(reversed(original), start=1):
|
|
if char in valid_chars:
|
|
break
|
|
else:
|
|
# the loop didn't break, so we looked through the entire string and didn't find
|
|
# any of the desired characters - can't do anything
|
|
return original
|
|
|
|
# a truncation char was found, so -num_from_end is the position to stop at
|
|
# pylint: disable=undefined-loop-variable
|
|
return original[:-num_from_end]
|
|
|
|
|
|
def simplify_string(original: str) -> str:
|
|
"""Sanitize a string for usage in places where we need a "simple" one.
|
|
|
|
This function is useful for sanitizing strings so that they're suitable to be used
|
|
in places like topic titles, message subjects, and so on.
|
|
|
|
Strings processed by this function:
|
|
|
|
* have unicode chars from the "separator" category replaced with spaces
|
|
* have unicode chars from the "other" category stripped out, except for newlines,
|
|
which are replaced with spaces
|
|
* have leading and trailing whitespace removed
|
|
* have multiple consecutive spaces collapsed into a single space
|
|
"""
|
|
simplified = _sanitize_characters(original)
|
|
|
|
# replace consecutive spaces with a single space
|
|
simplified = re.sub(r"\s{2,}", " ", simplified)
|
|
|
|
# remove any remaining leading/trailing whitespace
|
|
simplified = simplified.strip()
|
|
|
|
return simplified
|
|
|
|
|
|
def _sanitize_characters(original: str) -> str:
|
|
"""Process a string and filter/replace problematic unicode."""
|
|
final_characters = []
|
|
|
|
for char in original:
|
|
category = unicodedata.category(char)
|
|
|
|
if category.startswith("Z"):
|
|
# "separator" chars - replace with a normal space
|
|
final_characters.append(" ")
|
|
elif category.startswith("C"):
|
|
# "other" chars (control, formatting, etc.) - filter them out except for
|
|
# newlines, which are replaced with normal spaces
|
|
if char == "\n":
|
|
final_characters.append(" ")
|
|
else:
|
|
# any other type of character, just keep it
|
|
final_characters.append(char)
|
|
|
|
return "".join(final_characters)
|
|
|
|
|
|
def separate_string(original: str, separator: str, segment_size: int) -> str:
|
|
"""Separate a string into "segments", inserting a separator every X chars.
|
|
|
|
This is useful for strings being used as "codes" such as invite codes and 2FA backup
|
|
codes, so that they can be displayed in a more easily-readable format.
|
|
"""
|
|
separated = ""
|
|
|
|
for count, char in enumerate(original):
|
|
if count > 0 and count % segment_size == 0:
|
|
separated += separator
|
|
|
|
separated += char
|
|
|
|
return separated
|