diff --git a/tildes/tildes/lib/url_transform.py b/tildes/tildes/lib/url_transform.py index 51f4794..825aaf3 100644 --- a/tildes/tildes/lib/url_transform.py +++ b/tildes/tildes/lib/url_transform.py @@ -3,7 +3,16 @@ """Functions related to transforming URLs (sanitization, cleanup, etc.).""" -from urllib.parse import parse_qs, urlencode, urlparse, urlunparse +from abc import ABC, abstractmethod +from collections import Counter +import logging +from urllib.parse import ParseResult, parse_qs, urlencode, urlparse, urlunparse + + +class UrlTransformationLoopError(Exception): + """Exception to raise if transformations seem to be in an infinite loop.""" + + pass def apply_url_transformations(url: str) -> str: @@ -12,23 +21,97 @@ def apply_url_transformations(url: str) -> str: This method should generally be the only one imported/used from this module, unless there is a specific reason for needing to apply a subset of transformations. """ - url = remove_utm_query_params(url) + parsed_url = urlparse(url) - return url + try: + parsed_url = _apply_all_transformations(parsed_url) + except UrlTransformationLoopError: + # if an infinite loop was caused, log an error and return the unchanged original + logging.error(f"Transformations went into a loop on url {url}") + return urlunparse(parsed_url) -def remove_utm_query_params(url: str) -> str: - """Remove any utm_* query parameters from a url.""" - parsed = urlparse(url) - query_params = parse_qs(parsed.query) +def _apply_all_transformations(parsed_url: ParseResult) -> ParseResult: + """Apply all relevant UrlTransformer transformations to the url.""" + # Used to keep track of which transformations are restarting the process, so we + # can detect cases where the process has gone into a loop (such as if two of them + # are reversing each other's changes repeatedly) + restarted_by: Counter = Counter() + + # This structure is confusing - its purpose is to restart the full set of + # transformations whenever any of them cause the url to be changed, and only stop + # when all of the transformations run without any effects. This is because some + # transformations may modify a url in a way that makes a previous transformation + # applicable when it wasn't originally, so it will need to be run again. + while True: + for cls in UrlTransformer.__subclasses__(): + if not cls.is_applicable(parsed_url): + continue + + before = parsed_url + parsed_url = cls.apply_transformation(parsed_url) + + # if the url was changed, break the for loop (effectively restarting it) + if before != parsed_url: + # first, check how many times this transformation has caused the loop to + # restart - if it's 3 or more, it's likely we're in an infinite loop and + # need to bail out entirely + restarted_by[cls] += 1 + if restarted_by[cls] >= 3: + raise UrlTransformationLoopError + + break + else: + # all transformations ran with no changes, can stop working on this url + break + + return parsed_url + + +class UrlTransformer(ABC): + """Abstract base class for url transformers. + + This is a bit of an unusual usage of ABC, since normally it would only cause errors + when trying to instantiate the subclasses, which will never happen here since the + methods are classmethods. However, prospector is still able to recognize when the + abstract method(s) haven't been overridden. + """ + + @classmethod + def is_applicable(cls, parsed_url: ParseResult) -> bool: + """Return whether this transformation should be applied to the url. + + This can be used for cases like a transformation that only applies to specific + domains. If not overridden, the transformation will apply to all urls. + """ + # pylint: disable=unused-argument + return True + + @classmethod + @abstractmethod + def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult: + """Apply the actual transformation process to the url. + + Subclass implementations of this method should be idempotent, since a + transformation may be applied to a particular url any number of times until + the process completes. + """ + return parsed_url + + +class UtmQueryParamRemover(UrlTransformer): + """Remove any utm_* query parameters from a url.""" - cleaned_params = { - param: value - for param, value in query_params.items() - if not param.startswith("utm_") - } + @classmethod + def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult: + """Apply the actual transformation process to the url.""" + query_params = parse_qs(parsed_url.query) - parsed = parsed._replace(query=urlencode(cleaned_params, doseq=True)) + cleaned_params = { + param: value + for param, value in query_params.items() + if not param.startswith("utm_") + } - return urlunparse(parsed) + return parsed_url._replace(query=urlencode(cleaned_params, doseq=True))