Refactor url transformation process

This is (extreme) overkill at this point, but I'm thinking ahead to some of the transformations that I want to be able to support, and this refactor should make it much more feasible to have a good process. The main changes here are using a class-based system where each transformation can also define a method for checking whether it should apply to any given url (probably often based on domain), as well as a new process that will repeatedly apply all transformations until the url passes through all of them unchanged.
7 years ago · 08b714bae8
1 changed files with 97 additions and 14 deletions
--- a/tildes/tildes/lib/url_transform.py
+++ b/tildes/tildes/lib/url_transform.py
@ -3,7 +3,16 @@
 """Functions related to transforming URLs (sanitization, cleanup, etc.)."""
 from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
 from abc import ABC, abstractmethod
 from collections import Counter
 import logging
 from urllib.parse import ParseResult, parse_qs, urlencode, urlparse, urlunparse
 class UrlTransformationLoopError(Exception):
    """Exception to raise if transformations seem to be in an infinite loop."""
    pass
 def apply_url_transformations(url: str) -> str:
@ -12,16 +21,92 @@ def apply_url_transformations(url: str) -> str:
    This method should generally be the only one imported/used from this module, unless
    there is a specific reason for needing to apply a subset of transformations.
    """
    url = remove_utm_query_params(url)
    parsed_url = urlparse(url)
    try:
        parsed_url = _apply_all_transformations(parsed_url)
    except UrlTransformationLoopError:
        # if an infinite loop was caused, log an error and return the unchanged original
        logging.error(f"Transformations went into a loop on url {url}")
    return urlunparse(parsed_url)
 def _apply_all_transformations(parsed_url: ParseResult) -> ParseResult:
    """Apply all relevant UrlTransformer transformations to the url."""
    # Used to keep track of which transformations are restarting the process, so we
    # can detect cases where the process has gone into a loop (such as if two of them
    # are reversing each other's changes repeatedly)
    restarted_by: Counter = Counter()
    # This structure is confusing - its purpose is to restart the full set of
    # transformations whenever any of them cause the url to be changed, and only stop
    # when all of the transformations run without any effects. This is because some
    # transformations may modify a url in a way that makes a previous transformation
    # applicable when it wasn't originally, so it will need to be run again.
    while True:
        for cls in UrlTransformer.__subclasses__():
            if not cls.is_applicable(parsed_url):
                continue
            before = parsed_url
            parsed_url = cls.apply_transformation(parsed_url)
            # if the url was changed, break the for loop (effectively restarting it)
            if before != parsed_url:
                # first, check how many times this transformation has caused the loop to
                # restart - if it's 3 or more, it's likely we're in an infinite loop and
                # need to bail out entirely
                restarted_by[cls] += 1
                if restarted_by[cls] >= 3:
                    raise UrlTransformationLoopError
                break
        else:
            # all transformations ran with no changes, can stop working on this url
            break
    return parsed_url
 class UrlTransformer(ABC):
    """Abstract base class for url transformers.
    This is a bit of an unusual usage of ABC, since normally it would only cause errors
    when trying to instantiate the subclasses, which will never happen here since the
    methods are classmethods. However, prospector is still able to recognize when the
    abstract method(s) haven't been overridden.
    """
    return url
    @classmethod
    def is_applicable(cls, parsed_url: ParseResult) -> bool:
        """Return whether this transformation should be applied to the url.
        This can be used for cases like a transformation that only applies to specific
        domains. If not overridden, the transformation will apply to all urls.
        """
        # pylint: disable=unused-argument
        return True
 def remove_utm_query_params(url: str) -> str:
    @classmethod
    @abstractmethod
    def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult:
        """Apply the actual transformation process to the url.
        Subclass implementations of this method should be idempotent, since a
        transformation may be applied to a particular url any number of times until
        the process completes.
        """
        return parsed_url
 class UtmQueryParamRemover(UrlTransformer):
    """Remove any utm_* query parameters from a url."""
    parsed = urlparse(url)
    query_params = parse_qs(parsed.query)
    @classmethod
    def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult:
        """Apply the actual transformation process to the url."""
        query_params = parse_qs(parsed_url.query)
        cleaned_params = {
            param: value
@ -29,6 +114,4 @@ def remove_utm_query_params(url: str) -> str:
            if not param.startswith("utm_")
        }
    parsed = parsed._replace(query=urlencode(cleaned_params, doseq=True))
    return urlunparse(parsed)
        return parsed_url._replace(query=urlencode(cleaned_params, doseq=True))