Browse Source

Refactor url transformation process

This is (extreme) overkill at this point, but I'm thinking ahead to some
of the transformations that I want to be able to support, and this
refactor should make it much more feasible to have a good process.

The main changes here are using a class-based system where each
transformation can also define a method for checking whether it should
apply to any given url (probably often based on domain), as well as a
new process that will repeatedly apply all transformations until the url
passes through all of them unchanged.
merge-requests/55/head
Deimos 6 years ago
parent
commit
08b714bae8
  1. 111
      tildes/tildes/lib/url_transform.py

111
tildes/tildes/lib/url_transform.py

@ -3,7 +3,16 @@
"""Functions related to transforming URLs (sanitization, cleanup, etc.).""" """Functions related to transforming URLs (sanitization, cleanup, etc.)."""
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
from abc import ABC, abstractmethod
from collections import Counter
import logging
from urllib.parse import ParseResult, parse_qs, urlencode, urlparse, urlunparse
class UrlTransformationLoopError(Exception):
"""Exception to raise if transformations seem to be in an infinite loop."""
pass
def apply_url_transformations(url: str) -> str: def apply_url_transformations(url: str) -> str:
@ -12,23 +21,97 @@ def apply_url_transformations(url: str) -> str:
This method should generally be the only one imported/used from this module, unless This method should generally be the only one imported/used from this module, unless
there is a specific reason for needing to apply a subset of transformations. there is a specific reason for needing to apply a subset of transformations.
""" """
url = remove_utm_query_params(url)
parsed_url = urlparse(url)
return url
try:
parsed_url = _apply_all_transformations(parsed_url)
except UrlTransformationLoopError:
# if an infinite loop was caused, log an error and return the unchanged original
logging.error(f"Transformations went into a loop on url {url}")
return urlunparse(parsed_url)
def remove_utm_query_params(url: str) -> str:
"""Remove any utm_* query parameters from a url."""
parsed = urlparse(url)
query_params = parse_qs(parsed.query)
def _apply_all_transformations(parsed_url: ParseResult) -> ParseResult:
"""Apply all relevant UrlTransformer transformations to the url."""
# Used to keep track of which transformations are restarting the process, so we
# can detect cases where the process has gone into a loop (such as if two of them
# are reversing each other's changes repeatedly)
restarted_by: Counter = Counter()
# This structure is confusing - its purpose is to restart the full set of
# transformations whenever any of them cause the url to be changed, and only stop
# when all of the transformations run without any effects. This is because some
# transformations may modify a url in a way that makes a previous transformation
# applicable when it wasn't originally, so it will need to be run again.
while True:
for cls in UrlTransformer.__subclasses__():
if not cls.is_applicable(parsed_url):
continue
before = parsed_url
parsed_url = cls.apply_transformation(parsed_url)
# if the url was changed, break the for loop (effectively restarting it)
if before != parsed_url:
# first, check how many times this transformation has caused the loop to
# restart - if it's 3 or more, it's likely we're in an infinite loop and
# need to bail out entirely
restarted_by[cls] += 1
if restarted_by[cls] >= 3:
raise UrlTransformationLoopError
break
else:
# all transformations ran with no changes, can stop working on this url
break
return parsed_url
class UrlTransformer(ABC):
"""Abstract base class for url transformers.
This is a bit of an unusual usage of ABC, since normally it would only cause errors
when trying to instantiate the subclasses, which will never happen here since the
methods are classmethods. However, prospector is still able to recognize when the
abstract method(s) haven't been overridden.
"""
@classmethod
def is_applicable(cls, parsed_url: ParseResult) -> bool:
"""Return whether this transformation should be applied to the url.
This can be used for cases like a transformation that only applies to specific
domains. If not overridden, the transformation will apply to all urls.
"""
# pylint: disable=unused-argument
return True
@classmethod
@abstractmethod
def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult:
"""Apply the actual transformation process to the url.
Subclass implementations of this method should be idempotent, since a
transformation may be applied to a particular url any number of times until
the process completes.
"""
return parsed_url
class UtmQueryParamRemover(UrlTransformer):
"""Remove any utm_* query parameters from a url."""
cleaned_params = {
param: value
for param, value in query_params.items()
if not param.startswith("utm_")
}
@classmethod
def apply_transformation(cls, parsed_url: ParseResult) -> ParseResult:
"""Apply the actual transformation process to the url."""
query_params = parse_qs(parsed_url.query)
parsed = parsed._replace(query=urlencode(cleaned_params, doseq=True))
cleaned_params = {
param: value
for param, value in query_params.items()
if not param.startswith("utm_")
}
return urlunparse(parsed)
return parsed_url._replace(query=urlencode(cleaned_params, doseq=True))
Loading…
Cancel
Save