@ -3,7 +3,16 @@
""" Functions related to transforming URLs (sanitization, cleanup, etc.). """
from urllib.parse import parse_qs , urlencode , urlparse , urlunparse
from abc import ABC , abstractmethod
from collections import Counter
import logging
from urllib.parse import ParseResult , parse_qs , urlencode , urlparse , urlunparse
class UrlTransformationLoopError ( Exception ) :
""" Exception to raise if transformations seem to be in an infinite loop. """
pass
def apply_url_transformations ( url : str ) - > str :
@ -12,16 +21,92 @@ def apply_url_transformations(url: str) -> str:
This method should generally be the only one imported / used from this module , unless
there is a specific reason for needing to apply a subset of transformations .
"""
url = remove_utm_query_params ( url )
parsed_url = urlparse ( url )
try :
parsed_url = _apply_all_transformations ( parsed_url )
except UrlTransformationLoopError :
# if an infinite loop was caused, log an error and return the unchanged original
logging . error ( f " Transformations went into a loop on url {url} " )
return urlunparse ( parsed_url )
def _apply_all_transformations ( parsed_url : ParseResult ) - > ParseResult :
""" Apply all relevant UrlTransformer transformations to the url. """
# Used to keep track of which transformations are restarting the process, so we
# can detect cases where the process has gone into a loop (such as if two of them
# are reversing each other's changes repeatedly)
restarted_by : Counter = Counter ( )
# This structure is confusing - its purpose is to restart the full set of
# transformations whenever any of them cause the url to be changed, and only stop
# when all of the transformations run without any effects. This is because some
# transformations may modify a url in a way that makes a previous transformation
# applicable when it wasn't originally, so it will need to be run again.
while True :
for cls in UrlTransformer . __subclasses__ ( ) :
if not cls . is_applicable ( parsed_url ) :
continue
before = parsed_url
parsed_url = cls . apply_transformation ( parsed_url )
# if the url was changed, break the for loop (effectively restarting it)
if before != parsed_url :
# first, check how many times this transformation has caused the loop to
# restart - if it's 3 or more, it's likely we're in an infinite loop and
# need to bail out entirely
restarted_by [ cls ] + = 1
if restarted_by [ cls ] > = 3 :
raise UrlTransformationLoopError
break
else :
# all transformations ran with no changes, can stop working on this url
break
return parsed_url
class UrlTransformer ( ABC ) :
""" Abstract base class for url transformers.
This is a bit of an unusual usage of ABC , since normally it would only cause errors
when trying to instantiate the subclasses , which will never happen here since the
methods are classmethods . However , prospector is still able to recognize when the
abstract method ( s ) haven ' t been overridden.
"""
return url
@classmethod
def is_applicable ( cls , parsed_url : ParseResult ) - > bool :
""" Return whether this transformation should be applied to the url.
This can be used for cases like a transformation that only applies to specific
domains . If not overridden , the transformation will apply to all urls .
"""
# pylint: disable=unused-argument
return True
def remove_utm_query_params ( url : str ) - > str :
@classmethod
@abstractmethod
def apply_transformation ( cls , parsed_url : ParseResult ) - > ParseResult :
""" Apply the actual transformation process to the url.
Subclass implementations of this method should be idempotent , since a
transformation may be applied to a particular url any number of times until
the process completes .
"""
return parsed_url
class UtmQueryParamRemover ( UrlTransformer ) :
""" Remove any utm_* query parameters from a url. """
parsed = urlparse ( url )
query_params = parse_qs ( parsed . query )
@classmethod
def apply_transformation ( cls , parsed_url : ParseResult ) - > ParseResult :
""" Apply the actual transformation process to the url. """
query_params = parse_qs ( parsed_url . query )
cleaned_params = {
param : value
@ -29,6 +114,4 @@ def remove_utm_query_params(url: str) -> str:
if not param . startswith ( " utm_ " )
}
parsed = parsed . _replace ( query = urlencode ( cleaned_params , doseq = True ) )
return urlunparse ( parsed )
return parsed_url . _replace ( query = urlencode ( cleaned_params , doseq = True ) )