diff --git a/tildes/tests/test_url_transform.py b/tildes/tests/test_url_transform.py new file mode 100644 index 0000000..833a053 --- /dev/null +++ b/tildes/tests/test_url_transform.py @@ -0,0 +1,20 @@ +# Copyright (c) 2018 Tildes contributors +# SPDX-License-Identifier: AGPL-3.0-or-later + +from tildes.lib.url_transform import apply_url_transformations + + +def test_remove_utm_query_params(): + """Ensure that utm query params are removed but others are left.""" + url = "http://example.com/path?utm_source=tildes&utm_campaign=test&something=ok" + cleaned_url = apply_url_transformations(url) + + assert cleaned_url == "http://example.com/path?something=ok" + + +def test_non_utm_params_unaffected(): + """Ensure that non-utm_ query params aren't removed.""" + url = "http://example.com/path?one=x&two=y&three=z" + cleaned_url = apply_url_transformations(url) + + assert cleaned_url == url diff --git a/tildes/tildes/lib/url_transform.py b/tildes/tildes/lib/url_transform.py new file mode 100644 index 0000000..51f4794 --- /dev/null +++ b/tildes/tildes/lib/url_transform.py @@ -0,0 +1,34 @@ +# Copyright (c) 2018 Tildes contributors +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Functions related to transforming URLs (sanitization, cleanup, etc.).""" + +from urllib.parse import parse_qs, urlencode, urlparse, urlunparse + + +def apply_url_transformations(url: str) -> str: + """Apply all applicable transformations to a url. + + This method should generally be the only one imported/used from this module, unless + there is a specific reason for needing to apply a subset of transformations. + """ + url = remove_utm_query_params(url) + + return url + + +def remove_utm_query_params(url: str) -> str: + """Remove any utm_* query parameters from a url.""" + parsed = urlparse(url) + + query_params = parse_qs(parsed.query) + + cleaned_params = { + param: value + for param, value in query_params.items() + if not param.startswith("utm_") + } + + parsed = parsed._replace(query=urlencode(cleaned_params, doseq=True)) + + return urlunparse(parsed) diff --git a/tildes/tildes/models/topic/topic.py b/tildes/tildes/models/topic/topic.py index 8dc4a73..58b4c8c 100644 --- a/tildes/tildes/models/topic/topic.py +++ b/tildes/tildes/models/topic/topic.py @@ -32,6 +32,7 @@ from tildes.lib.id import id_to_id36 from tildes.lib.markdown import convert_markdown_to_safe_html from tildes.lib.string import convert_to_url_slug from tildes.lib.url import get_domain_from_url, is_tweet +from tildes.lib.url_transform import apply_url_transformations from tildes.metrics import incr_counter from tildes.models import DatabaseModel from tildes.models.group import Group @@ -207,7 +208,7 @@ class Topic(DatabaseModel): """Create a new link topic.""" new_topic = cls._create_base_topic(group, author, title) new_topic.topic_type = TopicType.LINK - new_topic.link = link + new_topic.link = apply_url_transformations(link) new_topic.original_url = link incr_counter("topics", type="link")