From 369f273f8ed56c9c44ab0bc4b968319d60eb9110 Mon Sep 17 00:00:00 2001 From: Deimos Date: Wed, 12 Sep 2018 13:04:52 -0600 Subject: [PATCH] Use Embedly result to canonicalize link topics As part of scraping a link, Embedly will often remove tracking vars from the query, follow redirects, and so on. This will start using the url returned back from an Embedly result to replace the one that was originally submitted when it was different (though the original one will still be kept in the original_url column). --- tildes/consumers/topic_embedly_extractor.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tildes/consumers/topic_embedly_extractor.py b/tildes/consumers/topic_embedly_extractor.py index 68ca723..5f1b8b9 100644 --- a/tildes/consumers/topic_embedly_extractor.py +++ b/tildes/consumers/topic_embedly_extractor.py @@ -66,6 +66,10 @@ class TopicEmbedlyExtractor(PgsqlQueueConsumer): self.db_session.add(result) + # update the topic's link if embedly says the final url is different + if topic.link != result.data["url"]: + topic.link = result.data["url"] + new_metadata = EmbedlyScraper.get_metadata_from_result(result) if new_metadata: