From 206606fd599f274abd11320f5032518d3e23dc59 Mon Sep 17 00:00:00 2001 From: Deimos Date: Sun, 9 Sep 2018 19:53:04 -0600 Subject: [PATCH] Topic metadata generator: update, don't replace Previously, any topic processed by this consumer would have its content_metadata completely replaced. This won't work once other consumers or processes start being able to set that data, since we don't know that this one will always run first. This commit updates the method the consumer uses so that it will keep any data that's already in the topic's content_metadata column if necessary. It would probably be good to generalize this method out somehow so that it can be used in other places more easily. --- tildes/consumers/topic_metadata_generator.py | 37 +++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/tildes/consumers/topic_metadata_generator.py b/tildes/consumers/topic_metadata_generator.py index d1710ef..1b04ab7 100644 --- a/tildes/consumers/topic_metadata_generator.py +++ b/tildes/consumers/topic_metadata_generator.py @@ -3,10 +3,12 @@ """Consumer that generates content_metadata for topics.""" -from typing import Sequence +from typing import Any, Dict, Sequence from amqpy import Message import publicsuffix +from sqlalchemy import cast, func +from sqlalchemy.dialects.postgresql import JSONB from tildes.lib.amqp import PgsqlQueueConsumer from tildes.lib.string import extract_text_from_html, truncate_string, word_count @@ -32,32 +34,41 @@ class TopicMetadataGenerator(PgsqlQueueConsumer): ) if topic.is_text_type: - self._generate_text_metadata(topic) + new_metadata = self._generate_text_metadata(topic) elif topic.is_link_type: - self._generate_link_metadata(topic) + new_metadata = self._generate_link_metadata(topic) + + # update the topic's content_metadata in a way that won't wipe out any existing + # values, and can handle the column being null + ( + self.db_session.query(Topic) + .filter(Topic.topic_id == topic.topic_id) + .update( + { + "content_metadata": func.coalesce( + Topic.content_metadata, cast({}, JSONB) + ).op("||")(new_metadata) + }, + synchronize_session=False, + ) + ) @staticmethod - def _generate_text_metadata(topic: Topic) -> None: + def _generate_text_metadata(topic: Topic) -> Dict[str, Any]: """Generate metadata for a text topic (word count and excerpt).""" extracted_text = extract_text_from_html(topic.rendered_html) # create a short excerpt by truncating the extracted string excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ") - topic.content_metadata = { - "word_count": word_count(extracted_text), - "excerpt": excerpt, - } + return {"word_count": word_count(extracted_text), "excerpt": excerpt} - def _generate_link_metadata(self, topic: Topic) -> None: + def _generate_link_metadata(self, topic: Topic) -> Dict[str, Any]: """Generate metadata for a link topic (domain).""" - if not topic.link: - return - parsed_domain = get_domain_from_url(topic.link) domain = self.public_suffix_list.get_public_suffix(parsed_domain) - topic.content_metadata = {"domain": domain} + return {"domain": domain} if __name__ == "__main__":