Browse Source

Topic metadata generator: update, don't replace

Previously, any topic processed by this consumer would have its
content_metadata completely replaced. This won't work once other
consumers or processes start being able to set that data, since we don't
know that this one will always run first.

This commit updates the method the consumer uses so that it will keep
any data that's already in the topic's content_metadata column if
necessary. It would probably be good to generalize this method out
somehow so that it can be used in other places more easily.
merge-requests/37/head
Deimos 6 years ago
parent
commit
206606fd59
  1. 37
      tildes/consumers/topic_metadata_generator.py

37
tildes/consumers/topic_metadata_generator.py

@ -3,10 +3,12 @@
"""Consumer that generates content_metadata for topics.""" """Consumer that generates content_metadata for topics."""
from typing import Sequence
from typing import Any, Dict, Sequence
from amqpy import Message from amqpy import Message
import publicsuffix import publicsuffix
from sqlalchemy import cast, func
from sqlalchemy.dialects.postgresql import JSONB
from tildes.lib.amqp import PgsqlQueueConsumer from tildes.lib.amqp import PgsqlQueueConsumer
from tildes.lib.string import extract_text_from_html, truncate_string, word_count from tildes.lib.string import extract_text_from_html, truncate_string, word_count
@ -32,32 +34,41 @@ class TopicMetadataGenerator(PgsqlQueueConsumer):
) )
if topic.is_text_type: if topic.is_text_type:
self._generate_text_metadata(topic)
new_metadata = self._generate_text_metadata(topic)
elif topic.is_link_type: elif topic.is_link_type:
self._generate_link_metadata(topic)
new_metadata = self._generate_link_metadata(topic)
# update the topic's content_metadata in a way that won't wipe out any existing
# values, and can handle the column being null
(
self.db_session.query(Topic)
.filter(Topic.topic_id == topic.topic_id)
.update(
{
"content_metadata": func.coalesce(
Topic.content_metadata, cast({}, JSONB)
).op("||")(new_metadata)
},
synchronize_session=False,
)
)
@staticmethod @staticmethod
def _generate_text_metadata(topic: Topic) -> None:
def _generate_text_metadata(topic: Topic) -> Dict[str, Any]:
"""Generate metadata for a text topic (word count and excerpt).""" """Generate metadata for a text topic (word count and excerpt)."""
extracted_text = extract_text_from_html(topic.rendered_html) extracted_text = extract_text_from_html(topic.rendered_html)
# create a short excerpt by truncating the extracted string # create a short excerpt by truncating the extracted string
excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ") excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ")
topic.content_metadata = {
"word_count": word_count(extracted_text),
"excerpt": excerpt,
}
return {"word_count": word_count(extracted_text), "excerpt": excerpt}
def _generate_link_metadata(self, topic: Topic) -> None:
def _generate_link_metadata(self, topic: Topic) -> Dict[str, Any]:
"""Generate metadata for a link topic (domain).""" """Generate metadata for a link topic (domain)."""
if not topic.link:
return
parsed_domain = get_domain_from_url(topic.link) parsed_domain = get_domain_from_url(topic.link)
domain = self.public_suffix_list.get_public_suffix(parsed_domain) domain = self.public_suffix_list.get_public_suffix(parsed_domain)
topic.content_metadata = {"domain": domain}
return {"domain": domain}
if __name__ == "__main__": if __name__ == "__main__":

Loading…
Cancel
Save