You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

67 lines
2.3 KiB

# Copyright (c) 2018 Tildes contributors <code@tildes.net>
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Consumer that generates content_metadata for topics."""
from typing import Sequence
from amqpy import Message
import publicsuffix
from tildes.lib.amqp import PgsqlQueueConsumer
from tildes.lib.string import extract_text_from_html, truncate_string, word_count
from tildes.lib.url import get_domain_from_url
from tildes.models.topic import Topic
class TopicMetadataGenerator(PgsqlQueueConsumer):
"""Consumer that generates content_metadata for topics."""
def __init__(self, queue_name: str, routing_keys: Sequence[str]) -> None:
"""Initialize the consumer, including the public suffix list."""
super().__init__(queue_name, routing_keys)
# download the public suffix list (would be good to add caching here)
psl_file = publicsuffix.fetch()
self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
def run(self, msg: Message) -> None:
"""Process a delivered message."""
topic = (
self.db_session.query(Topic).filter_by(topic_id=msg.body["topic_id"]).one()
)
if topic.is_text_type:
self._generate_text_metadata(topic)
elif topic.is_link_type:
self._generate_link_metadata(topic)
@staticmethod
def _generate_text_metadata(topic: Topic) -> None:
"""Generate metadata for a text topic (word count and excerpt)."""
extracted_text = extract_text_from_html(topic.rendered_html)
# create a short excerpt by truncating the extracted string
excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ")
topic.content_metadata = {
"word_count": word_count(extracted_text),
"excerpt": excerpt,
}
def _generate_link_metadata(self, topic: Topic) -> None:
"""Generate metadata for a link topic (domain)."""
if not topic.link:
return
parsed_domain = get_domain_from_url(topic.link)
domain = self.public_suffix_list.get_public_suffix(parsed_domain)
topic.content_metadata = {"domain": domain}
if __name__ == "__main__":
TopicMetadataGenerator(
queue_name="topic_metadata_generator.q",
routing_keys=["topic.created", "topic.edited"],
).consume_queue()