You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

105 lines
3.4 KiB

# Copyright (c) 2018 Tildes contributors <code@tildes.net>
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Consumer that fetches data from Embedly's Extract API for link topics."""
import os
from datetime import timedelta
from typing import Sequence
from pyramid.paster import get_appsettings
from requests.exceptions import HTTPError, Timeout
from sqlalchemy import cast, desc, func
from sqlalchemy.dialects.postgresql import JSONB
from tildes.enums import ScraperType
from tildes.lib.datetime import utc_now
from tildes.lib.event_stream import EventStreamConsumer, Message
from tildes.models.scraper import ScraperResult
from tildes.models.topic import Topic
from tildes.scrapers import EmbedlyScraper
# don't rescrape the same url inside this time period
RESCRAPE_DELAY = timedelta(hours=24)
class TopicEmbedlyExtractor(EventStreamConsumer):
"""Consumer that fetches data from Embedly's Extract API for link topics."""
METRICS_PORT = 25012
def __init__(
self, api_key: str, consumer_group: str, source_streams: Sequence[str]
):
"""Initialize the consumer, including creating a scraper instance."""
super().__init__(consumer_group, source_streams)
self.scraper = EmbedlyScraper(api_key)
def process_message(self, message: Message) -> None:
"""Process a message from the stream."""
topic = (
self.db_session.query(Topic)
.filter_by(topic_id=message.fields["topic_id"])
.one()
)
if not topic.is_link_type:
return
if not self.scraper.is_applicable(topic.link):
return
# see if we already have a recent scrape result from the same url
result = (
self.db_session.query(ScraperResult)
.filter(
ScraperResult.url == topic.link,
ScraperResult.scraper_type == ScraperType.EMBEDLY,
ScraperResult.scrape_time > utc_now() - RESCRAPE_DELAY,
)
.order_by(desc(ScraperResult.scrape_time))
.first()
)
# if not, scrape the url and store the result
if not result:
try:
result = self.scraper.scrape_url(topic.link)
except (HTTPError, Timeout):
return
self.db_session.add(result)
new_metadata = EmbedlyScraper.get_metadata_from_result(result)
if new_metadata:
# update the topic's content_metadata in a way that won't wipe out any
# existing values, and can handle the column being null
(
self.db_session.query(Topic)
.filter(Topic.topic_id == topic.topic_id)
.update(
{
"content_metadata": func.coalesce(
Topic.content_metadata, cast({}, JSONB)
).op("||")(new_metadata)
},
synchronize_session=False,
)
)
if __name__ == "__main__":
# pylint: disable=invalid-name
settings = get_appsettings(os.environ["INI_FILE"])
embedly_api_key = settings.get("api_keys.embedly")
if not embedly_api_key:
raise RuntimeError("No embedly API key available in INI file")
TopicEmbedlyExtractor(
embedly_api_key,
"topic_embedly_extractor",
source_streams=["topics.insert", "topics.update.link"],
).consume_streams()