Browse Source

Add the Embedly Extract scraper and consumer

This adds a consumer (in prod only) that uses Embedly's Extract API to
scrape the links from all new link topics and stores some of the data in
the topic's content_metadata column.
merge-requests/37/head
Deimos 6 years ago
parent
commit
db9d485cc5
  1. 14
      salt/salt/consumers/init.sls
  2. 16
      salt/salt/consumers/topic_embedly_extractor.service.jinja2
  3. 1
      tildes/alembic/env.py
  4. 52
      tildes/alembic/versions/09cfb27cc90e_add_scraper_results_table.py
  5. 99
      tildes/consumers/topic_embedly_extractor.py
  6. 1
      tildes/production.ini.example
  7. 6
      tildes/tildes/enums.py
  8. 3
      tildes/tildes/models/scraper/__init__.py
  9. 37
      tildes/tildes/models/scraper/scraper_result.py
  10. 3
      tildes/tildes/scrapers/__init__.py
  11. 61
      tildes/tildes/scrapers/embedly_scraper.py

14
salt/salt/consumers/init.sls

@ -21,3 +21,17 @@ consumer-topic_metadata_generator.service:
consumer-comment_user_mentions_generator.service:
service.running:
- enable: True
{% if grains['id'] == 'prod' %}
/etc/systemd/system/consumer-topic_embedly_extractor.service:
file.managed:
- source: salt://consumers/topic_embedly_extractor.service.jinja2
- template: jinja
- user: root
- group: root
- mode: 644
consumer-topic_embedly_extractor.service:
service.running:
- enable: True
{% endif %}

16
salt/salt/consumers/topic_embedly_extractor.service.jinja2

@ -0,0 +1,16 @@
{% from 'common.jinja2' import app_dir, bin_dir -%}
[Unit]
Description=Topic Embedly Extractor (Queue Consumer)
Requires=rabbitmq-server.service
After=rabbitmq-server.service
PartOf=rabbitmq-server.service
[Service]
WorkingDirectory={{ app_dir }}/consumers
Environment="INI_FILE={{ app_dir }}/{{ pillar['ini_file'] }}"
ExecStart={{ bin_dir }}/python topic_embedly_extractor.py
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target

1
tildes/alembic/env.py

@ -16,6 +16,7 @@ from tildes.models.comment import Comment, CommentNotification, CommentTag, Comm
from tildes.models.group import Group, GroupSubscription
from tildes.models.log import Log
from tildes.models.message import MessageConversation, MessageReply
from tildes.models.scraper import ScraperResult
from tildes.models.topic import Topic, TopicVisit, TopicVote
from tildes.models.user import User, UserGroupSettings, UserInviteCode

52
tildes/alembic/versions/09cfb27cc90e_add_scraper_results_table.py

@ -0,0 +1,52 @@
"""Add scraper_results table
Revision ID: 09cfb27cc90e
Revises: 04fd898de0db
Create Date: 2018-09-09 21:22:32.769786
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = "09cfb27cc90e"
down_revision = "04fd898de0db"
branch_labels = None
depends_on = None
def upgrade():
op.create_table(
"scraper_results",
sa.Column("result_id", sa.Integer(), nullable=False),
sa.Column("url", sa.Text(), nullable=False),
sa.Column(
"scraper_type",
postgresql.ENUM("EMBEDLY", name="scrapertype"),
nullable=False,
),
sa.Column(
"scrape_time",
sa.TIMESTAMP(timezone=True),
server_default=sa.text("NOW()"),
nullable=False,
),
sa.Column("data", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.PrimaryKeyConstraint("result_id", name=op.f("pk_scraper_results")),
)
op.create_index(
op.f("ix_scraper_results_scrape_time"),
"scraper_results",
["scrape_time"],
unique=False,
)
op.create_index(
op.f("ix_scraper_results_url"), "scraper_results", ["url"], unique=False
)
def downgrade():
op.drop_index(op.f("ix_scraper_results_url"), table_name="scraper_results")
op.drop_index(op.f("ix_scraper_results_scrape_time"), table_name="scraper_results")
op.drop_table("scraper_results")

99
tildes/consumers/topic_embedly_extractor.py

@ -0,0 +1,99 @@
# Copyright (c) 2018 Tildes contributors <code@tildes.net>
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Consumer that fetches data from Embedly's Extract API for link topics."""
from datetime import timedelta
import os
from typing import Sequence
from amqpy import Message
from pyramid.paster import bootstrap
from requests.exceptions import HTTPError
from sqlalchemy import cast, desc, func
from sqlalchemy.dialects.postgresql import JSONB
from tildes.enums import ScraperType
from tildes.lib.amqp import PgsqlQueueConsumer
from tildes.lib.datetime import utc_now
from tildes.models.scraper import ScraperResult
from tildes.models.topic import Topic
from tildes.scrapers import EmbedlyScraper
# don't rescrape the same url inside this time period
RESCRAPE_DELAY = timedelta(hours=24)
class TopicEmbedlyExtractor(PgsqlQueueConsumer):
"""Consumer that fetches data from Embedly's Extract API for link topics."""
def __init__(
self, api_key: str, queue_name: str, routing_keys: Sequence[str]
) -> None:
"""Initialize the consumer, including creating a scraper instance."""
super().__init__(queue_name, routing_keys)
self.scraper = EmbedlyScraper(api_key)
def run(self, msg: Message) -> None:
"""Process a delivered message."""
topic = (
self.db_session.query(Topic).filter_by(topic_id=msg.body["topic_id"]).one()
)
if not topic.is_link_type:
return
# see if we already have a recent scrape result from the same url
result = (
self.db_session.query(ScraperResult)
.filter(
ScraperResult.url == topic.link,
ScraperResult.scraper_type == ScraperType.EMBEDLY,
ScraperResult.scrape_time > utc_now() - RESCRAPE_DELAY,
)
.order_by(desc(ScraperResult.scrape_time))
.first()
)
# if not, scrape the url and store the result
if not result:
try:
result = self.scraper.scrape_url(topic.link)
except HTTPError:
return
self.db_session.add(result)
new_metadata = EmbedlyScraper.get_metadata_from_result(result)
if new_metadata:
# update the topic's content_metadata in a way that won't wipe out any
# existing values, and can handle the column being null
(
self.db_session.query(Topic)
.filter(Topic.topic_id == topic.topic_id)
.update(
{
"content_metadata": func.coalesce(
Topic.content_metadata, cast({}, JSONB)
).op("||")(new_metadata)
},
synchronize_session=False,
)
)
if __name__ == "__main__":
# pylint: disable=invalid-name
env = bootstrap(os.environ["INI_FILE"])
embedly_api_key = env["registry"].settings.get("api_keys.embedly")
if not embedly_api_key:
raise RuntimeError("No embedly API key available in INI file")
TopicEmbedlyExtractor(
embedly_api_key,
queue_name="topic_embedly_extractor.q",
routing_keys=["topic.created"],
).consume_queue()

1
tildes/production.ini.example

@ -29,6 +29,7 @@ webassets.cache = false
webassets.manifest = json
# API keys for external APIs
api_keys.embedly = embedlykeygoeshere
api_keys.stripe = sk_live_ActualKeyShouldGoHere
[server:main]

6
tildes/tildes/enums.py

@ -65,6 +65,12 @@ class LogEventType(enum.Enum):
TOPIC_UNREMOVE = enum.auto()
class ScraperType(enum.Enum):
"""Enum for the types of scrapers available."""
EMBEDLY = enum.auto()
class TopicSortOption(enum.Enum):
"""Enum for the different methods topics can be sorted by."""

3
tildes/tildes/models/scraper/__init__.py

@ -0,0 +1,3 @@
"""Contains models related to scrapers."""
from .scraper_result import ScraperResult

37
tildes/tildes/models/scraper/scraper_result.py

@ -0,0 +1,37 @@
# Copyright (c) 2018 Tildes contributors <code@tildes.net>
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Contains the ScraperResult class."""
from datetime import datetime
from typing import Any
from sqlalchemy import Column, Integer, Text, TIMESTAMP
from sqlalchemy.dialects.postgresql import ENUM, JSONB
from sqlalchemy.sql.expression import text
from tildes.enums import ScraperType
from tildes.models import DatabaseModel
class ScraperResult(DatabaseModel):
"""Model for the result from a scraper processing a url."""
__tablename__ = "scraper_results"
result_id: int = Column(Integer, primary_key=True)
url: str = Column(Text, nullable=False, index=True)
scraper_type: ScraperType = Column(ENUM(ScraperType), nullable=False)
scrape_time: datetime = Column(
TIMESTAMP(timezone=True),
nullable=False,
index=True,
server_default=text("NOW()"),
)
data: Any = Column(JSONB)
def __init__(self, url: str, scraper_type: ScraperType, data: Any) -> None:
"""Create a new ScraperResult."""
self.url = url
self.scraper_type = scraper_type
self.data = data

3
tildes/tildes/scrapers/__init__.py

@ -0,0 +1,3 @@
"""Contains scrapers."""
from .embedly_scraper import EmbedlyScraper

61
tildes/tildes/scrapers/embedly_scraper.py

@ -0,0 +1,61 @@
# Copyright (c) 2018 Tildes contributors <code@tildes.net>
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Contains the EmbedlyScraper class."""
from typing import Any, Dict
import requests
from tildes.enums import ScraperType
from tildes.lib.string import extract_text_from_html, word_count
from tildes.models.scraper import ScraperResult
class EmbedlyScraper:
"""Scraper that uses Embedly's "Extract" API."""
def __init__(self, api_key: str) -> None:
"""Create a new scraper using the specified Embedly API key."""
self.api_key = api_key
def scrape_url(self, url: str) -> ScraperResult:
"""Scrape a url and return the result."""
params: Dict[str, Any] = {"key": self.api_key, "format": "json", "url": url}
response = requests.get("https://api.embedly.com/1/extract", params=params)
response.raise_for_status()
return ScraperResult(url, ScraperType.EMBEDLY, response.json())
@staticmethod
def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]:
"""Get the metadata that we're interested in out of a scrape result."""
# pylint: disable=too-many-branches
if result.scraper_type != ScraperType.EMBEDLY:
raise ValueError("Can't process a result from a different scraper.")
metadata = {}
if result.data.get("title"):
metadata["title"] = result.data["title"]
if result.data.get("description"):
metadata["description"] = result.data["description"]
content = result.data.get("content")
if content:
metadata["word_count"] = word_count(extract_text_from_html(content))
if result.data.get("published"):
# the field's value is in milliseconds, store it in seconds instead
metadata["published"] = result.data["published"] // 1000
authors = result.data.get("authors")
if authors:
try:
metadata["authors"] = [author["name"] for author in authors]
except KeyError:
pass
return metadata
Loading…
Cancel
Save