Browse Source

Add scraper for YouTube Data API

A lot of the code in common between this and the EmbedlyScraper should
probably be generalized out to a base class soon, but let's make sure
this works first.
merge-requests/55/head
Deimos 6 years ago
parent
commit
845281796a
  1. 12
      salt/salt/consumers/init.sls
  2. 16
      salt/salt/consumers/topic_youtube_scraper.service.jinja2
  3. 35
      tildes/alembic/versions/61f43e57679a_add_youtube_scraper_result.py
  4. 100
      tildes/consumers/topic_youtube_scraper.py
  5. 1
      tildes/production.ini.example
  6. 1
      tildes/tildes/enums.py
  7. 1
      tildes/tildes/scrapers/__init__.py
  8. 123
      tildes/tildes/scrapers/youtube_scraper.py

12
salt/salt/consumers/init.sls

@ -35,6 +35,18 @@ consumer-topic_embedly_extractor.service:
service.running: service.running:
- enable: True - enable: True
/etc/systemd/system/consumer-topic_youtube_scraper.service:
file.managed:
- source: salt://consumers/topic_youtube_scraper.service.jinja2
- template: jinja
- user: root
- group: root
- mode: 644
consumer-topic_youtube_scraper.service:
service.running:
- enable: True
/etc/systemd/system/consumer-site_icon_downloader.service: /etc/systemd/system/consumer-site_icon_downloader.service:
file.managed: file.managed:
- source: salt://consumers/site_icon_downloader.service.jinja2 - source: salt://consumers/site_icon_downloader.service.jinja2

16
salt/salt/consumers/topic_youtube_scraper.service.jinja2

@ -0,0 +1,16 @@
{% from 'common.jinja2' import app_dir, bin_dir -%}
[Unit]
Description=Topic Youtube Scraper (Queue Consumer)
Requires=rabbitmq-server.service
After=rabbitmq-server.service
PartOf=rabbitmq-server.service
[Service]
WorkingDirectory={{ app_dir }}/consumers
Environment="INI_FILE={{ app_dir }}/{{ pillar['ini_file'] }}"
ExecStart={{ bin_dir }}/python topic_youtube_scraper.py
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target

35
tildes/alembic/versions/61f43e57679a_add_youtube_scraper_result.py

@ -0,0 +1,35 @@
"""Add youtube scraper result
Revision ID: 61f43e57679a
Revises: a0e0b6206146
Create Date: 2019-01-26 20:02:27.642583
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "61f43e57679a"
down_revision = "a0e0b6206146"
branch_labels = None
depends_on = None
def upgrade():
# ALTER TYPE doesn't work from inside a transaction, disable it
connection = None
if not op.get_context().as_sql:
connection = op.get_bind()
connection.execution_options(isolation_level="AUTOCOMMIT")
op.execute("ALTER TYPE scrapertype ADD VALUE IF NOT EXISTS 'YOUTUBE'")
# re-activate the transaction for any future migrations
if connection is not None:
connection.execution_options(isolation_level="READ_COMMITTED")
def downgrade():
# can't remove from enums, do nothing
pass

100
tildes/consumers/topic_youtube_scraper.py

@ -0,0 +1,100 @@
# Copyright (c) 2019 Tildes contributors <code@tildes.net>
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Consumer that fetches data from YouTube's data API for relevant link topics."""
from datetime import timedelta
import os
from typing import Sequence
from amqpy import Message
from pyramid.paster import get_appsettings
from requests.exceptions import HTTPError, Timeout
from sqlalchemy import cast, desc, func
from sqlalchemy.dialects.postgresql import JSONB
from tildes.enums import ScraperType
from tildes.lib.amqp import PgsqlQueueConsumer
from tildes.lib.datetime import utc_now
from tildes.models.scraper import ScraperResult
from tildes.models.topic import Topic
from tildes.scrapers import YoutubeScraper
# don't rescrape the same url inside this time period
RESCRAPE_DELAY = timedelta(hours=24)
class TopicYoutubeScraper(PgsqlQueueConsumer):
"""Consumer that fetches data from YouTube's data API for relevant link topics."""
def __init__(self, api_key: str, queue_name: str, routing_keys: Sequence[str]):
"""Initialize the consumer, including creating a scraper instance."""
super().__init__(queue_name, routing_keys)
self.scraper = YoutubeScraper(api_key)
def run(self, msg: Message) -> None:
"""Process a delivered message."""
topic = (
self.db_session.query(Topic).filter_by(topic_id=msg.body["topic_id"]).one()
)
if not topic.is_link_type:
return
if not self.scraper.is_applicable(topic.link):
return
# see if we already have a recent scrape result from the same url
result = (
self.db_session.query(ScraperResult)
.filter(
ScraperResult.url == topic.link,
ScraperResult.scraper_type == ScraperType.YOUTUBE,
ScraperResult.scrape_time > utc_now() - RESCRAPE_DELAY,
)
.order_by(desc(ScraperResult.scrape_time))
.first()
)
# if not, scrape the url and store the result
if not result:
try:
result = self.scraper.scrape_url(topic.link)
except (HTTPError, Timeout):
return
self.db_session.add(result)
new_metadata = YoutubeScraper.get_metadata_from_result(result)
if new_metadata:
# update the topic's content_metadata in a way that won't wipe out any
# existing values, and can handle the column being null
(
self.db_session.query(Topic)
.filter(Topic.topic_id == topic.topic_id)
.update(
{
"content_metadata": func.coalesce(
Topic.content_metadata, cast({}, JSONB)
).op("||")(new_metadata)
},
synchronize_session=False,
)
)
if __name__ == "__main__":
# pylint: disable=invalid-name
settings = get_appsettings(os.environ["INI_FILE"])
youtube_api_key = settings.get("api_keys.youtube")
if not youtube_api_key:
raise RuntimeError("No YouTube API key available in INI file")
TopicYoutubeScraper(
youtube_api_key,
queue_name="topic_youtube_scraper.q",
routing_keys=["topic.created"],
).consume_queue()

1
tildes/production.ini.example

@ -35,6 +35,7 @@ webassets.manifest = json
# API keys for external APIs # API keys for external APIs
api_keys.embedly = embedlykeygoeshere api_keys.embedly = embedlykeygoeshere
api_keys.stripe = sk_live_ActualKeyShouldGoHere api_keys.stripe = sk_live_ActualKeyShouldGoHere
api_keys.youtube = youtubekeygoeshere
[server:main] [server:main]
use = egg:gunicorn#main use = egg:gunicorn#main

1
tildes/tildes/enums.py

@ -88,6 +88,7 @@ class ScraperType(enum.Enum):
"""Enum for the types of scrapers available.""" """Enum for the types of scrapers available."""
EMBEDLY = enum.auto() EMBEDLY = enum.auto()
YOUTUBE = enum.auto()
class TopicSortOption(enum.Enum): class TopicSortOption(enum.Enum):

1
tildes/tildes/scrapers/__init__.py

@ -1,3 +1,4 @@
"""Contains scrapers.""" """Contains scrapers."""
from .embedly_scraper import EmbedlyScraper from .embedly_scraper import EmbedlyScraper
from .youtube_scraper import YoutubeScraper

123
tildes/tildes/scrapers/youtube_scraper.py

@ -0,0 +1,123 @@
# Copyright (c) 2019 Tildes contributors <code@tildes.net>
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Contains the YoutubeScraper class."""
from datetime import timedelta
import re
from typing import Any, Dict
from urllib.parse import parse_qs, urlparse
from dateutil import parser
import requests
from tildes.enums import ScraperType
from tildes.models.scraper import ScraperResult
# Only parses the subset of ISO8601 durations that YouTube uses
# fmt: off
YOUTUBE_DURATION_REGEX = re.compile(
"P"
r"(?:(?P<days>\d+)D)?"
"T"
r"(?:(?P<hours>\d+)H)?"
r"(?:(?P<minutes>\d+)M)?"
r"(?:(?P<seconds>\d+)S)?"
)
# fmt: on
class YoutubeScraper:
"""Scraper that uses the YouTube Data API."""
def __init__(self, api_key: str):
"""Create a new scraper using the specified YouTube API key."""
self.api_key = api_key
def is_applicable(self, url: str) -> bool:
"""Return whether this scraper is applicable to a particular url."""
parsed_url = urlparse(url)
if parsed_url.hostname not in ("www.youtube.com", "youtube.com"):
return False
if parsed_url.path != "/watch":
return False
return True
def scrape_url(self, url: str) -> ScraperResult:
"""Scrape a url and return the result."""
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
video_id = query_params["v"]
if not video_id:
raise ValueError("Invalid url, no video ID found.")
params: Dict[str, Any] = {
"key": self.api_key,
"id": video_id,
"part": "snippet,contentDetails,statistics",
}
response = requests.get(
"https://www.googleapis.com/youtube/v3/videos", params=params, timeout=5
)
response.raise_for_status()
return ScraperResult(url, ScraperType.YOUTUBE, response.json()["items"][0])
@staticmethod
def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]:
"""Get the metadata that we're interested in out of a scrape result."""
if result.scraper_type != ScraperType.YOUTUBE:
raise ValueError("Can't process a result from a different scraper.")
metadata = {}
snippet = result.data.get("snippet")
if snippet.get("title"):
metadata["title"] = snippet["title"]
if snippet.get("description"):
metadata["description"] = snippet["description"]
if snippet.get("publishedAt"):
published = parser.parse(snippet["publishedAt"], ignoretz=True)
metadata["published"] = int(published.timestamp())
if snippet.get("channelTitle"):
metadata["authors"] = [snippet["channelTitle"]]
content_details = result.data.get("contentDetails")
if content_details.get("duration"):
match = YOUTUBE_DURATION_REGEX.match(content_details["duration"])
if not match:
raise ValueError("Unable to parse duration")
duration_components = {}
# convert None to zero and all strings to integers
for key, value in match.groupdict().items():
if value is None:
duration_components[key] = 0
else:
duration_components[key] = int(value)
delta = timedelta(
days=duration_components["days"],
hours=duration_components["hours"],
minutes=duration_components["minutes"],
seconds=duration_components["seconds"],
)
# string version of timedelta always has hours, strip it off when it's zero
duration = str(delta).lstrip("0:")
metadata["duration"] = duration
return metadata
Loading…
Cancel
Save