mirror of https://gitlab.com/tildes/tildes.git
Browse Source
Add scraper for YouTube Data API
Add scraper for YouTube Data API
A lot of the code in common between this and the EmbedlyScraper should probably be generalized out to a base class soon, but let's make sure this works first.merge-requests/55/head
Deimos
6 years ago
8 changed files with 289 additions and 0 deletions
-
12salt/salt/consumers/init.sls
-
16salt/salt/consumers/topic_youtube_scraper.service.jinja2
-
35tildes/alembic/versions/61f43e57679a_add_youtube_scraper_result.py
-
100tildes/consumers/topic_youtube_scraper.py
-
1tildes/production.ini.example
-
1tildes/tildes/enums.py
-
1tildes/tildes/scrapers/__init__.py
-
123tildes/tildes/scrapers/youtube_scraper.py
@ -0,0 +1,16 @@ |
|||
{% from 'common.jinja2' import app_dir, bin_dir -%} |
|||
[Unit] |
|||
Description=Topic Youtube Scraper (Queue Consumer) |
|||
Requires=rabbitmq-server.service |
|||
After=rabbitmq-server.service |
|||
PartOf=rabbitmq-server.service |
|||
|
|||
[Service] |
|||
WorkingDirectory={{ app_dir }}/consumers |
|||
Environment="INI_FILE={{ app_dir }}/{{ pillar['ini_file'] }}" |
|||
ExecStart={{ bin_dir }}/python topic_youtube_scraper.py |
|||
Restart=always |
|||
RestartSec=5 |
|||
|
|||
[Install] |
|||
WantedBy=multi-user.target |
@ -0,0 +1,35 @@ |
|||
"""Add youtube scraper result |
|||
|
|||
Revision ID: 61f43e57679a |
|||
Revises: a0e0b6206146 |
|||
Create Date: 2019-01-26 20:02:27.642583 |
|||
|
|||
""" |
|||
from alembic import op |
|||
import sqlalchemy as sa |
|||
|
|||
|
|||
# revision identifiers, used by Alembic. |
|||
revision = "61f43e57679a" |
|||
down_revision = "a0e0b6206146" |
|||
branch_labels = None |
|||
depends_on = None |
|||
|
|||
|
|||
def upgrade(): |
|||
# ALTER TYPE doesn't work from inside a transaction, disable it |
|||
connection = None |
|||
if not op.get_context().as_sql: |
|||
connection = op.get_bind() |
|||
connection.execution_options(isolation_level="AUTOCOMMIT") |
|||
|
|||
op.execute("ALTER TYPE scrapertype ADD VALUE IF NOT EXISTS 'YOUTUBE'") |
|||
|
|||
# re-activate the transaction for any future migrations |
|||
if connection is not None: |
|||
connection.execution_options(isolation_level="READ_COMMITTED") |
|||
|
|||
|
|||
def downgrade(): |
|||
# can't remove from enums, do nothing |
|||
pass |
@ -0,0 +1,100 @@ |
|||
# Copyright (c) 2019 Tildes contributors <code@tildes.net> |
|||
# SPDX-License-Identifier: AGPL-3.0-or-later |
|||
|
|||
"""Consumer that fetches data from YouTube's data API for relevant link topics.""" |
|||
|
|||
from datetime import timedelta |
|||
import os |
|||
from typing import Sequence |
|||
|
|||
from amqpy import Message |
|||
from pyramid.paster import get_appsettings |
|||
from requests.exceptions import HTTPError, Timeout |
|||
from sqlalchemy import cast, desc, func |
|||
from sqlalchemy.dialects.postgresql import JSONB |
|||
|
|||
from tildes.enums import ScraperType |
|||
from tildes.lib.amqp import PgsqlQueueConsumer |
|||
from tildes.lib.datetime import utc_now |
|||
from tildes.models.scraper import ScraperResult |
|||
from tildes.models.topic import Topic |
|||
from tildes.scrapers import YoutubeScraper |
|||
|
|||
|
|||
# don't rescrape the same url inside this time period |
|||
RESCRAPE_DELAY = timedelta(hours=24) |
|||
|
|||
|
|||
class TopicYoutubeScraper(PgsqlQueueConsumer): |
|||
"""Consumer that fetches data from YouTube's data API for relevant link topics.""" |
|||
|
|||
def __init__(self, api_key: str, queue_name: str, routing_keys: Sequence[str]): |
|||
"""Initialize the consumer, including creating a scraper instance.""" |
|||
super().__init__(queue_name, routing_keys) |
|||
|
|||
self.scraper = YoutubeScraper(api_key) |
|||
|
|||
def run(self, msg: Message) -> None: |
|||
"""Process a delivered message.""" |
|||
topic = ( |
|||
self.db_session.query(Topic).filter_by(topic_id=msg.body["topic_id"]).one() |
|||
) |
|||
|
|||
if not topic.is_link_type: |
|||
return |
|||
|
|||
if not self.scraper.is_applicable(topic.link): |
|||
return |
|||
|
|||
# see if we already have a recent scrape result from the same url |
|||
result = ( |
|||
self.db_session.query(ScraperResult) |
|||
.filter( |
|||
ScraperResult.url == topic.link, |
|||
ScraperResult.scraper_type == ScraperType.YOUTUBE, |
|||
ScraperResult.scrape_time > utc_now() - RESCRAPE_DELAY, |
|||
) |
|||
.order_by(desc(ScraperResult.scrape_time)) |
|||
.first() |
|||
) |
|||
|
|||
# if not, scrape the url and store the result |
|||
if not result: |
|||
try: |
|||
result = self.scraper.scrape_url(topic.link) |
|||
except (HTTPError, Timeout): |
|||
return |
|||
|
|||
self.db_session.add(result) |
|||
|
|||
new_metadata = YoutubeScraper.get_metadata_from_result(result) |
|||
|
|||
if new_metadata: |
|||
# update the topic's content_metadata in a way that won't wipe out any |
|||
# existing values, and can handle the column being null |
|||
( |
|||
self.db_session.query(Topic) |
|||
.filter(Topic.topic_id == topic.topic_id) |
|||
.update( |
|||
{ |
|||
"content_metadata": func.coalesce( |
|||
Topic.content_metadata, cast({}, JSONB) |
|||
).op("||")(new_metadata) |
|||
}, |
|||
synchronize_session=False, |
|||
) |
|||
) |
|||
|
|||
|
|||
if __name__ == "__main__": |
|||
# pylint: disable=invalid-name |
|||
settings = get_appsettings(os.environ["INI_FILE"]) |
|||
youtube_api_key = settings.get("api_keys.youtube") |
|||
if not youtube_api_key: |
|||
raise RuntimeError("No YouTube API key available in INI file") |
|||
|
|||
TopicYoutubeScraper( |
|||
youtube_api_key, |
|||
queue_name="topic_youtube_scraper.q", |
|||
routing_keys=["topic.created"], |
|||
).consume_queue() |
@ -1,3 +1,4 @@ |
|||
"""Contains scrapers.""" |
|||
|
|||
from .embedly_scraper import EmbedlyScraper |
|||
from .youtube_scraper import YoutubeScraper |
@ -0,0 +1,123 @@ |
|||
# Copyright (c) 2019 Tildes contributors <code@tildes.net> |
|||
# SPDX-License-Identifier: AGPL-3.0-or-later |
|||
|
|||
"""Contains the YoutubeScraper class.""" |
|||
|
|||
from datetime import timedelta |
|||
import re |
|||
from typing import Any, Dict |
|||
from urllib.parse import parse_qs, urlparse |
|||
|
|||
from dateutil import parser |
|||
import requests |
|||
|
|||
from tildes.enums import ScraperType |
|||
from tildes.models.scraper import ScraperResult |
|||
|
|||
|
|||
# Only parses the subset of ISO8601 durations that YouTube uses |
|||
# fmt: off |
|||
YOUTUBE_DURATION_REGEX = re.compile( |
|||
"P" |
|||
r"(?:(?P<days>\d+)D)?" |
|||
"T" |
|||
r"(?:(?P<hours>\d+)H)?" |
|||
r"(?:(?P<minutes>\d+)M)?" |
|||
r"(?:(?P<seconds>\d+)S)?" |
|||
) |
|||
# fmt: on |
|||
|
|||
|
|||
class YoutubeScraper: |
|||
"""Scraper that uses the YouTube Data API.""" |
|||
|
|||
def __init__(self, api_key: str): |
|||
"""Create a new scraper using the specified YouTube API key.""" |
|||
self.api_key = api_key |
|||
|
|||
def is_applicable(self, url: str) -> bool: |
|||
"""Return whether this scraper is applicable to a particular url.""" |
|||
parsed_url = urlparse(url) |
|||
|
|||
if parsed_url.hostname not in ("www.youtube.com", "youtube.com"): |
|||
return False |
|||
|
|||
if parsed_url.path != "/watch": |
|||
return False |
|||
|
|||
return True |
|||
|
|||
def scrape_url(self, url: str) -> ScraperResult: |
|||
"""Scrape a url and return the result.""" |
|||
parsed_url = urlparse(url) |
|||
query_params = parse_qs(parsed_url.query) |
|||
video_id = query_params["v"] |
|||
|
|||
if not video_id: |
|||
raise ValueError("Invalid url, no video ID found.") |
|||
|
|||
params: Dict[str, Any] = { |
|||
"key": self.api_key, |
|||
"id": video_id, |
|||
"part": "snippet,contentDetails,statistics", |
|||
} |
|||
|
|||
response = requests.get( |
|||
"https://www.googleapis.com/youtube/v3/videos", params=params, timeout=5 |
|||
) |
|||
response.raise_for_status() |
|||
|
|||
return ScraperResult(url, ScraperType.YOUTUBE, response.json()["items"][0]) |
|||
|
|||
@staticmethod |
|||
def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]: |
|||
"""Get the metadata that we're interested in out of a scrape result.""" |
|||
if result.scraper_type != ScraperType.YOUTUBE: |
|||
raise ValueError("Can't process a result from a different scraper.") |
|||
|
|||
metadata = {} |
|||
|
|||
snippet = result.data.get("snippet") |
|||
|
|||
if snippet.get("title"): |
|||
metadata["title"] = snippet["title"] |
|||
|
|||
if snippet.get("description"): |
|||
metadata["description"] = snippet["description"] |
|||
|
|||
if snippet.get("publishedAt"): |
|||
published = parser.parse(snippet["publishedAt"], ignoretz=True) |
|||
metadata["published"] = int(published.timestamp()) |
|||
|
|||
if snippet.get("channelTitle"): |
|||
metadata["authors"] = [snippet["channelTitle"]] |
|||
|
|||
content_details = result.data.get("contentDetails") |
|||
|
|||
if content_details.get("duration"): |
|||
match = YOUTUBE_DURATION_REGEX.match(content_details["duration"]) |
|||
if not match: |
|||
raise ValueError("Unable to parse duration") |
|||
|
|||
duration_components = {} |
|||
|
|||
# convert None to zero and all strings to integers |
|||
for key, value in match.groupdict().items(): |
|||
if value is None: |
|||
duration_components[key] = 0 |
|||
else: |
|||
duration_components[key] = int(value) |
|||
|
|||
delta = timedelta( |
|||
days=duration_components["days"], |
|||
hours=duration_components["hours"], |
|||
minutes=duration_components["minutes"], |
|||
seconds=duration_components["seconds"], |
|||
) |
|||
|
|||
# string version of timedelta always has hours, strip it off when it's zero |
|||
duration = str(delta).lstrip("0:") |
|||
|
|||
metadata["duration"] = duration |
|||
|
|||
return metadata |
Write
Preview
Loading…
Cancel
Save
Reference in new issue