mirror of https://gitlab.com/tildes/tildes.git
Browse Source
Add scraper for YouTube Data API
Add scraper for YouTube Data API
A lot of the code in common between this and the EmbedlyScraper should probably be generalized out to a base class soon, but let's make sure this works first.merge-requests/55/head
Deimos
6 years ago
8 changed files with 289 additions and 0 deletions
-
12salt/salt/consumers/init.sls
-
16salt/salt/consumers/topic_youtube_scraper.service.jinja2
-
35tildes/alembic/versions/61f43e57679a_add_youtube_scraper_result.py
-
100tildes/consumers/topic_youtube_scraper.py
-
1tildes/production.ini.example
-
1tildes/tildes/enums.py
-
1tildes/tildes/scrapers/__init__.py
-
123tildes/tildes/scrapers/youtube_scraper.py
@ -0,0 +1,16 @@ |
|||||
|
{% from 'common.jinja2' import app_dir, bin_dir -%} |
||||
|
[Unit] |
||||
|
Description=Topic Youtube Scraper (Queue Consumer) |
||||
|
Requires=rabbitmq-server.service |
||||
|
After=rabbitmq-server.service |
||||
|
PartOf=rabbitmq-server.service |
||||
|
|
||||
|
[Service] |
||||
|
WorkingDirectory={{ app_dir }}/consumers |
||||
|
Environment="INI_FILE={{ app_dir }}/{{ pillar['ini_file'] }}" |
||||
|
ExecStart={{ bin_dir }}/python topic_youtube_scraper.py |
||||
|
Restart=always |
||||
|
RestartSec=5 |
||||
|
|
||||
|
[Install] |
||||
|
WantedBy=multi-user.target |
@ -0,0 +1,35 @@ |
|||||
|
"""Add youtube scraper result |
||||
|
|
||||
|
Revision ID: 61f43e57679a |
||||
|
Revises: a0e0b6206146 |
||||
|
Create Date: 2019-01-26 20:02:27.642583 |
||||
|
|
||||
|
""" |
||||
|
from alembic import op |
||||
|
import sqlalchemy as sa |
||||
|
|
||||
|
|
||||
|
# revision identifiers, used by Alembic. |
||||
|
revision = "61f43e57679a" |
||||
|
down_revision = "a0e0b6206146" |
||||
|
branch_labels = None |
||||
|
depends_on = None |
||||
|
|
||||
|
|
||||
|
def upgrade(): |
||||
|
# ALTER TYPE doesn't work from inside a transaction, disable it |
||||
|
connection = None |
||||
|
if not op.get_context().as_sql: |
||||
|
connection = op.get_bind() |
||||
|
connection.execution_options(isolation_level="AUTOCOMMIT") |
||||
|
|
||||
|
op.execute("ALTER TYPE scrapertype ADD VALUE IF NOT EXISTS 'YOUTUBE'") |
||||
|
|
||||
|
# re-activate the transaction for any future migrations |
||||
|
if connection is not None: |
||||
|
connection.execution_options(isolation_level="READ_COMMITTED") |
||||
|
|
||||
|
|
||||
|
def downgrade(): |
||||
|
# can't remove from enums, do nothing |
||||
|
pass |
@ -0,0 +1,100 @@ |
|||||
|
# Copyright (c) 2019 Tildes contributors <code@tildes.net> |
||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later |
||||
|
|
||||
|
"""Consumer that fetches data from YouTube's data API for relevant link topics.""" |
||||
|
|
||||
|
from datetime import timedelta |
||||
|
import os |
||||
|
from typing import Sequence |
||||
|
|
||||
|
from amqpy import Message |
||||
|
from pyramid.paster import get_appsettings |
||||
|
from requests.exceptions import HTTPError, Timeout |
||||
|
from sqlalchemy import cast, desc, func |
||||
|
from sqlalchemy.dialects.postgresql import JSONB |
||||
|
|
||||
|
from tildes.enums import ScraperType |
||||
|
from tildes.lib.amqp import PgsqlQueueConsumer |
||||
|
from tildes.lib.datetime import utc_now |
||||
|
from tildes.models.scraper import ScraperResult |
||||
|
from tildes.models.topic import Topic |
||||
|
from tildes.scrapers import YoutubeScraper |
||||
|
|
||||
|
|
||||
|
# don't rescrape the same url inside this time period |
||||
|
RESCRAPE_DELAY = timedelta(hours=24) |
||||
|
|
||||
|
|
||||
|
class TopicYoutubeScraper(PgsqlQueueConsumer): |
||||
|
"""Consumer that fetches data from YouTube's data API for relevant link topics.""" |
||||
|
|
||||
|
def __init__(self, api_key: str, queue_name: str, routing_keys: Sequence[str]): |
||||
|
"""Initialize the consumer, including creating a scraper instance.""" |
||||
|
super().__init__(queue_name, routing_keys) |
||||
|
|
||||
|
self.scraper = YoutubeScraper(api_key) |
||||
|
|
||||
|
def run(self, msg: Message) -> None: |
||||
|
"""Process a delivered message.""" |
||||
|
topic = ( |
||||
|
self.db_session.query(Topic).filter_by(topic_id=msg.body["topic_id"]).one() |
||||
|
) |
||||
|
|
||||
|
if not topic.is_link_type: |
||||
|
return |
||||
|
|
||||
|
if not self.scraper.is_applicable(topic.link): |
||||
|
return |
||||
|
|
||||
|
# see if we already have a recent scrape result from the same url |
||||
|
result = ( |
||||
|
self.db_session.query(ScraperResult) |
||||
|
.filter( |
||||
|
ScraperResult.url == topic.link, |
||||
|
ScraperResult.scraper_type == ScraperType.YOUTUBE, |
||||
|
ScraperResult.scrape_time > utc_now() - RESCRAPE_DELAY, |
||||
|
) |
||||
|
.order_by(desc(ScraperResult.scrape_time)) |
||||
|
.first() |
||||
|
) |
||||
|
|
||||
|
# if not, scrape the url and store the result |
||||
|
if not result: |
||||
|
try: |
||||
|
result = self.scraper.scrape_url(topic.link) |
||||
|
except (HTTPError, Timeout): |
||||
|
return |
||||
|
|
||||
|
self.db_session.add(result) |
||||
|
|
||||
|
new_metadata = YoutubeScraper.get_metadata_from_result(result) |
||||
|
|
||||
|
if new_metadata: |
||||
|
# update the topic's content_metadata in a way that won't wipe out any |
||||
|
# existing values, and can handle the column being null |
||||
|
( |
||||
|
self.db_session.query(Topic) |
||||
|
.filter(Topic.topic_id == topic.topic_id) |
||||
|
.update( |
||||
|
{ |
||||
|
"content_metadata": func.coalesce( |
||||
|
Topic.content_metadata, cast({}, JSONB) |
||||
|
).op("||")(new_metadata) |
||||
|
}, |
||||
|
synchronize_session=False, |
||||
|
) |
||||
|
) |
||||
|
|
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
# pylint: disable=invalid-name |
||||
|
settings = get_appsettings(os.environ["INI_FILE"]) |
||||
|
youtube_api_key = settings.get("api_keys.youtube") |
||||
|
if not youtube_api_key: |
||||
|
raise RuntimeError("No YouTube API key available in INI file") |
||||
|
|
||||
|
TopicYoutubeScraper( |
||||
|
youtube_api_key, |
||||
|
queue_name="topic_youtube_scraper.q", |
||||
|
routing_keys=["topic.created"], |
||||
|
).consume_queue() |
@ -1,3 +1,4 @@ |
|||||
"""Contains scrapers.""" |
"""Contains scrapers.""" |
||||
|
|
||||
from .embedly_scraper import EmbedlyScraper |
from .embedly_scraper import EmbedlyScraper |
||||
|
from .youtube_scraper import YoutubeScraper |
@ -0,0 +1,123 @@ |
|||||
|
# Copyright (c) 2019 Tildes contributors <code@tildes.net> |
||||
|
# SPDX-License-Identifier: AGPL-3.0-or-later |
||||
|
|
||||
|
"""Contains the YoutubeScraper class.""" |
||||
|
|
||||
|
from datetime import timedelta |
||||
|
import re |
||||
|
from typing import Any, Dict |
||||
|
from urllib.parse import parse_qs, urlparse |
||||
|
|
||||
|
from dateutil import parser |
||||
|
import requests |
||||
|
|
||||
|
from tildes.enums import ScraperType |
||||
|
from tildes.models.scraper import ScraperResult |
||||
|
|
||||
|
|
||||
|
# Only parses the subset of ISO8601 durations that YouTube uses |
||||
|
# fmt: off |
||||
|
YOUTUBE_DURATION_REGEX = re.compile( |
||||
|
"P" |
||||
|
r"(?:(?P<days>\d+)D)?" |
||||
|
"T" |
||||
|
r"(?:(?P<hours>\d+)H)?" |
||||
|
r"(?:(?P<minutes>\d+)M)?" |
||||
|
r"(?:(?P<seconds>\d+)S)?" |
||||
|
) |
||||
|
# fmt: on |
||||
|
|
||||
|
|
||||
|
class YoutubeScraper: |
||||
|
"""Scraper that uses the YouTube Data API.""" |
||||
|
|
||||
|
def __init__(self, api_key: str): |
||||
|
"""Create a new scraper using the specified YouTube API key.""" |
||||
|
self.api_key = api_key |
||||
|
|
||||
|
def is_applicable(self, url: str) -> bool: |
||||
|
"""Return whether this scraper is applicable to a particular url.""" |
||||
|
parsed_url = urlparse(url) |
||||
|
|
||||
|
if parsed_url.hostname not in ("www.youtube.com", "youtube.com"): |
||||
|
return False |
||||
|
|
||||
|
if parsed_url.path != "/watch": |
||||
|
return False |
||||
|
|
||||
|
return True |
||||
|
|
||||
|
def scrape_url(self, url: str) -> ScraperResult: |
||||
|
"""Scrape a url and return the result.""" |
||||
|
parsed_url = urlparse(url) |
||||
|
query_params = parse_qs(parsed_url.query) |
||||
|
video_id = query_params["v"] |
||||
|
|
||||
|
if not video_id: |
||||
|
raise ValueError("Invalid url, no video ID found.") |
||||
|
|
||||
|
params: Dict[str, Any] = { |
||||
|
"key": self.api_key, |
||||
|
"id": video_id, |
||||
|
"part": "snippet,contentDetails,statistics", |
||||
|
} |
||||
|
|
||||
|
response = requests.get( |
||||
|
"https://www.googleapis.com/youtube/v3/videos", params=params, timeout=5 |
||||
|
) |
||||
|
response.raise_for_status() |
||||
|
|
||||
|
return ScraperResult(url, ScraperType.YOUTUBE, response.json()["items"][0]) |
||||
|
|
||||
|
@staticmethod |
||||
|
def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]: |
||||
|
"""Get the metadata that we're interested in out of a scrape result.""" |
||||
|
if result.scraper_type != ScraperType.YOUTUBE: |
||||
|
raise ValueError("Can't process a result from a different scraper.") |
||||
|
|
||||
|
metadata = {} |
||||
|
|
||||
|
snippet = result.data.get("snippet") |
||||
|
|
||||
|
if snippet.get("title"): |
||||
|
metadata["title"] = snippet["title"] |
||||
|
|
||||
|
if snippet.get("description"): |
||||
|
metadata["description"] = snippet["description"] |
||||
|
|
||||
|
if snippet.get("publishedAt"): |
||||
|
published = parser.parse(snippet["publishedAt"], ignoretz=True) |
||||
|
metadata["published"] = int(published.timestamp()) |
||||
|
|
||||
|
if snippet.get("channelTitle"): |
||||
|
metadata["authors"] = [snippet["channelTitle"]] |
||||
|
|
||||
|
content_details = result.data.get("contentDetails") |
||||
|
|
||||
|
if content_details.get("duration"): |
||||
|
match = YOUTUBE_DURATION_REGEX.match(content_details["duration"]) |
||||
|
if not match: |
||||
|
raise ValueError("Unable to parse duration") |
||||
|
|
||||
|
duration_components = {} |
||||
|
|
||||
|
# convert None to zero and all strings to integers |
||||
|
for key, value in match.groupdict().items(): |
||||
|
if value is None: |
||||
|
duration_components[key] = 0 |
||||
|
else: |
||||
|
duration_components[key] = int(value) |
||||
|
|
||||
|
delta = timedelta( |
||||
|
days=duration_components["days"], |
||||
|
hours=duration_components["hours"], |
||||
|
minutes=duration_components["minutes"], |
||||
|
seconds=duration_components["seconds"], |
||||
|
) |
||||
|
|
||||
|
# string version of timedelta always has hours, strip it off when it's zero |
||||
|
duration = str(delta).lstrip("0:") |
||||
|
|
||||
|
metadata["duration"] = duration |
||||
|
|
||||
|
return metadata |
Write
Preview
Loading…
Cancel
Save
Reference in new issue