From 1537785c2df61bf6a2a11a1b73bcb043d887e5e7 Mon Sep 17 00:00:00 2001 From: Deimos Date: Mon, 28 Jan 2019 16:38:06 -0700 Subject: [PATCH] YoutubeScraper: handle API returning blank result --- tildes/consumers/topic_youtube_scraper.py | 4 ++-- tildes/tildes/scrapers/__init__.py | 1 + tildes/tildes/scrapers/exceptions.py | 10 ++++++++++ tildes/tildes/scrapers/youtube_scraper.py | 8 +++++++- 4 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 tildes/tildes/scrapers/exceptions.py diff --git a/tildes/consumers/topic_youtube_scraper.py b/tildes/consumers/topic_youtube_scraper.py index d140eb6..4089a28 100644 --- a/tildes/consumers/topic_youtube_scraper.py +++ b/tildes/consumers/topic_youtube_scraper.py @@ -18,7 +18,7 @@ from tildes.lib.amqp import PgsqlQueueConsumer from tildes.lib.datetime import utc_now from tildes.models.scraper import ScraperResult from tildes.models.topic import Topic -from tildes.scrapers import YoutubeScraper +from tildes.scrapers import ScraperError, YoutubeScraper # don't rescrape the same url inside this time period @@ -62,7 +62,7 @@ class TopicYoutubeScraper(PgsqlQueueConsumer): if not result: try: result = self.scraper.scrape_url(topic.link) - except (HTTPError, Timeout): + except (HTTPError, ScraperError, Timeout): return self.db_session.add(result) diff --git a/tildes/tildes/scrapers/__init__.py b/tildes/tildes/scrapers/__init__.py index f1f4d07..f251a73 100644 --- a/tildes/tildes/scrapers/__init__.py +++ b/tildes/tildes/scrapers/__init__.py @@ -1,4 +1,5 @@ """Contains scrapers.""" from .embedly_scraper import EmbedlyScraper +from .exceptions import ScraperError from .youtube_scraper import YoutubeScraper diff --git a/tildes/tildes/scrapers/exceptions.py b/tildes/tildes/scrapers/exceptions.py new file mode 100644 index 0000000..ed152d5 --- /dev/null +++ b/tildes/tildes/scrapers/exceptions.py @@ -0,0 +1,10 @@ +# Copyright (c) 2019 Tildes contributors +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Exception classes related to scraping.""" + + +class ScraperError(Exception): + """Exception class for an error while scraping.""" + + pass diff --git a/tildes/tildes/scrapers/youtube_scraper.py b/tildes/tildes/scrapers/youtube_scraper.py index 4d7f4a4..a227dba 100644 --- a/tildes/tildes/scrapers/youtube_scraper.py +++ b/tildes/tildes/scrapers/youtube_scraper.py @@ -13,6 +13,7 @@ import requests from tildes.enums import ScraperType from tildes.models.scraper import ScraperResult +from .exceptions import ScraperError # Only parses the subset of ISO8601 durations that YouTube uses @@ -67,7 +68,12 @@ class YoutubeScraper: ) response.raise_for_status() - return ScraperResult(url, ScraperType.YOUTUBE, response.json()["items"][0]) + try: + video_data = response.json()["items"][0] + except (KeyError, IndexError): + raise ScraperError(f"No data returned for video with ID {video_id}") + + return ScraperResult(url, ScraperType.YOUTUBE, video_data) @staticmethod def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]: