diff --git a/tildes/tests/test_scraper.py b/tildes/tests/test_scraper.py new file mode 100644 index 0000000..4b13086 --- /dev/null +++ b/tildes/tests/test_scraper.py @@ -0,0 +1,33 @@ +# Copyright (c) 2019 Tildes contributors +# SPDX-License-Identifier: AGPL-3.0-or-later + +from datetime import timedelta + +from pytest import raises + +from tildes.scrapers import YoutubeScraper + + +def test_youtube_duration_parsing(): + """Ensure a simple Youtube duration parses successfully.""" + duration = "PT8M14S" + + expected_seconds = int(timedelta(minutes=8, seconds=14).total_seconds()) + + assert YoutubeScraper.parse_duration(duration) == expected_seconds + + +def test_youtube_very_long_duration_parsing(): + """Ensure a strange, extremely long YouTube duration parses successfully.""" + duration = "P30W2DT8H2M32S" + + expected_delta = timedelta(weeks=30, days=2, hours=8, minutes=2, seconds=32) + expected_seconds = int(expected_delta.total_seconds()) + + assert YoutubeScraper.parse_duration(duration) == expected_seconds + + +def test_youtube_duration_parsing_invalid(): + """Ensure an invalid duration raises a ValueError.""" + with raises(ValueError): + YoutubeScraper.parse_duration("18:15") diff --git a/tildes/tildes/scrapers/youtube_scraper.py b/tildes/tildes/scrapers/youtube_scraper.py index a227dba..beb287f 100644 --- a/tildes/tildes/scrapers/youtube_scraper.py +++ b/tildes/tildes/scrapers/youtube_scraper.py @@ -20,6 +20,7 @@ from .exceptions import ScraperError # fmt: off YOUTUBE_DURATION_REGEX = re.compile( "P" + r"(?:(?P\d+)W)?" r"(?:(?P\d+)D)?" "T" r"(?:(?P\d+)H)?" @@ -75,8 +76,8 @@ class YoutubeScraper: return ScraperResult(url, ScraperType.YOUTUBE, video_data) - @staticmethod - def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]: + @classmethod + def get_metadata_from_result(cls, result: ScraperResult) -> Dict[str, Any]: """Get the metadata that we're interested in out of a scrape result.""" if result.scraper_type != ScraperType.YOUTUBE: raise ValueError("Can't process a result from a different scraper.") @@ -101,26 +102,35 @@ class YoutubeScraper: content_details = result.data.get("contentDetails") if content_details.get("duration"): - match = YOUTUBE_DURATION_REGEX.match(content_details["duration"]) - if not match: - raise ValueError("Unable to parse duration") + try: + metadata["duration"] = cls.parse_duration(content_details["duration"]) + except ValueError: + pass - duration_components = {} - - # convert None to zero and all strings to integers - for key, value in match.groupdict().items(): - if value is None: - duration_components[key] = 0 - else: - duration_components[key] = int(value) - - delta = timedelta( - days=duration_components["days"], - hours=duration_components["hours"], - minutes=duration_components["minutes"], - seconds=duration_components["seconds"], - ) + return metadata - metadata["duration"] = int(delta.total_seconds()) + @classmethod + def parse_duration(cls, duration: str) -> int: + """Convert a YouTube duration (subset of ISO8601 duration) to seconds.""" + match = YOUTUBE_DURATION_REGEX.match(duration) + if not match: + raise ValueError("Unable to parse duration") + + duration_components = {} + + # convert None to zero and all strings to integers + for key, value in match.groupdict().items(): + if value is None: + duration_components[key] = 0 + else: + duration_components[key] = int(value) + + delta = timedelta( + weeks=duration_components["weeks"], + days=duration_components["days"], + hours=duration_components["hours"], + minutes=duration_components["minutes"], + seconds=duration_components["seconds"], + ) - return metadata + return int(delta.total_seconds())