Browse Source

YoutubeScraper: improve duration-parsing, test

Youtube scraping broke earlier on a crazy duration of "P30W2DT8H2M32S"
(30 weeks?!), so I updated the parsing a little to be able to handle
that, and also not crash the consumer if it hits a duration that it
can't handle.
merge-requests/55/head
Deimos 6 years ago
parent
commit
d5f3a40404
  1. 33
      tildes/tests/test_scraper.py
  2. 54
      tildes/tildes/scrapers/youtube_scraper.py

33
tildes/tests/test_scraper.py

@ -0,0 +1,33 @@
# Copyright (c) 2019 Tildes contributors <code@tildes.net>
# SPDX-License-Identifier: AGPL-3.0-or-later
from datetime import timedelta
from pytest import raises
from tildes.scrapers import YoutubeScraper
def test_youtube_duration_parsing():
"""Ensure a simple Youtube duration parses successfully."""
duration = "PT8M14S"
expected_seconds = int(timedelta(minutes=8, seconds=14).total_seconds())
assert YoutubeScraper.parse_duration(duration) == expected_seconds
def test_youtube_very_long_duration_parsing():
"""Ensure a strange, extremely long YouTube duration parses successfully."""
duration = "P30W2DT8H2M32S"
expected_delta = timedelta(weeks=30, days=2, hours=8, minutes=2, seconds=32)
expected_seconds = int(expected_delta.total_seconds())
assert YoutubeScraper.parse_duration(duration) == expected_seconds
def test_youtube_duration_parsing_invalid():
"""Ensure an invalid duration raises a ValueError."""
with raises(ValueError):
YoutubeScraper.parse_duration("18:15")

54
tildes/tildes/scrapers/youtube_scraper.py

@ -20,6 +20,7 @@ from .exceptions import ScraperError
# fmt: off # fmt: off
YOUTUBE_DURATION_REGEX = re.compile( YOUTUBE_DURATION_REGEX = re.compile(
"P" "P"
r"(?:(?P<weeks>\d+)W)?"
r"(?:(?P<days>\d+)D)?" r"(?:(?P<days>\d+)D)?"
"T" "T"
r"(?:(?P<hours>\d+)H)?" r"(?:(?P<hours>\d+)H)?"
@ -75,8 +76,8 @@ class YoutubeScraper:
return ScraperResult(url, ScraperType.YOUTUBE, video_data) return ScraperResult(url, ScraperType.YOUTUBE, video_data)
@staticmethod
def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]:
@classmethod
def get_metadata_from_result(cls, result: ScraperResult) -> Dict[str, Any]:
"""Get the metadata that we're interested in out of a scrape result.""" """Get the metadata that we're interested in out of a scrape result."""
if result.scraper_type != ScraperType.YOUTUBE: if result.scraper_type != ScraperType.YOUTUBE:
raise ValueError("Can't process a result from a different scraper.") raise ValueError("Can't process a result from a different scraper.")
@ -101,26 +102,35 @@ class YoutubeScraper:
content_details = result.data.get("contentDetails") content_details = result.data.get("contentDetails")
if content_details.get("duration"): if content_details.get("duration"):
match = YOUTUBE_DURATION_REGEX.match(content_details["duration"])
if not match:
raise ValueError("Unable to parse duration")
try:
metadata["duration"] = cls.parse_duration(content_details["duration"])
except ValueError:
pass
duration_components = {}
# convert None to zero and all strings to integers
for key, value in match.groupdict().items():
if value is None:
duration_components[key] = 0
else:
duration_components[key] = int(value)
delta = timedelta(
days=duration_components["days"],
hours=duration_components["hours"],
minutes=duration_components["minutes"],
seconds=duration_components["seconds"],
)
return metadata
metadata["duration"] = int(delta.total_seconds())
@classmethod
def parse_duration(cls, duration: str) -> int:
"""Convert a YouTube duration (subset of ISO8601 duration) to seconds."""
match = YOUTUBE_DURATION_REGEX.match(duration)
if not match:
raise ValueError("Unable to parse duration")
duration_components = {}
# convert None to zero and all strings to integers
for key, value in match.groupdict().items():
if value is None:
duration_components[key] = 0
else:
duration_components[key] = int(value)
delta = timedelta(
weeks=duration_components["weeks"],
days=duration_components["days"],
hours=duration_components["hours"],
minutes=duration_components["minutes"],
seconds=duration_components["seconds"],
)
return metadata
return int(delta.total_seconds())
Loading…
Cancel
Save