From c10746a69f0f6f95fb104fa0ae4f1da7a1a1e08f Mon Sep 17 00:00:00 2001 From: Deimos Date: Mon, 29 Jul 2019 15:12:48 -0600 Subject: [PATCH] Exclude YouTube videos from Embedly scraper There are occasionally some data conflicts between the Embedly scraper and the dedicated YouTube one (which uses their API). The Embedly scraper has also recently started supplying "content" for YouTube videos, which ends up with Tildes displaying a word count for it. That doesn't make much sense. Just leaving the YouTube-scraping to the API client should be fine. --- tildes/tildes/scrapers/embedly_scraper.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tildes/tildes/scrapers/embedly_scraper.py b/tildes/tildes/scrapers/embedly_scraper.py index 667e612..3213ad0 100644 --- a/tildes/tildes/scrapers/embedly_scraper.py +++ b/tildes/tildes/scrapers/embedly_scraper.py @@ -4,6 +4,7 @@ """Contains the EmbedlyScraper class.""" from typing import Any, Dict +from urllib.parse import urlparse import requests @@ -19,6 +20,19 @@ class EmbedlyScraper: """Create a new scraper using the specified Embedly API key.""" self.api_key = api_key + def is_applicable(self, url: str) -> bool: + """Return whether this scraper is applicable to a particular url.""" + parsed_url = urlparse(url) + + # exclude links to YouTube videos, since we have a dedicated scraper for those + if ( + parsed_url.hostname in ("www.youtube.com", "youtube.com") + and parsed_url.path == "/watch" + ): + return False + + return True + def scrape_url(self, url: str) -> ScraperResult: """Scrape a url and return the result.""" params: Dict[str, Any] = {"key": self.api_key, "format": "json", "url": url}