Exclude YouTube videos from Embedly scraper

There are occasionally some data conflicts between the Embedly scraper and the dedicated YouTube one (which uses their API). The Embedly scraper has also recently started supplying "content" for YouTube videos, which ends up with Tildes displaying a word count for it. That doesn't make much sense. Just leaving the YouTube-scraping to the API client should be fine.
6 years ago · c10746a69f
1 changed files with 14 additions and 0 deletions
--- a/tildes/tildes/scrapers/embedly_scraper.py
+++ b/tildes/tildes/scrapers/embedly_scraper.py
@ -4,6 +4,7 @@
 """Contains the EmbedlyScraper class."""

 from typing import Any, Dict
+from urllib.parse import urlparse

 import requests

@ -19,6 +20,19 @@ class EmbedlyScraper:
        """Create a new scraper using the specified Embedly API key."""
        self.api_key = api_key

+    def is_applicable(self, url: str) -> bool:
+        """Return whether this scraper is applicable to a particular url."""
+        parsed_url = urlparse(url)
+
+        # exclude links to YouTube videos, since we have a dedicated scraper for those
+        if (
+            parsed_url.hostname in ("www.youtube.com", "youtube.com")
+            and parsed_url.path == "/watch"
+        ):
+            return False
+
+        return True
+
    def scrape_url(self, url: str) -> ScraperResult:
        """Scrape a url and return the result."""
        params: Dict[str, Any] = {"key": self.api_key, "format": "json", "url": url}