From c10746a69f0f6f95fb104fa0ae4f1da7a1a1e08f Mon Sep 17 00:00:00 2001
From: Deimos <deimos@tildes.net>
Date: Mon, 29 Jul 2019 15:12:48 -0600
Subject: [PATCH] Exclude YouTube videos from Embedly scraper

There are occasionally some data conflicts between the Embedly scraper
and the dedicated YouTube one (which uses their API). The Embedly
scraper has also recently started supplying "content" for YouTube
videos, which ends up with Tildes displaying a word count for it. That
doesn't make much sense.

Just leaving the YouTube-scraping to the API client should be fine.
---
 tildes/tildes/scrapers/embedly_scraper.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tildes/tildes/scrapers/embedly_scraper.py b/tildes/tildes/scrapers/embedly_scraper.py
index 667e612..3213ad0 100644
--- a/tildes/tildes/scrapers/embedly_scraper.py
+++ b/tildes/tildes/scrapers/embedly_scraper.py
@@ -4,6 +4,7 @@
 """Contains the EmbedlyScraper class."""
 
 from typing import Any, Dict
+from urllib.parse import urlparse
 
 import requests
 
@@ -19,6 +20,19 @@ class EmbedlyScraper:
         """Create a new scraper using the specified Embedly API key."""
         self.api_key = api_key
 
+    def is_applicable(self, url: str) -> bool:
+        """Return whether this scraper is applicable to a particular url."""
+        parsed_url = urlparse(url)
+
+        # exclude links to YouTube videos, since we have a dedicated scraper for those
+        if (
+            parsed_url.hostname in ("www.youtube.com", "youtube.com")
+            and parsed_url.path == "/watch"
+        ):
+            return False
+
+        return True
+
     def scrape_url(self, url: str) -> ScraperResult:
         """Scrape a url and return the result."""
         params: Dict[str, Any] = {"key": self.api_key, "format": "json", "url": url}