Browse Source

Add timeout to Embedly scraper

merge-requests/40/head
Deimos 6 years ago
parent
commit
32bcbf1f95
  1. 4
      tildes/consumers/topic_embedly_extractor.py
  2. 4
      tildes/tildes/scrapers/embedly_scraper.py

4
tildes/consumers/topic_embedly_extractor.py

@ -9,7 +9,7 @@ from typing import Sequence
from amqpy import Message
from pyramid.paster import bootstrap
from requests.exceptions import HTTPError
from requests.exceptions import HTTPError, Timeout
from sqlalchemy import cast, desc, func
from sqlalchemy.dialects.postgresql import JSONB
@ -61,7 +61,7 @@ class TopicEmbedlyExtractor(PgsqlQueueConsumer):
if not result:
try:
result = self.scraper.scrape_url(topic.link)
except HTTPError:
except (HTTPError, Timeout):
return
self.db_session.add(result)

4
tildes/tildes/scrapers/embedly_scraper.py

@ -23,7 +23,9 @@ class EmbedlyScraper:
"""Scrape a url and return the result."""
params: Dict[str, Any] = {"key": self.api_key, "format": "json", "url": url}
response = requests.get("https://api.embedly.com/1/extract", params=params)
response = requests.get(
"https://api.embedly.com/1/extract", params=params, timeout=5
)
response.raise_for_status()
return ScraperResult(url, ScraperType.EMBEDLY, response.json())

Loading…
Cancel
Save