mirror of https://gitlab.com/tildes/tildes.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
101 lines
3.3 KiB
101 lines
3.3 KiB
# Copyright (c) 2018 Tildes contributors <code@tildes.net>
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
|
"""Consumer that downloads site icons using Embedly scraper data."""
|
|
|
|
from io import BytesIO
|
|
from os import path
|
|
from typing import Optional, Sequence
|
|
|
|
import publicsuffix
|
|
import requests
|
|
from amqpy import Message
|
|
from PIL import Image
|
|
|
|
from tildes.enums import ScraperType
|
|
from tildes.lib.amqp import PgsqlQueueConsumer
|
|
from tildes.lib.url import get_domain_from_url
|
|
from tildes.models.scraper import ScraperResult
|
|
|
|
|
|
class SiteIconDownloader(PgsqlQueueConsumer):
|
|
"""Consumer that generates content_metadata for topics."""
|
|
|
|
ICON_FOLDER = "/opt/tildes/static/images/site-icons"
|
|
|
|
def __init__(self, queue_name: str, routing_keys: Sequence[str]):
|
|
"""Initialize the consumer, including the public suffix list."""
|
|
super().__init__(queue_name, routing_keys)
|
|
|
|
# download the public suffix list (would be good to add caching here)
|
|
psl_file = publicsuffix.fetch()
|
|
self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
|
|
|
|
def run(self, msg: Message) -> None:
|
|
"""Process a delivered message."""
|
|
result = (
|
|
self.db_session.query(ScraperResult)
|
|
.filter_by(result_id=msg.body["result_id"])
|
|
.one()
|
|
)
|
|
|
|
# Check if we already have an icon for this domain, and skip if we do. This
|
|
# currently uses the ScraperResult's url, but it might be better to use the
|
|
# Embedly url data, since that will be after any redirects
|
|
parsed_domain = get_domain_from_url(result.url)
|
|
domain = self.public_suffix_list.get_public_suffix(parsed_domain)
|
|
|
|
filename = domain.replace(".", "_") + ".png"
|
|
filename = path.join(self.ICON_FOLDER, filename)
|
|
if path.exists(filename):
|
|
return
|
|
|
|
if result.scraper_type != ScraperType.EMBEDLY:
|
|
return
|
|
|
|
favicon_url = result.data.get("favicon_url")
|
|
if not favicon_url:
|
|
return
|
|
|
|
try:
|
|
response = requests.get(favicon_url, timeout=5)
|
|
except requests.exceptions.RequestException:
|
|
return
|
|
|
|
if response.status_code != 200:
|
|
return
|
|
|
|
icon = self._get_icon_from_response(response)
|
|
if icon:
|
|
icon.save(filename)
|
|
|
|
@staticmethod
|
|
def _get_icon_from_response(response: requests.Response) -> Optional[Image.Image]:
|
|
"""Return a properly-sized icon Image extracted from a Response."""
|
|
try:
|
|
favicon = Image.open(BytesIO(response.content))
|
|
except (OSError, ValueError):
|
|
return None
|
|
|
|
if favicon.format == "ICO":
|
|
# get the 32x32 size if it's present, otherwise resize the largest one
|
|
if (32, 32) in favicon.ico.sizes():
|
|
return favicon.ico.getimage((32, 32))
|
|
|
|
image = favicon.ico.getimage(max(favicon.ico.sizes()))
|
|
return image.resize((32, 32))
|
|
elif favicon.format in ("JPEG", "PNG"):
|
|
image = favicon
|
|
if image.size != (32, 32):
|
|
image = image.resize((32, 32))
|
|
|
|
return image
|
|
|
|
# any other formats aren't handled
|
|
return None
|
|
|
|
|
|
if __name__ == "__main__":
|
|
SiteIconDownloader(
|
|
queue_name="site_icon_downloader.q", routing_keys=["scraper_result.created"]
|
|
).consume_queue()
|