From 2a19aa20ce2b799f93b0705a03c70cbd8a55ef9a Mon Sep 17 00:00:00 2001 From: Deimos Date: Sun, 30 Sep 2018 22:56:47 -0600 Subject: [PATCH] Add a consumer to automatically download favicons This adds a trigger to the scraper_results table which will add rabbitmq messages whenever a scrape finishes, as well as a consumer that picks up these messages, and uses Embedly data to download (and resize if necessary) the favicons from any sites that are scraped. These are downloaded into the input folder for the site-icons-spriter, so it should be able to use these to generate spritesheets. --- salt/salt/consumers/init.sls | 12 +++ .../site_icon_downloader.service.jinja2 | 17 ++++ ...9_send_rabbitmq_message_on_new_scraper_.py | 58 +++++++++++ tildes/consumers/site_icon_downloader.py | 98 +++++++++++++++++++ .../triggers/scraper_results/rabbitmq.sql | 27 +++++ 5 files changed, 212 insertions(+) create mode 100644 salt/salt/consumers/site_icon_downloader.service.jinja2 create mode 100644 tildes/alembic/versions/22a8ed36a3c9_send_rabbitmq_message_on_new_scraper_.py create mode 100644 tildes/consumers/site_icon_downloader.py create mode 100644 tildes/sql/init/triggers/scraper_results/rabbitmq.sql diff --git a/salt/salt/consumers/init.sls b/salt/salt/consumers/init.sls index 4e26637..01b33a1 100644 --- a/salt/salt/consumers/init.sls +++ b/salt/salt/consumers/init.sls @@ -34,4 +34,16 @@ consumer-comment_user_mentions_generator.service: consumer-topic_embedly_extractor.service: service.running: - enable: True + +/etc/systemd/system/consumer-site_icon_downloader.service: + file.managed: + - source: salt://consumers/site_icon_downloader.service.jinja2 + - template: jinja + - user: root + - group: root + - mode: 644 + +consumer-site_icon_downloader.service: + service.running: + - enable: True {% endif %} diff --git a/salt/salt/consumers/site_icon_downloader.service.jinja2 b/salt/salt/consumers/site_icon_downloader.service.jinja2 new file mode 100644 index 0000000..faae373 --- /dev/null +++ b/salt/salt/consumers/site_icon_downloader.service.jinja2 @@ -0,0 +1,17 @@ +{% from 'common.jinja2' import app_dir, app_username, bin_dir -%} +[Unit] +Description=Site Icon Downloader (Queue Consumer) +Requires=rabbitmq-server.service +After=rabbitmq-server.service +PartOf=rabbitmq-server.service + +[Service] +User={{ app_username }} +WorkingDirectory={{ app_dir }}/consumers +Environment="INI_FILE={{ app_dir }}/{{ pillar['ini_file'] }}" +ExecStart={{ bin_dir }}/python site_icon_downloader.py +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/tildes/alembic/versions/22a8ed36a3c9_send_rabbitmq_message_on_new_scraper_.py b/tildes/alembic/versions/22a8ed36a3c9_send_rabbitmq_message_on_new_scraper_.py new file mode 100644 index 0000000..51592a7 --- /dev/null +++ b/tildes/alembic/versions/22a8ed36a3c9_send_rabbitmq_message_on_new_scraper_.py @@ -0,0 +1,58 @@ +"""Send rabbitmq message on new scraper result + +Revision ID: 22a8ed36a3c9 +Revises: 8e54f422541c +Create Date: 2018-09-30 21:14:29.265490 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "22a8ed36a3c9" +down_revision = "8e54f422541c" +branch_labels = None +depends_on = None + + +def upgrade(): + op.execute( + """ + CREATE OR REPLACE FUNCTION send_rabbitmq_message_for_scraper_result() RETURNS TRIGGER AS $$ + DECLARE + affected_row RECORD; + payload TEXT; + BEGIN + IF (TG_OP = 'INSERT' OR TG_OP = 'UPDATE') THEN + affected_row := NEW; + ELSIF (TG_OP = 'DELETE') THEN + affected_row := OLD; + END IF; + + payload := json_build_object('result_id', affected_row.result_id); + + PERFORM send_rabbitmq_message('scraper_result.' || TG_ARGV[0], payload); + + RETURN NULL; + END; + $$ LANGUAGE plpgsql; + """ + ) + + op.execute( + """ + CREATE TRIGGER send_rabbitmq_message_for_scraper_result_insert + AFTER INSERT ON scraper_results + FOR EACH ROW + EXECUTE PROCEDURE send_rabbitmq_message_for_scraper_result('created'); + """ + ) + + +def downgrade(): + op.execute( + "DROP TRIGGER send_rabbitmq_message_for_scraper_result_insert ON scraper_results" + ) + + op.execute("DROP FUNCTION send_rabbitmq_message_for_scraper_result()") diff --git a/tildes/consumers/site_icon_downloader.py b/tildes/consumers/site_icon_downloader.py new file mode 100644 index 0000000..bbf0eae --- /dev/null +++ b/tildes/consumers/site_icon_downloader.py @@ -0,0 +1,98 @@ +# Copyright (c) 2018 Tildes contributors +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Consumer that downloads site icons using Embedly scraper data.""" + +from io import BytesIO +from os import path +from typing import Optional, Sequence + +from amqpy import Message +from PIL import Image +import publicsuffix +import requests + +from tildes.enums import ScraperType +from tildes.lib.amqp import PgsqlQueueConsumer +from tildes.lib.url import get_domain_from_url +from tildes.models.scraper import ScraperResult + + +class SiteIconDownloader(PgsqlQueueConsumer): + """Consumer that generates content_metadata for topics.""" + + ICON_FOLDER = "/var/lib/site-icons-spriter/site-icons" + + def __init__(self, queue_name: str, routing_keys: Sequence[str]) -> None: + """Initialize the consumer, including the public suffix list.""" + super().__init__(queue_name, routing_keys) + + # download the public suffix list (would be good to add caching here) + psl_file = publicsuffix.fetch() + self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file) + + def run(self, msg: Message) -> None: + """Process a delivered message.""" + result = ( + self.db_session.query(ScraperResult) + .filter_by(result_id=msg.body["result_id"]) + .one() + ) + + # Check if we already have an icon for this domain, and skip if we do. This + # currently uses the ScraperResult's url, but it might be better to use the + # Embedly url data, since that will be after any redirects + parsed_domain = get_domain_from_url(result.url) + domain = self.public_suffix_list.get_public_suffix(parsed_domain) + + filename = domain.replace(".", "_") + ".png" + filename = path.join(self.ICON_FOLDER, filename) + if path.exists(filename): + return + + if result.scraper_type != ScraperType.EMBEDLY: + return + + favicon_url = result.data.get("favicon_url") + if not favicon_url: + return + + try: + response = requests.get(favicon_url, timeout=5) + except requests.exeptions.Timeout: + return + + if response.status_code != 200: + return + + icon = self._get_icon_from_response(response) + if icon: + icon.save(filename) + + @staticmethod + def _get_icon_from_response(response: requests.Response) -> Optional[Image.Image]: + """Return a properly-sized icon Image extracted from a Response.""" + favicon = Image.open(BytesIO(response.content)) + + if favicon.format == "ICO": + # get the 32x32 size if it's present, otherwise resize the largest one + if (32, 32) in favicon.ico.sizes(): + return favicon.ico.getimage((32, 32)) + + image = favicon.ico.getimage(max(favicon.ico.sizes())) + return image.resize((32, 32)) + elif favicon.format == "PNG": + image = favicon + if image.size != (32, 32): + image = image.resize((32, 32)) + + return image + + # formats other than ICO or PNG aren't handled + return None + + +if __name__ == "__main__": + SiteIconDownloader( + queue_name="site_icon_downloader.q", routing_keys=["scraper_result.created"] + ).consume_queue() diff --git a/tildes/sql/init/triggers/scraper_results/rabbitmq.sql b/tildes/sql/init/triggers/scraper_results/rabbitmq.sql new file mode 100644 index 0000000..657c18b --- /dev/null +++ b/tildes/sql/init/triggers/scraper_results/rabbitmq.sql @@ -0,0 +1,27 @@ +-- Copyright (c) 2018 Tildes contributors +-- SPDX-License-Identifier: AGPL-3.0-or-later + +CREATE OR REPLACE FUNCTION send_rabbitmq_message_for_scraper_result() RETURNS TRIGGER AS $$ +DECLARE + affected_row RECORD; + payload TEXT; +BEGIN + IF (TG_OP = 'INSERT' OR TG_OP = 'UPDATE') THEN + affected_row := NEW; + ELSIF (TG_OP = 'DELETE') THEN + affected_row := OLD; + END IF; + + payload := json_build_object('result_id', affected_row.result_id); + + PERFORM send_rabbitmq_message('scraper_result.' || TG_ARGV[0], payload); + + RETURN NULL; +END; +$$ LANGUAGE plpgsql; + + +CREATE TRIGGER send_rabbitmq_message_for_scraper_result_insert + AFTER INSERT ON scraper_results + FOR EACH ROW + EXECUTE PROCEDURE send_rabbitmq_message_for_scraper_result('created');