diff --git a/salt/salt/consumers/init.sls b/salt/salt/consumers/init.sls index 4e26637..01b33a1 100644 --- a/salt/salt/consumers/init.sls +++ b/salt/salt/consumers/init.sls @@ -34,4 +34,16 @@ consumer-comment_user_mentions_generator.service: consumer-topic_embedly_extractor.service: service.running: - enable: True + +/etc/systemd/system/consumer-site_icon_downloader.service: + file.managed: + - source: salt://consumers/site_icon_downloader.service.jinja2 + - template: jinja + - user: root + - group: root + - mode: 644 + +consumer-site_icon_downloader.service: + service.running: + - enable: True {% endif %} diff --git a/salt/salt/consumers/site_icon_downloader.service.jinja2 b/salt/salt/consumers/site_icon_downloader.service.jinja2 new file mode 100644 index 0000000..faae373 --- /dev/null +++ b/salt/salt/consumers/site_icon_downloader.service.jinja2 @@ -0,0 +1,17 @@ +{% from 'common.jinja2' import app_dir, app_username, bin_dir -%} +[Unit] +Description=Site Icon Downloader (Queue Consumer) +Requires=rabbitmq-server.service +After=rabbitmq-server.service +PartOf=rabbitmq-server.service + +[Service] +User={{ app_username }} +WorkingDirectory={{ app_dir }}/consumers +Environment="INI_FILE={{ app_dir }}/{{ pillar['ini_file'] }}" +ExecStart={{ bin_dir }}/python site_icon_downloader.py +Restart=always +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/tildes/alembic/versions/22a8ed36a3c9_send_rabbitmq_message_on_new_scraper_.py b/tildes/alembic/versions/22a8ed36a3c9_send_rabbitmq_message_on_new_scraper_.py new file mode 100644 index 0000000..51592a7 --- /dev/null +++ b/tildes/alembic/versions/22a8ed36a3c9_send_rabbitmq_message_on_new_scraper_.py @@ -0,0 +1,58 @@ +"""Send rabbitmq message on new scraper result + +Revision ID: 22a8ed36a3c9 +Revises: 8e54f422541c +Create Date: 2018-09-30 21:14:29.265490 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "22a8ed36a3c9" +down_revision = "8e54f422541c" +branch_labels = None +depends_on = None + + +def upgrade(): + op.execute( + """ + CREATE OR REPLACE FUNCTION send_rabbitmq_message_for_scraper_result() RETURNS TRIGGER AS $$ + DECLARE + affected_row RECORD; + payload TEXT; + BEGIN + IF (TG_OP = 'INSERT' OR TG_OP = 'UPDATE') THEN + affected_row := NEW; + ELSIF (TG_OP = 'DELETE') THEN + affected_row := OLD; + END IF; + + payload := json_build_object('result_id', affected_row.result_id); + + PERFORM send_rabbitmq_message('scraper_result.' || TG_ARGV[0], payload); + + RETURN NULL; + END; + $$ LANGUAGE plpgsql; + """ + ) + + op.execute( + """ + CREATE TRIGGER send_rabbitmq_message_for_scraper_result_insert + AFTER INSERT ON scraper_results + FOR EACH ROW + EXECUTE PROCEDURE send_rabbitmq_message_for_scraper_result('created'); + """ + ) + + +def downgrade(): + op.execute( + "DROP TRIGGER send_rabbitmq_message_for_scraper_result_insert ON scraper_results" + ) + + op.execute("DROP FUNCTION send_rabbitmq_message_for_scraper_result()") diff --git a/tildes/consumers/site_icon_downloader.py b/tildes/consumers/site_icon_downloader.py new file mode 100644 index 0000000..bbf0eae --- /dev/null +++ b/tildes/consumers/site_icon_downloader.py @@ -0,0 +1,98 @@ +# Copyright (c) 2018 Tildes contributors +# SPDX-License-Identifier: AGPL-3.0-or-later + +"""Consumer that downloads site icons using Embedly scraper data.""" + +from io import BytesIO +from os import path +from typing import Optional, Sequence + +from amqpy import Message +from PIL import Image +import publicsuffix +import requests + +from tildes.enums import ScraperType +from tildes.lib.amqp import PgsqlQueueConsumer +from tildes.lib.url import get_domain_from_url +from tildes.models.scraper import ScraperResult + + +class SiteIconDownloader(PgsqlQueueConsumer): + """Consumer that generates content_metadata for topics.""" + + ICON_FOLDER = "/var/lib/site-icons-spriter/site-icons" + + def __init__(self, queue_name: str, routing_keys: Sequence[str]) -> None: + """Initialize the consumer, including the public suffix list.""" + super().__init__(queue_name, routing_keys) + + # download the public suffix list (would be good to add caching here) + psl_file = publicsuffix.fetch() + self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file) + + def run(self, msg: Message) -> None: + """Process a delivered message.""" + result = ( + self.db_session.query(ScraperResult) + .filter_by(result_id=msg.body["result_id"]) + .one() + ) + + # Check if we already have an icon for this domain, and skip if we do. This + # currently uses the ScraperResult's url, but it might be better to use the + # Embedly url data, since that will be after any redirects + parsed_domain = get_domain_from_url(result.url) + domain = self.public_suffix_list.get_public_suffix(parsed_domain) + + filename = domain.replace(".", "_") + ".png" + filename = path.join(self.ICON_FOLDER, filename) + if path.exists(filename): + return + + if result.scraper_type != ScraperType.EMBEDLY: + return + + favicon_url = result.data.get("favicon_url") + if not favicon_url: + return + + try: + response = requests.get(favicon_url, timeout=5) + except requests.exeptions.Timeout: + return + + if response.status_code != 200: + return + + icon = self._get_icon_from_response(response) + if icon: + icon.save(filename) + + @staticmethod + def _get_icon_from_response(response: requests.Response) -> Optional[Image.Image]: + """Return a properly-sized icon Image extracted from a Response.""" + favicon = Image.open(BytesIO(response.content)) + + if favicon.format == "ICO": + # get the 32x32 size if it's present, otherwise resize the largest one + if (32, 32) in favicon.ico.sizes(): + return favicon.ico.getimage((32, 32)) + + image = favicon.ico.getimage(max(favicon.ico.sizes())) + return image.resize((32, 32)) + elif favicon.format == "PNG": + image = favicon + if image.size != (32, 32): + image = image.resize((32, 32)) + + return image + + # formats other than ICO or PNG aren't handled + return None + + +if __name__ == "__main__": + SiteIconDownloader( + queue_name="site_icon_downloader.q", routing_keys=["scraper_result.created"] + ).consume_queue() diff --git a/tildes/sql/init/triggers/scraper_results/rabbitmq.sql b/tildes/sql/init/triggers/scraper_results/rabbitmq.sql new file mode 100644 index 0000000..657c18b --- /dev/null +++ b/tildes/sql/init/triggers/scraper_results/rabbitmq.sql @@ -0,0 +1,27 @@ +-- Copyright (c) 2018 Tildes contributors +-- SPDX-License-Identifier: AGPL-3.0-or-later + +CREATE OR REPLACE FUNCTION send_rabbitmq_message_for_scraper_result() RETURNS TRIGGER AS $$ +DECLARE + affected_row RECORD; + payload TEXT; +BEGIN + IF (TG_OP = 'INSERT' OR TG_OP = 'UPDATE') THEN + affected_row := NEW; + ELSIF (TG_OP = 'DELETE') THEN + affected_row := OLD; + END IF; + + payload := json_build_object('result_id', affected_row.result_id); + + PERFORM send_rabbitmq_message('scraper_result.' || TG_ARGV[0], payload); + + RETURN NULL; +END; +$$ LANGUAGE plpgsql; + + +CREATE TRIGGER send_rabbitmq_message_for_scraper_result_insert + AFTER INSERT ON scraper_results + FOR EACH ROW + EXECUTE PROCEDURE send_rabbitmq_message_for_scraper_result('created');