Browse Source

Add a consumer to automatically download favicons

This adds a trigger to the scraper_results table which will add rabbitmq
messages whenever a scrape finishes, as well as a consumer that picks up
these messages, and uses Embedly data to download (and resize if
necessary) the favicons from any sites that are scraped. These are
downloaded into the input folder for the site-icons-spriter, so it
should be able to use these to generate spritesheets.
merge-requests/40/head
Deimos 6 years ago
parent
commit
2a19aa20ce
  1. 12
      salt/salt/consumers/init.sls
  2. 17
      salt/salt/consumers/site_icon_downloader.service.jinja2
  3. 58
      tildes/alembic/versions/22a8ed36a3c9_send_rabbitmq_message_on_new_scraper_.py
  4. 98
      tildes/consumers/site_icon_downloader.py
  5. 27
      tildes/sql/init/triggers/scraper_results/rabbitmq.sql

12
salt/salt/consumers/init.sls

@ -34,4 +34,16 @@ consumer-comment_user_mentions_generator.service:
consumer-topic_embedly_extractor.service:
service.running:
- enable: True
/etc/systemd/system/consumer-site_icon_downloader.service:
file.managed:
- source: salt://consumers/site_icon_downloader.service.jinja2
- template: jinja
- user: root
- group: root
- mode: 644
consumer-site_icon_downloader.service:
service.running:
- enable: True
{% endif %}

17
salt/salt/consumers/site_icon_downloader.service.jinja2

@ -0,0 +1,17 @@
{% from 'common.jinja2' import app_dir, app_username, bin_dir -%}
[Unit]
Description=Site Icon Downloader (Queue Consumer)
Requires=rabbitmq-server.service
After=rabbitmq-server.service
PartOf=rabbitmq-server.service
[Service]
User={{ app_username }}
WorkingDirectory={{ app_dir }}/consumers
Environment="INI_FILE={{ app_dir }}/{{ pillar['ini_file'] }}"
ExecStart={{ bin_dir }}/python site_icon_downloader.py
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target

58
tildes/alembic/versions/22a8ed36a3c9_send_rabbitmq_message_on_new_scraper_.py

@ -0,0 +1,58 @@
"""Send rabbitmq message on new scraper result
Revision ID: 22a8ed36a3c9
Revises: 8e54f422541c
Create Date: 2018-09-30 21:14:29.265490
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "22a8ed36a3c9"
down_revision = "8e54f422541c"
branch_labels = None
depends_on = None
def upgrade():
op.execute(
"""
CREATE OR REPLACE FUNCTION send_rabbitmq_message_for_scraper_result() RETURNS TRIGGER AS $$
DECLARE
affected_row RECORD;
payload TEXT;
BEGIN
IF (TG_OP = 'INSERT' OR TG_OP = 'UPDATE') THEN
affected_row := NEW;
ELSIF (TG_OP = 'DELETE') THEN
affected_row := OLD;
END IF;
payload := json_build_object('result_id', affected_row.result_id);
PERFORM send_rabbitmq_message('scraper_result.' || TG_ARGV[0], payload);
RETURN NULL;
END;
$$ LANGUAGE plpgsql;
"""
)
op.execute(
"""
CREATE TRIGGER send_rabbitmq_message_for_scraper_result_insert
AFTER INSERT ON scraper_results
FOR EACH ROW
EXECUTE PROCEDURE send_rabbitmq_message_for_scraper_result('created');
"""
)
def downgrade():
op.execute(
"DROP TRIGGER send_rabbitmq_message_for_scraper_result_insert ON scraper_results"
)
op.execute("DROP FUNCTION send_rabbitmq_message_for_scraper_result()")

98
tildes/consumers/site_icon_downloader.py

@ -0,0 +1,98 @@
# Copyright (c) 2018 Tildes contributors <code@tildes.net>
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Consumer that downloads site icons using Embedly scraper data."""
from io import BytesIO
from os import path
from typing import Optional, Sequence
from amqpy import Message
from PIL import Image
import publicsuffix
import requests
from tildes.enums import ScraperType
from tildes.lib.amqp import PgsqlQueueConsumer
from tildes.lib.url import get_domain_from_url
from tildes.models.scraper import ScraperResult
class SiteIconDownloader(PgsqlQueueConsumer):
"""Consumer that generates content_metadata for topics."""
ICON_FOLDER = "/var/lib/site-icons-spriter/site-icons"
def __init__(self, queue_name: str, routing_keys: Sequence[str]) -> None:
"""Initialize the consumer, including the public suffix list."""
super().__init__(queue_name, routing_keys)
# download the public suffix list (would be good to add caching here)
psl_file = publicsuffix.fetch()
self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
def run(self, msg: Message) -> None:
"""Process a delivered message."""
result = (
self.db_session.query(ScraperResult)
.filter_by(result_id=msg.body["result_id"])
.one()
)
# Check if we already have an icon for this domain, and skip if we do. This
# currently uses the ScraperResult's url, but it might be better to use the
# Embedly url data, since that will be after any redirects
parsed_domain = get_domain_from_url(result.url)
domain = self.public_suffix_list.get_public_suffix(parsed_domain)
filename = domain.replace(".", "_") + ".png"
filename = path.join(self.ICON_FOLDER, filename)
if path.exists(filename):
return
if result.scraper_type != ScraperType.EMBEDLY:
return
favicon_url = result.data.get("favicon_url")
if not favicon_url:
return
try:
response = requests.get(favicon_url, timeout=5)
except requests.exeptions.Timeout:
return
if response.status_code != 200:
return
icon = self._get_icon_from_response(response)
if icon:
icon.save(filename)
@staticmethod
def _get_icon_from_response(response: requests.Response) -> Optional[Image.Image]:
"""Return a properly-sized icon Image extracted from a Response."""
favicon = Image.open(BytesIO(response.content))
if favicon.format == "ICO":
# get the 32x32 size if it's present, otherwise resize the largest one
if (32, 32) in favicon.ico.sizes():
return favicon.ico.getimage((32, 32))
image = favicon.ico.getimage(max(favicon.ico.sizes()))
return image.resize((32, 32))
elif favicon.format == "PNG":
image = favicon
if image.size != (32, 32):
image = image.resize((32, 32))
return image
# formats other than ICO or PNG aren't handled
return None
if __name__ == "__main__":
SiteIconDownloader(
queue_name="site_icon_downloader.q", routing_keys=["scraper_result.created"]
).consume_queue()

27
tildes/sql/init/triggers/scraper_results/rabbitmq.sql

@ -0,0 +1,27 @@
-- Copyright (c) 2018 Tildes contributors <code@tildes.net>
-- SPDX-License-Identifier: AGPL-3.0-or-later
CREATE OR REPLACE FUNCTION send_rabbitmq_message_for_scraper_result() RETURNS TRIGGER AS $$
DECLARE
affected_row RECORD;
payload TEXT;
BEGIN
IF (TG_OP = 'INSERT' OR TG_OP = 'UPDATE') THEN
affected_row := NEW;
ELSIF (TG_OP = 'DELETE') THEN
affected_row := OLD;
END IF;
payload := json_build_object('result_id', affected_row.result_id);
PERFORM send_rabbitmq_message('scraper_result.' || TG_ARGV[0], payload);
RETURN NULL;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER send_rabbitmq_message_for_scraper_result_insert
AFTER INSERT ON scraper_results
FOR EACH ROW
EXECUTE PROCEDURE send_rabbitmq_message_for_scraper_result('created');
Loading…
Cancel
Save