Add better control of content metadata by type

Previously, the content metadata displayed next to a topic's content type (like "Article: 1800 words") was fairly generic and could result in strange data being displayed if a scraper fetched it for an inappropriate type (for example, displaying word count for videos). This creates an enum to hold all the different content metadata fields, and moves some logic into that class to handle deciding which fields to show for different types, and the formatting logic for values.
6 years ago · af66a76054
4 changed files with 83 additions and 62 deletions
--- a/tildes/tildes/enums.py
+++ b/tildes/tildes/enums.py
@ -4,7 +4,10 @@
 """Contains Enum classes."""
 import enum
 from typing import Optional
 from datetime import timedelta
 from typing import Any, List, Optional
 from tildes.lib.datetime import utc_from_timestamp
 class CommentNotificationType(enum.Enum):
@ -80,6 +83,67 @@ class CommentLabelOption(enum.Enum):
        return None
 class ContentMetadataFields(enum.Enum):
    """Enum for the fields of content metadata stored and used (for topics)."""
    AUTHORS = enum.auto()
    DESCRIPTION = enum.auto()
    DOMAIN = enum.auto()
    DURATION = enum.auto()
    EXCERPT = enum.auto()
    PUBLISHED = enum.auto()
    TITLE = enum.auto()
    WORD_COUNT = enum.auto()
    @property
    def key(self) -> str:
        """Return the key to store this field under."""
        return self.name.lower()
    @classmethod
    def detail_fields_for_content_type(
        cls, content_type: "TopicContentType",
    ) -> List["ContentMetadataFields"]:
        """Return a list of fields to display for detail about a particular type."""
        if content_type is TopicContentType.ARTICLE:
            return [cls.WORD_COUNT, cls.PUBLISHED]
        if content_type is TopicContentType.TEXT:
            return [cls.WORD_COUNT]
        if content_type is TopicContentType.VIDEO:
            return [cls.DURATION, cls.PUBLISHED]
        return []
    def format_value(self, value: Any) -> str:
        """Format a value stored in this field into a string for display."""
        if self.name == "WORD_COUNT":
            if value == 1:
                return "1 word"
            return f"{value} words"
        if self.name == "DURATION":
            delta = timedelta(seconds=value)
            # When converted to str, timedelta always includes hours and minutes,
            # so we want to strip off all the excess zeros and/or colons. However,
            # if it's less than a minute we'll need to add one back.
            duration_str = str(delta).lstrip("0:")
            if value < 60:
                duration_str = f"0:{duration_str}"
            return duration_str
        if self.name == "PUBLISHED":
            published = utc_from_timestamp(value)
            date_str = published.strftime("%b %-d %Y")
            return f"published {date_str}"
        return str(value)
 class FinancialEntryType(enum.Enum):
    """Enum for entry types in the Financials table."""
--- a/tildes/tildes/lib/link_metadata.py
+++ b/tildes/tildes/lib/link_metadata.py
@ -1,13 +0,0 @@
 # Copyright (c) 2019 Tildes contributors <code@tildes.net>
 # SPDX-License-Identifier: AGPL-3.0-or-later
 """Constants/classes/functions related to metadata generated from links."""
 METADATA_KEYS = [
    "authors",
    "description",
    "duration",
    "published",
    "title",
    "word_count",
 ]
--- a/tildes/tildes/models/topic/topic.py
+++ b/tildes/tildes/models/topic/topic.py
@ -26,7 +26,7 @@ from sqlalchemy.orm import deferred, relationship
 from sqlalchemy.sql.expression import text
 from titlecase import titlecase
 from tildes.enums import TopicContentType, TopicType
 from tildes.enums import ContentMetadataFields, TopicContentType, TopicType
 from tildes.lib.database import TagList
 from tildes.lib.datetime import utc_from_timestamp, utc_now
 from tildes.lib.id import id_to_id36
@ -461,38 +461,25 @@ class Topic(DatabaseModel):
    @property
    def content_metadata_for_display(self) -> str:
        """Return a string of the content's metadata, suitable for display."""
        if not self.content_type:
            return ""
        metadata_strings = []
        # display word count (if we have it) with either type of topic
        word_count = self.get_content_metadata("word_count")
        if word_count is not None:
            if word_count == 1:
                metadata_strings.append("1 word")
            else:
                metadata_strings.append(f"{word_count} words")
        fields = ContentMetadataFields.detail_fields_for_content_type(self.content_type)
        if self.is_link_type:
            # display the duration if we have it
            duration = self.get_content_metadata("duration")
            if duration:
                duration_delta = timedelta(seconds=duration)
                # When converted to str, timedelta always includes hours and minutes,
                # so we want to strip off all the excess zeros and/or colons. However,
                # if it's less than a minute we'll need to add one back.
                duration_str = str(duration_delta).lstrip("0:")
                if duration < 60:
                    duration_str = f"0:{duration_str}"
                metadata_strings.append(duration_str)
            # display the published date if it's more than 3 days before the topic
            published_timestamp = self.get_content_metadata("published")
            if published_timestamp:
                published = utc_from_timestamp(published_timestamp)
                if self.created_time - published > timedelta(days=3):
                    date_str = published.strftime("%b %-d %Y")
                    metadata_strings.append(f"published {date_str}")
        for field in fields:
            value = self.get_content_metadata(field.key)
            if not value:
                continue
            # only show published date if it's more than 3 days before the topic
            if field is ContentMetadataFields.PUBLISHED:
                published = utc_from_timestamp(value)
                if self.created_time - published < timedelta(days=3):
                    continue
            metadata_strings.append(field.format_value(value))
        return ", ".join(metadata_strings)
--- a/tildes/tildes/views/api/web/topic.py
+++ b/tildes/tildes/views/api/web/topic.py
@ -8,13 +8,10 @@ from marshmallow.fields import String
 from pyramid.httpexceptions import HTTPNotFound
 from pyramid.request import Request
 from pyramid.response import Response
 from sqlalchemy import cast, Text
 from sqlalchemy.dialects.postgresql import ARRAY
 from sqlalchemy.exc import IntegrityError
 from webargs.pyramidparser import use_kwargs
 from tildes.enums import LogEventType
 from tildes.lib.link_metadata import METADATA_KEYS
 from tildes.models.group import Group
 from tildes.models.log import LogTopic
 from tildes.models.topic import Topic, TopicBookmark, TopicVote
@ -393,22 +390,8 @@ def patch_topic_link(request: Request, link: str) -> dict:
        )
    )
    # Wipe any old metadata from scrapers so we don't leave behind remnants
    # (this probably really shouldn't be done here, but it's fine for now)
    (
        request.query(Topic)
        .filter(Topic.topic_id == topic.topic_id)
        .update(
            {
                "content_metadata": Topic.content_metadata.op("-")(  # type: ignore
                    cast(METADATA_KEYS, ARRAY(Text))
                )
            },
            synchronize_session=False,
        )
    )
    topic.link = link
    topic.content_metadata = None
    return Response(f'<a href="{topic.link}">{topic.link}</a>')