From af66a76054e7c157202c5d82238c2b1ef906f66b Mon Sep 17 00:00:00 2001
From: Deimos <deimos@tildes.net>
Date: Tue, 12 Nov 2019 23:16:25 -0700
Subject: [PATCH] Add better control of content metadata by type

Previously, the content metadata displayed next to a topic's content
type (like "Article: 1800 words") was fairly generic and could result in
strange data being displayed if a scraper fetched it for an
inappropriate type (for example, displaying word count for videos).

This creates an enum to hold all the different content metadata fields,
and moves some logic into that class to handle deciding which fields to
show for different types, and the formatting logic for values.
---
 tildes/tildes/enums.py               | 66 +++++++++++++++++++++++++++-
 tildes/tildes/lib/link_metadata.py   | 13 ------
 tildes/tildes/models/topic/topic.py  | 47 +++++++-------------
 tildes/tildes/views/api/web/topic.py | 19 +-------
 4 files changed, 83 insertions(+), 62 deletions(-)
 delete mode 100644 tildes/tildes/lib/link_metadata.py

diff --git a/tildes/tildes/enums.py b/tildes/tildes/enums.py
index af440fa..57a66ae 100644
--- a/tildes/tildes/enums.py
+++ b/tildes/tildes/enums.py
@@ -4,7 +4,10 @@
 """Contains Enum classes."""
 
 import enum
-from typing import Optional
+from datetime import timedelta
+from typing import Any, List, Optional
+
+from tildes.lib.datetime import utc_from_timestamp
 
 
 class CommentNotificationType(enum.Enum):
@@ -80,6 +83,67 @@ class CommentLabelOption(enum.Enum):
         return None
 
 
+class ContentMetadataFields(enum.Enum):
+    """Enum for the fields of content metadata stored and used (for topics)."""
+
+    AUTHORS = enum.auto()
+    DESCRIPTION = enum.auto()
+    DOMAIN = enum.auto()
+    DURATION = enum.auto()
+    EXCERPT = enum.auto()
+    PUBLISHED = enum.auto()
+    TITLE = enum.auto()
+    WORD_COUNT = enum.auto()
+
+    @property
+    def key(self) -> str:
+        """Return the key to store this field under."""
+        return self.name.lower()
+
+    @classmethod
+    def detail_fields_for_content_type(
+        cls, content_type: "TopicContentType",
+    ) -> List["ContentMetadataFields"]:
+        """Return a list of fields to display for detail about a particular type."""
+        if content_type is TopicContentType.ARTICLE:
+            return [cls.WORD_COUNT, cls.PUBLISHED]
+
+        if content_type is TopicContentType.TEXT:
+            return [cls.WORD_COUNT]
+
+        if content_type is TopicContentType.VIDEO:
+            return [cls.DURATION, cls.PUBLISHED]
+
+        return []
+
+    def format_value(self, value: Any) -> str:
+        """Format a value stored in this field into a string for display."""
+        if self.name == "WORD_COUNT":
+            if value == 1:
+                return "1 word"
+
+            return f"{value} words"
+
+        if self.name == "DURATION":
+            delta = timedelta(seconds=value)
+
+            # When converted to str, timedelta always includes hours and minutes,
+            # so we want to strip off all the excess zeros and/or colons. However,
+            # if it's less than a minute we'll need to add one back.
+            duration_str = str(delta).lstrip("0:")
+            if value < 60:
+                duration_str = f"0:{duration_str}"
+
+            return duration_str
+
+        if self.name == "PUBLISHED":
+            published = utc_from_timestamp(value)
+            date_str = published.strftime("%b %-d %Y")
+            return f"published {date_str}"
+
+        return str(value)
+
+
 class FinancialEntryType(enum.Enum):
     """Enum for entry types in the Financials table."""
 
diff --git a/tildes/tildes/lib/link_metadata.py b/tildes/tildes/lib/link_metadata.py
deleted file mode 100644
index ce6d0c5..0000000
--- a/tildes/tildes/lib/link_metadata.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2019 Tildes contributors <code@tildes.net>
-# SPDX-License-Identifier: AGPL-3.0-or-later
-
-"""Constants/classes/functions related to metadata generated from links."""
-
-METADATA_KEYS = [
-    "authors",
-    "description",
-    "duration",
-    "published",
-    "title",
-    "word_count",
-]
diff --git a/tildes/tildes/models/topic/topic.py b/tildes/tildes/models/topic/topic.py
index ca9acc6..e3feb3a 100644
--- a/tildes/tildes/models/topic/topic.py
+++ b/tildes/tildes/models/topic/topic.py
@@ -26,7 +26,7 @@ from sqlalchemy.orm import deferred, relationship
 from sqlalchemy.sql.expression import text
 from titlecase import titlecase
 
-from tildes.enums import TopicContentType, TopicType
+from tildes.enums import ContentMetadataFields, TopicContentType, TopicType
 from tildes.lib.database import TagList
 from tildes.lib.datetime import utc_from_timestamp, utc_now
 from tildes.lib.id import id_to_id36
@@ -461,38 +461,25 @@ class Topic(DatabaseModel):
     @property
     def content_metadata_for_display(self) -> str:
         """Return a string of the content's metadata, suitable for display."""
+        if not self.content_type:
+            return ""
+
         metadata_strings = []
 
-        # display word count (if we have it) with either type of topic
-        word_count = self.get_content_metadata("word_count")
-        if word_count is not None:
-            if word_count == 1:
-                metadata_strings.append("1 word")
-            else:
-                metadata_strings.append(f"{word_count} words")
+        fields = ContentMetadataFields.detail_fields_for_content_type(self.content_type)
 
-        if self.is_link_type:
-            # display the duration if we have it
-            duration = self.get_content_metadata("duration")
-            if duration:
-                duration_delta = timedelta(seconds=duration)
-
-                # When converted to str, timedelta always includes hours and minutes,
-                # so we want to strip off all the excess zeros and/or colons. However,
-                # if it's less than a minute we'll need to add one back.
-                duration_str = str(duration_delta).lstrip("0:")
-                if duration < 60:
-                    duration_str = f"0:{duration_str}"
-
-                metadata_strings.append(duration_str)
-
-            # display the published date if it's more than 3 days before the topic
-            published_timestamp = self.get_content_metadata("published")
-            if published_timestamp:
-                published = utc_from_timestamp(published_timestamp)
-                if self.created_time - published > timedelta(days=3):
-                    date_str = published.strftime("%b %-d %Y")
-                    metadata_strings.append(f"published {date_str}")
+        for field in fields:
+            value = self.get_content_metadata(field.key)
+            if not value:
+                continue
+
+            # only show published date if it's more than 3 days before the topic
+            if field is ContentMetadataFields.PUBLISHED:
+                published = utc_from_timestamp(value)
+                if self.created_time - published < timedelta(days=3):
+                    continue
+
+            metadata_strings.append(field.format_value(value))
 
         return ", ".join(metadata_strings)
 
diff --git a/tildes/tildes/views/api/web/topic.py b/tildes/tildes/views/api/web/topic.py
index 6708ea6..bdbcdc7 100644
--- a/tildes/tildes/views/api/web/topic.py
+++ b/tildes/tildes/views/api/web/topic.py
@@ -8,13 +8,10 @@ from marshmallow.fields import String
 from pyramid.httpexceptions import HTTPNotFound
 from pyramid.request import Request
 from pyramid.response import Response
-from sqlalchemy import cast, Text
-from sqlalchemy.dialects.postgresql import ARRAY
 from sqlalchemy.exc import IntegrityError
 from webargs.pyramidparser import use_kwargs
 
 from tildes.enums import LogEventType
-from tildes.lib.link_metadata import METADATA_KEYS
 from tildes.models.group import Group
 from tildes.models.log import LogTopic
 from tildes.models.topic import Topic, TopicBookmark, TopicVote
@@ -393,22 +390,8 @@ def patch_topic_link(request: Request, link: str) -> dict:
         )
     )
 
-    # Wipe any old metadata from scrapers so we don't leave behind remnants
-    # (this probably really shouldn't be done here, but it's fine for now)
-    (
-        request.query(Topic)
-        .filter(Topic.topic_id == topic.topic_id)
-        .update(
-            {
-                "content_metadata": Topic.content_metadata.op("-")(  # type: ignore
-                    cast(METADATA_KEYS, ARRAY(Text))
-                )
-            },
-            synchronize_session=False,
-        )
-    )
-
     topic.link = link
+    topic.content_metadata = None
 
     return Response(f'<a href="{topic.link}">{topic.link}</a>')