From 3ea3b161c564d80bd30e92dea9b554ed20089443 Mon Sep 17 00:00:00 2001
From: Deimos <deimos@tildes.net>
Date: Fri, 7 Sep 2018 23:43:53 -0600
Subject: [PATCH] Exclude blockquotes from comment excerpts

This included implementing a custom function that will allow skipping
particular tags inside extract_text_from_html().
---
 tildes/tests/test_comment.py            |  8 ++++++
 tildes/tildes/lib/string.py             | 35 ++++++++++++++++++++++---
 tildes/tildes/models/comment/comment.py |  4 ++-
 3 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/tildes/tests/test_comment.py b/tildes/tests/test_comment.py
index 1f93889..24b057f 100644
--- a/tildes/tests/test_comment.py
+++ b/tildes/tests/test_comment.py
@@ -122,6 +122,14 @@ def test_multiple_edits_update_time(comment):
             assert comment.last_edited_time == utc_now()
 
 
+def test_comment_excerpt_excludes_blockquote(topic, session_user):
+    """Ensure that comment excerpts don't include text from blockquotes."""
+    markdown = "> Something you said\n\nYeah, I agree."
+    comment = Comment(topic, session_user, markdown)
+
+    assert comment.excerpt == "Yeah, I agree."
+
+
 def test_comment_tree(db, topic, session_user):
     """Ensure that building and pruning a comment tree works."""
     all_comments = []
diff --git a/tildes/tildes/lib/string.py b/tildes/tildes/lib/string.py
index b61f6c0..4492307 100644
--- a/tildes/tildes/lib/string.py
+++ b/tildes/tildes/lib/string.py
@@ -3,8 +3,10 @@
 
 """Functions related to processing/manipulating strings."""
 
+from xml.etree.ElementTree import Element
+
 import re
-from typing import Optional
+from typing import Iterator, List, Optional
 import unicodedata
 from urllib.parse import quote
 
@@ -212,12 +214,37 @@ def separate_string(original: str, separator: str, segment_size: int) -> str:
     return separated
 
 
-def extract_text_from_html(html: str) -> str:
+def extract_text_from_html(html: str, skip_tags: Optional[List[str]] = None) -> str:
     """Extract plain text content from the elements inside an HTML string."""
-    html_tree = HTMLParser().parseFragment(html)
+
+    def extract_text(element: Element, skip_tags: List[str]) -> Iterator[str]:
+        """Extract text recursively from elements, optionally skipping some tags.
+
+        This function is Python's xml.etree.ElementTree.Element.itertext() but with the
+        added ability to skip over particular tags and not include the text from inside
+        them or any of their children.
+        """
+        if not isinstance(element.tag, str) and element.tag is not None:
+            return
+
+        if element.tag in skip_tags:
+            return
+
+        if element.text:
+            yield element.text
+
+        for subelement in element:
+            yield from extract_text(subelement, skip_tags)
+
+            if subelement.tail:
+                yield subelement.tail
+
+    skip_tags = skip_tags or []
+
+    html_tree = HTMLParser(namespaceHTMLElements=False).parseFragment(html)
 
     # extract the text from all of the HTML elements
-    extracted_text = "".join([element_text for element_text in html_tree.itertext()])
+    extracted_text = "".join([text for text in extract_text(html_tree, skip_tags)])
 
     # sanitize unicode, remove leading/trailing whitespace, etc.
     return simplify_string(extracted_text)
diff --git a/tildes/tildes/models/comment/comment.py b/tildes/tildes/models/comment/comment.py
index 7a6478e..8da0d4b 100644
--- a/tildes/tildes/models/comment/comment.py
+++ b/tildes/tildes/models/comment/comment.py
@@ -105,7 +105,9 @@ class Comment(DatabaseModel):
         self._markdown = new_markdown
         self.rendered_html = convert_markdown_to_safe_html(new_markdown)
 
-        extracted_text = extract_text_from_html(self.rendered_html)
+        extracted_text = extract_text_from_html(
+            self.rendered_html, skip_tags=["blockquote"]
+        )
         self.excerpt = truncate_string(
             extracted_text, length=200, truncate_at_chars=" "
         )