Exclude blockquotes from comment excerpts

This included implementing a custom function that will allow skipping particular tags inside extract_text_from_html().
6 years ago · 3ea3b161c5
3 changed files with 42 additions and 5 deletions
--- a/tildes/tests/test_comment.py
+++ b/tildes/tests/test_comment.py
@ -122,6 +122,14 @@ def test_multiple_edits_update_time(comment):
            assert comment.last_edited_time == utc_now()
 def test_comment_excerpt_excludes_blockquote(topic, session_user):
    """Ensure that comment excerpts don't include text from blockquotes."""
    markdown = "> Something you said\n\nYeah, I agree."
    comment = Comment(topic, session_user, markdown)
    assert comment.excerpt == "Yeah, I agree."
 def test_comment_tree(db, topic, session_user):
    """Ensure that building and pruning a comment tree works."""
    all_comments = []
--- a/tildes/tildes/lib/string.py
+++ b/tildes/tildes/lib/string.py
@ -3,8 +3,10 @@
 """Functions related to processing/manipulating strings."""
 from xml.etree.ElementTree import Element
 import re
 from typing import Optional
 from typing import Iterator, List, Optional
 import unicodedata
 from urllib.parse import quote
@ -212,12 +214,37 @@ def separate_string(original: str, separator: str, segment_size: int) -> str:
    return separated
 def extract_text_from_html(html: str) -> str:
 def extract_text_from_html(html: str, skip_tags: Optional[List[str]] = None) -> str:
    """Extract plain text content from the elements inside an HTML string."""
    html_tree = HTMLParser().parseFragment(html)
    def extract_text(element: Element, skip_tags: List[str]) -> Iterator[str]:
        """Extract text recursively from elements, optionally skipping some tags.
        This function is Python's xml.etree.ElementTree.Element.itertext() but with the
        added ability to skip over particular tags and not include the text from inside
        them or any of their children.
        """
        if not isinstance(element.tag, str) and element.tag is not None:
            return
        if element.tag in skip_tags:
            return
        if element.text:
            yield element.text
        for subelement in element:
            yield from extract_text(subelement, skip_tags)
            if subelement.tail:
                yield subelement.tail
    skip_tags = skip_tags or []
    html_tree = HTMLParser(namespaceHTMLElements=False).parseFragment(html)
    # extract the text from all of the HTML elements
    extracted_text = "".join([element_text for element_text in html_tree.itertext()])
    extracted_text = "".join([text for text in extract_text(html_tree, skip_tags)])
    # sanitize unicode, remove leading/trailing whitespace, etc.
    return simplify_string(extracted_text)
--- a/tildes/tildes/models/comment/comment.py
+++ b/tildes/tildes/models/comment/comment.py
@ -105,7 +105,9 @@ class Comment(DatabaseModel):
        self._markdown = new_markdown
        self.rendered_html = convert_markdown_to_safe_html(new_markdown)
        extracted_text = extract_text_from_html(self.rendered_html)
        extracted_text = extract_text_from_html(
            self.rendered_html, skip_tags=["blockquote"]
        )
        self.excerpt = truncate_string(
            extracted_text, length=200, truncate_at_chars=" "
        )