From 3ea3b161c564d80bd30e92dea9b554ed20089443 Mon Sep 17 00:00:00 2001 From: Deimos Date: Fri, 7 Sep 2018 23:43:53 -0600 Subject: [PATCH] Exclude blockquotes from comment excerpts This included implementing a custom function that will allow skipping particular tags inside extract_text_from_html(). --- tildes/tests/test_comment.py | 8 ++++++ tildes/tildes/lib/string.py | 35 ++++++++++++++++++++++--- tildes/tildes/models/comment/comment.py | 4 ++- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/tildes/tests/test_comment.py b/tildes/tests/test_comment.py index 1f93889..24b057f 100644 --- a/tildes/tests/test_comment.py +++ b/tildes/tests/test_comment.py @@ -122,6 +122,14 @@ def test_multiple_edits_update_time(comment): assert comment.last_edited_time == utc_now() +def test_comment_excerpt_excludes_blockquote(topic, session_user): + """Ensure that comment excerpts don't include text from blockquotes.""" + markdown = "> Something you said\n\nYeah, I agree." + comment = Comment(topic, session_user, markdown) + + assert comment.excerpt == "Yeah, I agree." + + def test_comment_tree(db, topic, session_user): """Ensure that building and pruning a comment tree works.""" all_comments = [] diff --git a/tildes/tildes/lib/string.py b/tildes/tildes/lib/string.py index b61f6c0..4492307 100644 --- a/tildes/tildes/lib/string.py +++ b/tildes/tildes/lib/string.py @@ -3,8 +3,10 @@ """Functions related to processing/manipulating strings.""" +from xml.etree.ElementTree import Element + import re -from typing import Optional +from typing import Iterator, List, Optional import unicodedata from urllib.parse import quote @@ -212,12 +214,37 @@ def separate_string(original: str, separator: str, segment_size: int) -> str: return separated -def extract_text_from_html(html: str) -> str: +def extract_text_from_html(html: str, skip_tags: Optional[List[str]] = None) -> str: """Extract plain text content from the elements inside an HTML string.""" - html_tree = HTMLParser().parseFragment(html) + + def extract_text(element: Element, skip_tags: List[str]) -> Iterator[str]: + """Extract text recursively from elements, optionally skipping some tags. + + This function is Python's xml.etree.ElementTree.Element.itertext() but with the + added ability to skip over particular tags and not include the text from inside + them or any of their children. + """ + if not isinstance(element.tag, str) and element.tag is not None: + return + + if element.tag in skip_tags: + return + + if element.text: + yield element.text + + for subelement in element: + yield from extract_text(subelement, skip_tags) + + if subelement.tail: + yield subelement.tail + + skip_tags = skip_tags or [] + + html_tree = HTMLParser(namespaceHTMLElements=False).parseFragment(html) # extract the text from all of the HTML elements - extracted_text = "".join([element_text for element_text in html_tree.itertext()]) + extracted_text = "".join([text for text in extract_text(html_tree, skip_tags)]) # sanitize unicode, remove leading/trailing whitespace, etc. return simplify_string(extracted_text) diff --git a/tildes/tildes/models/comment/comment.py b/tildes/tildes/models/comment/comment.py index 7a6478e..8da0d4b 100644 --- a/tildes/tildes/models/comment/comment.py +++ b/tildes/tildes/models/comment/comment.py @@ -105,7 +105,9 @@ class Comment(DatabaseModel): self._markdown = new_markdown self.rendered_html = convert_markdown_to_safe_html(new_markdown) - extracted_text = extract_text_from_html(self.rendered_html) + extracted_text = extract_text_from_html( + self.rendered_html, skip_tags=["blockquote"] + ) self.excerpt = truncate_string( extracted_text, length=200, truncate_at_chars=" " )