Browse Source

Exclude blockquotes from comment excerpts

This included implementing a custom function that will allow skipping
particular tags inside extract_text_from_html().
merge-requests/37/head
Deimos 6 years ago
parent
commit
3ea3b161c5
  1. 8
      tildes/tests/test_comment.py
  2. 35
      tildes/tildes/lib/string.py
  3. 4
      tildes/tildes/models/comment/comment.py

8
tildes/tests/test_comment.py

@ -122,6 +122,14 @@ def test_multiple_edits_update_time(comment):
assert comment.last_edited_time == utc_now() assert comment.last_edited_time == utc_now()
def test_comment_excerpt_excludes_blockquote(topic, session_user):
"""Ensure that comment excerpts don't include text from blockquotes."""
markdown = "> Something you said\n\nYeah, I agree."
comment = Comment(topic, session_user, markdown)
assert comment.excerpt == "Yeah, I agree."
def test_comment_tree(db, topic, session_user): def test_comment_tree(db, topic, session_user):
"""Ensure that building and pruning a comment tree works.""" """Ensure that building and pruning a comment tree works."""
all_comments = [] all_comments = []

35
tildes/tildes/lib/string.py

@ -3,8 +3,10 @@
"""Functions related to processing/manipulating strings.""" """Functions related to processing/manipulating strings."""
from xml.etree.ElementTree import Element
import re import re
from typing import Optional
from typing import Iterator, List, Optional
import unicodedata import unicodedata
from urllib.parse import quote from urllib.parse import quote
@ -212,12 +214,37 @@ def separate_string(original: str, separator: str, segment_size: int) -> str:
return separated return separated
def extract_text_from_html(html: str) -> str:
def extract_text_from_html(html: str, skip_tags: Optional[List[str]] = None) -> str:
"""Extract plain text content from the elements inside an HTML string.""" """Extract plain text content from the elements inside an HTML string."""
html_tree = HTMLParser().parseFragment(html)
def extract_text(element: Element, skip_tags: List[str]) -> Iterator[str]:
"""Extract text recursively from elements, optionally skipping some tags.
This function is Python's xml.etree.ElementTree.Element.itertext() but with the
added ability to skip over particular tags and not include the text from inside
them or any of their children.
"""
if not isinstance(element.tag, str) and element.tag is not None:
return
if element.tag in skip_tags:
return
if element.text:
yield element.text
for subelement in element:
yield from extract_text(subelement, skip_tags)
if subelement.tail:
yield subelement.tail
skip_tags = skip_tags or []
html_tree = HTMLParser(namespaceHTMLElements=False).parseFragment(html)
# extract the text from all of the HTML elements # extract the text from all of the HTML elements
extracted_text = "".join([element_text for element_text in html_tree.itertext()])
extracted_text = "".join([text for text in extract_text(html_tree, skip_tags)])
# sanitize unicode, remove leading/trailing whitespace, etc. # sanitize unicode, remove leading/trailing whitespace, etc.
return simplify_string(extracted_text) return simplify_string(extracted_text)

4
tildes/tildes/models/comment/comment.py

@ -105,7 +105,9 @@ class Comment(DatabaseModel):
self._markdown = new_markdown self._markdown = new_markdown
self.rendered_html = convert_markdown_to_safe_html(new_markdown) self.rendered_html = convert_markdown_to_safe_html(new_markdown)
extracted_text = extract_text_from_html(self.rendered_html)
extracted_text = extract_text_from_html(
self.rendered_html, skip_tags=["blockquote"]
)
self.excerpt = truncate_string( self.excerpt = truncate_string(
extracted_text, length=200, truncate_at_chars=" " extracted_text, length=200, truncate_at_chars=" "
) )

Loading…
Cancel
Save