diff --git a/tildes/consumers/topic_metadata_generator.py b/tildes/consumers/topic_metadata_generator.py index b88a11d..3a527e7 100644 --- a/tildes/consumers/topic_metadata_generator.py +++ b/tildes/consumers/topic_metadata_generator.py @@ -69,7 +69,9 @@ class TopicMetadataGenerator(EventStreamConsumer): if not topic.rendered_html: return {} - extracted_text = extract_text_from_html(topic.rendered_html) + extracted_text = extract_text_from_html( + topic.rendered_html, exclude_details_include_summary=True + ) # create a short excerpt by truncating the extracted string excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ") diff --git a/tildes/tests/test_comment.py b/tildes/tests/test_comment.py index 7114500..f6ba0e0 100644 --- a/tildes/tests/test_comment.py +++ b/tildes/tests/test_comment.py @@ -154,6 +154,16 @@ def test_comment_excerpt_excludes_del(topic, session_user): assert comment.excerpt == "I really love it." +def test_comment_excerpt_excludes_details(topic, session_user): + """Ensure that comment excerpts don't include text from
elements. + + But ensure that the inner text *is* included. + """ + markdown = "
\nSpoilers!\n\nHide me!\n
" + comment = Comment(topic, session_user, markdown) + assert comment.excerpt == "Spoilers!" + + def test_comment_tree(db, topic, session_user): """Ensure that building and pruning a comment tree works.""" all_comments = [] diff --git a/tildes/tests/test_string.py b/tildes/tests/test_string.py index fec1ce9..d2eb99d 100644 --- a/tildes/tests/test_string.py +++ b/tildes/tests/test_string.py @@ -7,6 +7,7 @@ from tildes.lib.string import ( truncate_string, truncate_string_at_char, word_count, + extract_text_from_html, ) @@ -152,3 +153,23 @@ def test_basic_camelcase_to_snakecase(): def test_camelcase_to_snakecase_with_acronym(): """Ensure CamelCase->snake_case works as expected with an acronym.""" assert camelcase_to_snakecase("SomeHTTPThing") == "some_http_thing" + + +def test_extract_text_from_html_include_details(): + """Ensure extract_text_from_html behavior includes
elements by default.""" + html = "
Spoilers!

Don't hide me!

" + assert extract_text_from_html(html) == "Spoilers! Don't hide me!" + + html = "

Don't hide me!

" + assert extract_text_from_html(html) == "Don't hide me!" + + +def test_extract_text_from_html_exclude_details(): + """Ensure extract_text_from_html behavior excludes
elements when specified.""" + html = "
Spoilers!

Hide me!

" + text = extract_text_from_html(html, exclude_details_include_summary=True) + assert text == "Spoilers!" + + html = "

Hide me!

" + text = extract_text_from_html(html, exclude_details_include_summary=True) + assert text == "Details" diff --git a/tildes/tildes/lib/string.py b/tildes/tildes/lib/string.py index 51c5e62..aba05af 100644 --- a/tildes/tildes/lib/string.py +++ b/tildes/tildes/lib/string.py @@ -226,7 +226,11 @@ def separate_string(original: str, separator: str, segment_size: int) -> str: return separated -def extract_text_from_html(html: str, skip_tags: Optional[list[str]] = None) -> str: +def extract_text_from_html( + html: str, + skip_tags: Optional[list[str]] = None, + exclude_details_include_summary: bool = False, +) -> str: """Extract plain text content from the elements inside an HTML string.""" def extract_text(element: Element, skip_tags: list[str]) -> Iterator[str]: @@ -242,6 +246,14 @@ def extract_text_from_html(html: str, skip_tags: Optional[list[str]] = None) -> if element.tag in skip_tags: return + if element.tag == "details" and exclude_details_include_summary: + for subelement in element: + if subelement.tag == "summary": + yield from extract_text(subelement, skip_tags) + return + yield "Details" + return + if element.text: yield element.text diff --git a/tildes/tildes/models/comment/comment.py b/tildes/tildes/models/comment/comment.py index a4bc90b..b298181 100644 --- a/tildes/tildes/models/comment/comment.py +++ b/tildes/tildes/models/comment/comment.py @@ -138,7 +138,9 @@ class Comment(DatabaseModel): self.rendered_html = convert_markdown_to_safe_html(new_markdown) extracted_text = extract_text_from_html( - self.rendered_html, skip_tags=["blockquote", "del"] + self.rendered_html, + skip_tags=["blockquote", "del"], + exclude_details_include_summary=True, ) self.excerpt = truncate_string( extracted_text, length=200, truncate_at_chars=" "