From 23e2a1b65c8dc485871d0e41f876251994410dce Mon Sep 17 00:00:00 2001 From: Deimos Date: Fri, 26 Oct 2018 22:55:38 -0600 Subject: [PATCH] Update Bleach, add linkification to clean() Bleach supports adding html5lib filters to the cleaning process, which is how its linkify() process works, as well as being how I implemented the custom Tildes linkification for usernames and group names. This commit switches to adding those filters into the clean() call, which makes it so everything is now done in a single pass instead of three. In addition, this also fixes the issues we were having with bleach "fixing" things that it thought were invalid HTML (when they were just people writing things inside angle brackets). --- tildes/requirements-to-freeze.txt | 2 +- tildes/requirements.txt | 2 +- tildes/tests/test_markdown.py | 21 ++++++++++-- tildes/tildes/lib/markdown.py | 56 ++++++++++--------------------- 4 files changed, 39 insertions(+), 42 deletions(-) diff --git a/tildes/requirements-to-freeze.txt b/tildes/requirements-to-freeze.txt index 5730976..7639d93 100644 --- a/tildes/requirements-to-freeze.txt +++ b/tildes/requirements-to-freeze.txt @@ -3,7 +3,7 @@ alembic amqpy argon2_cffi black -bleach<3.0 # bleach 3.0.0 has issues, can't upgrade yet +bleach click cornice freezegun diff --git a/tildes/requirements.txt b/tildes/requirements.txt index 11f8754..cf50ae1 100644 --- a/tildes/requirements.txt +++ b/tildes/requirements.txt @@ -9,7 +9,7 @@ attrs==18.2.0 backcall==0.1.0 beautifulsoup4==4.6.3 black==18.9b0 -bleach==2.1.4 +bleach==3.0.2 certifi==2018.10.15 cffi==1.11.5 chardet==3.0.4 diff --git a/tildes/tests/test_markdown.py b/tildes/tests/test_markdown.py index 9b500cf..2d5b8d4 100644 --- a/tildes/tests/test_markdown.py +++ b/tildes/tests/test_markdown.py @@ -174,14 +174,31 @@ def test_other_protocol_urls_not_linkified(): def test_html_attr_whitelist_violation(): - """Ensure using non-whitelisted HTML attributes removes the tag.""" + """Ensure non-whitelisted HTML attributes are removed.""" markdown = ( 'test link' ) processed = convert_markdown_to_safe_html(markdown) - assert processed == "

test link

\n" + assert processed == '

test link

\n' + + +def test_html_lookalike_not_closed(): + """Ensure text that looks like an HTML tag isn't "fixed" by adding a closing tag.""" + markdown = "I can't believe it's not !" + processed = convert_markdown_to_safe_html(markdown) + + assert "<blank>" in processed + assert "</blank>" not in processed + + +def test_html_lookalike_closing_not_removed(): + """Ensure text that looks like an HTML close tag isn't removed without an opener.""" + markdown = "Well, that's just great." + processed = convert_markdown_to_safe_html(markdown) + + assert "</sarcasm>" in processed def test_a_href_protocol_violation(): diff --git a/tildes/tildes/lib/markdown.py b/tildes/tildes/lib/markdown.py index 53f9214..2c97d00 100644 --- a/tildes/tildes/lib/markdown.py +++ b/tildes/tildes/lib/markdown.py @@ -3,6 +3,7 @@ """Functions/constants related to markdown handling.""" +from functools import partial import re from typing import ( Any, @@ -20,10 +21,7 @@ from urllib.parse import urlparse from bs4 import BeautifulSoup import bleach -import html5lib -from html5lib import HTMLParser from html5lib.filters.base import Filter -from html5lib.serializer import HTMLSerializer from html5lib.treewalkers.base import NonRecursiveTreeWalker from pygments import highlight from pygments.formatters import HtmlFormatter @@ -170,8 +168,8 @@ def convert_markdown_to_safe_html(markdown: str) -> str: # apply custom post-processing to HTML html = postprocess_markdown_html(html) - # sanitize the final HTML before returning it - return sanitize_html(html) + # add linkification and sanitize the final HTML before returning it + return linkify_and_sanitize_html(html) def preprocess_markdown(markdown: str) -> str: @@ -200,17 +198,6 @@ def escape_accidental_ordered_lists(markdown: str) -> str: def postprocess_markdown_html(html: str) -> str: """Apply post-processing to HTML generated by markdown parser.""" - # list of tag names to exclude from linkification - linkify_skipped_tags = ["code", "pre"] - - # search for text that looks like urls and convert to actual links - html = bleach.linkify( - html, callbacks=[linkify_protocol_whitelist], skip_tags=linkify_skipped_tags - ) - - # run the HTML through our custom linkification process as well - html = apply_linkification(html, skip_tags=linkify_skipped_tags) - # apply syntax highlighting to code blocks html = apply_syntax_highlighting(html) @@ -255,24 +242,6 @@ def apply_syntax_highlighting(html: str) -> str: return html -def apply_linkification(html: str, skip_tags: Optional[List[str]] = None) -> str: - """Apply custom linkification filter to convert text patterns to links.""" - parser = HTMLParser(namespaceHTMLElements=False) - - html_tree = parser.parseFragment(html) - walker_stream = html5lib.getTreeWalker("etree")(html_tree) - - filtered_html_tree = LinkifyFilter(walker_stream, skip_tags) - - serializer = HTMLSerializer( - quote_attr_values="always", - omit_optional_tags=False, - sanitize=False, - alphabetical_attributes=False, - ) - return serializer.render(filtered_html_tree) - - class LinkifyFilter(Filter): """html5lib Filter to convert custom text patterns to links. @@ -459,11 +428,22 @@ class LinkifyFilter(Filter): return [{"type": "Characters", "data": match[0]}] -def sanitize_html(html: str) -> str: - """Sanitize HTML by escaping/stripping undesirable elements.""" - return bleach.clean( - html, +def linkify_and_sanitize_html(html: str) -> str: + """Use bleach and html5lib filters to linkify and sanitize HTML.""" + # list of tag names to exclude from linkification + linkify_skipped_tags = ["code", "pre"] + + bleach_linkifier = partial( + bleach.linkifier.LinkifyFilter, + callbacks=[linkify_protocol_whitelist], + skip_tags=linkify_skipped_tags, + ) + tildes_linkifier = partial(LinkifyFilter, skip_tags=linkify_skipped_tags) + + cleaner = bleach.Cleaner( tags=HTML_TAG_WHITELIST, attributes=HTML_ATTRIBUTE_WHITELIST, protocols=PROTOCOL_WHITELIST, + filters=[bleach_linkifier, tildes_linkifier], ) + return cleaner.clean(html)