Browse Source

Update Bleach, add linkification to clean()

Bleach supports adding html5lib filters to the cleaning process, which
is how its linkify() process works, as well as being how I implemented
the custom Tildes linkification for usernames and group names.

This commit switches to adding those filters into the clean() call,
which makes it so everything is now done in a single pass instead of
three. In addition, this also fixes the issues we were having with
bleach "fixing" things that it thought were invalid HTML (when they were
just people writing things inside angle brackets).
merge-requests/51/head
Deimos 6 years ago
parent
commit
23e2a1b65c
  1. 2
      tildes/requirements-to-freeze.txt
  2. 2
      tildes/requirements.txt
  3. 21
      tildes/tests/test_markdown.py
  4. 56
      tildes/tildes/lib/markdown.py

2
tildes/requirements-to-freeze.txt

@ -3,7 +3,7 @@ alembic
amqpy
argon2_cffi
black
bleach<3.0 # bleach 3.0.0 has issues, can't upgrade yet
bleach
click
cornice
freezegun

2
tildes/requirements.txt

@ -9,7 +9,7 @@ attrs==18.2.0
backcall==0.1.0
beautifulsoup4==4.6.3
black==18.9b0
bleach==2.1.4
bleach==3.0.2
certifi==2018.10.15
cffi==1.11.5
chardet==3.0.4

21
tildes/tests/test_markdown.py

@ -174,14 +174,31 @@ def test_other_protocol_urls_not_linkified():
def test_html_attr_whitelist_violation():
"""Ensure using non-whitelisted HTML attributes removes the tag."""
"""Ensure non-whitelisted HTML attributes are removed."""
markdown = (
'<a href="example.com" title="example" target="_blank" '
'referrerpolicy="unsafe-url">test link</a>'
)
processed = convert_markdown_to_safe_html(markdown)
assert processed == "<p>test link</p>\n"
assert processed == '<p><a href="example.com" title="example">test link</a></p>\n'
def test_html_lookalike_not_closed():
"""Ensure text that looks like an HTML tag isn't "fixed" by adding a closing tag."""
markdown = "I can't believe it's not <blank>!"
processed = convert_markdown_to_safe_html(markdown)
assert "&lt;blank&gt;" in processed
assert "&lt;/blank&gt;" not in processed
def test_html_lookalike_closing_not_removed():
"""Ensure text that looks like an HTML close tag isn't removed without an opener."""
markdown = "Well, that's just great.</sarcasm>"
processed = convert_markdown_to_safe_html(markdown)
assert "&lt;/sarcasm&gt;" in processed
def test_a_href_protocol_violation():

56
tildes/tildes/lib/markdown.py

@ -3,6 +3,7 @@
"""Functions/constants related to markdown handling."""
from functools import partial
import re
from typing import (
Any,
@ -20,10 +21,7 @@ from urllib.parse import urlparse
from bs4 import BeautifulSoup
import bleach
import html5lib
from html5lib import HTMLParser
from html5lib.filters.base import Filter
from html5lib.serializer import HTMLSerializer
from html5lib.treewalkers.base import NonRecursiveTreeWalker
from pygments import highlight
from pygments.formatters import HtmlFormatter
@ -170,8 +168,8 @@ def convert_markdown_to_safe_html(markdown: str) -> str:
# apply custom post-processing to HTML
html = postprocess_markdown_html(html)
# sanitize the final HTML before returning it
return sanitize_html(html)
# add linkification and sanitize the final HTML before returning it
return linkify_and_sanitize_html(html)
def preprocess_markdown(markdown: str) -> str:
@ -200,17 +198,6 @@ def escape_accidental_ordered_lists(markdown: str) -> str:
def postprocess_markdown_html(html: str) -> str:
"""Apply post-processing to HTML generated by markdown parser."""
# list of tag names to exclude from linkification
linkify_skipped_tags = ["code", "pre"]
# search for text that looks like urls and convert to actual links
html = bleach.linkify(
html, callbacks=[linkify_protocol_whitelist], skip_tags=linkify_skipped_tags
)
# run the HTML through our custom linkification process as well
html = apply_linkification(html, skip_tags=linkify_skipped_tags)
# apply syntax highlighting to code blocks
html = apply_syntax_highlighting(html)
@ -255,24 +242,6 @@ def apply_syntax_highlighting(html: str) -> str:
return html
def apply_linkification(html: str, skip_tags: Optional[List[str]] = None) -> str:
"""Apply custom linkification filter to convert text patterns to links."""
parser = HTMLParser(namespaceHTMLElements=False)
html_tree = parser.parseFragment(html)
walker_stream = html5lib.getTreeWalker("etree")(html_tree)
filtered_html_tree = LinkifyFilter(walker_stream, skip_tags)
serializer = HTMLSerializer(
quote_attr_values="always",
omit_optional_tags=False,
sanitize=False,
alphabetical_attributes=False,
)
return serializer.render(filtered_html_tree)
class LinkifyFilter(Filter):
"""html5lib Filter to convert custom text patterns to links.
@ -459,11 +428,22 @@ class LinkifyFilter(Filter):
return [{"type": "Characters", "data": match[0]}]
def sanitize_html(html: str) -> str:
"""Sanitize HTML by escaping/stripping undesirable elements."""
return bleach.clean(
html,
def linkify_and_sanitize_html(html: str) -> str:
"""Use bleach and html5lib filters to linkify and sanitize HTML."""
# list of tag names to exclude from linkification
linkify_skipped_tags = ["code", "pre"]
bleach_linkifier = partial(
bleach.linkifier.LinkifyFilter,
callbacks=[linkify_protocol_whitelist],
skip_tags=linkify_skipped_tags,
)
tildes_linkifier = partial(LinkifyFilter, skip_tags=linkify_skipped_tags)
cleaner = bleach.Cleaner(
tags=HTML_TAG_WHITELIST,
attributes=HTML_ATTRIBUTE_WHITELIST,
protocols=PROTOCOL_WHITELIST,
filters=[bleach_linkifier, tildes_linkifier],
)
return cleaner.clean(html)
Loading…
Cancel
Save