From 23e2a1b65c8dc485871d0e41f876251994410dce Mon Sep 17 00:00:00 2001
From: Deimos <deimos@tildes.net>
Date: Fri, 26 Oct 2018 22:55:38 -0600
Subject: [PATCH] Update Bleach, add linkification to clean()

Bleach supports adding html5lib filters to the cleaning process, which
is how its linkify() process works, as well as being how I implemented
the custom Tildes linkification for usernames and group names.

This commit switches to adding those filters into the clean() call,
which makes it so everything is now done in a single pass instead of
three. In addition, this also fixes the issues we were having with
bleach "fixing" things that it thought were invalid HTML (when they were
just people writing things inside angle brackets).
---
 tildes/requirements-to-freeze.txt |  2 +-
 tildes/requirements.txt           |  2 +-
 tildes/tests/test_markdown.py     | 21 ++++++++++--
 tildes/tildes/lib/markdown.py     | 56 ++++++++++---------------------
 4 files changed, 39 insertions(+), 42 deletions(-)
diff --git a/tildes/requirements-to-freeze.txt b/tildes/requirements-to-freeze.txt
index 5730976..7639d93 100644
--- a/tildes/requirements-to-freeze.txt
+++ b/tildes/requirements-to-freeze.txt
@@ -3,7 +3,7 @@ alembic
 amqpy
 argon2_cffi
 black
-bleach<3.0  # bleach 3.0.0 has issues, can't upgrade yet
+bleach
 click
 cornice
 freezegun
diff --git a/tildes/requirements.txt b/tildes/requirements.txt
index 11f8754..cf50ae1 100644
--- a/tildes/requirements.txt
+++ b/tildes/requirements.txt
@@ -9,7 +9,7 @@ attrs==18.2.0
 backcall==0.1.0
 beautifulsoup4==4.6.3
 black==18.9b0
-bleach==2.1.4
+bleach==3.0.2
 certifi==2018.10.15
 cffi==1.11.5
 chardet==3.0.4
diff --git a/tildes/tests/test_markdown.py b/tildes/tests/test_markdown.py
index 9b500cf..2d5b8d4 100644
--- a/tildes/tests/test_markdown.py
+++ b/tildes/tests/test_markdown.py
@@ -174,14 +174,31 @@ def test_other_protocol_urls_not_linkified():
 
 
 def test_html_attr_whitelist_violation():
-    """Ensure using non-whitelisted HTML attributes removes the tag."""
+    """Ensure non-whitelisted HTML attributes are removed."""
     markdown = (
         '<a href="example.com" title="example" target="_blank" '
         'referrerpolicy="unsafe-url">test link</a>'
     )
     processed = convert_markdown_to_safe_html(markdown)
 
-    assert processed == "<p>test link</p>\n"
+    assert processed == '<p><a href="example.com" title="example">test link</a></p>\n'
+
+
+def test_html_lookalike_not_closed():
+    """Ensure text that looks like an HTML tag isn't "fixed" by adding a closing tag."""
+    markdown = "I can't believe it's not <blank>!"
+    processed = convert_markdown_to_safe_html(markdown)
+
+    assert "&lt;blank&gt;" in processed
+    assert "&lt;/blank&gt;" not in processed
+
+
+def test_html_lookalike_closing_not_removed():
+    """Ensure text that looks like an HTML close tag isn't removed without an opener."""
+    markdown = "Well, that's just great.</sarcasm>"
+    processed = convert_markdown_to_safe_html(markdown)
+
+    assert "&lt;/sarcasm&gt;" in processed
 
 
 def test_a_href_protocol_violation():
diff --git a/tildes/tildes/lib/markdown.py b/tildes/tildes/lib/markdown.py
index 53f9214..2c97d00 100644
--- a/tildes/tildes/lib/markdown.py
+++ b/tildes/tildes/lib/markdown.py
@@ -3,6 +3,7 @@
 
 """Functions/constants related to markdown handling."""
 
+from functools import partial
 import re
 from typing import (
     Any,
@@ -20,10 +21,7 @@ from urllib.parse import urlparse
 
 from bs4 import BeautifulSoup
 import bleach
-import html5lib
-from html5lib import HTMLParser
 from html5lib.filters.base import Filter
-from html5lib.serializer import HTMLSerializer
 from html5lib.treewalkers.base import NonRecursiveTreeWalker
 from pygments import highlight
 from pygments.formatters import HtmlFormatter
@@ -170,8 +168,8 @@ def convert_markdown_to_safe_html(markdown: str) -> str:
     # apply custom post-processing to HTML
     html = postprocess_markdown_html(html)
 
-    # sanitize the final HTML before returning it
-    return sanitize_html(html)
+    # add linkification and sanitize the final HTML before returning it
+    return linkify_and_sanitize_html(html)
 
 
 def preprocess_markdown(markdown: str) -> str:
@@ -200,17 +198,6 @@ def escape_accidental_ordered_lists(markdown: str) -> str:
 
 def postprocess_markdown_html(html: str) -> str:
     """Apply post-processing to HTML generated by markdown parser."""
-    # list of tag names to exclude from linkification
-    linkify_skipped_tags = ["code", "pre"]
-
-    # search for text that looks like urls and convert to actual links
-    html = bleach.linkify(
-        html, callbacks=[linkify_protocol_whitelist], skip_tags=linkify_skipped_tags
-    )
-
-    # run the HTML through our custom linkification process as well
-    html = apply_linkification(html, skip_tags=linkify_skipped_tags)
-
     # apply syntax highlighting to code blocks
     html = apply_syntax_highlighting(html)
 
@@ -255,24 +242,6 @@ def apply_syntax_highlighting(html: str) -> str:
     return html
 
 
-def apply_linkification(html: str, skip_tags: Optional[List[str]] = None) -> str:
-    """Apply custom linkification filter to convert text patterns to links."""
-    parser = HTMLParser(namespaceHTMLElements=False)
-
-    html_tree = parser.parseFragment(html)
-    walker_stream = html5lib.getTreeWalker("etree")(html_tree)
-
-    filtered_html_tree = LinkifyFilter(walker_stream, skip_tags)
-
-    serializer = HTMLSerializer(
-        quote_attr_values="always",
-        omit_optional_tags=False,
-        sanitize=False,
-        alphabetical_attributes=False,
-    )
-    return serializer.render(filtered_html_tree)
-
-
 class LinkifyFilter(Filter):
     """html5lib Filter to convert custom text patterns to links.
 
@@ -459,11 +428,22 @@ class LinkifyFilter(Filter):
         return [{"type": "Characters", "data": match[0]}]
 
 
-def sanitize_html(html: str) -> str:
-    """Sanitize HTML by escaping/stripping undesirable elements."""
-    return bleach.clean(
-        html,
+def linkify_and_sanitize_html(html: str) -> str:
+    """Use bleach and html5lib filters to linkify and sanitize HTML."""
+    # list of tag names to exclude from linkification
+    linkify_skipped_tags = ["code", "pre"]
+
+    bleach_linkifier = partial(
+        bleach.linkifier.LinkifyFilter,
+        callbacks=[linkify_protocol_whitelist],
+        skip_tags=linkify_skipped_tags,
+    )
+    tildes_linkifier = partial(LinkifyFilter, skip_tags=linkify_skipped_tags)
+
+    cleaner = bleach.Cleaner(
         tags=HTML_TAG_WHITELIST,
         attributes=HTML_ATTRIBUTE_WHITELIST,
         protocols=PROTOCOL_WHITELIST,
+        filters=[bleach_linkifier, tildes_linkifier],
     )
+    return cleaner.clean(html)