Browse Source

Replace bleach.linkify with cmark-gfm autolink

Yet another issue with Bleach 3.0's linkification: when used via the
filter as part of sanitization (which is necessary right now due to
*another* issue where it escapes valid HTML tags), it doesn't properly
linkify urls that contain an ampersand.

As a (temporary?) workaround, this stops using Bleach's linkification
entirely and switches to cmark-gfm's "autolink" extension. These aren't
perfectly equivalent, and the switch results in two other issues that I
consider more minor than links including ampersands not working:

- autolink will initially create links for ftp:// urls and email
  addresses. The final sanitization will remove these links due to the
  protocol whitelist, but it will leave behind a bare <a> tag. So the
  text will *appear* linked but not actually link to anything. If I
  decide to stick with autolink, it should be pretty straightforward to
  fix this by stripping all bare <a> tags from the final HTML.
- autolink doesn't create links for bare domains. For example, writing
  "example.com" won't result in a link, it's necessary to write
  "www.example.com" or include a protocol like "http://example.com".
merge-requests/53/head
Deimos 6 years ago
parent
commit
f1d2e0aee3
  1. 22
      tildes/tests/test_markdown.py
  2. 2
      tildes/tildes/lib/cmark.py
  3. 44
      tildes/tildes/lib/markdown.py

22
tildes/tests/test_markdown.py

@ -130,14 +130,6 @@ def test_https_link_linkified():
assert '<a href="https://example.com">' in processed
def test_bare_domain_linkified():
"""Ensure that a bare domain results in a link."""
markdown = "I can just write example.com too."
processed = convert_markdown_to_safe_html(markdown)
assert '<a href="http://example.com">' in processed
def test_link_with_path_linkified():
"""Ensure a link with a path results in a link."""
markdown = "So http://example.com/a/b_c_d/e too?"
@ -148,18 +140,10 @@ def test_link_with_path_linkified():
def test_link_with_query_string_linkified():
"""Ensure a link with a query string results in a link."""
markdown = "Also http://example.com?something=true works?"
processed = convert_markdown_to_safe_html(markdown)
assert '<a href="http://example.com?something=true">' in processed
def test_email_address_not_linkified():
"""Ensure that an email address does not get linkified."""
markdown = "Please contact somebody@example.com about that."
markdown = "Also http://example.com?something=true&test=yes works?"
processed = convert_markdown_to_safe_html(markdown)
assert "<a" not in processed
assert '<a href="http://example.com?something=true&amp;test=yes">' in processed
def test_other_protocol_urls_not_linkified():
@ -170,7 +154,7 @@ def test_other_protocol_urls_not_linkified():
markdown = f"Testing {protocol}://example.com for linking"
processed = convert_markdown_to_safe_html(markdown)
assert "<a" not in processed
assert "href" not in processed
def test_html_attr_whitelist_violation():

2
tildes/tildes/lib/cmark.py

@ -13,7 +13,7 @@ CMARK_EXT_DLL = CDLL("/usr/local/lib/libcmark-gfm-extensions.so")
# CMARK_OPT_UNSAFE (1 << 17)
CMARK_OPTS = (1 << 2) | (1 << 17)
CMARK_EXTENSIONS = (b"strikethrough", b"table")
CMARK_EXTENSIONS = (b"autolink", b"strikethrough", b"table")
cmark_parser_new = CMARK_DLL.cmark_parser_new
cmark_parser_new.restype = c_void_p

44
tildes/tildes/lib/markdown.py

@ -5,19 +5,7 @@
from functools import partial
import re
from typing import (
Any,
Callable,
Dict,
Iterator,
List,
Match,
Optional,
Pattern,
Tuple,
Union,
)
from urllib.parse import urlparse
from typing import Any, Callable, Iterator, List, Match, Optional, Pattern, Tuple
from bs4 import BeautifulSoup
import bleach
@ -117,29 +105,6 @@ BAD_ORDERED_LIST_REGEX = re.compile(
r"\.\s" # Followed by a period and a space
)
# Type alias for the "namespaced attr dict" used inside bleach.linkify callbacks. This
# looks pretty ridiculous, but it's a dict where the keys are namespaced attr names,
# like `(None, 'href')`, and there's also a `_text` key for getting the innerText of the
# <a> tag.
NamespacedAttrDict = Dict[Union[Tuple[Optional[str], str], str], str]
def linkify_protocol_whitelist(
attrs: NamespacedAttrDict, new: bool = False
) -> Optional[NamespacedAttrDict]:
"""bleach.linkify callback: prevent links to non-whitelisted protocols."""
# pylint: disable=unused-argument
href = attrs.get((None, "href"))
if not href:
return attrs
parsed = urlparse(href)
if parsed.scheme not in PROTOCOL_WHITELIST:
return None
return attrs
@histogram_timer("markdown_processing")
def convert_markdown_to_safe_html(markdown: str) -> str:
@ -444,17 +409,12 @@ def linkify_and_sanitize_html(html: str) -> str:
# list of tag names to exclude from linkification
linkify_skipped_tags = ["code", "pre"]
bleach_linkifier = partial(
bleach.linkifier.LinkifyFilter,
callbacks=[linkify_protocol_whitelist],
skip_tags=linkify_skipped_tags,
)
tildes_linkifier = partial(LinkifyFilter, skip_tags=linkify_skipped_tags)
cleaner = bleach.Cleaner(
tags=HTML_TAG_WHITELIST,
attributes=HTML_ATTRIBUTE_WHITELIST,
protocols=PROTOCOL_WHITELIST,
filters=[bleach_linkifier, tildes_linkifier],
filters=[tildes_linkifier],
)
return cleaner.clean(html)
Loading…
Cancel
Save