@ -3,6 +3,7 @@
""" Functions/constants related to markdown handling. """
""" Functions/constants related to markdown handling. """
from functools import partial
import re
import re
from typing import (
from typing import (
Any ,
Any ,
@ -20,10 +21,7 @@ from urllib.parse import urlparse
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
import bleach
import bleach
import html5lib
from html5lib import HTMLParser
from html5lib.filters.base import Filter
from html5lib.filters.base import Filter
from html5lib.serializer import HTMLSerializer
from html5lib.treewalkers.base import NonRecursiveTreeWalker
from html5lib.treewalkers.base import NonRecursiveTreeWalker
from pygments import highlight
from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.formatters import HtmlFormatter
@ -170,8 +168,8 @@ def convert_markdown_to_safe_html(markdown: str) -> str:
# apply custom post-processing to HTML
# apply custom post-processing to HTML
html = postprocess_markdown_html ( html )
html = postprocess_markdown_html ( html )
# sanitize the final HTML before returning it
return sanitize_html ( html )
# add linkification and sanitize the final HTML before returning it
return linkify_and_ sanitize_html( html )
def preprocess_markdown ( markdown : str ) - > str :
def preprocess_markdown ( markdown : str ) - > str :
@ -200,17 +198,6 @@ def escape_accidental_ordered_lists(markdown: str) -> str:
def postprocess_markdown_html ( html : str ) - > str :
def postprocess_markdown_html ( html : str ) - > str :
""" Apply post-processing to HTML generated by markdown parser. """
""" Apply post-processing to HTML generated by markdown parser. """
# list of tag names to exclude from linkification
linkify_skipped_tags = [ " code " , " pre " ]
# search for text that looks like urls and convert to actual links
html = bleach . linkify (
html , callbacks = [ linkify_protocol_whitelist ] , skip_tags = linkify_skipped_tags
)
# run the HTML through our custom linkification process as well
html = apply_linkification ( html , skip_tags = linkify_skipped_tags )
# apply syntax highlighting to code blocks
# apply syntax highlighting to code blocks
html = apply_syntax_highlighting ( html )
html = apply_syntax_highlighting ( html )
@ -255,24 +242,6 @@ def apply_syntax_highlighting(html: str) -> str:
return html
return html
def apply_linkification ( html : str , skip_tags : Optional [ List [ str ] ] = None ) - > str :
""" Apply custom linkification filter to convert text patterns to links. """
parser = HTMLParser ( namespaceHTMLElements = False )
html_tree = parser . parseFragment ( html )
walker_stream = html5lib . getTreeWalker ( " etree " ) ( html_tree )
filtered_html_tree = LinkifyFilter ( walker_stream , skip_tags )
serializer = HTMLSerializer (
quote_attr_values = " always " ,
omit_optional_tags = False ,
sanitize = False ,
alphabetical_attributes = False ,
)
return serializer . render ( filtered_html_tree )
class LinkifyFilter ( Filter ) :
class LinkifyFilter ( Filter ) :
""" html5lib Filter to convert custom text patterns to links.
""" html5lib Filter to convert custom text patterns to links.
@ -459,11 +428,22 @@ class LinkifyFilter(Filter):
return [ { " type " : " Characters " , " data " : match [ 0 ] } ]
return [ { " type " : " Characters " , " data " : match [ 0 ] } ]
def sanitize_html ( html : str ) - > str :
""" Sanitize HTML by escaping/stripping undesirable elements. """
return bleach . clean (
html ,
def linkify_and_sanitize_html ( html : str ) - > str :
""" Use bleach and html5lib filters to linkify and sanitize HTML. """
# list of tag names to exclude from linkification
linkify_skipped_tags = [ " code " , " pre " ]
bleach_linkifier = partial (
bleach . linkifier . LinkifyFilter ,
callbacks = [ linkify_protocol_whitelist ] ,
skip_tags = linkify_skipped_tags ,
)
tildes_linkifier = partial ( LinkifyFilter , skip_tags = linkify_skipped_tags )
cleaner = bleach . Cleaner (
tags = HTML_TAG_WHITELIST ,
tags = HTML_TAG_WHITELIST ,
attributes = HTML_ATTRIBUTE_WHITELIST ,
attributes = HTML_ATTRIBUTE_WHITELIST ,
protocols = PROTOCOL_WHITELIST ,
protocols = PROTOCOL_WHITELIST ,
filters = [ bleach_linkifier , tildes_linkifier ] ,
)
)
return cleaner . clean ( html )