@ -79,18 +79,18 @@ HTML_ATTRIBUTE_WHITELIST = {
}
}
PROTOCOL_WHITELIST = ( " http " , " https " )
PROTOCOL_WHITELIST = ( " http " , " https " )
# Regex that finds ordered list markdown that was probably accidental - ones
# Regex that finds ordered list markdown that was probably accidental - ones being
# being initiated by anything except "1."
# initiated by anything except "1."
BAD_ORDERED_LIST_REGEX = re . compile (
BAD_ORDERED_LIST_REGEX = re . compile (
r " ((?: \ A| \ n \ n) " # Either the start of the entire text, or a new paragraph
r " ((?: \ A| \ n \ n) " # Either the start of the entire text, or a new paragraph
r " (?!1 \ .) \ d+) " # A number that isn't "1"
r " (?!1 \ .) \ d+) " # A number that isn't "1"
r " \ . \ s " # Followed by a period and a space
r " \ . \ s " # Followed by a period and a space
)
)
# Type alias for the "namespaced attr dict" used inside bleach.linkify
# Type alias for the "namespaced attr dict" used inside bleach.linkify callbacks. This
# callbacks. This looks pretty ridiculous, but it's a dict where the keys are
# looks pretty ridiculous, but it's a dict where the keys are namespaced attr names,
# namespaced attr names, like `(None, 'href')`, and there's also a `_text`
# like `(None, 'href')`, and there's also a `_text` key for getting the innerText of the
# key for getting the innerText of the <a> tag.
# <a> tag.
NamespacedAttrDict = Dict [ Union [ Tuple [ Optional [ str ] , str ] , str ] , str ] # noqa
NamespacedAttrDict = Dict [ Union [ Tuple [ Optional [ str ] , str ] , str ] , str ] # noqa
@ -155,15 +155,14 @@ def preprocess_markdown(markdown: str) -> str:
def escape_accidental_ordered_lists ( markdown : str ) - > str :
def escape_accidental_ordered_lists ( markdown : str ) - > str :
""" Escape markdown that ' s probably an accidental ordered list.
""" Escape markdown that ' s probably an accidental ordered list.
It ' s a common markdown mistake to accidentally start a numbered list, by
It ' s a common markdown mistake to accidentally start a numbered list, by beginning a
beginning a post or paragraph with a number followed by a period . For
post or paragraph with a number followed by a period . For example , someone might try
example , someone might try to write " 1975. It was a long time ago. " , and
to write " 1975. It was a long time ago. " , and the result will be a comment that says
the result will be a comment that says " 1. It was a long time ago. " since
" 1. It was a long time ago. " since that gets parsed into a numbered list .
that gets parsed into a numbered list .
This fixes that quirk of markdown by escaping anything that would start a
This fixes that quirk of markdown by escaping anything that would start a numbered
numbered list except for " 1. " . This will cause a few other edge cases , but
list except for " 1. " . This will cause a few other edge cases , but I believe they ' re
I believe they ' re less common/important than fixing this common error .
less common / important than fixing this common error .
"""
"""
return BAD_ORDERED_LIST_REGEX . sub ( r " \ 1 \\ . " , markdown )
return BAD_ORDERED_LIST_REGEX . sub ( r " \ 1 \\ . " , markdown )
@ -205,24 +204,24 @@ def apply_linkification(html: str, skip_tags: Optional[List[str]] = None) -> str
class LinkifyFilter ( Filter ) :
class LinkifyFilter ( Filter ) :
""" html5lib Filter to convert custom text patterns to links.
""" html5lib Filter to convert custom text patterns to links.
This replaces references to group paths and usernames with links to the
This replaces references to group paths and usernames with links to the relevant
relevant pages .
pages .
This implementation is based heavily on the linkify implementation from
This implementation is based heavily on the linkify implementation from the Bleach
the Bleach library .
library .
"""
"""
# Regex that finds probable references to groups. This isn't "perfect",
# Regex that finds probable references to groups. This isn't "perfect", just a first
# just a first pass to find likely candidates. The validity of the group
# pass to find likely candidates. The validity of the group path is checked more
# path is checked more carefully later.
# carefully later.
# Note: currently specifically excludes paths immediately followed by a
# Note: currently specifically excludes paths immediately followed by a tilde, but
# tilde, but t his may be possible to remove once strikethrough is
# this may be possible to remove once strikethrough is implemented (since that' s
# implemented (since that's probably what they were trying to do)
# probably what they were trying to do)
GROUP_REFERENCE_REGEX = re . compile ( r " (?<! \ w)~([ \ w.]+) \ b(?!~) " )
GROUP_REFERENCE_REGEX = re . compile ( r " (?<! \ w)~([ \ w.]+) \ b(?!~) " )
# Regex that finds probable references to users. As above, this isn't
# Regex that finds probable references to users. As above, this isn't "perfect"
# "perfect" either but works as an initial pass with the validity of
# either but works as an initial pass with the validity of the username checked more
# the username checked more carefully later.
# carefully later.
USERNAME_REFERENCE_REGEX = re . compile ( r " (?<! \ w)(?:/?u/|@)([ \ w-]+) \ b " )
USERNAME_REFERENCE_REGEX = re . compile ( r " (?<! \ w)(?:/?u/|@)([ \ w-]+) \ b " )
def __init__ (
def __init__ (
@ -230,8 +229,8 @@ class LinkifyFilter(Filter):
) - > None :
) - > None :
""" Initialize a linkification filter to apply to HTML.
""" Initialize a linkification filter to apply to HTML.
The skip_tags argument can be a list of tag names , and the contents of
The skip_tags argument can be a list of tag names , and the contents of any of
any of those tags will be excluded from linkification.
those tags will be excluded from linkification.
"""
"""
super ( ) . __init__ ( source )
super ( ) . __init__ ( source )
self . skip_tags = skip_tags or [ ]
self . skip_tags = skip_tags or [ ]
@ -248,28 +247,27 @@ class LinkifyFilter(Filter):
token [ " type " ] in ( " StartTag " , " EmptyTag " )
token [ " type " ] in ( " StartTag " , " EmptyTag " )
and token [ " name " ] in self . skip_tags
and token [ " name " ] in self . skip_tags
) :
) :
# if this is the start of a tag we want to skip, add it to the
# if this is the start of a tag we want to skip, add it to the list of
# list of skipped tags that we're currently inside
# skipped tags that we're currently inside
inside_skipped_tags . append ( token [ " name " ] )
inside_skipped_tags . append ( token [ " name " ] )
elif inside_skipped_tags :
elif inside_skipped_tags :
# if we're currently inside any skipped tags, the only thing we
# if we're currently inside any skipped tags, the only thing we want to
# want to do is look for all the end tags we need to be able to
# do is look for all the end tags we need to be able to finish skipping
# finish skipping
if token [ " type " ] == " EndTag " :
if token [ " type " ] == " EndTag " :
try :
try :
inside_skipped_tags . remove ( token [ " name " ] )
inside_skipped_tags . remove ( token [ " name " ] )
except ValueError :
except ValueError :
pass
pass
elif token [ " type " ] == " Characters " :
elif token [ " type " ] == " Characters " :
# this is only reachable if inside_skipped_tags is empty, so
# this is only reachable if inside_skipped_tags is empty, so this is a
# this is a t ext token not inside a skipped tag - do the actual
# text token not inside a skipped tag - do the actual linkification
# linkification replacements
# replacements
# Note: doing the two replacements "iteratively" like this only works
# Note: doing the two replacements "iteratively" like this only
# because they are "disjoint" and we know they're not competing to
# works because they are "disjoint" and we know they're not
# replace the same text. If more replacements are added in the futu re
# competing to replace the same text. If more replacements are
# that might conflict with each other, this will need to be reworked
# added in the future that might conflict with each other, this
# somehow.
# will need to be reworked somehow.
replaced_tokens = self . _linkify_tokens (
replaced_tokens = self . _linkify_tokens (
[ token ] ,
[ token ] ,
filter_regex = self . GROUP_REFERENCE_REGEX ,
filter_regex = self . GROUP_REFERENCE_REGEX ,
@ -281,13 +279,13 @@ class LinkifyFilter(Filter):
linkify_function = self . _tokenize_username_match ,
linkify_function = self . _tokenize_username_match ,
)
)
# yield all the tokens returned from the replacement process
# yield all the tokens returned from the replacement process (will be
# (will be just the original token if nothing was replaced)
# just the original token if nothing was replaced)
for new_token in replaced_tokens :
for new_token in replaced_tokens :
yield new_token
yield new_token
# we either yielded new tokens or the original one already, so
# we either yielded new tokens or the original one already, so we don't
# we don't w ant to fall through and yield the original again
# want to fall through and yield the original again
continue
continue
yield token
yield token
@ -298,11 +296,11 @@ class LinkifyFilter(Filter):
) - > List [ dict ] :
) - > List [ dict ] :
""" Check tokens for text that matches a regex and linkify it.
""" Check tokens for text that matches a regex and linkify it.
The `filter_regex` argument should be a compiled pattern that will be
The `filter_regex` argument should be a compiled pattern that will be applied to
applied to the text in all of the supplied tokens . If any matches are
the text in all of the supplied tokens . If any matches are found , they will each
found , they will each be used to call `linkify_function` , which will
be used to call `linkify_function` , which will validate the match and convert it
validate the match and convert it back into tokens ( representing an < a >
back into tokens ( representing an < a > tag if it is valid for linkifying , or just
tag if it is valid for linkifying , or just t ext if not ) .
text if not ) .
"""
"""
new_tokens = [ ]
new_tokens = [ ]
@ -316,8 +314,8 @@ class LinkifyFilter(Filter):
current_index = 0
current_index = 0
for match in filter_regex . finditer ( original_text ) :
for match in filter_regex . finditer ( original_text ) :
# if there were some characters between the previous match and
# if there were some characters between the previous match and this one,
# this one, add a token containing those first
# add a token containing those first
if match . start ( ) > current_index :
if match . start ( ) > current_index :
new_tokens . append (
new_tokens . append (
{
{
@ -333,8 +331,8 @@ class LinkifyFilter(Filter):
# move the progress marker up to the end of this match
# move the progress marker up to the end of this match
current_index = match . end ( )
current_index = match . end ( )
# if there's still some text left over, add one more token for it
# if there's still some text left over, add one more token for it (this will
# (this will be the entire thing if there weren't any matches)
# be the entire thing if there weren't any matches)
if current_index < len ( original_text ) :
if current_index < len ( original_text ) :
new_tokens . append (
new_tokens . append (
{ " type " : " Characters " , " data " : original_text [ current_index : ] }
{ " type " : " Characters " , " data " : original_text [ current_index : ] }
@ -345,14 +343,14 @@ class LinkifyFilter(Filter):
@staticmethod
@staticmethod
def _tokenize_group_match ( match : Match ) - > List [ dict ] :
def _tokenize_group_match ( match : Match ) - > List [ dict ] :
""" Convert a potential group reference into HTML tokens. """
""" Convert a potential group reference into HTML tokens. """
# convert the potential group path to lowercase to allow people to use
# convert the potential group path to lowercase to allow people to use incorrect
# incorrect casing but still have it link properly
# casing but still have it link properly
group_path = match [ 1 ] . lower ( )
group_path = match [ 1 ] . lower ( )
# Even though they're technically valid paths, we don't want to linkify
# Even though they're technically valid paths, we don't want to linkify things
# things like "~10" or "~4.5" since that's just going to be someone
# like "~10" or "~4.5" since that's just going to be someone using it in th e
# using it in the "approximately" sense. So if the path consists of
# "approximately" sense. So if the path consists of only numbers and/or periods,
# only numbers and/or periods, we won't linkify it
# we won't linkify it
is_numeric = all ( char in " 0123456789. " for char in group_path )
is_numeric = all ( char in " 0123456789. " for char in group_path )
# if it's a valid group path and not totally numeric, convert to <a>
# if it's a valid group path and not totally numeric, convert to <a>
xxxxxxxxxx