@ -79,18 +79,18 @@ HTML_ATTRIBUTE_WHITELIST = {
}
}
PROTOCOL_WHITELIST = ( " http " , " https " )
PROTOCOL_WHITELIST = ( " http " , " https " )
# Regex that finds ordered list markdown that was probably accidental - ones
# being initiated by anything except "1."
# Regex that finds ordered list markdown that was probably accidental - ones being
# initiated by anything except "1."
BAD_ORDERED_LIST_REGEX = re . compile (
BAD_ORDERED_LIST_REGEX = re . compile (
r " ((?: \ A| \ n \ n) " # Either the start of the entire text, or a new paragraph
r " ((?: \ A| \ n \ n) " # Either the start of the entire text, or a new paragraph
r " (?!1 \ .) \ d+) " # A number that isn't "1"
r " (?!1 \ .) \ d+) " # A number that isn't "1"
r " \ . \ s " # Followed by a period and a space
r " \ . \ s " # Followed by a period and a space
)
)
# Type alias for the "namespaced attr dict" used inside bleach.linkify
# callbacks. This looks pretty ridiculous, but it's a dict where the keys are
# namespaced attr names, like `(None, 'href')`, and there's also a `_text`
# key for getting the innerText of the <a> tag.
# Type alias for the "namespaced attr dict" used inside bleach.linkify callbacks. This
# looks pretty ridiculous, but it's a dict where the keys are namespaced attr names,
# like `(None, 'href')`, and there's also a `_text` key for getting the innerText of the
# <a> tag.
NamespacedAttrDict = Dict [ Union [ Tuple [ Optional [ str ] , str ] , str ] , str ] # noqa
NamespacedAttrDict = Dict [ Union [ Tuple [ Optional [ str ] , str ] , str ] , str ] # noqa
@ -155,15 +155,14 @@ def preprocess_markdown(markdown: str) -> str:
def escape_accidental_ordered_lists ( markdown : str ) - > str :
def escape_accidental_ordered_lists ( markdown : str ) - > str :
""" Escape markdown that ' s probably an accidental ordered list.
""" Escape markdown that ' s probably an accidental ordered list.
It ' s a common markdown mistake to accidentally start a numbered list, by
beginning a post or paragraph with a number followed by a period . For
example , someone might try to write " 1975. It was a long time ago. " , and
the result will be a comment that says " 1. It was a long time ago. " since
that gets parsed into a numbered list .
It ' s a common markdown mistake to accidentally start a numbered list, by beginning a
post or paragraph with a number followed by a period . For example , someone might try
to write " 1975. It was a long time ago. " , and the result will be a comment that says
" 1. It was a long time ago. " since that gets parsed into a numbered list .
This fixes that quirk of markdown by escaping anything that would start a
numbered list except for " 1. " . This will cause a few other edge cases , but
I believe they ' re less common/important than fixing this common error .
This fixes that quirk of markdown by escaping anything that would start a numbered
list except for " 1. " . This will cause a few other edge cases , but I believe they ' re
less common / important than fixing this common error .
"""
"""
return BAD_ORDERED_LIST_REGEX . sub ( r " \ 1 \\ . " , markdown )
return BAD_ORDERED_LIST_REGEX . sub ( r " \ 1 \\ . " , markdown )
@ -205,24 +204,24 @@ def apply_linkification(html: str, skip_tags: Optional[List[str]] = None) -> str
class LinkifyFilter ( Filter ) :
class LinkifyFilter ( Filter ) :
""" html5lib Filter to convert custom text patterns to links.
""" html5lib Filter to convert custom text patterns to links.
This replaces references to group paths and usernames with links to the
relevant pages .
This replaces references to group paths and usernames with links to the relevant
pages .
This implementation is based heavily on the linkify implementation from
the Bleach library .
This implementation is based heavily on the linkify implementation from the Bleach
library .
"""
"""
# Regex that finds probable references to groups. This isn't "perfect",
# just a first pass to find likely candidates. The validity of the group
# path is checked more carefully later.
# Note: currently specifically excludes paths immediately followed by a
# tilde, but t his may be possible to remove once strikethrough is
# implemented (since that's probably what they were trying to do)
# Regex that finds probable references to groups. This isn't "perfect", just a first
# pass to find likely candidates. The validity of the group path is checked more
# carefully later.
# Note: currently specifically excludes paths immediately followed by a tilde, but
# this may be possible to remove once strikethrough is implemented (since that' s
# probably what they were trying to do)
GROUP_REFERENCE_REGEX = re . compile ( r " (?<! \ w)~([ \ w.]+) \ b(?!~) " )
GROUP_REFERENCE_REGEX = re . compile ( r " (?<! \ w)~([ \ w.]+) \ b(?!~) " )
# Regex that finds probable references to users. As above, this isn't
# "perfect" either but works as an initial pass with the validity of
# the username checked more carefully later.
# Regex that finds probable references to users. As above, this isn't "perfect"
# either but works as an initial pass with the validity of the username checked more
# carefully later.
USERNAME_REFERENCE_REGEX = re . compile ( r " (?<! \ w)(?:/?u/|@)([ \ w-]+) \ b " )
USERNAME_REFERENCE_REGEX = re . compile ( r " (?<! \ w)(?:/?u/|@)([ \ w-]+) \ b " )
def __init__ (
def __init__ (
@ -230,8 +229,8 @@ class LinkifyFilter(Filter):
) - > None :
) - > None :
""" Initialize a linkification filter to apply to HTML.
""" Initialize a linkification filter to apply to HTML.
The skip_tags argument can be a list of tag names , and the contents of
any of those tags will be excluded from linkification.
The skip_tags argument can be a list of tag names , and the contents of any of
those tags will be excluded from linkification.
"""
"""
super ( ) . __init__ ( source )
super ( ) . __init__ ( source )
self . skip_tags = skip_tags or [ ]
self . skip_tags = skip_tags or [ ]
@ -248,28 +247,27 @@ class LinkifyFilter(Filter):
token [ " type " ] in ( " StartTag " , " EmptyTag " )
token [ " type " ] in ( " StartTag " , " EmptyTag " )
and token [ " name " ] in self . skip_tags
and token [ " name " ] in self . skip_tags
) :
) :
# if this is the start of a tag we want to skip, add it to the
# list of skipped tags that we're currently inside
# if this is the start of a tag we want to skip, add it to the list of
# skipped tags that we're currently inside
inside_skipped_tags . append ( token [ " name " ] )
inside_skipped_tags . append ( token [ " name " ] )
elif inside_skipped_tags :
elif inside_skipped_tags :
# if we're currently inside any skipped tags, the only thing we
# want to do is look for all the end tags we need to be able to
# finish skipping
# if we're currently inside any skipped tags, the only thing we want to
# do is look for all the end tags we need to be able to finish skipping
if token [ " type " ] == " EndTag " :
if token [ " type " ] == " EndTag " :
try :
try :
inside_skipped_tags . remove ( token [ " name " ] )
inside_skipped_tags . remove ( token [ " name " ] )
except ValueError :
except ValueError :
pass
pass
elif token [ " type " ] == " Characters " :
elif token [ " type " ] == " Characters " :
# this is only reachable if inside_skipped_tags is empty, so
# this is a t ext token not inside a skipped tag - do the actual
# linkification replacements
# Note: doing the two replacements "iteratively" like this only
# works because they are "disjoint" and we know they're not
# competing to replace the same text. If more replacements are
# added in the future that might conflict with each other, this
# will need to be reworked somehow.
# this is only reachable if inside_skipped_tags is empty, so this is a
# text token not inside a skipped tag - do the actual linkification
# replacements
# Note: doing the two replacements "iteratively" like this only works
# because they are "disjoint" and we know they're not competing to
# replace the same text. If more replacements are added in the futu re
# that might conflict with each other, this will need to be reworked
# somehow.
replaced_tokens = self . _linkify_tokens (
replaced_tokens = self . _linkify_tokens (
[ token ] ,
[ token ] ,
filter_regex = self . GROUP_REFERENCE_REGEX ,
filter_regex = self . GROUP_REFERENCE_REGEX ,
@ -281,13 +279,13 @@ class LinkifyFilter(Filter):
linkify_function = self . _tokenize_username_match ,
linkify_function = self . _tokenize_username_match ,
)
)
# yield all the tokens returned from the replacement process
# (will be just the original token if nothing was replaced)
# yield all the tokens returned from the replacement process (will be
# just the original token if nothing was replaced)
for new_token in replaced_tokens :
for new_token in replaced_tokens :
yield new_token
yield new_token
# we either yielded new tokens or the original one already, so
# we don't w ant to fall through and yield the original again
# we either yielded new tokens or the original one already, so we don't
# want to fall through and yield the original again
continue
continue
yield token
yield token
@ -298,11 +296,11 @@ class LinkifyFilter(Filter):
) - > List [ dict ] :
) - > List [ dict ] :
""" Check tokens for text that matches a regex and linkify it.
""" Check tokens for text that matches a regex and linkify it.
The `filter_regex` argument should be a compiled pattern that will be
applied to the text in all of the supplied tokens . If any matches are
found , they will each be used to call `linkify_function` , which will
validate the match and convert it back into tokens ( representing an < a >
tag if it is valid for linkifying , or just t ext if not ) .
The `filter_regex` argument should be a compiled pattern that will be applied to
the text in all of the supplied tokens . If any matches are found , they will each
be used to call `linkify_function` , which will validate the match and convert it
back into tokens ( representing an < a > tag if it is valid for linkifying , or just
text if not ) .
"""
"""
new_tokens = [ ]
new_tokens = [ ]
@ -316,8 +314,8 @@ class LinkifyFilter(Filter):
current_index = 0
current_index = 0
for match in filter_regex . finditer ( original_text ) :
for match in filter_regex . finditer ( original_text ) :
# if there were some characters between the previous match and
# this one, add a token containing those first
# if there were some characters between the previous match and this one,
# add a token containing those first
if match . start ( ) > current_index :
if match . start ( ) > current_index :
new_tokens . append (
new_tokens . append (
{
{
@ -333,8 +331,8 @@ class LinkifyFilter(Filter):
# move the progress marker up to the end of this match
# move the progress marker up to the end of this match
current_index = match . end ( )
current_index = match . end ( )
# if there's still some text left over, add one more token for it
# (this will be the entire thing if there weren't any matches)
# if there's still some text left over, add one more token for it (this will
# be the entire thing if there weren't any matches)
if current_index < len ( original_text ) :
if current_index < len ( original_text ) :
new_tokens . append (
new_tokens . append (
{ " type " : " Characters " , " data " : original_text [ current_index : ] }
{ " type " : " Characters " , " data " : original_text [ current_index : ] }
@ -345,14 +343,14 @@ class LinkifyFilter(Filter):
@staticmethod
@staticmethod
def _tokenize_group_match ( match : Match ) - > List [ dict ] :
def _tokenize_group_match ( match : Match ) - > List [ dict ] :
""" Convert a potential group reference into HTML tokens. """
""" Convert a potential group reference into HTML tokens. """
# convert the potential group path to lowercase to allow people to use
# incorrect casing but still have it link properly
# convert the potential group path to lowercase to allow people to use incorrect
# casing but still have it link properly
group_path = match [ 1 ] . lower ( )
group_path = match [ 1 ] . lower ( )
# Even though they're technically valid paths, we don't want to linkify
# things like "~10" or "~4.5" since that's just going to be someone
# using it in the "approximately" sense. So if the path consists of
# only numbers and/or periods, we won't linkify it
# Even though they're technically valid paths, we don't want to linkify things
# like "~10" or "~4.5" since that's just going to be someone using it in th e
# "approximately" sense. So if the path consists of only numbers and/or periods,
# we won't linkify it
is_numeric = all ( char in " 0123456789. " for char in group_path )
is_numeric = all ( char in " 0123456789. " for char in group_path )
# if it's a valid group path and not totally numeric, convert to <a>
# if it's a valid group path and not totally numeric, convert to <a>