Markdown: Automatically link subreddit references

5 years ago · eb7f2a75fd
2 changed files with 96 additions and 1 deletions
--- a/tildes/tests/test_markdown.py
+++ b/tildes/tests/test_markdown.py
@ -331,6 +331,77 @@ def test_group_ref_inside_other_tags_linkified():
    assert soup.find("a", href="/~group.reference")
 def test_subreddit_without_leading_forward_slash_linkified():
    """Ensure subreddit without leading forward slash is linkified."""
    markdown = "Check out: r/antarctica"
    processed = convert_markdown_to_safe_html(markdown)
    soup = BeautifulSoup(processed, features="html5lib")
    assert soup.find("a", href="https://www.reddit.com/r/antarctica/")
 def test_subreddit_with_leading_forward_slash_linkified():
    """Ensure subreddit with leading forward slash is linkified."""
    markdown = "Check out: /r/antarctica"
    processed = convert_markdown_to_safe_html(markdown)
    soup = BeautifulSoup(processed, features="html5lib")
    assert soup.find("a", href="https://www.reddit.com/r/antarctica/")
 def test_subreddit_linkified_without_punctuation():
    """Ensure subreddit is linkified without punctuation."""
    markdown = "Check out: /r/antarctica!"
    processed = convert_markdown_to_safe_html(markdown)
    soup = BeautifulSoup(processed, features="html5lib")
    assert soup.find("a", href="https://www.reddit.com/r/antarctica/")
 def test_multiple_subreddits_linkify():
    """Ensure multiple subreddits linkify."""
    markdown = (
        "Here are a couple of my favorite subreddits:\n\n"
        "* r/antarctica\n"
        "* /r/emacs\n"
    )
    processed = convert_markdown_to_safe_html(markdown)
    soup = BeautifulSoup(processed, features="html5lib")
    assert len(soup.find_all("a")) == 2
 def test_subreddit_inside_pre_ignored():
    """Ensure a subreddit link inside a <pre> tag doesn't get linked."""
    markdown = (
        "```\n"
        "# This is a code block\n"
        "# I found this code on r/python, hopefully it works\n"
        "```\n"
    )
    processed = convert_markdown_to_safe_html(markdown)
    assert "<a" not in processed
 def test_subreddit_lookalike_conjunction_not_linkified():
    """Ensure where forward slash used for conjunction, text doesn't linkify."""
    markdown = "water/ocean"
    processed = convert_markdown_to_safe_html(markdown)
    soup = BeautifulSoup(processed, features="html5lib")
    assert len(soup.find_all("a")) == 0
 def test_subreddit_followed_by_apostrophe_not_linkified():
    """Ensure we don't linkify apostrophes after subreddit references."""
    markdown = "/r/funny's moderators"
    processed = convert_markdown_to_safe_html(markdown)
    soup = BeautifulSoup(processed, features="html5lib")
    assert soup.find("a", href="https://www.reddit.com/r/funny/")
 def test_username_reference_linkified():
    """Ensure a basic username reference gets linkified."""
    markdown = "Hey @SomeUser, what do you think of this?"
--- a/tildes/tildes/lib/markdown.py
+++ b/tildes/tildes/lib/markdown.py
@ -295,6 +295,10 @@ class LinkifyFilter(Filter):
    # carefully later.
    USERNAME_REFERENCE_REGEX = re.compile(r"(?<![\w\\])(?:/?u/|@)([\w-]+)\b")
    # Regex that finds probable references to subreddits. Matches with or without the
    # preceding slash (e.g. either of "r/emacs" or "/r/emacs").
    SUBREDDIT_REFERENCE_REGEX = re.compile(r"(?<!\w)/?r/(\w+)\b")
    def __init__(
        self, source: NonRecursiveTreeWalker, skip_tags: Optional[List[str]] = None
    ):
@ -334,7 +338,7 @@ class LinkifyFilter(Filter):
                # text token not inside a skipped tag - do the actual linkification
                # replacements
                # Note: doing the two replacements "iteratively" like this only works
                # Note: doing the replacements "iteratively" like this only works
                # because they are "disjoint" and we know they're not competing to
                # replace the same text. If more replacements are added in the future
                # that might conflict with each other, this will need to be reworked
@ -344,12 +348,19 @@ class LinkifyFilter(Filter):
                    filter_regex=self.GROUP_REFERENCE_REGEX,
                    linkify_function=self._tokenize_group_match,
                )
                replaced_tokens = self._linkify_tokens(
                    replaced_tokens,
                    filter_regex=self.USERNAME_REFERENCE_REGEX,
                    linkify_function=self._tokenize_username_match,
                )
                replaced_tokens = self._linkify_tokens(
                    replaced_tokens,
                    filter_regex=self.SUBREDDIT_REFERENCE_REGEX,
                    linkify_function=self._tokenize_subreddit_match,
                )
                # yield all the tokens returned from the replacement process (will be
                # just the original token if nothing was replaced)
                for new_token in replaced_tokens:
@ -463,6 +474,19 @@ class LinkifyFilter(Filter):
        # the username wasn't valid, so just keep it as the original text
        return [{"type": "Characters", "data": match[0]}]
    @staticmethod
    def _tokenize_subreddit_match(match: Match) -> List[dict]:
        """Convert a subreddit reference into HTML tokens."""
        return [
            {
                "type": "StartTag",
                "name": "a",
                "data": {(None, "href"): f"https://www.reddit.com/r/{match[1]}/"},
            },
            {"type": "Characters", "data": match[0]},
            {"type": "EndTag", "name": "a"},
        ]
 def linkify_and_sanitize_html(
    html: str, context: Optional[HTMLSanitizationContext] = None