Markdown: Automatically link subreddit references

6 years ago · eb7f2a75fd
2 changed files with 96 additions and 1 deletions
--- a/tildes/tests/test_markdown.py
+++ b/tildes/tests/test_markdown.py
@ -331,6 +331,77 @@ def test_group_ref_inside_other_tags_linkified():
    assert soup.find("a", href="/~group.reference")


+def test_subreddit_without_leading_forward_slash_linkified():
+    """Ensure subreddit without leading forward slash is linkified."""
+    markdown = "Check out: r/antarctica"
+    processed = convert_markdown_to_safe_html(markdown)
+
+    soup = BeautifulSoup(processed, features="html5lib")
+    assert soup.find("a", href="https://www.reddit.com/r/antarctica/")
+
+
+def test_subreddit_with_leading_forward_slash_linkified():
+    """Ensure subreddit with leading forward slash is linkified."""
+    markdown = "Check out: /r/antarctica"
+    processed = convert_markdown_to_safe_html(markdown)
+
+    soup = BeautifulSoup(processed, features="html5lib")
+    assert soup.find("a", href="https://www.reddit.com/r/antarctica/")
+
+
+def test_subreddit_linkified_without_punctuation():
+    """Ensure subreddit is linkified without punctuation."""
+    markdown = "Check out: /r/antarctica!"
+    processed = convert_markdown_to_safe_html(markdown)
+
+    soup = BeautifulSoup(processed, features="html5lib")
+    assert soup.find("a", href="https://www.reddit.com/r/antarctica/")
+
+
+def test_multiple_subreddits_linkify():
+    """Ensure multiple subreddits linkify."""
+    markdown = (
+        "Here are a couple of my favorite subreddits:\n\n"
+        "* r/antarctica\n"
+        "* /r/emacs\n"
+    )
+    processed = convert_markdown_to_safe_html(markdown)
+
+    soup = BeautifulSoup(processed, features="html5lib")
+    assert len(soup.find_all("a")) == 2
+
+
+def test_subreddit_inside_pre_ignored():
+    """Ensure a subreddit link inside a <pre> tag doesn't get linked."""
+    markdown = (
+        "```\n"
+        "# This is a code block\n"
+        "# I found this code on r/python, hopefully it works\n"
+        "```\n"
+    )
+    processed = convert_markdown_to_safe_html(markdown)
+
+    assert "<a" not in processed
+
+
+def test_subreddit_lookalike_conjunction_not_linkified():
+    """Ensure where forward slash used for conjunction, text doesn't linkify."""
+    markdown = "water/ocean"
+    processed = convert_markdown_to_safe_html(markdown)
+
+    soup = BeautifulSoup(processed, features="html5lib")
+    assert len(soup.find_all("a")) == 0
+
+
+def test_subreddit_followed_by_apostrophe_not_linkified():
+    """Ensure we don't linkify apostrophes after subreddit references."""
+    markdown = "/r/funny's moderators"
+    processed = convert_markdown_to_safe_html(markdown)
+
+    soup = BeautifulSoup(processed, features="html5lib")
+    assert soup.find("a", href="https://www.reddit.com/r/funny/")
+
+
 def test_username_reference_linkified():
    """Ensure a basic username reference gets linkified."""
    markdown = "Hey @SomeUser, what do you think of this?"
--- a/tildes/tildes/lib/markdown.py
+++ b/tildes/tildes/lib/markdown.py
@ -295,6 +295,10 @@ class LinkifyFilter(Filter):
    # carefully later.
    USERNAME_REFERENCE_REGEX = re.compile(r"(?<![\w\\])(?:/?u/|@)([\w-]+)\b")

+    # Regex that finds probable references to subreddits. Matches with or without the
+    # preceding slash (e.g. either of "r/emacs" or "/r/emacs").
+    SUBREDDIT_REFERENCE_REGEX = re.compile(r"(?<!\w)/?r/(\w+)\b")
+
    def __init__(
        self, source: NonRecursiveTreeWalker, skip_tags: Optional[List[str]] = None
    ):
@ -334,7 +338,7 @@ class LinkifyFilter(Filter):
                # text token not inside a skipped tag - do the actual linkification
                # replacements

-                # Note: doing the two replacements "iteratively" like this only works
+                # Note: doing the replacements "iteratively" like this only works
                # because they are "disjoint" and we know they're not competing to
                # replace the same text. If more replacements are added in the future
                # that might conflict with each other, this will need to be reworked
@ -344,12 +348,19 @@ class LinkifyFilter(Filter):
                    filter_regex=self.GROUP_REFERENCE_REGEX,
                    linkify_function=self._tokenize_group_match,
                )
+
                replaced_tokens = self._linkify_tokens(
                    replaced_tokens,
                    filter_regex=self.USERNAME_REFERENCE_REGEX,
                    linkify_function=self._tokenize_username_match,
                )

+                replaced_tokens = self._linkify_tokens(
+                    replaced_tokens,
+                    filter_regex=self.SUBREDDIT_REFERENCE_REGEX,
+                    linkify_function=self._tokenize_subreddit_match,
+                )
+
                # yield all the tokens returned from the replacement process (will be
                # just the original token if nothing was replaced)
                for new_token in replaced_tokens:
@ -463,6 +474,19 @@ class LinkifyFilter(Filter):
        # the username wasn't valid, so just keep it as the original text
        return [{"type": "Characters", "data": match[0]}]

+    @staticmethod
+    def _tokenize_subreddit_match(match: Match) -> List[dict]:
+        """Convert a subreddit reference into HTML tokens."""
+        return [
+            {
+                "type": "StartTag",
+                "name": "a",
+                "data": {(None, "href"): f"https://www.reddit.com/r/{match[1]}/"},
+            },
+            {"type": "Characters", "data": match[0]},
+            {"type": "EndTag", "name": "a"},
+        ]
+

 def linkify_and_sanitize_html(
    html: str, context: Optional[HTMLSanitizationContext] = None