From e3842e88c4393a2cbd3fa2524181a1b8da50ca6f Mon Sep 17 00:00:00 2001
From: Deimos <deimos@tildes.net>
Date: Fri, 10 Aug 2018 01:01:58 -0600
Subject: [PATCH] Enable cmark-gfm extensions (table, strikethrough)

An example was recently added to the github cmark repo to show how to
set up the extensions from Python, so this is heavily based on that
code:
https://github.com/github/cmark/blob/master/wrappers/wrapper_ext.py

This should also fix a memory leak, since I wasn't manually freeing the
returned buffer (as the library recommends that you do).
---
 tildes/tests/test_markdown.py | 34 ++++++++++++++++-----
 tildes/tildes/lib/cmark.py    | 57 +++++++++++++++++++++++++++++++++++
 tildes/tildes/lib/markdown.py | 40 ++++++++++++++++--------
 3 files changed, 111 insertions(+), 20 deletions(-)
 create mode 100644 tildes/tildes/lib/cmark.py
diff --git a/tildes/tests/test_markdown.py b/tildes/tests/test_markdown.py
index 1c5dbd9..f96c9ff 100644
--- a/tildes/tests/test_markdown.py
+++ b/tildes/tests/test_markdown.py
@@ -28,6 +28,32 @@ def test_basic_markdown_unescaped():
     assert '&lt;' not in sanitized
 
 
+def test_strikethrough():
+    """Ensure strikethrough works and doesn't turn into a group link."""
+    markdown = "This ~should not~ should work"
+    processed = convert_markdown_to_safe_html(markdown)
+
+    assert '<del>' in processed
+    assert '<a' not in processed
+
+
+def test_table():
+    """Ensure table markdown works."""
+    markdown = (
+        '|Header 1|Header 2|Header 3|\n'
+        '|--------|-------:|:------:|\n'
+        '|1 - 1   |1 - 2   |1 - 3   |\n'
+        '|2 - 1|2 - 2|2 - 3|\n'
+    )
+    processed = convert_markdown_to_safe_html(markdown)
+
+    assert '<table>' in processed
+    assert processed.count('<tr') == 3
+    assert processed.count('<td') == 6
+    assert 'align="right"' in processed
+    assert 'align="center"' in processed
+
+
 def test_deliberate_ordered_list():
     """Ensure a "deliberate" ordered list works."""
     markdown = (
@@ -215,14 +241,6 @@ def test_approximately_tilde_not_linkified():
     assert '<a' not in processed
 
 
-def test_strikethrough_attempt_not_linkified():
-    """Ensure someone trying to do strikethrough doesn't get a link."""
-    markdown = "This ~should~ shouldn't work"
-    processed = convert_markdown_to_safe_html(markdown)
-
-    assert '<a' not in processed
-
-
 def test_uppercase_group_ref_links_correctly():
     """Ensure using uppercase in a group ref works but links correctly."""
     markdown = 'That was in ~Music.Metal.Progressive'
diff --git a/tildes/tildes/lib/cmark.py b/tildes/tildes/lib/cmark.py
new file mode 100644
index 0000000..4e6a8d8
--- /dev/null
+++ b/tildes/tildes/lib/cmark.py
@@ -0,0 +1,57 @@
+"""Set up the shared libcmark-gfm library and extensions."""
+# pylint: disable=invalid-name
+
+from ctypes import CDLL, c_char_p, c_int, c_size_t, c_void_p
+
+
+CMARK_DLL = CDLL('/usr/local/lib/libcmark-gfm.so')
+CMARK_EXT_DLL = CDLL('/usr/local/lib/libcmark-gfmextensions.so')
+
+# enables the --hardbreaks option for cmark
+# (can I import this? it's defined in cmark.h as CMARK_OPT_HARDBREAKS)
+CMARK_OPTS = 4
+
+CMARK_EXTENSIONS = (b'strikethrough', b'table')
+
+cmark_parser_new = CMARK_DLL.cmark_parser_new
+cmark_parser_new.restype = c_void_p
+cmark_parser_new.argtypes = (c_int,)
+
+cmark_parser_feed = CMARK_DLL.cmark_parser_feed
+cmark_parser_feed.restype = None
+cmark_parser_feed.argtypes = (c_void_p, c_char_p, c_size_t)
+
+cmark_parser_finish = CMARK_DLL.cmark_parser_finish
+cmark_parser_finish.restype = c_void_p
+cmark_parser_finish.argtypes = (c_void_p,)
+
+cmark_parser_attach_syntax_extension = (
+    CMARK_DLL.cmark_parser_attach_syntax_extension)
+cmark_parser_attach_syntax_extension.restype = c_int
+cmark_parser_attach_syntax_extension.argtypes = (c_void_p, c_void_p)
+
+cmark_parser_get_syntax_extensions = (
+    CMARK_DLL.cmark_parser_get_syntax_extensions)
+cmark_parser_get_syntax_extensions.restype = c_void_p
+cmark_parser_get_syntax_extensions.argtypes = (c_void_p,)
+
+cmark_parser_free = CMARK_DLL.cmark_parser_free
+cmark_parser_free.restype = None
+cmark_parser_free.argtypes = (c_void_p,)
+
+cmark_node_free = CMARK_DLL.cmark_node_free
+cmark_node_free.restype = None
+cmark_node_free.argtypes = (c_void_p,)
+
+cmark_find_syntax_extension = CMARK_DLL.cmark_find_syntax_extension
+cmark_find_syntax_extension.restype = c_void_p
+cmark_find_syntax_extension.argtypes = (c_char_p,)
+
+cmark_render_html = CMARK_DLL.cmark_render_html
+cmark_render_html.restype = c_char_p
+cmark_render_html.argtypes = (c_void_p, c_int, c_void_p)
+
+register = CMARK_EXT_DLL.core_extensions_ensure_registered
+register.restype = None
+register.argtypes = ()
+register()
diff --git a/tildes/tildes/lib/markdown.py b/tildes/tildes/lib/markdown.py
index 98c1c6b..e8f2006 100644
--- a/tildes/tildes/lib/markdown.py
+++ b/tildes/tildes/lib/markdown.py
@@ -1,6 +1,5 @@
 """Functions/constants related to markdown handling."""
 
-from ctypes import CDLL, c_char_p, c_long
 import re
 from typing import (
     Callable,
@@ -25,13 +24,19 @@ from html5lib.treewalkers.base import NonRecursiveTreeWalker
 from tildes.metrics import histogram_timer
 from tildes.schemas.group import is_valid_group_path
 from tildes.schemas.user import is_valid_username
-
-
-# set up the commonmark function to call into libcmark-gfm
-CMARK_DLL = CDLL('/usr/local/lib/libcmark-gfm.so')
-commonmark = CMARK_DLL.cmark_markdown_to_html  # pylint: disable=invalid-name
-commonmark.restype = c_char_p
-commonmark.argtypes = [c_char_p, c_long, c_long]
+from .cmark import (
+    CMARK_EXTENSIONS,
+    CMARK_OPTS,
+    cmark_find_syntax_extension,
+    cmark_node_free,
+    cmark_parser_attach_syntax_extension,
+    cmark_parser_feed,
+    cmark_parser_finish,
+    cmark_parser_free,
+    cmark_parser_get_syntax_extensions,
+    cmark_parser_new,
+    cmark_render_html,
+)
 
 
 HTML_TAG_WHITELIST = (
@@ -62,12 +67,15 @@ HTML_TAG_WHITELIST = (
     'tbody',
     'td',
     'th',
+    'thead',
     'tr',
     'ul',
 )
 HTML_ATTRIBUTE_WHITELIST = {
     'a': ['href', 'title'],
     'ol': ['start'],
+    'td': ['align'],
+    'th': ['align'],
 }
 PROTOCOL_WHITELIST = ('http', 'https')
 
@@ -112,11 +120,19 @@ def convert_markdown_to_safe_html(markdown: str) -> str:
 
     markdown_bytes = markdown.encode('utf8')
 
-    # enables the --hardbreaks option
-    # (can I import this? it's defined in cmark.h as CMARK_OPT_HARDBREAKS)
-    cmark_options = 4
+    parser = cmark_parser_new(CMARK_OPTS)
+    for name in CMARK_EXTENSIONS:
+        ext = cmark_find_syntax_extension(name)
+        cmark_parser_attach_syntax_extension(parser, ext)
+    exts = cmark_parser_get_syntax_extensions(parser)
+
+    cmark_parser_feed(parser, markdown_bytes, len(markdown_bytes))
+    doc = cmark_parser_finish(parser)
+
+    html_bytes = cmark_render_html(doc, CMARK_OPTS, exts)
 
-    html_bytes = commonmark(markdown_bytes, len(markdown_bytes), cmark_options)
+    cmark_parser_free(parser)
+    cmark_node_free(doc)
 
     html = html_bytes.decode('utf8')