diff --git a/tildes/tests/test_markdown.py b/tildes/tests/test_markdown.py index de807e3..e11d8b3 100644 --- a/tildes/tests/test_markdown.py +++ b/tildes/tests/test_markdown.py @@ -3,7 +3,7 @@ from bs4 import BeautifulSoup -from tildes.enums import BleachContext +from tildes.enums import HTMLSanitizationContext from tildes.lib.markdown import convert_markdown_to_safe_html @@ -403,6 +403,8 @@ def test_a_rel_removed_default_context(): def test_a_rel_kept_user_bio_context(): """Ensure a rel= attr is kept on an tag in the user bio context.""" markdown = 'Link' - processed = convert_markdown_to_safe_html(markdown, BleachContext.USER_BIO) + processed = convert_markdown_to_safe_html( + markdown, HTMLSanitizationContext.USER_BIO + ) assert "rel=" in processed diff --git a/tildes/tildes/enums.py b/tildes/tildes/enums.py index 9f27ec8..c1150b1 100644 --- a/tildes/tildes/enums.py +++ b/tildes/tildes/enums.py @@ -143,8 +143,7 @@ class TopicType(enum.Enum): LINK = enum.auto() -class BleachContext(enum.Enum): - """Enum for the possible contexts of Bleach HTML sanitization.""" +class HTMLSanitizationContext(enum.Enum): + """Enum for the possible contexts for HTML sanitization.""" - DEFAULT = enum.auto() USER_BIO = enum.auto() diff --git a/tildes/tildes/lib/markdown.py b/tildes/tildes/lib/markdown.py index 425fe06..8d1d8ef 100644 --- a/tildes/tildes/lib/markdown.py +++ b/tildes/tildes/lib/markdown.py @@ -5,7 +5,7 @@ from functools import partial import re -from typing import Any, Callable, Dict, Iterator, List, Match, Optional, Pattern, Tuple +from typing import Any, Callable, Iterator, List, Match, Optional, Pattern, Tuple from bs4 import BeautifulSoup import bleach @@ -16,7 +16,7 @@ from pygments.formatters import HtmlFormatter from pygments.lexers import get_lexer_by_name, PhpLexer from pygments.util import ClassNotFound -from tildes.enums import BleachContext +from tildes.enums import HTMLSanitizationContext from tildes.metrics import histogram_timer from tildes.schemas.group import is_valid_group_path from tildes.schemas.user import is_valid_username @@ -88,6 +88,8 @@ HTML_TAG_WHITELIST = ( "tr", "ul", ) +PROTOCOL_WHITELIST = ("http", "https", "mailto") + HTML_ATTRIBUTE_WHITELIST_DEFAULT = { "a": ["href", "title"], "ol": ["start"], @@ -96,12 +98,10 @@ HTML_ATTRIBUTE_WHITELIST_DEFAULT = { "code": allow_syntax_highlighting_classes, "span": allow_syntax_highlighting_classes, } -PROTOCOL_WHITELIST = ("http", "https", "mailto") # per-context overrides for allowed attributes -HTML_ATTRIBUTE_WHITELIST_OVERRIDES: Dict[BleachContext, Dict[str, List[str]]] = { - BleachContext.DEFAULT: {}, - BleachContext.USER_BIO: {"a": ["href", "title", "rel"]}, +HTML_ATTRIBUTE_WHITELIST_OVERRIDES = { + HTMLSanitizationContext.USER_BIO: {"a": ["href", "title", "rel"]} } # Regex that finds ordered list markdown that was probably accidental - ones being @@ -119,7 +119,7 @@ SUBSEQUENT_BLOCKQUOTES_REGEX = re.compile("^>([^\n]*?)\n\n(?=>)", flags=re.MULTI @histogram_timer("markdown_processing") def convert_markdown_to_safe_html( - markdown: str, context: BleachContext = BleachContext.DEFAULT + markdown: str, context: Optional[HTMLSanitizationContext] = None ) -> str: """Convert markdown to sanitized HTML.""" # apply custom pre-processing to markdown @@ -459,18 +459,20 @@ class LinkifyFilter(Filter): return [{"type": "Characters", "data": match[0]}] -def linkify_and_sanitize_html(html: str, context: BleachContext) -> str: +def linkify_and_sanitize_html( + html: str, context: Optional[HTMLSanitizationContext] = None +) -> str: """Use bleach and html5lib filters to linkify and sanitize HTML.""" # list of tag names to exclude from linkification linkify_skipped_tags = ["code", "pre"] tildes_linkifier = partial(LinkifyFilter, skip_tags=linkify_skipped_tags) - # include overrides for the current context - attribute_whitelist = { - **HTML_ATTRIBUTE_WHITELIST_DEFAULT, - **HTML_ATTRIBUTE_WHITELIST_OVERRIDES[context], - } + attribute_whitelist = HTML_ATTRIBUTE_WHITELIST_DEFAULT + if context: + # include overrides for the current context + overrides = HTML_ATTRIBUTE_WHITELIST_OVERRIDES.get(context, {}) + attribute_whitelist = {**attribute_whitelist, **overrides} cleaner = bleach.Cleaner( tags=HTML_TAG_WHITELIST, diff --git a/tildes/tildes/models/user/user.py b/tildes/tildes/models/user/user.py index d0cd78c..9b77e2b 100644 --- a/tildes/tildes/models/user/user.py +++ b/tildes/tildes/models/user/user.py @@ -32,7 +32,7 @@ from sqlalchemy.orm import deferred from sqlalchemy.sql.expression import text from sqlalchemy_utils import Ltree -from tildes.enums import BleachContext, CommentLabelOption, TopicSortOption +from tildes.enums import CommentLabelOption, HTMLSanitizationContext, TopicSortOption from tildes.lib.database import ArrayOfLtree, CIText from tildes.lib.datetime import utc_now from tildes.lib.hash import hash_string, is_match_for_hash @@ -156,7 +156,7 @@ class User(DatabaseModel): if self._bio_markdown is not None: self.bio_rendered_html = convert_markdown_to_safe_html( - new_markdown, BleachContext.USER_BIO + new_markdown, HTMLSanitizationContext.USER_BIO ) else: self.bio_rendered_html = None