From 353891335573e4fdde770a998241ed25af8be8f2 Mon Sep 17 00:00:00 2001 From: deing Date: Sat, 4 May 2019 22:19:12 +0200 Subject: [PATCH] Add use-specific HTML sanitization overrides Includes an override for user bios allowing rel attributes in anchor tags --- tildes/tildes/enums.py | 7 +++++++ tildes/tildes/lib/markdown.py | 27 +++++++++++++++++++++------ tildes/tildes/models/user/user.py | 6 ++++-- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/tildes/tildes/enums.py b/tildes/tildes/enums.py index a46715e..9f27ec8 100644 --- a/tildes/tildes/enums.py +++ b/tildes/tildes/enums.py @@ -141,3 +141,10 @@ class TopicType(enum.Enum): TEXT = enum.auto() LINK = enum.auto() + + +class BleachContext(enum.Enum): + """Enum for the possible contexts of Bleach HTML sanitization.""" + + DEFAULT = enum.auto() + USER_BIO = enum.auto() diff --git a/tildes/tildes/lib/markdown.py b/tildes/tildes/lib/markdown.py index d9cce60..425fe06 100644 --- a/tildes/tildes/lib/markdown.py +++ b/tildes/tildes/lib/markdown.py @@ -5,7 +5,7 @@ from functools import partial import re -from typing import Any, Callable, Iterator, List, Match, Optional, Pattern, Tuple +from typing import Any, Callable, Dict, Iterator, List, Match, Optional, Pattern, Tuple from bs4 import BeautifulSoup import bleach @@ -16,6 +16,7 @@ from pygments.formatters import HtmlFormatter from pygments.lexers import get_lexer_by_name, PhpLexer from pygments.util import ClassNotFound +from tildes.enums import BleachContext from tildes.metrics import histogram_timer from tildes.schemas.group import is_valid_group_path from tildes.schemas.user import is_valid_username @@ -87,7 +88,7 @@ HTML_TAG_WHITELIST = ( "tr", "ul", ) -HTML_ATTRIBUTE_WHITELIST = { +HTML_ATTRIBUTE_WHITELIST_DEFAULT = { "a": ["href", "title"], "ol": ["start"], "td": ["align"], @@ -97,6 +98,12 @@ HTML_ATTRIBUTE_WHITELIST = { } PROTOCOL_WHITELIST = ("http", "https", "mailto") +# per-context overrides for allowed attributes +HTML_ATTRIBUTE_WHITELIST_OVERRIDES: Dict[BleachContext, Dict[str, List[str]]] = { + BleachContext.DEFAULT: {}, + BleachContext.USER_BIO: {"a": ["href", "title", "rel"]}, +} + # Regex that finds ordered list markdown that was probably accidental - ones being # initiated by anything except "1." at the start of a post BAD_ORDERED_LIST_REGEX = re.compile( @@ -111,7 +118,9 @@ SUBSEQUENT_BLOCKQUOTES_REGEX = re.compile("^>([^\n]*?)\n\n(?=>)", flags=re.MULTI @histogram_timer("markdown_processing") -def convert_markdown_to_safe_html(markdown: str) -> str: +def convert_markdown_to_safe_html( + markdown: str, context: BleachContext = BleachContext.DEFAULT +) -> str: """Convert markdown to sanitized HTML.""" # apply custom pre-processing to markdown markdown = preprocess_markdown(markdown) @@ -138,7 +147,7 @@ def convert_markdown_to_safe_html(markdown: str) -> str: html = postprocess_markdown_html(html) # add linkification and sanitize the final HTML before returning it - return linkify_and_sanitize_html(html) + return linkify_and_sanitize_html(html, context) def preprocess_markdown(markdown: str) -> str: @@ -450,16 +459,22 @@ class LinkifyFilter(Filter): return [{"type": "Characters", "data": match[0]}] -def linkify_and_sanitize_html(html: str) -> str: +def linkify_and_sanitize_html(html: str, context: BleachContext) -> str: """Use bleach and html5lib filters to linkify and sanitize HTML.""" # list of tag names to exclude from linkification linkify_skipped_tags = ["code", "pre"] tildes_linkifier = partial(LinkifyFilter, skip_tags=linkify_skipped_tags) + # include overrides for the current context + attribute_whitelist = { + **HTML_ATTRIBUTE_WHITELIST_DEFAULT, + **HTML_ATTRIBUTE_WHITELIST_OVERRIDES[context], + } + cleaner = bleach.Cleaner( tags=HTML_TAG_WHITELIST, - attributes=HTML_ATTRIBUTE_WHITELIST, + attributes=attribute_whitelist, protocols=PROTOCOL_WHITELIST, filters=[tildes_linkifier], ) diff --git a/tildes/tildes/models/user/user.py b/tildes/tildes/models/user/user.py index f3acfd3..d0cd78c 100644 --- a/tildes/tildes/models/user/user.py +++ b/tildes/tildes/models/user/user.py @@ -32,7 +32,7 @@ from sqlalchemy.orm import deferred from sqlalchemy.sql.expression import text from sqlalchemy_utils import Ltree -from tildes.enums import CommentLabelOption, TopicSortOption +from tildes.enums import BleachContext, CommentLabelOption, TopicSortOption from tildes.lib.database import ArrayOfLtree, CIText from tildes.lib.datetime import utc_now from tildes.lib.hash import hash_string, is_match_for_hash @@ -155,7 +155,9 @@ class User(DatabaseModel): self._bio_markdown = new_markdown if self._bio_markdown is not None: - self.bio_rendered_html = convert_markdown_to_safe_html(new_markdown) + self.bio_rendered_html = convert_markdown_to_safe_html( + new_markdown, BleachContext.USER_BIO + ) else: self.bio_rendered_html = None