Browse Source

Add use-specific HTML sanitization overrides

Includes an override for user bios allowing rel attributes in anchor 
tags
merge-requests/69/head
deing 6 years ago
committed by Deimos
parent
commit
3538913355
  1. 7
      tildes/tildes/enums.py
  2. 27
      tildes/tildes/lib/markdown.py
  3. 6
      tildes/tildes/models/user/user.py

7
tildes/tildes/enums.py

@ -141,3 +141,10 @@ class TopicType(enum.Enum):
TEXT = enum.auto()
LINK = enum.auto()
class BleachContext(enum.Enum):
"""Enum for the possible contexts of Bleach HTML sanitization."""
DEFAULT = enum.auto()
USER_BIO = enum.auto()

27
tildes/tildes/lib/markdown.py

@ -5,7 +5,7 @@
from functools import partial
import re
from typing import Any, Callable, Iterator, List, Match, Optional, Pattern, Tuple
from typing import Any, Callable, Dict, Iterator, List, Match, Optional, Pattern, Tuple
from bs4 import BeautifulSoup
import bleach
@ -16,6 +16,7 @@ from pygments.formatters import HtmlFormatter
from pygments.lexers import get_lexer_by_name, PhpLexer
from pygments.util import ClassNotFound
from tildes.enums import BleachContext
from tildes.metrics import histogram_timer
from tildes.schemas.group import is_valid_group_path
from tildes.schemas.user import is_valid_username
@ -87,7 +88,7 @@ HTML_TAG_WHITELIST = (
"tr",
"ul",
)
HTML_ATTRIBUTE_WHITELIST = {
HTML_ATTRIBUTE_WHITELIST_DEFAULT = {
"a": ["href", "title"],
"ol": ["start"],
"td": ["align"],
@ -97,6 +98,12 @@ HTML_ATTRIBUTE_WHITELIST = {
}
PROTOCOL_WHITELIST = ("http", "https", "mailto")
# per-context overrides for allowed attributes
HTML_ATTRIBUTE_WHITELIST_OVERRIDES: Dict[BleachContext, Dict[str, List[str]]] = {
BleachContext.DEFAULT: {},
BleachContext.USER_BIO: {"a": ["href", "title", "rel"]},
}
# Regex that finds ordered list markdown that was probably accidental - ones being
# initiated by anything except "1." at the start of a post
BAD_ORDERED_LIST_REGEX = re.compile(
@ -111,7 +118,9 @@ SUBSEQUENT_BLOCKQUOTES_REGEX = re.compile("^>([^\n]*?)\n\n(?=>)", flags=re.MULTI
@histogram_timer("markdown_processing")
def convert_markdown_to_safe_html(markdown: str) -> str:
def convert_markdown_to_safe_html(
markdown: str, context: BleachContext = BleachContext.DEFAULT
) -> str:
"""Convert markdown to sanitized HTML."""
# apply custom pre-processing to markdown
markdown = preprocess_markdown(markdown)
@ -138,7 +147,7 @@ def convert_markdown_to_safe_html(markdown: str) -> str:
html = postprocess_markdown_html(html)
# add linkification and sanitize the final HTML before returning it
return linkify_and_sanitize_html(html)
return linkify_and_sanitize_html(html, context)
def preprocess_markdown(markdown: str) -> str:
@ -450,16 +459,22 @@ class LinkifyFilter(Filter):
return [{"type": "Characters", "data": match[0]}]
def linkify_and_sanitize_html(html: str) -> str:
def linkify_and_sanitize_html(html: str, context: BleachContext) -> str:
"""Use bleach and html5lib filters to linkify and sanitize HTML."""
# list of tag names to exclude from linkification
linkify_skipped_tags = ["code", "pre"]
tildes_linkifier = partial(LinkifyFilter, skip_tags=linkify_skipped_tags)
# include overrides for the current context
attribute_whitelist = {
**HTML_ATTRIBUTE_WHITELIST_DEFAULT,
**HTML_ATTRIBUTE_WHITELIST_OVERRIDES[context],
}
cleaner = bleach.Cleaner(
tags=HTML_TAG_WHITELIST,
attributes=HTML_ATTRIBUTE_WHITELIST,
attributes=attribute_whitelist,
protocols=PROTOCOL_WHITELIST,
filters=[tildes_linkifier],
)

6
tildes/tildes/models/user/user.py

@ -32,7 +32,7 @@ from sqlalchemy.orm import deferred
from sqlalchemy.sql.expression import text
from sqlalchemy_utils import Ltree
from tildes.enums import CommentLabelOption, TopicSortOption
from tildes.enums import BleachContext, CommentLabelOption, TopicSortOption
from tildes.lib.database import ArrayOfLtree, CIText
from tildes.lib.datetime import utc_now
from tildes.lib.hash import hash_string, is_match_for_hash
@ -155,7 +155,9 @@ class User(DatabaseModel):
self._bio_markdown = new_markdown
if self._bio_markdown is not None:
self.bio_rendered_html = convert_markdown_to_safe_html(new_markdown)
self.bio_rendered_html = convert_markdown_to_safe_html(
new_markdown, BleachContext.USER_BIO
)
else:
self.bio_rendered_html = None

Loading…
Cancel
Save