add a new HtmlRenderer subclass with XSS protections

2017-01-13 13:02:44 +01:00 · 2017-01-13 13:02:44 +01:00 · 1aa4e1f9e9
parent 53c2b953db
commit 1aa4e1f9e9
3 changed files with 184 additions and 1 deletions
--- a/docs/index.rst
+++ b/docs/index.rst
@ -228,6 +228,10 @@ Classes
    :members:


+.. autoclass:: SaferHtmlRenderer
+    :members:
+
+
 .. autoclass:: HtmlTocRenderer
    :members:

--- a/misaka/api.py
+++ b/misaka/api.py
@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-

+import re
+
 from ._hoedown import lib, ffi
 from .callbacks import python_callbacks, to_string
 from .constants import *
@ -15,6 +17,7 @@ __all__ = [
    'BaseRenderer',
    'HtmlRenderer',
    'HtmlTocRenderer',
+    'SaferHtmlRenderer',

    'args_to_int',
    'extension_map',
@ -252,3 +255,91 @@ class HtmlTocRenderer(HtmlRenderer):

    def _new_renderer(self, flags, nesting_level):
        return lib.hoedown_html_toc_renderer_new(nesting_level)
+
+
+class SaferHtmlRenderer(HtmlRenderer):
+    """
+    A subclass of :class:`HtmlRenderer` which adds protections against
+    Cross-Site Scripting (XSS):
+
+    1. The ``'skip-html'`` flag is turned on by default, preventing injection of
+       HTML elements. If you want to escape HTML code instead of removing it
+       entirely, change ``sanitization_mode`` to ``'escape'``.
+    2. The URLs of links and images are filtered to prevent JavaScript injection.
+       See the :meth:`check_link` method below.
+    3. Optionally, the URLs can also be rewritten to counter other attacks such
+       as phishing.
+    """
+    _allowed_url_re = re.compile(r'^https?:', re.I)
+
+    def __init__(self, flags=(), sanitization_mode='skip-html', nesting_level=0):
+        if not isinstance(flags, tuple):
+            raise TypeError("`flags` should be a tuple of strings")
+        HtmlRenderer.__init__(self, flags + (sanitization_mode,), nesting_level)
+
+    def autolink(self, raw_link, is_email):
+        """
+        Filters links generated by the ``autolink`` extension.
+        """
+        if self.check_link(raw_link):
+            link = self.rewrite_link(('mailto:' if is_email else '') + raw_link)
+            link = escape_html(link)
+            return '<a href="%s">%s</a>' % (link, escape_html(raw_link))
+        else:
+            return escape_html('<%s>' % raw_link)
+
+    def image(self, raw_link, title='', alt=''):
+        """
+        Filters the ``src`` attribute of an image.
+
+        Note that filtering the source URL of an ``<img>`` tag is only a very
+        basic protection, and it's mostly useless in modern browsers (they block
+        JavaScript in there by default). An example of attack that filtering
+        does not thwart is phishing based on HTTP Auth, see `this issue
+        <https://github.com/liberapay/liberapay.com/issues/504>`_ for details.
+
+        To mitigate this issue you should only allow images from trusted services,
+        for example your own image store, or a proxy (see :meth:`rewrite_link`).
+        """
+        if self.check_link(raw_link):
+            link = self.rewrite_link(raw_link, is_image_src=True)
+            maybe_alt = ' alt="%s"' % escape_html(alt) if alt else ''
+            maybe_title = ' title="%s"' % escape_html(title) if title else ''
+            link = escape_html(link)
+            return '<img src="%s"%s%s />' % (link, maybe_alt, maybe_title)
+        else:
+            return escape_html("![%s](%s)" % (alt, raw_link))
+
+    def link(self, content, raw_link, title=''):
+        """
+        Filters links.
+        """
+        if self.check_link(raw_link):
+            link = self.rewrite_link(raw_link)
+            maybe_title = ' title="%s"' % escape_html(title) if title else ''
+            link = escape_html(link)
+            return ('<a href="%s"%s>' + content + '</a>') % (link, maybe_title)
+        else:
+            return escape_html("[%s](%s)" % (content, raw_link))
+
+    def check_link(self, link, is_image_src=False):
+        """
+        This method is used to check a URL.
+
+        Returns :obj:`True` if the URL is "safe", :obj:`False` otherwise.
+
+        The default implementation only allows HTTP and HTTPS links. Using a
+        blacklist approach is not recommended, see the
+        `OWASP XSS Filter Evasion Cheat Sheet
+        <https://www.owasp.org/index.php/XSS_Filter_Evasion_Cheat_Sheet>`_ for
+        an illustration of why.
+        """
+        return bool(self._allowed_url_re.match(link))
+
+    def rewrite_link(self, link, is_image_src=False):
+        """
+        This method is called to rewrite URLs.
+
+        The default implementation simply returns the given link.
+        """
+        return link
--- a/tests/test_xss_protection.py
+++ b/tests/test_xss_protection.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-

 from chibitest import TestCase, ok
-from misaka import escape_html
+from misaka import escape_html, Markdown, SaferHtmlRenderer


 class EscapeHtmlTest(TestCase):
@ -10,3 +10,91 @@ class EscapeHtmlTest(TestCase):

    def test_escape_html_slash(self):
        ok(escape_html('a&<>"\'/', True)) == 'a&amp;&lt;&gt;&quot;&#39;&#47;'
+
+
+render = Markdown(SaferHtmlRenderer())
+render_escape = Markdown(SaferHtmlRenderer(sanitization_mode='escape'))
+
+
+class SaferHtmlRendererTest(TestCase):
+    def test_html_skip(self):
+        actual = render('Example <script>alert(1);</script>')
+        expected = '<p>Example alert(1);</p>\n'
+        ok(actual).diff(expected)
+
+        html = render('<sc<script>ript>xss</sc</script>ript>')
+        ok(html).not_contains('<sc')
+        ok(html).not_contains('ript>')
+
+        actual = render('<span><a href="javascript:xss">foo</a></span>')
+        expected = '<p>foo</p>\n'
+        ok(actual).diff(expected)
+
+    def test_html_escape(self):
+        supplied = 'Example <script>alert(1);</script>'
+        expected = '<p>%s</p>\n' % escape_html(supplied)
+        ok(render_escape(supplied)).diff(expected)
+
+        html = render_escape('<sc<script>ript>xss</sc</script>ript>')
+        ok(html).not_contains('<sc')
+        ok(html).not_contains('ript>')
+
+        supplied = '<span><a href="javascript:xss">foo</a></span>'
+        expected = '<p>%s</p>\n' % escape_html(supplied)
+        ok(render_escape(supplied)).diff(expected)
+
+    def test_autolink_filtering_with_nice_data(self):
+        for url in ('http://a', "https://b?x&y"):
+            actual = render('<%s>' % url)
+            expected = '<p><a href="{0}">{0}</a></p>\n'.format(escape_html(url))
+            ok(actual).diff(expected)
+
+        supplied = "<alice@example.net>"
+        expected = '<p>%s</p>\n' % escape_html(supplied)
+        ok(render_escape(supplied)).diff(expected)
+
+    def test_autolink_filtering_with_naughty_data(self):
+        actual = render('<javascript:foo>')
+        expected = '<p>&lt;javascript:foo&gt;</p>\n'
+        ok(actual).diff(expected)
+
+        link = 'javascript:0'
+        encoded_link = ''.join('&x{0:x};'.format(ord(c)) for c in link)
+        html = render('<%s>' % encoded_link)
+        ok(html).not_contains(link)
+
+    def test_link_filtering_with_nice_data(self):
+        for url in ('http://a', 'https://b'):
+            actual = render("['foo](%s \"bar'\")" % url)
+            expected = '<p><a href="{0}" title="bar&#39;">&#39;foo</a></p>\n'.format(url)
+            ok(actual).diff(expected)
+
+    def test_link_filtering_with_naughty_data(self):
+        supplied = '[foo](javascript:xss)'
+        expected = '<p>%s</p>\n' % escape_html(supplied)
+        ok(render(supplied)).diff(expected)
+
+        html = render('[foo](unknown:bar)')
+        expected = '<p>%s</p>\n' % escape_html(supplied)
+        ok(render(supplied)).diff(expected)
+
+        html = render('[" xss><xss>]("><xss>)')
+        ok(html).not_contains('<xss>')
+        ok(html).not_contains('" xss')
+        html = render('[" xss><xss>](https:"><xss>)')
+        ok(html).not_contains('<xss>')
+        ok(html).not_contains('" xss')
+
+    def test_image_src_filtering_with_nice_data(self):
+        actual = render('![](http:"foo")')
+        expected = '<p><img src="http:&quot;foo&quot;" /></p>\n'
+        ok(actual).diff(expected)
+
+        actual = render('!["bar"](https://example.org/ "\'title\'")')
+        expected = '<p><img src="https://example.org/" alt="&quot;bar&quot;" title="&#39;title&#39;" /></p>\n'
+        ok(actual).diff(expected)
+
+    def test_image_src_filtering_with_naughty_data(self):
+        actual = render('![foo](javascript:foo)')
+        expected = '<p>![foo](javascript:foo)</p>\n'
+        ok(actual).diff(expected)