From aa779db780ebf0f8f463d59d8fbba40f3024a7fc Mon Sep 17 00:00:00 2001 From: Justin Li Date: Thu, 23 May 2019 16:12:16 -0400 Subject: [PATCH] Add escape_html_once and escape_html_once_as_safe_html --- CHANGELOG.md | 2 ++ README.md | 2 ++ benchmark/html_escape_once.rb | 25 ++++++++++++++++++++ ext/escape_utils/escape_utils.c | 30 ++++++++++++++++++++---- ext/escape_utils/houdini.h | 4 +++- ext/escape_utils/houdini_html_e.c | 38 ++++++++++++++++++++++++++++--- test/html/escape_test.rb | 24 +++++++++++++++++++ 7 files changed, 117 insertions(+), 8 deletions(-) create mode 100644 benchmark/html_escape_once.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bd339b..35d558d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # Unreleased +- Add `EscapeUtils.escape_html_once` and `EscapeUtils.rb_eu_escape_html_once_as_html_safe` as faster implementations of Rails `escape_once` helper. + # 1.2.2 - Update EscapeUtils.escape_javascript to match Rails `escape_javascript` diff --git a/README.md b/README.md index 4d70241..192ef63 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,8 @@ or per-call by passing `false` as the second parameter to `escape_html` like `Es For more information check out: http://www.owasp.org/index.php/XSS_(Cross_Site_Scripting)_Prevention_Cheat_Sheet#RULE_.231_-_HTML_Escape_Before_Inserting_Untrusted_Data_into_HTML_Element_Content +To avoid double-escaping HTML entities, use `EscapeUtils.escape_html_once`. + #### Unescaping ``` ruby diff --git a/benchmark/html_escape_once.rb b/benchmark/html_escape_once.rb new file mode 100644 index 0000000..92d566a --- /dev/null +++ b/benchmark/html_escape_once.rb @@ -0,0 +1,25 @@ +# encoding: utf-8 + +require 'rubygems' +require 'bundler/setup' +require 'benchmark/ips' + +require 'escape_utils' +require 'active_support/core_ext/string/output_safety' + +url = "https://en.wikipedia.org/wiki/Succession_to_the_British_throne" +html = `curl -s #{url}` +html = html.force_encoding('utf-8') if html.respond_to?(:force_encoding) +puts "Escaping #{html.bytesize} bytes of html from #{url}" + +Benchmark.ips do |x| + x.report "EscapeUtils.escape_html_once" do + EscapeUtils.escape_html_once(html) + end + + x.report "ActionView escape_once" do # Rails expose it as ERB::Util.html_escape_once + ERB::Util.html_escape_once(html) + end + + x.compare!(order: :baseline) +end diff --git a/ext/escape_utils/escape_utils.c b/ext/escape_utils/escape_utils.c index ea8f000..e88114f 100644 --- a/ext/escape_utils/escape_utils.c +++ b/ext/escape_utils/escape_utils.c @@ -101,7 +101,7 @@ static VALUE new_html_safe_string(const char *ptr, size_t len) return rb_str_new_with_class(rb_html_safe_string_template_object, ptr, len); } -static VALUE rb_eu_escape_html_as_html_safe(VALUE self, VALUE str) +static VALUE rb_eu_escape_html_as_html_safe0(VALUE self, VALUE str, int escape_once) { VALUE result; int secure = g_html_secure; @@ -110,7 +110,7 @@ static VALUE rb_eu_escape_html_as_html_safe(VALUE self, VALUE str) Check_Type(str, T_STRING); check_utf8_encoding(str); - if (houdini_escape_html0(&buf, (const uint8_t *)RSTRING_PTR(str), RSTRING_LEN(str), secure)) { + if (houdini_escape_html0(&buf, (const uint8_t *)RSTRING_PTR(str), RSTRING_LEN(str), secure, escape_once)) { result = new_html_safe_string(buf.ptr, buf.size); gh_buf_free(&buf); } else { @@ -123,7 +123,17 @@ static VALUE rb_eu_escape_html_as_html_safe(VALUE self, VALUE str) return result; } -static VALUE rb_eu_escape_html(int argc, VALUE *argv, VALUE self) +static VALUE rb_eu_escape_html_as_html_safe(VALUE self, VALUE str) +{ + return rb_eu_escape_html_as_html_safe0(self, str, 0); +} + +static VALUE rb_eu_escape_html_once_as_html_safe(VALUE self, VALUE str) +{ + return rb_eu_escape_html_as_html_safe0(self, str, 1); +} + +static VALUE rb_eu_escape_html0(int argc, VALUE *argv, VALUE self, int escape_once) { VALUE str, rb_secure; gh_buf buf = GH_BUF_INIT; @@ -138,7 +148,7 @@ static VALUE rb_eu_escape_html(int argc, VALUE *argv, VALUE self) Check_Type(str, T_STRING); check_utf8_encoding(str); - if (houdini_escape_html0(&buf, (const uint8_t *)RSTRING_PTR(str), RSTRING_LEN(str), secure)) { + if (houdini_escape_html0(&buf, (const uint8_t *)RSTRING_PTR(str), RSTRING_LEN(str), secure, escape_once)) { VALUE result = eu_new_str(buf.ptr, buf.size); gh_buf_free(&buf); return result; @@ -147,6 +157,16 @@ static VALUE rb_eu_escape_html(int argc, VALUE *argv, VALUE self) return str; } +static VALUE rb_eu_escape_html(int argc, VALUE *argv, VALUE self) +{ + return rb_eu_escape_html0(argc, argv, self, 0); +} + +static VALUE rb_eu_escape_html_once(int argc, VALUE *argv, VALUE self) +{ + return rb_eu_escape_html0(argc, argv, self, 1); +} + static VALUE rb_eu_unescape_html(VALUE self, VALUE str) { return rb_eu__generic(str, &houdini_unescape_html); @@ -232,7 +252,9 @@ void Init_escape_utils() rb_mEscapeUtils = rb_define_module("EscapeUtils"); rb_define_method(rb_mEscapeUtils, "escape_html_as_html_safe", rb_eu_escape_html_as_html_safe, 1); + rb_define_method(rb_mEscapeUtils, "escape_html_once_as_html_safe", rb_eu_escape_html_once_as_html_safe, 1); rb_define_method(rb_mEscapeUtils, "escape_html", rb_eu_escape_html, -1); + rb_define_method(rb_mEscapeUtils, "escape_html_once", rb_eu_escape_html_once, -1); rb_define_method(rb_mEscapeUtils, "unescape_html", rb_eu_unescape_html, 1); rb_define_method(rb_mEscapeUtils, "escape_xml", rb_eu_escape_xml, 1); rb_define_method(rb_mEscapeUtils, "escape_javascript", rb_eu_escape_js, 1); diff --git a/ext/escape_utils/houdini.h b/ext/escape_utils/houdini.h index 7db0034..dede7bb 100644 --- a/ext/escape_utils/houdini.h +++ b/ext/escape_utils/houdini.h @@ -22,11 +22,13 @@ extern "C" { # define _isdigit(c) ((c) >= '0' && (c) <= '9') #endif +#define _isasciialpha(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + #define HOUDINI_ESCAPED_SIZE(x) (((x) * 12) / 10) #define HOUDINI_UNESCAPED_SIZE(x) (x) extern int houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size); -extern int houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure); +extern int houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure, int escape_once); extern int houdini_unescape_html(gh_buf *ob, const uint8_t *src, size_t size); extern int houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size); extern int houdini_escape_uri(gh_buf *ob, const uint8_t *src, size_t size); diff --git a/ext/escape_utils/houdini_html_e.c b/ext/escape_utils/houdini_html_e.c index 1593371..84ea013 100644 --- a/ext/escape_utils/houdini_html_e.c +++ b/ext/escape_utils/houdini_html_e.c @@ -44,15 +44,47 @@ static const char *HTML_ESCAPES[] = { ">" }; +static int +is_entity(const uint8_t *src, size_t size) +{ + size_t i = 0; + + if (size == 0 || src[0] != '&') + return false; + + if (size > 16) + size = 16; + + if (size >= 4 && src[1] == '#') { + if (_isdigit(src[2])) { + for (i = 3; i < size && _isdigit(src[i]); ++i); + } + else if ((src[2] == 'x' || src[2] == 'X') && _isxdigit(src[3])) { + for (i = 4; i < size && _isxdigit(src[i]); ++i); + } + else return false; + } + else { + for (i = 1; i < size && _isasciialpha(src[i]); ++i); + if (i == 1) return false; + } + + return i < size && src[i] == ';'; +} + int -houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure) +houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure, int escape_once) { size_t i = 0, org, esc = 0; while (i < size) { org = i; - while (i < size && (esc = HTML_ESCAPE_TABLE[src[i]]) == 0) + while (i < size) { + esc = HTML_ESCAPE_TABLE[src[i]]; + if (unlikely(esc != 0) && (!escape_once || !is_entity(src + i, size - i))) + break; i++; + } if (i > org) { if (unlikely(org == 0)) { @@ -85,6 +117,6 @@ houdini_escape_html0(gh_buf *ob, const uint8_t *src, size_t size, int secure) int houdini_escape_html(gh_buf *ob, const uint8_t *src, size_t size) { - return houdini_escape_html0(ob, src, size, 1); + return houdini_escape_html0(ob, src, size, 1, 0); } diff --git a/test/html/escape_test.rb b/test/html/escape_test.rb index 6e5ea26..7a7cdce 100644 --- a/test/html/escape_test.rb +++ b/test/html/escape_test.rb @@ -63,6 +63,30 @@ def test_returns_original_if_not_escaped assert_equal str.object_id, EscapeUtils.escape_html(str).object_id end + def test_escape_html_once + { + '&<' => '&<', + '&<&x;' => '&<&x;', + '&' => '&amp', + '&!;' => '&!;', + '�' => '�', + ' ' => ' ', + ' ' => '&#10', + '�' => '�', + '�' => '�', + 'ð' => 'ð', + 'ð' => '&#xf0', + '&#x;' => '&#x;', + 'oo;' => '&#xfoo;', + '&#;' => '&#;', + '&#foo;' => '&#foo;', + 'foo&bar' => 'foo&bar', + }.each do |(input, output)| + assert_equal output, EscapeUtils.escape_html_once(input) + assert_equal output, EscapeUtils.escape_html_once_as_html_safe(input) + end + end + def test_html_safe_escape_default_works str = EscapeUtils.escape_html_as_html_safe('foobar') assert_equal 'foobar', str