>From e31d29fae766daa358ed6e6bf278e75b95a317d3 Mon Sep 17 00:00:00 2001 From: Manfred Stienstra Date: Tue, 1 Sep 2009 20:16:11 +0200 Subject: [PATCH] Add methods for string verification and encoding cleanup code. Signed-off-by: Michael Koziarski --- activesupport/lib/active_support/multibyte.rb | 18 ++++ .../multibyte/handlers/utf8_handler.rb | 13 +-- .../lib/active_support/multibyte/utils.rb | 39 +++++++ activesupport/test/multibyte_utils_test.rb | 106 ++++++++++++++++++++ 4 files changed, 165 insertions(+), 11 deletions(-) create mode 100644 activesupport/lib/active_support/multibyte/utils.rb create mode 100644 activesupport/test/multibyte_utils_test.rb diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb index 27c0d18..f76cfba 100644 --- a/activesupport/lib/active_support/multibyte.rb +++ b/activesupport/lib/active_support/multibyte.rb @@ -3,7 +3,25 @@ module ActiveSupport DEFAULT_NORMALIZATION_FORM = :kc NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd] UNICODE_VERSION = '5.0.0' + + # Regular expressions that describe valid byte sequences for a character + VALID_CHARACTER = { + # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) + 'UTF-8' => /\A(?: + [\x00-\x7f] | + [\xc2-\xdf] [\x80-\xbf] | + \xe0 [\xa0-\xbf] [\x80-\xbf] | + [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | + \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | + [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | + \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn, + # Quick check for valid Shift-JIS characters, disregards the odd-even pairing + 'Shift_JIS' => /\A(?: + [\x00-\x7e \xa1-\xdf] | + [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn + } end end require 'active_support/multibyte/chars' +require 'active_support/multibyte/utils' \ No newline at end of file diff --git a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb index 66fe47a..f95349e 100644 --- a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb +++ b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb @@ -100,16 +100,7 @@ module ActiveSupport::Multibyte::Handlers #:nodoc: # between little and big endian. This is not an issue in utf-8, so it must be ignored. UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM - # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) - UTF8_PAT = /\A(?: - [\x00-\x7f] | - [\xc2-\xdf] [\x80-\xbf] | - \xe0 [\xa0-\xbf] [\x80-\xbf] | - [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | - \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | - [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | - \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] - )*\z/xn + UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] # Returns a regular expression pattern that matches the passed Unicode codepoints def self.codepoints_to_pattern(array_of_codepoints) #:nodoc: @@ -357,7 +348,7 @@ module ActiveSupport::Multibyte::Handlers #:nodoc: # Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string def tidy_bytes(str) str.split(//u).map do |c| - if !UTF8_PAT.match(c) + if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c) n = c.unpack('C')[0] n < 128 ? n.chr : n < 160 ? [UCD.cp1252[n] || n].pack('U') : diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb new file mode 100644 index 0000000..094e856 --- /dev/null +++ b/activesupport/lib/active_support/multibyte/utils.rb @@ -0,0 +1,39 @@ +module ActiveSupport #:nodoc: + module Multibyte #:nodoc: + # Returns a regular expression that matches valid characters in the current encoding + def self.valid_character + case $KCODE + when 'UTF8' + VALID_CHARACTER['UTF-8'] + when 'SJIS' + VALID_CHARACTER['Shift_JIS'] + end + end + + # Verifies the encoding of a string + def self.verify(string) + if expression = valid_character + for c in string.split(//) + return false unless valid_character.match(c) + end + end + true + end + + # Verifies the encoding of the string and raises an exception when it's not valid + def self.verify!(string) + raise ActiveSupport::Multibyte::Handlers::EncodingError.new("Found characters with invalid encoding") unless verify(string) + end + + # Removes all invalid characters from the string + def self.clean(string) + if expression = valid_character + stripped = []; for c in string.split(//) + stripped << c if valid_character.match(c) + end; stripped.join + else + string + end + end + end +end \ No newline at end of file diff --git a/activesupport/test/multibyte_utils_test.rb b/activesupport/test/multibyte_utils_test.rb new file mode 100644 index 0000000..a4bcfc8 --- /dev/null +++ b/activesupport/test/multibyte_utils_test.rb @@ -0,0 +1,106 @@ +require 'abstract_unit' + +class MultibyteUtilsTest < Test::Unit::TestCase + + def test_valid_character_returns_an_expression_for_the_current_encoding + with_kcode('None') do + assert_nil ActiveSupport::Multibyte.valid_character + end + with_kcode('UTF8') do + assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character + end + with_kcode('SJIS') do + assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character + end + end + + def test_verify_verifies_ASCII_strings_are_properly_encoded + with_kcode('None') do + examples.each do |example| + assert ActiveSupport::Multibyte.verify(example) + end + end + end + + def test_verify_verifies_UTF_8_strings_are_properly_encoded + with_kcode('UTF8') do + assert ActiveSupport::Multibyte.verify(example('valid UTF-8')) + assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8')) + end + end + + def test_verify_verifies_Shift_JIS_strings_are_properly_encoded + with_kcode('SJIS') do + assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS')) + assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS')) + end + end + + def test_verify_bang_raises_an_exception_when_it_finds_an_invalid_character + with_kcode('UTF8') do + assert_raises(ActiveSupport::Multibyte::Handlers::EncodingError) do + ActiveSupport::Multibyte.verify!(example('invalid UTF-8')) + end + end + end + + def test_verify_bang_doesnt_raise_an_exception_when_the_encoding_is_valid + with_kcode('UTF8') do + assert_nothing_raised do + ActiveSupport::Multibyte.verify!(example('valid UTF-8')) + end + end + end + + def test_clean_leaves_ASCII_strings_intact + with_kcode('None') do + [ + 'word', "\270\236\010\210\245" + ].each do |string| + assert_equal string, ActiveSupport::Multibyte.clean(string) + end + end + end + + def test_clean_cleans_invalid_characters_from_UTF_8_encoded_strings + with_kcode('UTF8') do + cleaned_utf8 = [8].pack('C*') + assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8')) + assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8')) + end + end + + def test_clean_cleans_invalid_characters_from_Shift_JIS_encoded_strings + with_kcode('SJIS') do + cleaned_sjis = [184, 0, 136, 165].pack('C*') + assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS')) + assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) + end + end + + private + + STRINGS = { + 'valid ASCII' => [65, 83, 67, 73, 73].pack('C*'), + 'invalid ASCII' => [128].pack('C*'), + 'valid UTF-8' => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'), + 'invalid UTF-8' => [184, 158, 8, 136, 165].pack('C*'), + 'valid Shift-JIS' => [131, 122, 129, 91, 131, 128].pack('C*'), + 'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*') + } + + def example(key) + STRINGS[key] + end + + def examples + STRINGS.values + end + + def with_kcode(code) + before = $KCODE + $KCODE = code + yield + $KCODE = before + end +end \ No newline at end of file -- 1.6.0.1 >From 5b8b41732f385131d4e1f1a8862d71f44dcc992d Mon Sep 17 00:00:00 2001 From: Michael Koziarski Date: Mon, 31 Aug 2009 12:07:30 -0700 Subject: [PATCH] Clean tag attributes before passing through the escape_once logic. Addresses CVE-2009-3009 --- actionpack/lib/action_view/helpers/tag_helper.rb | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/actionpack/lib/action_view/helpers/tag_helper.rb b/actionpack/lib/action_view/helpers/tag_helper.rb index 999cbfb..bde5581 100644 --- a/actionpack/lib/action_view/helpers/tag_helper.rb +++ b/actionpack/lib/action_view/helpers/tag_helper.rb @@ -99,7 +99,7 @@ module ActionView # escape_once("<< Accept & Checkout") # # => "<< Accept & Checkout" def escape_once(html) - html.to_s.gsub(/[\"><]|&(?!([a-zA-Z]+|(#\d+));)/) { |special| ERB::Util::HTML_ESCAPE[special] } + ActiveSupport::Multibyte.clean(html.to_s).gsub(/[\"><]|&(?!([a-zA-Z]+|(#\d+));)/) { |special| ERB::Util::HTML_ESCAPE[special] } end private -- 1.6.0.1