>From 9c61eb32c534c706815f80eb375012cdcf854e71 Mon Sep 17 00:00:00 2001
From: Michael Koziarski <michael@koziarski.com>
Date: Mon, 31 Aug 2009 12:20:46 -0700
Subject: [PATCH] Add verify and clean methods to ActiveSupport::Multibyte.

When accepting character input from outside of your application you can't
blindly trust that all strings are properly encoded. With these methods
you can check incoming strings and clean them up if necessary.

Signed-off-by: Michael Koziarski <michael@koziarski.com>

Conflicts:

	activesupport/lib/active_support/multibyte/chars.rb
---
 activesupport/lib/active_support/multibyte.rb      |   36 ++++-
 .../lib/active_support/multibyte/chars.rb          |   25 ++---
 .../lib/active_support/multibyte/utils.rb          |   61 +++++++++
 activesupport/test/multibyte_utils_test.rb         |  141 ++++++++++++++++++++
 4 files changed, 241 insertions(+), 22 deletions(-)
 create mode 100644 activesupport/lib/active_support/multibyte/utils.rb
 create mode 100644 activesupport/test/multibyte_utils_test.rb

diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb
index 65a96af..b6354ee 100644
--- a/activesupport/lib/active_support/multibyte.rb
+++ b/activesupport/lib/active_support/multibyte.rb
@@ -1,9 +1,5 @@
 # encoding: utf-8
 
-require 'active_support/multibyte/chars'
-require 'active_support/multibyte/exceptions'
-require 'active_support/multibyte/unicode_database'
-
 module ActiveSupport #:nodoc:
   module Multibyte
     # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more
@@ -27,7 +23,35 @@ module ActiveSupport #:nodoc:
     #
     # Example:
     #   ActiveSupport::Multibyte.proxy_class = CharsForUTF32
-    mattr_accessor :proxy_class
-    self.proxy_class = ActiveSupport::Multibyte::Chars
+    def self.proxy_class=(klass)
+      @proxy_class = klass
+    end
+
+    # Returns the currect proxy class
+    def self.proxy_class
+      @proxy_class ||= ActiveSupport::Multibyte::Chars
+    end
+
+    # Regular expressions that describe valid byte sequences for a character
+    VALID_CHARACTER = {
+      # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
+      'UTF-8' => /\A(?:
+                  [\x00-\x7f]                                         |
+                  [\xc2-\xdf] [\x80-\xbf]                             |
+                  \xe0        [\xa0-\xbf] [\x80-\xbf]                 |
+                  [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]                 |
+                  \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf]     |
+                  [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf]     |
+                  \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn,
+      # Quick check for valid Shift-JIS characters, disregards the odd-even pairing
+      'Shift_JIS' => /\A(?:
+                  [\x00-\x7e \xa1-\xdf]                                     |
+                  [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn
+    }
   end
 end
+
+require 'active_support/multibyte/chars'
+require 'active_support/multibyte/exceptions'
+require 'active_support/multibyte/unicode_database'
+require 'active_support/multibyte/utils'
diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb
index a00b165..5199bf9 100644
--- a/activesupport/lib/active_support/multibyte/chars.rb
+++ b/activesupport/lib/active_support/multibyte/chars.rb
@@ -73,16 +73,7 @@ module ActiveSupport #:nodoc:
       UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
       UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
 
-      # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site)
-      UTF8_PAT = /\A(?:
-                     [\x00-\x7f]                                     |
-                     [\xc2-\xdf] [\x80-\xbf]                         |
-                     \xe0        [\xa0-\xbf] [\x80-\xbf]             |
-                     [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]             |
-                     \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
-                     [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
-                     \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
-                    )*\z/xn
+      UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
 
       attr_reader :wrapped_string
       alias to_s wrapped_string
@@ -292,23 +283,23 @@ module ActiveSupport #:nodoc:
       def rstrip
         chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
       end
-      
+
       # Strips entire range of Unicode whitespace from the left of the string.
       def lstrip
         chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
       end
-      
+
       # Strips entire range of Unicode whitespace from the right and left of the string.
       def strip
         rstrip.lstrip
       end
-      
+
       # Returns the number of codepoints in the string
       def size
         self.class.u_unpack(@wrapped_string).size
       end
       alias_method :length, :size
-      
+
       # Reverses all characters in the string.
       #
       # Example:
@@ -316,7 +307,7 @@ module ActiveSupport #:nodoc:
       def reverse
         chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*'))
       end
-      
+
       # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
       # character.
       #
@@ -617,7 +608,9 @@ module ActiveSupport #:nodoc:
         # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
         def tidy_bytes(string)
           string.split(//u).map do |c|
-            if !UTF8_PAT.match(c)
+            c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
+
+            if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
               n = c.unpack('C')[0]
               n < 128 ? n.chr :
               n < 160 ? [UCD.cp1252[n] || n].pack('U') :
diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb
new file mode 100644
index 0000000..acef84d
--- /dev/null
+++ b/activesupport/lib/active_support/multibyte/utils.rb
@@ -0,0 +1,61 @@
+# encoding: utf-8
+
+module ActiveSupport #:nodoc:
+  module Multibyte #:nodoc:
+    if Kernel.const_defined?(:Encoding)
+      # Returns a regular expression that matches valid characters in the current encoding
+      def self.valid_character
+        VALID_CHARACTER[Encoding.default_internal.to_s]
+      end
+    else
+      def self.valid_character
+        case $KCODE
+        when 'UTF8'
+          VALID_CHARACTER['UTF-8']
+        when 'SJIS'
+          VALID_CHARACTER['Shift_JIS']
+        end
+      end
+    end
+
+    if 'string'.respond_to?(:valid_encoding?)
+      # Verifies the encoding of a string
+      def self.verify(string)
+        string.valid_encoding?
+      end
+    else
+      def self.verify(string)
+        if expression = valid_character
+          for c in string.split(//)
+            return false unless valid_character.match(c)
+          end
+        end
+        true
+      end
+    end
+
+    # Verifies the encoding of the string and raises an exception when it's not valid
+    def self.verify!(string)
+      raise EncodingError.new("Found characters with invalid encoding") unless verify(string)
+    end
+
+    if 'string'.respond_to?(:force_encoding)
+      # Removes all invalid characters from the string.
+      #
+      # Note: this method is a no-op in Ruby 1.9
+      def self.clean(string)
+        string
+      end
+    else
+      def self.clean(string)
+        if expression = valid_character
+          stripped = []; for c in string.split(//)
+            stripped << c if valid_character.match(c)
+          end; stripped.join
+        else
+          string
+        end
+      end
+    end
+  end
+end
\ No newline at end of file
diff --git a/activesupport/test/multibyte_utils_test.rb b/activesupport/test/multibyte_utils_test.rb
new file mode 100644
index 0000000..d8ac5ff
--- /dev/null
+++ b/activesupport/test/multibyte_utils_test.rb
@@ -0,0 +1,141 @@
+# encoding: utf-8
+
+require 'abstract_unit'
+require 'multibyte_test_helpers'
+
+class MultibyteUtilsTest < ActiveSupport::TestCase
+  include MultibyteTestHelpers
+
+  test "valid_character returns an expression for the current encoding" do
+    with_encoding('None') do
+      assert_nil ActiveSupport::Multibyte.valid_character
+    end
+    with_encoding('UTF8') do
+      assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character
+    end
+    with_encoding('SJIS') do
+      assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character
+    end
+  end
+
+  test "verify verifies ASCII strings are properly encoded" do
+    with_encoding('None') do
+      examples.each do |example|
+        assert ActiveSupport::Multibyte.verify(example)
+      end
+    end
+  end
+
+  test "verify verifies UTF-8 strings are properly encoded" do
+    with_encoding('UTF8') do
+      assert ActiveSupport::Multibyte.verify(example('valid UTF-8'))
+      assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8'))
+    end
+  end
+
+  test "verify verifies Shift-JIS strings are properly encoded" do
+    with_encoding('SJIS') do
+      assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS'))
+      assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS'))
+    end
+  end
+
+  test "verify! raises an exception when it finds an invalid character" do
+    with_encoding('UTF8') do
+      assert_raises(ActiveSupport::Multibyte::EncodingError) do
+        ActiveSupport::Multibyte.verify!(example('invalid UTF-8'))
+      end
+    end
+  end
+
+  test "verify! doesn't raise an exception when the encoding is valid" do
+    with_encoding('UTF8') do
+      assert_nothing_raised do
+        ActiveSupport::Multibyte.verify!(example('valid UTF-8'))
+      end
+    end
+  end
+
+  if RUBY_VERSION < '1.9'
+    test "clean leaves ASCII strings intact" do
+      with_encoding('None') do
+        [
+          'word', "\270\236\010\210\245"
+        ].each do |string|
+          assert_equal string, ActiveSupport::Multibyte.clean(string)
+        end
+      end
+    end
+
+    test "clean cleans invalid characters from UTF-8 encoded strings" do
+      with_encoding('UTF8') do
+        cleaned_utf8 = [8].pack('C*')
+        assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8'))
+        assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8'))
+      end
+    end
+
+    test "clean cleans invalid characters from Shift-JIS encoded strings" do
+      with_encoding('SJIS') do
+        cleaned_sjis = [184, 0, 136, 165].pack('C*')
+        assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS'))
+        assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
+      end
+    end
+  else
+    test "clean is a no-op" do
+      with_encoding('UTF8') do
+        assert_equal example('invalid Shift-JIS'), ActiveSupport::Multibyte.clean(example('invalid Shift-JIS'))
+      end
+    end
+  end
+
+  private
+
+  STRINGS = {
+    'valid ASCII'       => [65, 83, 67, 73, 73].pack('C*'),
+    'invalid ASCII'     => [128].pack('C*'),
+    'valid UTF-8'       => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'),
+    'invalid UTF-8'     => [184, 158, 8, 136, 165].pack('C*'),
+    'valid Shift-JIS'   => [131, 122, 129, 91, 131, 128].pack('C*'),
+    'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*')
+  }
+
+  if Kernel.const_defined?(:Encoding)
+    def example(key)
+      STRINGS[key].force_encoding(Encoding.default_internal)
+    end
+
+    def examples
+      STRINGS.values.map { |s| s.force_encoding(Encoding.default_internal) }
+    end
+  else
+    def example(key)
+      STRINGS[key]
+    end
+
+    def examples
+      STRINGS.values
+    end
+  end
+
+  if 'string'.respond_to?(:encoding)
+    def with_encoding(enc)
+      before = Encoding.default_internal
+
+      case enc
+      when 'UTF8'
+        Encoding.default_internal = Encoding::UTF_8
+      when 'SJIS'
+        Encoding.default_internal = Encoding::Shift_JIS
+      else
+        Encoding.default_internal = Encoding::BINARY
+      end
+      yield
+
+      Encoding.default_internal = before
+    end
+  else
+    alias with_encoding with_kcode
+  end
+end
\ No newline at end of file
-- 
1.6.0.1


>From 31678df21276f0a986a4e39a69a4c10a2236a2ce Mon Sep 17 00:00:00 2001
From: Michael Koziarski <michael@koziarski.com>
Date: Mon, 31 Aug 2009 12:07:30 -0700
Subject: [PATCH] Clean tag attributes before passing through the escape_once logic.

Addresses CVE-2009-3009
---
 actionpack/lib/action_view/helpers/tag_helper.rb |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/actionpack/lib/action_view/helpers/tag_helper.rb b/actionpack/lib/action_view/helpers/tag_helper.rb
index 1c8d2db..54a9df4 100644
--- a/actionpack/lib/action_view/helpers/tag_helper.rb
+++ b/actionpack/lib/action_view/helpers/tag_helper.rb
@@ -104,7 +104,7 @@ module ActionView
       #   escape_once("&lt;&lt; Accept & Checkout")
       #   # => "&lt;&lt; Accept &amp; Checkout"
       def escape_once(html)
-        html.to_s.gsub(/[\"><]|&(?!([a-zA-Z]+|(#\d+));)/) { |special| ERB::Util::HTML_ESCAPE[special] }
+        ActiveSupport::Multibyte.clean(html.to_s).gsub(/[\"><]|&(?!([a-zA-Z]+|(#\d+));)/) { |special| ERB::Util::HTML_ESCAPE[special] }
       end
 
       private
-- 
1.6.0.1