uri_string.rb in utf8-cleaner-0.2.0

- old
+ new

@@ -1,9 +1,14 @@
 module UTF8Cleaner
+  # Cleans invalid %-encodings from URI-encoded strings.
   class URIString
     attr_accessor :data
 
+    HEX_CHARS = '0-9a-fA-F'
+    HEX_CHARS_REGEX = /[#{HEX_CHARS}]/
+    INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/
+
     def initialize(data)
       self.data = data
     end
 
     def cleaned
@@ -14,15 +19,10 @@
       end
     end
 
     def valid?
       valid_uri_encoded_utf8(data)
-    rescue ArgumentError => e
-      if e.message =~ /invalid byte sequence/
-        return false
-      end
-      raise e
     end
 
     private
 
     # Returns an array of valid URI-encoded UTF-8 characters.
@@ -36,10 +36,23 @@
         if char == '%'
           # Skip the next two characters, which are the encoded byte
           # indicates by this %. (We'll change this later for multibyte characters.)
           skip_next = 2
 
+          # If the next character is not a hex char, drop the percent and it
+          unless data[index + 1] =~ HEX_CHARS_REGEX
+            index += 2
+            next
+          end
+
+          # If the character after that is not a hex char, drop the percent and
+          # both of the following chars.
+          unless data[index + 2] =~ HEX_CHARS_REGEX
+            index += 3
+            next
+          end
+
           # How long is this character?
           first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
           bytes = utf8_char_length_in_bytes(first_byte)
 
           # Grab the specified number of encoded bytes
@@ -70,11 +83,17 @@
 
       char_array
     end
 
     def valid_uri_encoded_utf8(string)
-      URI.decode(string).force_encoding('UTF-8').valid_encoding?
+      URI.decode(string).force_encoding('UTF-8').valid_encoding? &&
+        string !~ INVALID_PERCENT_ENCODING_REGEX
+    rescue ArgumentError => e
+      if e.message =~ /invalid byte sequence/
+        return false
+      end
+      raise e
     end
 
     # Grab the next num_bytes URI-encoded bytes from the raw character array.
     # Returns an array like ['%E2', '%9C', '%93']
     def next_n_bytes_from(index, num_bytes)
@@ -108,6 +127,6 @@
         4
       end
     end
 
   end
-end
\ No newline at end of file
+end