lib/utf8-cleaner/uri_string.rb in utf8-cleaner-0.1.1 vs lib/utf8-cleaner/uri_string.rb in utf8-cleaner-0.2.0

- old
+ new

@@ -1,9 +1,14 @@ module UTF8Cleaner + # Cleans invalid %-encodings from URI-encoded strings. class URIString attr_accessor :data + HEX_CHARS = '0-9a-fA-F' + HEX_CHARS_REGEX = /[#{HEX_CHARS}]/ + INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/ + def initialize(data) self.data = data end def cleaned @@ -14,15 +19,10 @@ end end def valid? valid_uri_encoded_utf8(data) - rescue ArgumentError => e - if e.message =~ /invalid byte sequence/ - return false - end - raise e end private # Returns an array of valid URI-encoded UTF-8 characters. @@ -36,10 +36,23 @@ if char == '%' # Skip the next two characters, which are the encoded byte # indicates by this %. (We'll change this later for multibyte characters.) skip_next = 2 + # If the next character is not a hex char, drop the percent and it + unless data[index + 1] =~ HEX_CHARS_REGEX + index += 2 + next + end + + # If the character after that is not a hex char, drop the percent and + # both of the following chars. + unless data[index + 2] =~ HEX_CHARS_REGEX + index += 3 + next + end + # How long is this character? first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase bytes = utf8_char_length_in_bytes(first_byte) # Grab the specified number of encoded bytes @@ -70,11 +83,17 @@ char_array end def valid_uri_encoded_utf8(string) - URI.decode(string).force_encoding('UTF-8').valid_encoding? + URI.decode(string).force_encoding('UTF-8').valid_encoding? && + string !~ INVALID_PERCENT_ENCODING_REGEX + rescue ArgumentError => e + if e.message =~ /invalid byte sequence/ + return false + end + raise e end # Grab the next num_bytes URI-encoded bytes from the raw character array. # Returns an array like ['%E2', '%9C', '%93'] def next_n_bytes_from(index, num_bytes) @@ -108,6 +127,6 @@ 4 end end end -end \ No newline at end of file +end