lib/utf8-cleaner/uri_string.rb in utf8-cleaner-0.1.1 vs lib/utf8-cleaner/uri_string.rb in utf8-cleaner-0.2.0
- old
+ new
@@ -1,9 +1,14 @@
module UTF8Cleaner
+ # Cleans invalid %-encodings from URI-encoded strings.
class URIString
attr_accessor :data
+ HEX_CHARS = '0-9a-fA-F'
+ HEX_CHARS_REGEX = /[#{HEX_CHARS}]/
+ INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/
+
def initialize(data)
self.data = data
end
def cleaned
@@ -14,15 +19,10 @@
end
end
def valid?
valid_uri_encoded_utf8(data)
- rescue ArgumentError => e
- if e.message =~ /invalid byte sequence/
- return false
- end
- raise e
end
private
# Returns an array of valid URI-encoded UTF-8 characters.
@@ -36,10 +36,23 @@
if char == '%'
# Skip the next two characters, which are the encoded byte
# indicates by this %. (We'll change this later for multibyte characters.)
skip_next = 2
+ # If the next character is not a hex char, drop the percent and it
+ unless data[index + 1] =~ HEX_CHARS_REGEX
+ index += 2
+ next
+ end
+
+ # If the character after that is not a hex char, drop the percent and
+ # both of the following chars.
+ unless data[index + 2] =~ HEX_CHARS_REGEX
+ index += 3
+ next
+ end
+
# How long is this character?
first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
bytes = utf8_char_length_in_bytes(first_byte)
# Grab the specified number of encoded bytes
@@ -70,11 +83,17 @@
char_array
end
def valid_uri_encoded_utf8(string)
- URI.decode(string).force_encoding('UTF-8').valid_encoding?
+ URI.decode(string).force_encoding('UTF-8').valid_encoding? &&
+ string !~ INVALID_PERCENT_ENCODING_REGEX
+ rescue ArgumentError => e
+ if e.message =~ /invalid byte sequence/
+ return false
+ end
+ raise e
end
# Grab the next num_bytes URI-encoded bytes from the raw character array.
# Returns an array like ['%E2', '%9C', '%93']
def next_n_bytes_from(index, num_bytes)
@@ -108,6 +127,6 @@
4
end
end
end
-end
\ No newline at end of file
+end