module UTF8Cleaner
  class URIString
    attr_accessor :data

    def initialize(data)
      self.data = data
    end

    def cleaned
      if valid?
        data
      else
        encoded_char_array.join
      end
    end

    def encoded?
      data.include?('%')
    end

    def valid?
      valid_uri_encoded_utf8(data)
    end

    private

    # Returns an array of valid URI-encoded UTF-8 characters.
    def encoded_char_array
      char_array = []
      index = 0

      while (index < data.length) do
        char = data[index]

        if char == '%'
          # Skip the next two characters, which are the encoded byte
          # indicates by this %. (We'll change this later for multibyte characters.)
          skip_next = 2

          # How long is this character?
          first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
          bytes = utf8_char_length_in_bytes(first_byte)

          # Grab the specified number of encoded bytes
          utf8_char_encoded_bytes = next_n_bytes_from(index, bytes)

          # Did we get the right number of bytes?
          if utf8_char_encoded_bytes.length == bytes

            # We did. Is it a valid character?
            utf8_char_encoded = utf8_char_encoded_bytes.join

            if valid_uri_encoded_utf8(utf8_char_encoded)
              # It's valid!
              char_array << utf8_char_encoded

              # If we're dealing with a multibyte character, skip more than two
              # of the next characters, which have already been processed.
              skip_next = bytes * 3 - 1
            end
          end
          index += skip_next
        else
          # This was not an encoded character, so just add it and move to the next.
          char_array << char
        end
        index += 1
      end

      char_array
    end

    def valid_uri_encoded_utf8(string)
      URI.decode(string).force_encoding('UTF-8').valid_encoding?
    end

    # Grab the next num_bytes URI-encoded bytes from the raw character array.
    # Returns an array like ['%E2', '%9C', '%93']
    def next_n_bytes_from(index, num_bytes)
      return [] if data.length < index + (3 * num_bytes)

      num_bytes.times.map do |n|
        # Look for percent signs in the right places
        pct_index = index + (3 * n)
        if data[pct_index] == '%'
          byte = data[pct_index + 1..pct_index + 2]
        else
          # An expected percent sign was missing. The whole character is invalid.
          return []
        end
        '%' + byte
      end
    end

    # If the first byte is between 0xC0 and 0xDF, the UTF-8 character has two bytes;
    # if it is between 0xE0 and 0xEF, the UTF-8 character has 3 bytes;
    # and if it is 0xF0 and 0xFF, the UTF-8 character has 4 bytes.
    # first_byte is a string like "0x13"
    def utf8_char_length_in_bytes(first_byte)
      if first_byte.hex < 'C0'.hex
        1
      elsif first_byte.hex < 'DF'.hex
        2
      elsif first_byte.hex < 'EF'.hex
        3
      else
        4
      end
    end

  end
end