Sha256: fcba145f719a078f597cb353dad860d1047d8dac9084d8e053626820ee5e4df6

Contents?: true

Size: 1.77 KB

Versions: 1

Compression:

Stored size: 1.77 KB

Contents

require "guess_html_encoding/version"

# A small and simple library for guessing the encoding of HTML in Ruby 1.9.
module GuessHtmlEncoding
  # Guess the encoding of an HTML string, using HTTP headers if provided.  HTTP headers can be a string or a hash.
  def self.guess(html, headers = nil)
    html = html.to_s.dup.force_encoding("ASCII-8BIT")
    out = nil

    if headers
      headers = headers.map {|k, v| "#{k}: #{v}" }.join("\n") if headers.is_a?(Hash)
      headers = headers.dup.force_encoding("ASCII-8BIT")
      headers.split("\n").map {|i| i.split(":")}.each do |k,v|
        if k =~ /Content-Type/i && v =~ /charset=([\w\d-]+);?/i
          out = $1.upcase
          break
        end
      end
    end

    if out.nil? || out.empty? || !encoding_loaded?(out)
      if html =~ /<meta[^>]*HTTP-EQUIV=["']Content-Type["'][^>]*content=["']([^'"]*)["']/i && $1 =~ /charset=([\w\d-]+);?/i
        out = $1.upcase
      end
    end

    # Translate encodings with other names.
    if out
      out = "UTF-8" if %w[DEFAULT UTF8 UNICODE].include?(out)
      out = "CP1251" if out == "CP-1251"
      out = "ISO-8859-1" if %w[LATIN1 LATIN-1].include?(out)
      out = "Windows-1250" if %w[WIN-1251 WIN1251].include?(out)
    end

    out
  end

  # Force an HTML string into a guessed encoding.
  def self.encode(html, headers = nil)
    html_copy = html.to_s.dup
    encoding = guess(html_copy, (headers || '').gsub(/[\r\n]+/, "\n"))
    html_copy.force_encoding(encoding_loaded?(encoding) ? encoding : "UTF-8")
    if html_copy.valid_encoding?
      html_copy
    else
      html_copy.force_encoding('ASCII-8BIT').encode('UTF-8', :undef => :replace, :invalid => :replace)
    end
  end

  # Is this encoding loaded?
  def self.encoding_loaded?(encoding)
    Encoding.name_list.include? encoding
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
guess_html_encoding-0.0.5 lib/guess_html_encoding.rb