require 'nokogiri' require 'nokogumboc' module Nokogiri # Parse an HTML document. +string+ contains the document. +string+ # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+. def self.HTML5(string) Nokogiri::HTML5.parse(string) end module HTML5 # Parse an HTML document. +string+ contains the document. +string+ # may also be an IO-like object. Returns a +Nokogiri::HTML::Document+. def self.parse(string) if string.respond_to? :read string = string.read end # convert to UTF-8 (Ruby 1.9+) if string.respond_to?(:encoding) and string.encoding != Encoding::UTF_8 string = reencode(string) end Nokogumbo.parse(string.to_s) end # Fetch and parse a HTML document from the web, following redirects, # handling https, and determining the character encoding using HTML5 # rules. +uri+ may be a +String+ or a +URI+. +options+ contains # http headers and special options. Everything which is not a # special option is considered a header. Special options include: # * :follow_limit => number of redirects which are followed # * :basic_auth => [username, password] def self.get(uri, options={}) headers = options.clone headers = {:follow_limit => headers} if Numeric === headers # deprecated limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10 require 'net/http' uri = URI(uri) unless URI === uri http = Net::HTTP.new(uri.host, uri.port) # TLS / SSL support http.use_ssl = true if uri.scheme == 'https' # Pass through Net::HTTP override values, which currently include: # :ca_file, :ca_path, :cert, :cert_store, :ciphers, # :close_on_empty_response, :continue_timeout, :key, :open_timeout, # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl, # :verify_callback, :verify_depth, :verify_mode options.each do |key, value| http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}=" end request = Net::HTTP::Get.new(uri.request_uri) # basic authentication auth = headers.delete(:basic_auth) auth ||= [uri.user, uri.password] if uri.user and uri.password request.basic_auth auth.first, auth.last if auth # remaining options are treated as headers headers.each {|key, value| request[key.to_s] = value.to_s} response = http.request(request) case response when Net::HTTPSuccess doc = parse(reencode(response.body, response['content-type'])) doc.instance_variable_set('@response', response) doc.class.send(:attr_reader, :response) doc when Net::HTTPRedirection response.value if limit <= 1 location = URI.join(uri, response['location']) get(location, options.merge(:follow_limit => limit-1)) else response.value end end # while fragment is on the Gumbo TODO list, simulate it by doing # a full document parse and ignoring the parent , , and # tags, and collecting up the children of each. def self.fragment(string) doc = parse(string) fragment = Nokogiri::HTML::DocumentFragment.new(doc) if doc.children.length != 1 or doc.children.first.name != 'html' # no HTML? Return document as is fragment = doc else # examine children of HTML element children = doc.children.first.children # head is always first. If present, take children but otherwise # ignore the head element if children.length > 0 and doc.children.first.name = 'head' fragment << children.shift.children end # body may be next, or last. If found, take children but otherwise # ignore the body element. Also take any remaining elements, taking # care to preserve order. if children.length > 0 and doc.children.first.name = 'body' fragment << children.shift.children fragment << children elsif children.length > 0 and doc.children.last.name = 'body' body = children.pop fragment << children fragment << body.children else fragment << children end end # return result fragment end private # Charset sniffing is a complex and controversial topic that understandably # isn't done _by default_ by the Ruby Net::HTTP library. This being said, # it is a very real problem for consumers of HTML as the default for HTML # is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser # *only* supports utf-8. # # Accordingly, Nokogiri::HTML::Document.parse provides limited encoding # detection. Following this lead, Nokogiri::HTML5 attempts to do likewise, # while attempting to more closely follow the HTML5 standard. # # http://bugs.ruby-lang.org/issues/2567 # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding # def self.reencode(body, content_type=nil) return body unless body.respond_to? :encoding if body.encoding == Encoding::ASCII_8BIT encoding = nil # look for a Byte Order Mark (BOM) if body[0..1] == "\xFE\xFF" encoding = 'utf-16be' elsif body[0..1] == "\xFF\xFE" encoding = 'utf-16le' elsif body[0..2] == "\xEF\xBB\xBF" encoding = 'utf-8' end # look for a charset in a content-encoding header if content_type encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1] end # look for a charset in a meta tag in the first 1024 bytes if not encoding data = body[0..1023].gsub(/|\Z)/m, '') data.scan(//m).each do |meta| encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1] end end # if all else fails, default to the official default encoding for HTML encoding ||= Encoding::ISO_8859_1 # change the encoding to match the detected or inferred encoding begin body.force_encoding(encoding) rescue ArgumentError body.force_encoding(Encoding::ISO_8859_1) end end body.encode(Encoding::UTF_8) end end end