Sha256: e87e951f81ff677b102daf8e6565534bb98cbd6204065a10449232c658d69ca5
Contents?: true
Size: 1.86 KB
Versions: 1
Compression:
Stored size: 1.86 KB
Contents
#coding: utf-8 require 'wombat/property/locators/factory' require 'wombat/processing/node_selector' require 'mechanize' require 'restclient' module Nokogiri module XML class Document attr_accessor :headers end end end module Wombat module Processing module Parser attr_accessor :mechanize, :context, :response_code, :page def initialize # http://stackoverflow.com/questions/6918277/ruby-mechanize-web-scraper-library-returns-file-instead-of-page @mechanize = Mechanize.new { |a| a.post_connect_hooks << lambda { |_,_,response,_| if response.content_type.nil? || response.content_type.empty? response.content_type = 'text/html' end } } @mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args end def parse(metadata) @context = parser_for metadata Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize) end private def parser_for(metadata) url = "#{metadata[:base_url]}#{metadata[:path]}" page = nil parser = nil begin @page = metadata[:page] if metadata[:document_format] == :html @page = @mechanize.get(url) unless @page parser = @page.parser parser.headers = @page.header else @page = RestClient.get(url) unless @page parser = Nokogiri::XML @page parser.headers = @page.headers end @response_code = @page.code.to_i if @page.respond_to? :code parser rescue if $!.respond_to? :http_code @response_code = $!.http_code.to_i elsif $!.respond_to? :response_code @response_code = $!.response_code.to_i end raise $! end end end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
wombat-2.3.0 | lib/wombat/processing/parser.rb |