Sha256: e87e951f81ff677b102daf8e6565534bb98cbd6204065a10449232c658d69ca5

Contents?: true

Size: 1.86 KB

Versions: 1

Compression:

Stored size: 1.86 KB

Contents

#coding: utf-8
require 'wombat/property/locators/factory'
require 'wombat/processing/node_selector'
require 'mechanize'
require 'restclient'

module Nokogiri
  module XML
    class Document
      attr_accessor :headers
    end
  end
end

module Wombat
  module Processing
    module Parser
      attr_accessor :mechanize, :context, :response_code, :page

      def initialize
        # http://stackoverflow.com/questions/6918277/ruby-mechanize-web-scraper-library-returns-file-instead-of-page
        @mechanize = Mechanize.new { |a|
          a.post_connect_hooks << lambda { |_,_,response,_|
            if response.content_type.nil? || response.content_type.empty?
              response.content_type = 'text/html'
            end
          }
        }
        @mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args
      end

      def parse(metadata)
        @context = parser_for metadata

        Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize)
      end

      private
      def parser_for(metadata)
        url = "#{metadata[:base_url]}#{metadata[:path]}"
        page = nil
        parser = nil
        begin
          @page = metadata[:page]

          if metadata[:document_format] == :html
            @page = @mechanize.get(url) unless @page
            parser = @page.parser
            parser.headers = @page.header
          else
            @page = RestClient.get(url) unless @page
            parser = Nokogiri::XML @page
            parser.headers = @page.headers
          end
          @response_code = @page.code.to_i if @page.respond_to? :code
          parser
        rescue
          if $!.respond_to? :http_code
            @response_code = $!.http_code.to_i
          elsif $!.respond_to? :response_code
            @response_code = $!.response_code.to_i
          end
          raise $!
        end
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
wombat-2.3.0 lib/wombat/processing/parser.rb