Sha256: d483e5aff70b35e39b7905422cf937dbe84e7d942fd3060d4a6179175d685159

Contents?: true

Size: 1.8 KB

Versions: 1

Compression:

Stored size: 1.8 KB

Contents

module RegexpCrawler
  class Crawler
    attr_accessor :start_page, :continue_regexp, :named_captures, :model

    def initialize(options = {})
      @start_page = options[:start_page]
      @continue_regexp = options[:continue_regexp]
      @capture_regexp = options[:capture_regexp]
      @named_captures = options[:named_captures]
      @model = options[:model]
    end

    def capture_regexp=(regexp)
      @capture_regexp = Regexp.new(regexp.source, regexp.options | Regexp::MULTILINE)
    end

    def start
      results = []
      @captured_pages = []
      @pages = [URI.parse(@start_page)]
      while !@pages.empty?
        uri = @pages.shift
        @captured_pages << uri
        result = parse_page(uri)
        results << result if result
      end
      results
    end

    def parse_page(uri)
      response = Net::HTTP.get_response(uri)
      parse_response(response, uri)
    end

    def parse_response(response, uri)
      if response.is_a? Net::HTTPSuccess
        response.body.scan(continue_regexp).each do |page|
          page = page.first if page.is_a? Array
          continue_uri = page.start_with?(uri.scheme) ? URI.parse(page) : URI.join(uri.scheme + '://' + uri.host, page)
          @pages << continue_uri unless @captured_pages.include?(continue_uri) or @pages.include?(continue_uri)
        end if continue_regexp
        md = @capture_regexp.match(response.body)
        if md
          model_result = model.new
          captures = md.captures if md
          captures.each_index do |i|
            model_result.send("#{named_captures[i]}=", captures[i])
          end
          {:model => model_result, :page => "#{uri.scheme}://#{uri.host}#{uri.path}"}
        end
      elsif response.is_a? Net::HTTPRedirection
        parse_page(URI.parse(response['location']))
      else
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
flyerhzm-regexp_crawler-0.2.0 lib/regexp_crawler/crawler.rb