Sha256: 0300ed2ec1dbd3a387cd55a784d2d612980855125e1f97c7139053ce1844b09b

Contents?: true

Size: 1.94 KB

Versions: 1

Compression:

Stored size: 1.94 KB

Contents

require 'net/http'
require 'uri'
require 'open-uri'
require 'rubygems'
require 'hpricot'
require 'url_utils'

class Spider
  include UrlUtils

  def initialize
    @already_visited = {}
  end

  def crawl_web(urls, depth=2, page_limit = 100)
    depth.times do
      next_urls = []
      urls.each do |url|
        url_object = open_url(url)
        next if url_object.nil?

        url = update_url_if_redirected(url_object)
        parsed_doc = parse_url(url_object)
        next if parsed_doc.nil?

        @already_visited[url] = true if @already_visited[url].nil?
        return if @already_visited.size == page_limit

        next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
        next_urls.uniq!
      end
      urls = next_urls
    end
  end

  def crawl_domain(url, page_limit = 100)
    return if @already_visited.size == page_limit

    url_object = open_url(url)
    return if url_object.nil?

    parsed_doc = parse_url(url_object)
    return if parsed_doc.nil?

    @already_visited[url] = true if @already_visited[url].nil?
    page_urls = find_urls_on_page(parsed_doc, url)
    page_urls.each do |page_url|
      if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
        crawl_domain(page_url)
      end
    end
  end

  private

  def open_url(url)
    open(url)
  rescue
    puts "unable to open url : " + url
  end

  def update_url_if_redirected(url_object)
    url_object.base_uri.to_s
  end

  def parse_url(url_object)
    doc = Hpricot(url_object) #nokogiri
    puts 'Crawling url ' + url_object.base_uri.to_s
    doc
  rescue
    puts 'Could not parse url: ' + url_object.base_uri.to_s
  end


  def find_urls_on_page(parsed_doc, current_url)
    parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
      new_url = x['href'].split('#')[0]
      if new_url
        new_url = make_absolute(current_url, new_url) if relative?(new_url)
        urls_list.push(new_url)
      end

    end

  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
by_crawler-0.1.0 lib/by_crawler/spider.rb