Sha256: bcfef8e451997148db2a78896d983f947f5454457c9ff7a1b5c838bcb8d7f6a3

Contents?: true

Size: 1.22 KB

Versions: 1

Compression:

Stored size: 1.22 KB

Contents

require 'addressable/uri'
require 'crawler/document'
require 'crawler/error'
require 'crawler/formatting'
require 'crawler/index'
require 'crawler/validations'
require 'crawler/version'

module Crawler
  class << self
    include Formatting
    include Validations

    attr_accessor :base_uri, :base_url, :index

    # Initialize a new Crawler
    #
    # base_uri => base_uri to crawl
    # base_url => base_url to crawl
    #
    def new(domain)
      @base_uri = Addressable::URI.parse(domain.strip)
      validate_protocol

      @base_url = construct_url base_uri
      self
    end

    # Performs crawl of domain, indexes results
    #
    def crawl(url = base_url)
      document = Crawler::Document.new(url)
      index.consume_document url.sub(base_url, ''), document

      paths_queue = index.get_paths_to_visit
      next_path = paths_queue[0]

      print " Pages remaing - #{paths_queue.count}    \r"
      crawl "#{base_url}#{next_path}" if next_path
    end

    # Returns the indexed results of a crawl
    #
    def results
      index.results
    end

    private

    # Crawler::Index
    # New Index used to record results of domain crawl
    #
    def index
      @index ||= Crawler::Index.new(base_uri)
    end

  end

end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
ruby-crawler-0.0.1 lib/crawler.rb