Sha256: bfe93f02e3cd549b0d99b1e3735f67b7d370359ded65228ea4ace0730cd89a79

Contents?: true

Size: 1.54 KB

Versions: 1

Compression:

Stored size: 1.54 KB

Contents

require 'nokogiri'
require 'open-uri'
require 'open3'
require 'addressable/uri'
require 'English'

module HttpSpell
  class Spider
    attr_reader :todo, :done

    def initialize(starting_point, limit: nil, tracing: false)
      @todo = []
      @done = []
      todo << Addressable::URI.parse(starting_point)
      @limit = limit || /^#{starting_point}/
      @tracing = tracing
    end

    def start
      while todo.any?
        url = todo.pop
        extracted = links(url) do |u, d|
          yield u, d if block_given?
        rescue
          warn "Callback error for #{url}: #{$ERROR_INFO}"
          warn $ERROR_INFO.backtrace if @tracing
        end

        done.append(url)
        todo.concat(extracted - done - todo)
      end
    end

    private

    def links(uri)
      # We are using open-uri, which follows redirects and also provides the content-type.
      response = open(uri).read

      if response.respond_to?(:content_type)
        return [] unless response.content_type == 'text/html'
      end

      doc = Nokogiri::HTML(response)

      links = doc.css('a[href]').map do |e|
        link = Addressable::URI.parse(e['href'])
        link = uri.join(link) if link.relative?
        next unless @limit.match?(link.to_s)
        # TODO Ignore same page links (some anchor)
        link
      rescue StandardError
        warn $ERROR_INFO.message
        warn $ERROR_INFO.backtrace if @tracing
      end.compact

      yield uri, doc if block_given?

      warn "Adding #{links.size} links from #{uri}" if @tracing
      links
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
httpspell-1.1.0 lib/httpspell/spider.rb