Sha256: e2dbfa921b0c993942b845f26eba8ebe489fcd7e51279db966d8a4c2567b92ed

Contents?: true

Size: 1.75 KB

Versions: 4

Compression:

Stored size: 1.75 KB

Contents

# -*- encoding : utf-8 -*-

module Kabutops

  class Spider < Crawler
    class << self
      params :url
      callbacks :after_crawl, :before_cache, :follow_if

      def debug_spider
        enable_debug
        self.new.perform({
          url: params[:url]
        })
      end

      def crawl collection=nil
        super(collection || [{ url: params.url, }])
      end

      def << resource
        if resource_status(resource).nil?
          resource_status(resource, 'new')
          super
        end
      end

      def resource_status resource, status=nil
        url_status(resource[:url], status)
      end

      def url_status url, status=nil
        key = redis_key(url)

        if status
          redis.set(
            key,
            JSON.dump({
              url: url,
              status: status,
            })
          )
        else
          item = redis.get(key)
          item ? JSON.parse(item)['status'] : nil
        end
      end

      protected

      def redis_key string
        Digest::SHA256.hexdigest(string)
      end

      def redis
        @redis ||= ::Redis::Namespace.new(
          self.to_s,
          redis: ::Redis.new(
            host: Configuration[:redis][:host],
            port: Configuration[:redis][:port],
            db: Configuration[:redis][:db],
          )
        )
      end
    end

    def crawl resource
      page = super
      after_crawl(resource, page)
      self.class.resource_status(resource, 'done')
      page
    end

    def after_crawl resource, page
      page.css('a').each do |a|
        follow = self.class.notify(:follow_if, a['href']).any?
        if follow
          self << {
            url: URI.join(params.url, URI.escape(a['href'])).to_s
          }
        end
      end
    end
  end

end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
kabutops-0.0.15 lib/kabutops/spider.rb
kabutops-0.0.14 lib/kabutops/spider.rb
kabutops-0.0.13 lib/kabutops/spider.rb
kabutops-0.0.12 lib/kabutops/spider.rb