Sha256: e2dbfa921b0c993942b845f26eba8ebe489fcd7e51279db966d8a4c2567b92ed
Contents?: true
Size: 1.75 KB
Versions: 4
Compression:
Stored size: 1.75 KB
Contents
# -*- encoding : utf-8 -*- module Kabutops class Spider < Crawler class << self params :url callbacks :after_crawl, :before_cache, :follow_if def debug_spider enable_debug self.new.perform({ url: params[:url] }) end def crawl collection=nil super(collection || [{ url: params.url, }]) end def << resource if resource_status(resource).nil? resource_status(resource, 'new') super end end def resource_status resource, status=nil url_status(resource[:url], status) end def url_status url, status=nil key = redis_key(url) if status redis.set( key, JSON.dump({ url: url, status: status, }) ) else item = redis.get(key) item ? JSON.parse(item)['status'] : nil end end protected def redis_key string Digest::SHA256.hexdigest(string) end def redis @redis ||= ::Redis::Namespace.new( self.to_s, redis: ::Redis.new( host: Configuration[:redis][:host], port: Configuration[:redis][:port], db: Configuration[:redis][:db], ) ) end end def crawl resource page = super after_crawl(resource, page) self.class.resource_status(resource, 'done') page end def after_crawl resource, page page.css('a').each do |a| follow = self.class.notify(:follow_if, a['href']).any? if follow self << { url: URI.join(params.url, URI.escape(a['href'])).to_s } end end end end end
Version data entries
4 entries across 4 versions & 1 rubygems
Version | Path |
---|---|
kabutops-0.0.15 | lib/kabutops/spider.rb |
kabutops-0.0.14 | lib/kabutops/spider.rb |
kabutops-0.0.13 | lib/kabutops/spider.rb |
kabutops-0.0.12 | lib/kabutops/spider.rb |