Sha256: aeef8d4c0a6d2cde4df5a7679dc9df80062b3a804f7c920a941c1d397b175f2f

Contents?: true

Size: 1.06 KB

Versions: 2

Compression:

Stored size: 1.06 KB

Contents

require 'retriever'

module Ubi
  # Base for araneas (spiders)
  class Aranea
    OPTIONS = {
      user_agent: "Ubi v#{Ubi::VERSION}",
      depth_limit: 3,
      logger: Logger.new(STDOUT),
      # redis_options: {
      #   host: 'localhost',
      #   db: 5,
      #   driver: 'hiredis'
      # },
    }
    attr_accessor :thema, :url, :datum, :html, :text, :opts

    def initialize(thema, url, opts = {})
      @thema = thema
      @url   = url
      @opts  = OPTIONS.merge(opts)
      @datum = []
      @html = []
      @text = ''
    end

    delegate :name, to: :thema

    def crawl!
      @last_run = Time.now

      puts "Crawler start #{name} #{url}"
      Retriever::PageIterator.new(url, opts) do |page|
        parse page.source
        p [page.title, page.h1, page.h2]
      end
    end

    def parse(chunk)
      @datum << chunk
      @html << Nokogiri::HTML(chunk)
      @text << html.last.text
    end

    def work
      crawl! unless @last_run
      true
    end

    def to_s
      "#{thema} html: #{html.size} txt: #{text.size}"
    end
  end # Aranea
end # Ubi

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
ubi-0.0.8 lib/ubi/aranea.rb
ubi-0.0.7 lib/ubi/aranea.rb