Sha256: db967e3ca02395c43a666dd7643e87339bcd5942234c95843ec4bd145d55808d

Contents?: true

Size: 783 Bytes

Versions: 3

Compression:

Stored size: 783 Bytes

Contents

require_relative "proxy"

module EmailCrawler
  class Scraper
    MAX_URLS = 10

    include MechanizeHelper

    def initialize(google_website)
      @google_website = "https://www.#{google_website}/"
    end

    def top_ten_urls_for(q)
      search_page = agent.get(@google_website)
      search_form = search_page.form_with(action: "/search")
      search_form.field_with(name: "q").value = q
      search_results_page = agent.submit(search_form)
      search_results_page.search("#search ol li h3.r a").
        map { |a| a["href"].downcase }.
        reject { |url| url =~ %r(\A/search[?]q=) }.
        first(MAX_URLS)
    end

  private

    def agent
      @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") }
      # @agent ||= new_agent
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
email_crawler-0.0.4 lib/email_crawler/scraper.rb
email_crawler-0.0.3 lib/email_crawler/scraper.rb
email_crawler-0.0.2 lib/email_crawler/scraper.rb