Sha256: db967e3ca02395c43a666dd7643e87339bcd5942234c95843ec4bd145d55808d
Contents?: true
Size: 783 Bytes
Versions: 3
Compression:
Stored size: 783 Bytes
Contents
require_relative "proxy" module EmailCrawler class Scraper MAX_URLS = 10 include MechanizeHelper def initialize(google_website) @google_website = "https://www.#{google_website}/" end def top_ten_urls_for(q) search_page = agent.get(@google_website) search_form = search_page.form_with(action: "/search") search_form.field_with(name: "q").value = q search_results_page = agent.submit(search_form) search_results_page.search("#search ol li h3.r a"). map { |a| a["href"].downcase }. reject { |url| url =~ %r(\A/search[?]q=) }. first(MAX_URLS) end private def agent @agent ||= new_agent { |agent| agent.set_proxy(Proxy.random, "8888") } # @agent ||= new_agent end end end
Version data entries
3 entries across 3 versions & 1 rubygems
Version | Path |
---|---|
email_crawler-0.0.4 | lib/email_crawler/scraper.rb |
email_crawler-0.0.3 | lib/email_crawler/scraper.rb |
email_crawler-0.0.2 | lib/email_crawler/scraper.rb |