Sha256: 368d18679e8058bf7a10924ff9ab7512bc0e38548d16da584ef1be382d4b76f6

Contents?: true

Size: 1.46 KB

Versions: 4

Compression:

Stored size: 1.46 KB

Contents

require_relative "url_helper"

module EmailCrawler
  class Scraper
    MAX_RESULTS = 100

    include MechanizeHelper
    include URLHelper

    def initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: [])
      @google_website = "https://www.#{google_website}/"
      @max_results = max_results
      @blacklisted_domains = blacklisted_domains.map { |domain| /#{domain}\z/ }
    end

    def search_result_urls_for(q)
      search_page = agent.get(@google_website)
      search_form = search_page.form_with(action: "/search")
      search_form.field_with(name: "q").value = q
      search_results_page = agent.submit(search_form)
      urls = search_results_on(search_results_page)

      page = 1
      while urls.size < @max_results
        next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
        return urls unless next_page_link

        next_search_results_page = next_page_link.click
        urls.concat(search_results_on(next_search_results_page)).uniq!
        page += 1
      end

      urls.first(@max_results)
    end

  private

    def search_results_on(page)
      page.search("#search ol li h3.r a").
        map { |a| a["href"].downcase }.
        reject { |url| url =~ %r(\A/search[?]q=) }.
        reject do |url|
          domain = extract_domain_from(url)
          @blacklisted_domains.any? { |blacklisted_domain| domain =~ blacklisted_domain }
        end
    end

    def agent
      @agent ||= new_agent
    end
  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
email_crawler-0.0.13 lib/email_crawler/scraper.rb
email_crawler-0.0.12 lib/email_crawler/scraper.rb
email_crawler-0.0.11 lib/email_crawler/scraper.rb
email_crawler-0.0.10 lib/email_crawler/scraper.rb