Sha256: 4fc543e3b4d8a502cb315060bfecd2246639ef953f09491c529075dda1b1ba60

Contents?: true

Size: 1.56 KB

Versions: 1

Compression:

Stored size: 1.56 KB

Contents

require "set"
require_relative "mechanize_helper"
require_relative "url_helper"

module EmailCrawler
  class Scraper
    MAX_RESULTS = 100

    include MechanizeHelper
    include URLHelper

    def initialize(google_website, max_results: MAX_RESULTS, blacklisted_domains: [])
      @search_url = "https://www.#{google_website}/search?q="
      @max_results = max_results
      @blacklisted_domains = blacklisted_domains.map { |domain| /#{domain}\z/ }
    end

    def search_result_urls_for(q)
      search_results_page = agent.get(@search_url + CGI.escape(q))
      urls = Set.new(search_results_on(search_results_page))

      page = 1
      while urls.size < @max_results
        next_page_link = search_results_page.link_with(href: /start=#{page*10}/)
        break unless next_page_link

        next_search_results_page = next_page_link.click
        search_results_on(next_search_results_page).each do |url|
          urls << url
        end

        page += 1
      end

      urls.to_a.first(@max_results)
    end

    private

    def search_results_on(page)
      urls = page.search("#search ol li.g h3.r a").map do |a|
        href = a[:href]
        url = href =~ %r(/url\?q=) && $POSTMATCH

        if url
          url = url =~ /&sa=/ && $PREMATCH
          CGI.unescape(url) if url
        end
      end
      urls.compact!

      unless @blacklisted_domains.empty?
        urls.delete_if do |url|
          domain = extract_domain_from(url)
          @blacklisted_domains.any? { |blacklisted_domain| domain =~ blacklisted_domain }
        end
      end

      urls
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
email_crawler-0.1.1 lib/email_crawler/scraper.rb