require 'rubygems' require 'mechanize' require 'csv' require 'fileutils' require 'i18n' require_relative 'proxy' require_relative 'proxy_handler' require_relative 'csv_handlers' # tuck this away into a core_extensions module class String def alnum return self.gsub(/[^\p{Alnum}\p{Space}]/u, ' ') end end class DuckScraper include CSVHandlers def initialize(working_dir, input_file, output_file, options) @working_dir, @input_file, @output_file, @noproxy = working_dir, input_file, output_file, options[:noproxy] @headers = get_headers(@input_file) @headers << "Linkedin Import Status" unless @headers.include?("Linkedin Import Status") @headers << "Urls" unless @headers.include?("Urls") @input_length = CSV.read(@input_file).length - 1 if File.exist?(@output_file) @start = CSV.read(@output_file, headers: true).length puts "resuming from row #{@start + 1}" else create_file(@output_file) end @cooldown = 5 @proxies = ProxyHandler.new(@cooldown) unless @noproxy end def find_profiles count = 0 CSV.foreach(@input_file, headers: true) do |input_row| count += 1 next if @start && @start >= count if @proxies tries = @proxies.length else tries = 3 end puts "ddg #{count}/#{@input_length}" begin unless sufficient_data?(input_row) puts "Insufficient data, skipping" append_ddg_row(input_row, "Insufficient Data", nil) next end cert_file = Pathname.new(File.dirname __dir__).realdirpath + '../data/cacert.pem' cert_store = OpenSSL::X509::Store.new cert_store.add_file(cert_file.to_s) agent = Mechanize.new agent.cert_store = cert_store unless @noproxy proxy = @proxies.get_proxy agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password) agent.user_agent = proxy.user_agent puts "proxy: #{proxy.ip}" end sleep(@cooldown) if @noproxy query_string = create_query(input_row) puts "query string: #{query_string}" ddg_page = agent.get('https://www.duckduckgo.com/html') search_form = ddg_page.form_with(id: 'search_form_homepage') search_form.q = query_string results_page = agent.submit(search_form) urls = find_results(results_page, input_row) if urls.length > 0 puts "Success! #{urls.length} possible urls found" append_ddg_row(input_row, "DDG results found", urls.join(', ')) else puts "no results found" append_ddg_row(input_row, "No DDG results found", nil) end proxy.good if proxy rescue StandardError => msg tries -= 1 if tries > 0 puts "\n\n" puts msg puts 'RETRYING' puts "\n\n" proxy.used if proxy retry else append_ddg_row(input_row, msg, nil) puts msg end end end end def append_ddg_row(row, status, urls) row << ["Linkedin Import Status", status] row << ["Urls", urls] output_row = create_row(row, @headers) append_to_csv(@output_file, output_row) end def sufficient_data?(row) data_presence = 0 if row["First Name"] && row["First Name"].alnum.strip != "" data_presence += 1 end if row["Last Name"] && row["Last Name"].alnum.strip != "" data_presence += 1 end if row["Employer Organization Name 1"] && row["Employer Organization Name 1"].alnum.strip != "" data_presence += 1 end if row["Employer 1 Title"] && row["Employer 1 Title"].alnum.strip != "" data_presence += 1 end data_presence == 4 ? true : false end def find_results(page, row) matches = [] full_name = "#{row['First Name']} #{row['Last Name']}".gsub(row["Email"], ' ').alnum.strip if page.css("#links .results_links_deep") results = page.css("#links .results_links_deep") else return matches end results.each do |result| if result.at_css("a.result__a") url_text = result.css("a.result__a").text.alnum url = result.at_css('a.result__a')['href'] bio = result.css("a.result__snippet").text.alnum || "" valid_url = true short_title = row["Employer 1 Title"].alnum.split.first(2) short_employer = row["Employer Organization Name 1"].alnum.split.first if result.css("a.large").text.include?("profiles | LinkedIn") valid_url = false end unless url.include?("linkedin") && (url.include?("/in/") || url.include?("/pub/")) valid_url = false end if valid_url && name_check(url_text, full_name) if bio.downcase.include?(short_title.join(' ').downcase) && bio.downcase.include?(short_employer.to_s.downcase) matches.unshift(url) else matches.push(url) end else end end end matches end def name_check(lin_name, csv_name) csv_array = csv_name.downcase.split(" ") lin_array = lin_name.downcase.split(" ") match = true csv_array.each do |chunk| unless lin_array.include?(chunk) match = false end end return match end def create_query(row) query_parts = [row["First Name"], row["Last Name"], row["Employer 1 Title"], row["Employer Organization Name 1"]] query_parts.collect! do |part| part.gsub!(row["Email"], ' ') part.downcase.alnum.strip end "linkedin #{query_parts.join(' ')}" end end