require 'capybara' require 'capybara/selenium/driver' require 'selenium/webdriver' module Spieker class LinkScraper include Capybara::DSL attr_writer :links def initialize(url, lang: 'en') @url = URI.parse(url) @lang = lang Capybara.app_host = app_host Capybara.register_driver :tolq do |app| profile = Selenium::WebDriver::Firefox::Profile.new # Sorry internet, but we need the whitelisting profile['general.useragent.override'] = "Mozilla/5.0 (compatible; Googlebot TolqSpieker/#{Spieker::VERSION}; +http://www.tolq.com)" Capybara::Selenium::Driver.new(app, :profile => profile) end Capybara.current_driver = :tolq end def result cleaned_up_links(found_links) end def app_host "#{@url.scheme}://#{@url.hostname}" end private def found_links @links ||= drive_page_for_links end def drive_page_for_links begin query = if @url.query "?#{@url.query}" else "" end visit @url.path + query + "#!lang=#{@lang}" # Capybara + selenium causes some links not to be found. There doesn't seem to be any method to that. # Cool is tho, in JS it's a lot faster as well links = begin page.evaluate_script('document.getElementsByTagName(\'a\')').map { |el| el['href'] } rescue Net::ReadTimeout page.all('a').map { |el| el['href'] } end begin # Our javascript adds a class if the content has been succesfully submitted page.find(:css, 'html.tolq-content-updated') rescue Capybara::Ambiguous, Capybara::ElementNotFound, Net::ReadTimeout => e puts "Something went wrong with submitting the content: #{e.message}" end links rescue => e puts "Error parsing #{@url.to_s}, #{e.message}" [] end end def cleaned_up_links(links) links.select { |link| LinkValidator.new(link, @url.to_s).valid? }.map(&method(:filter_hash)).compact.uniq end def filter_hash(link) if match = link.match(/(.*)#(.*)$/) match[1] else link end end end end class NullStream def puts; end end