require 'capybara' require 'capybara/selenium/driver' require 'selenium/webdriver' module Spieker class LinkScraper include Capybara::DSL attr_writer :links def initialize(url, lang: 'en') @url = URI.parse(url) @lang = lang Capybara.app_host = app_host Capybara.register_driver :tolq do |app| profile = Selenium::WebDriver::Firefox::Profile.new profile['general.useragent.override'] = "Mozilla/5.0 (compatible; Tolq Spieker/#{Spieker::VERSION}; +http://www.tolq.com)" Capybara::Selenium::Driver.new(app, :profile => profile) end Capybara.current_driver = :tolq end def result cleaned_up_links(found_links) end def app_host "#{@url.scheme}://#{@url.hostname}" end private def found_links @links ||= drive_page_for_links end def drive_page_for_links begin visit @url.path + "#!lang=#{@lang}" links = page.all('a').map { |el| el[:href]} begin # Our javascript adds a class if the content has been succesfully submitted page.find(:css, 'html.tolq-content-updated') rescue Capybara::Ambiguous, Capybara::ElementNotFound => e puts "Something went wrong with submitting the content #{e.inspect}" end links rescue => e puts "Error parsing #{@url.to_s}, #{e.inspect}" [] end end def cleaned_up_links(links) links.select { |link| LinkValidator.new(link, @url.to_s).valid? }.map(&method(:filter_hash)).compact.uniq end def filter_hash(link) if match = link.match(/(.*)#(.*)$/) match[1] else link end end end end class NullStream def puts; end end