require 'capybara' require 'capybara/poltergeist' module Spieker class LinkScraper LOCAL_LINK_REGEX = /^(?!(http(s)?\:|\/\/)|data\:).*/ include Capybara::DSL attr_writer :links def initialize(url) @url = URI.parse(url) Capybara.app_host = app_host Capybara.register_driver :poltergeist do |app| Capybara::Poltergeist::Driver.new(app, phantomjs_logger: NullStream.new ) end Capybara.current_driver = :selenium end def result cleaned_up_links(found_links) end def app_host "#{@url.scheme}://#{@url.hostname}" end private def found_links @links ||= drive_page_for_links end def drive_page_for_links begin visit @url.path page.all('a').map { |el| el[:href]} rescue puts "Error parsing #{@url.to_s}" [] end end def cleaned_up_links(links) links.select { |link| is_local?(link) && !is_email?(link) }.map(&method(:filter_hash)).compact.uniq end def is_local?(link) link =~ LOCAL_LINK_REGEX || begin URI.parse(link).hostname == @url.hostname rescue false end end def filter_hash(link) if match = link.match(/(.*)#(.*)$/) match[1] else link end end def is_email? link link =~ /mailto/ end end end class NullStream def puts; end end