Sha256: 08caf2d425a7a447f5873a3006028c8c72f2f8813afd3f1e00d6e9e85591a304

Contents?: true

Size: 1.73 KB

Versions: 1

Compression:

Stored size: 1.73 KB

Contents

require 'capybara'
require 'capybara/selenium/driver'
require 'selenium/webdriver'

module Spieker
  class LinkScraper

    include Capybara::DSL
    attr_writer :links

    def initialize(url, lang: 'en')
      @url = URI.parse(url)
      @lang = lang
      Capybara.app_host = app_host

      Capybara.register_driver :tolq do |app|
        profile = Selenium::WebDriver::Firefox::Profile.new
        profile['general.useragent.override'] = "Mozilla/5.0 (compatible; Tolq Spieker/#{Spieker::VERSION}; +http://www.tolq.com)"

        Capybara::Selenium::Driver.new(app, :profile => profile)
      end

      Capybara.current_driver = :tolq
    end

    def result
      cleaned_up_links(found_links)
    end

    def app_host
      "#{@url.scheme}://#{@url.hostname}"
    end

    private

    def found_links
      @links ||= drive_page_for_links 
    end

    def drive_page_for_links
      begin
        visit @url.path + "#!lang=#{@lang}"
        links = page.all('a').map { |el| el[:href]}
        begin
          # Our javascript adds a class if the content has been succesfully submitted
          page.find(:css, 'html.tolq-content-updated')
        rescue Capybara::Ambiguous, Capybara::ElementNotFound => e
          puts "Something went wrong with submitting the content #{e.inspect}"
        end
        links
      rescue => e
        puts "Error parsing #{@url.to_s}, #{e.inspect}"
        []
      end
    end

    def cleaned_up_links(links)
      links.select { |link|
        LinkValidator.new(link, @url.to_s).valid?
      }.map(&method(:filter_hash)).compact.uniq
    end

    def filter_hash(link)
      if match = link.match(/(.*)#(.*)$/)
        match[1]
      else
        link
      end
    end
  end
end

class NullStream
  def puts; end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
spieker-0.0.7 lib/spieker/link_scraper.rb