Sha256: 9f53dcac0cb271386e55217729006737e402dc789c35f31e92961322a04cca18

Contents?: true

Size: 1.07 KB

Versions: 1

Compression:

Stored size: 1.07 KB

Contents

module Rawler
  
  class Crawler
    
    attr_accessor :url, :links

    def initialize(url)
      @url = url
    end
    
    def links
      if different_domain?(url, Rawler.url) || not_html?(url)
        return []
      end
      
      response = Rawler::Request.get(url)
      
      doc = Nokogiri::HTML(response.body)
      doc.css('a').map { |a| absolute_url(a['href']) }.select { |url| valid_url?(url) }
    rescue Errno::ECONNREFUSED
      write("Couldn't connect to #{url}")
      []
    rescue Errno::ETIMEDOUT
      write("Connection to #{url} timed out")
      []
    end
    
    private
    
    def absolute_url(path)
      URI.parse(url).merge(path.to_s).to_s
    end
    
    def write(message)
      Rawler.output.puts(message)
    end
        
    def different_domain?(url_1, url_2)
      URI.parse(url_1).host != URI.parse(url_2).host
    end
    
    def not_html?(url)
      Rawler::Request.head(url).content_type != 'text/html'
    end
    
    def valid_url?(url)
      scheme = URI.parse(url).scheme

      ['http', 'https'].include?(scheme)
    end
  
  end
  
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
rawler-0.0.5 lib/rawler/crawler.rb