Sha256: ccfab00a2551ad54d87999602049e9dfb4da2215e4ec400e6b19c1f5bfe90e00

Contents?: true

Size: 1.05 KB

Versions: 1

Compression:

Stored size: 1.05 KB

Contents

module Rawler
  
  class Crawler
    
    attr_accessor :url, :links

    def initialize(url)
      @url = url
    end
    
    def links
      if different_domain?(url, Rawler.url) || not_html?(url)
        return []
      end
      
      response = Rawler::Request.get(url)
      
      doc = Nokogiri::HTML(response.body)
      doc.css('a').map { |a| absolute_url(a['href']) }.select { |url| valid_url?(url) }
    rescue Errno::ECONNREFUSED
      write("Couldn't connect to #{url}")
      []
    end
    
    private
    
    def absolute_url(path)
      URI.parse(URI.encode(url)).merge(URI.encode(path.to_s)).to_s
    end
    
    def write(message)
      Rawler.output.puts(message)
    end
        
    def different_domain?(url_1, url_2)
      URI.parse(URI.encode(url_1)).host != URI.parse(URI.encode(url_2)).host
    end
    
    def not_html?(url)
      Rawler::Request.head(url).content_type != 'text/html'
    end
    
    def valid_url?(url)
      scheme = URI.parse(URI.encode(url)).scheme

      ['http', 'https'].include?(scheme)
    end
  
  end
  
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
rawler-0.0.4 lib/rawler/crawler.rb