Sha256: ccfab00a2551ad54d87999602049e9dfb4da2215e4ec400e6b19c1f5bfe90e00
Contents?: true
Size: 1.05 KB
Versions: 1
Compression:
Stored size: 1.05 KB
Contents
module Rawler class Crawler attr_accessor :url, :links def initialize(url) @url = url end def links if different_domain?(url, Rawler.url) || not_html?(url) return [] end response = Rawler::Request.get(url) doc = Nokogiri::HTML(response.body) doc.css('a').map { |a| absolute_url(a['href']) }.select { |url| valid_url?(url) } rescue Errno::ECONNREFUSED write("Couldn't connect to #{url}") [] end private def absolute_url(path) URI.parse(URI.encode(url)).merge(URI.encode(path.to_s)).to_s end def write(message) Rawler.output.puts(message) end def different_domain?(url_1, url_2) URI.parse(URI.encode(url_1)).host != URI.parse(URI.encode(url_2)).host end def not_html?(url) Rawler::Request.head(url).content_type != 'text/html' end def valid_url?(url) scheme = URI.parse(URI.encode(url)).scheme ['http', 'https'].include?(scheme) end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
rawler-0.0.4 | lib/rawler/crawler.rb |