Sha256: 14741f87f4c9af12c711d3c3f04b9e46b67442031c093fac74268376420be9a1
Contents?: true
Size: 1.83 KB
Versions: 1
Compression:
Stored size: 1.83 KB
Contents
module Rawler class Crawler attr_accessor :url, :links SKIP_FORMATS = /^(javascript|mailto)/ def initialize(url) @url = url.strip end def links if different_domain?(url, Rawler.url) || not_html?(url) return [] end response = Rawler::Request.get(url) doc = Nokogiri::HTML(response.body) doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) } rescue Errno::ECONNREFUSED # TODO: add called from write("Couldn't connect to #{url}") [] rescue Errno::ETIMEDOUT # TODO: add called from write("Connection to #{url} timed out") [] end private def absolute_url(path) path = URI.encode(path.strip) if path[0].chr == '/' URI.parse(url).merge(path.to_s).to_s elsif URI.parse(path).scheme.nil? URI.parse(url).merge("/#{path.to_s}").to_s else path end rescue URI::InvalidURIError write("Invalid url: #{path} - Called from: #{url}") nil end # TODO: add 'called from in a more pragmatic way as an optional parameter def write(message) Rawler.output.error(message) end def different_domain?(url_1, url_2) URI.parse(url_1).host != URI.parse(url_2).host end def not_html?(url) Rawler::Request.head(url).content_type != 'text/html' end def valid_url?(url) return false unless url url.strip! scheme = URI.parse(url).scheme if ['http', 'https'].include?(scheme) true else write("Invalid url - #{url}") unless url =~ SKIP_FORMATS false end rescue URI::InvalidURIError false write("Invalid url - #{url}") end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
rawler-0.0.9 | lib/rawler/crawler.rb |