lib/rawler/crawler.rb in rawler-0.1.0 vs lib/rawler/crawler.rb in rawler-0.1.1

- old
+ new

@@ -1,48 +1,72 @@ +# `Rawler::Crawler` is responsible for parsing links inside a page + module Rawler class Crawler - attr_accessor :url, :links + # An instance of Rawler::Crawler has a url which represents the url for which we want to parse links. + attr_accessor :url + + # We want to skip some kind of formats + SKIP_FORMATS = /^(javascript|mailto)/ + + # To use this class, just pass it a url def initialize(url) @url = url.strip end + + # And then call `links` to get its links. def links + # If the url is different than the main Rawler.url, or if the page is not html, we return an empty array if different_domain?(url, Rawler.url) || not_html?(url) return [] end + # Otherwise we fetch the page + response = Rawler::Request.get(url) + + # And kindly ask nokogiri to convert it for us doc = Nokogiri::HTML(response.body) + + # We then do some magic, search all the links in the document that contain a valid link, and return them. doc.css('a').map { |a| a['href'] }.select { |url| !url.nil? }.map { |url| absolute_url(url) }.select { |url| valid_url?(url) } - rescue Errno::ECONNREFUSED # TODO: add called from + rescue Errno::ECONNREFUSED write("Couldn't connect to #{url}") [] - rescue Errno::ETIMEDOUT # TODO: add called from + rescue Errno::ETIMEDOUT write("Connection to #{url} timed out") [] end private + # Here's how we transform a relative url to an absolute url + def absolute_url(path) + # First, encode the url path = URI.encode(path.strip, Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]")) + + # if the url contains a scheme that means it's already absolute if URI.parse(path).scheme path else + # Otherwise we merge `url` to get the absolute url URI.parse(url).merge(path).to_s end rescue URI::InvalidURIError write("Invalid url: #{path} - Called from: #{url}") nil end - # TODO: add 'called from in a more pragmatic way as an optional parameter + # Some helper methods + def write(message) Rawler.output.error(message) end def different_domain?(url_1, url_2)