Sha256: fca5eb63488f72afccf4e5dfd9f63eb673f0724c42a5b8c80e52a80eabe343f1

Contents?: true

Size: 1.05 KB

Versions: 1

Compression:

Stored size: 1.05 KB

Contents

require 'nokogiri'
require 'crawler/http'
require 'crawler/formatting'

module Crawler
  module DocumentParser
    include Formatting
    include Http

    private

    # Parses the HTML from an http response
    #
    def parse_content(uri)
      Nokogiri::HTML request(uri)
    end

    # Returns the links from the html document
    #
    def extract_links
      content.css('a').map { |a| a['href'] unless a['href'] == '#' }.compact.uniq
    end

    # Returns the static assets from the html document
    #
    def extract_assets
      assets = content.css('img', 'script').map { |i| i['src'] }
      assets |= content.css('video').map { |v| v['poster'] }
      assets |= content.css('link').map { |l| l['href'] }

      assets.compact.uniq
    end

    # Returns the paths that are related to the given domain
    #
    def extract_domain_specific_paths
      links.map do |link|
        uri = Addressable::URI.parse(link.strip)
        if uri.hostname.nil? || uri.hostname == @uri.hostname
          normalize_path uri.path
        end
      end.compact
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
ruby-crawler-0.0.1 lib/crawler/document_parser.rb