Sha256: a4ea5e3347c4964a1b528e6e7a7fa2f209f78d3b08a7e972dcdba1131a176eb7

Contents?: true

Size: 1.88 KB

Versions: 3

Compression:

Stored size: 1.88 KB

Contents

module Downer
  module DownloadStrategy
    
    class WebsiteStrategy < GenericStrategy      
      
      # Create the downloading strategy, set any behavior flags in the options hash
      def initialize(url_source, search_options = {})
        super(url_source, search_options)
        uri = URI.parse(url_source)
        @host_prefix = uri.scheme + "://" + uri.host
      end
      
      # Retrieve urls from an HTML page. Behavior is dependent upon options passed
      # to constructor
      def get_urls
        @noko = Nokogiri::HTML(download_page)
        urls = []
        
        if @search_options[:images_only]
          urls = image_urls
        else
          urls = urls.concat document_links
          urls = urls.concat image_urls
        end
        urls.uniq
      end
      
      # read an html page into memory
      def download_page
        @downloaded_page ||= open(@url_source)
      end
      
      # Return all image urls from document
      def image_urls
        urls = []      
        @noko.css('img').each do |img| 
          urls << absolutify_link(img['src'])
        end
        urls
      end
      
      # Return all links stored within the document
      def document_links
        urls = []
        @noko.css('a').each do |alink|
          link = alink['href']
          urls << absolutify_link(link)
        end
        urls
      end
      
      # Converts non absolute urls to absolute ones
      def absolutify_link(link)
        
        # Auto prepend any links which refer use releative reference like '../'
        if link[0,1] == '.'
          link = '/' + link
        end
        
        if link =~ /(https?|ftp).*/
          url = link
        elsif link[0,1] != '/'
          link = "/" + link
        else
          url = @host_prefix + link
        end
      end
      
      def source_valid?
        URI.parse(@url_source)
      end
      
    end
  end
end

Version data entries

3 entries across 3 versions & 1 rubygems

Version Path
downer-0.3.2 lib/downer/strategies/website_strategy.rb
downer-0.3.1 lib/downer/strategies/website_strategy.rb
downer-0.3.0 lib/downer/strategies/website_strategy.rb