lib/image_scraper.rb in image_scraper-0.0.2 vs lib/image_scraper.rb in image_scraper-0.1.0

- old
+ new

@@ -1,20 +1,80 @@ +require 'pp' +require 'rails' require 'open-uri' require 'nokogiri' module ImageScraper - class Railtie < Rails::Railtie + class Client + attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc + + def initialize(url,options={}) + options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false) + @url = url + @convert_to_absolute_url = options[:convert_to_absolute_url] + @include_css_images = options[:include_css_images] + @include_css_data_images = options[:include_css_data_images] + @doc = Nokogiri::HTML(open url) + end + + def image_urls + images = page_images + images += stylesheet_images if include_css_images + images + end + + def page_images + urls = [] + doc.xpath("//img").each do |img| + image = img["src"] + image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url + urls << image + end + urls + end + + def stylesheet_images + images = [] + stylesheets.each do |stylesheet| + file = open(stylesheet) + css = file.string rescue IO.read(file) + + images += css.scan(/url\((.*?)\)/).collect do |image_url| + if image_url.include?("data:image") and @include_css_data_images + image_url[0] + else + @convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url + end + end + end + images + end + + def stylesheets + doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet| + ImageScraper::Util.absolute_url(url,stylesheet['href']) + end + end end - - def self.image_urls(url, convert_to_absolute_url=true) - uri = URI.parse(url) - domain = "#{uri.scheme}://#{uri.host}" - doc = Nokogiri::HTML(open url) - urls = [] - doc.xpath("//img").each do |img| - image = img["src"] - image = domain + image if convert_to_absolute_url and !image.include?("://") - urls << image + + module Util + def self.absolute_url(url,asset=nil) + return domain(url) + path(url) if asset.nil? and asset.include("://") + return asset if asset.include?("://") + return domain(url)+asset if asset[0]=="/" + return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset end - urls + + def self.domain(url) + uri = URI.parse(url) + "#{uri.scheme}://#{uri.host}" + end + + def self.path(url) + uri = URI.parse(url) + uri.path + end + end + + class Railtie < Rails::Railtie end end \ No newline at end of file