lib/image_scraper.rb in image_scraper-0.0.2 vs lib/image_scraper.rb in image_scraper-0.1.0
- old
+ new
@@ -1,20 +1,80 @@
+require 'pp'
+require 'rails'
require 'open-uri'
require 'nokogiri'
module ImageScraper
- class Railtie < Rails::Railtie
+ class Client
+ attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
+
+ def initialize(url,options={})
+ options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
+ @url = url
+ @convert_to_absolute_url = options[:convert_to_absolute_url]
+ @include_css_images = options[:include_css_images]
+ @include_css_data_images = options[:include_css_data_images]
+ @doc = Nokogiri::HTML(open url)
+ end
+
+ def image_urls
+ images = page_images
+ images += stylesheet_images if include_css_images
+ images
+ end
+
+ def page_images
+ urls = []
+ doc.xpath("//img").each do |img|
+ image = img["src"]
+ image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
+ urls << image
+ end
+ urls
+ end
+
+ def stylesheet_images
+ images = []
+ stylesheets.each do |stylesheet|
+ file = open(stylesheet)
+ css = file.string rescue IO.read(file)
+
+ images += css.scan(/url\((.*?)\)/).collect do |image_url|
+ if image_url.include?("data:image") and @include_css_data_images
+ image_url[0]
+ else
+ @convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url[0]) : image_url
+ end
+ end
+ end
+ images
+ end
+
+ def stylesheets
+ doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
+ ImageScraper::Util.absolute_url(url,stylesheet['href'])
+ end
+ end
end
-
- def self.image_urls(url, convert_to_absolute_url=true)
- uri = URI.parse(url)
- domain = "#{uri.scheme}://#{uri.host}"
- doc = Nokogiri::HTML(open url)
- urls = []
- doc.xpath("//img").each do |img|
- image = img["src"]
- image = domain + image if convert_to_absolute_url and !image.include?("://")
- urls << image
+
+ module Util
+ def self.absolute_url(url,asset=nil)
+ return domain(url) + path(url) if asset.nil? and asset.include("://")
+ return asset if asset.include?("://")
+ return domain(url)+asset if asset[0]=="/"
+ return domain(url) =~ /\/$/ ? domain(url)+asset : domain(url)+"/"+asset
end
- urls
+
+ def self.domain(url)
+ uri = URI.parse(url)
+ "#{uri.scheme}://#{uri.host}"
+ end
+
+ def self.path(url)
+ uri = URI.parse(url)
+ uri.path
+ end
+ end
+
+ class Railtie < Rails::Railtie
end
end
\ No newline at end of file