lib/image_scraper/client.rb in image_scraper-0.1.4 vs lib/image_scraper/client.rb in image_scraper-0.1.5
- old
+ new
@@ -2,28 +2,29 @@
class Client
attr_accessor :url, :convert_to_absolute_url, :include_css_images, :include_css_data_images, :doc
def initialize(url,options={})
options.reverse_merge!(:convert_to_absolute_url=>true,:include_css_images=>true, :include_css_data_images=>false)
- @url = url
+ @url = URI.escape(url)
@convert_to_absolute_url = options[:convert_to_absolute_url]
@include_css_images = options[:include_css_images]
@include_css_data_images = options[:include_css_data_images]
- html = open(url).read
- @doc = Nokogiri::HTML(html)
+ html = open(@url).read rescue nil
+ @doc = html ? Nokogiri::HTML(html) : nil
end
def image_urls
images = page_images
images += stylesheet_images if include_css_images
images
end
def page_images
urls = []
+ return urls if doc.blank?
doc.xpath("//img").each do |img|
- image = img["src"]
+ image = URI.escape(img["src"])
image = ImageScraper::Util.absolute_url(url,image) if convert_to_absolute_url
urls << image
end
urls
end
@@ -33,11 +34,11 @@
stylesheets.each do |stylesheet|
file = open(stylesheet)
css = file.string rescue IO.read(file)
images += css.scan(/url\((.*?)\)/).collect do |image_url|
- image_url = image_url[0]
+ image_url = URI.escape image_url[0]
if image_url.include?("data:image") and @include_css_data_images
image_url
else
image_url = ImageScraper::Util.strip_quotes(image_url)
@convert_to_absolute_url ? ImageScraper::Util.absolute_url(url,image_url) : image_url
@@ -46,11 +47,12 @@
end
images
end
def stylesheets
+ return [] if doc.blank?
doc.xpath('//link[@rel="stylesheet"]').collect do |stylesheet|
- ImageScraper::Util.absolute_url(url,stylesheet['href'])
+ URI.escape ImageScraper::Util.absolute_url(url,stylesheet['href'])
end
end
end
-end
\ No newline at end of file
+end