require 'thread' require 'fileutils' require 'active_support/core_ext' require 'securerandom' require_relative 'helper' module ImgDl class Parser include Helper Default_Options = {url_limit_count: nil,url_reg: nil,image_limit_count: nil,image_reg: nil,recursive: false,prefix: nil,interval: 0} attr_reader :agent,:origin_url,:options,:image_count,:url_count,:running,:error_urls,:downloaded_image_count,:success_download,:status,:dl_status alias running? running def initialize url,save_path,options = {} @agent = Mechanize.new @agent.user_agent_alias = 'Linux Mozilla' @origin_url = URI url @current_url = URI url @_urls = Hash.new 0 @_imgs = Hash.new 0 @save_path = save_path FileUtils.mkdir_p save_path @image_count = 0 @url_count = 0 @urls = Queue.new @error_urls = Queue.new enq_urls url @images = Queue.new @options = Default_Options.merge options define_options_helper @options @downloaded_image_count = 0 @running = true @downloading = true @success_download = 0 @status = "start" @dl_status = "ready" end def start Thread.start{parse} download rescue StandardError => e p e end def parse loop do break unless next_parse? sleep interval @status = "get url" url = @urls.shift url = URI.escape url if url.respond_to? :gsub @current_url = URI url begin page = @agent.get url rescue StandardError => e @error_urls << [url,e] puts e next end unless page.respond_to? :images redo end parse_images page if continue? parse_links page end end @running = false @status = "parser complete" end def default_head @_default_head ||= {"USER-AGENT"=>"Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17", "ACCEPT-ENCODING"=>"gzip,deflate,sdch","ACCEPT" => '*/*', "ACCEPT-CHARSET"=>"UTF-8,*;q=0.5", "ACCEPT-LANGUAGE"=>"zh-CN,zh;q=0.8","connection" => "close"} end def download @dl_status = "start" @_download_image = 0 EM.run do loop do if !running? && (@images.empty? || (image_limit_count? && @_download_image >= image_limit_count)) @dl_status = "all done" download_complete? and EM.stop break end if @images.empty? if running? @dl_status = "wait parser" sleep 3 redo else next end end @_download_image += 1 @dl_status = "shift image url" image_uri = @images.shift @dl_status = "download image #{image_uri}" http = EventMachine::HttpRequest.new(image_uri).get head: default_head http.callback { |res| res.response_header["CONTENT_TYPE"] =~ /^image\/(\w+)/ type = $1 if type @success_download += 1 save_image type,res.response else @error_urls << [image_uri,"image download error"] end @downloaded_image_count += 1 @dl_status = "success: download image #{image_uri}" download_complete? and EM.stop } http.errback { |res| @error_urls << [image_uri,"image download error"] @downloaded_image_count += 1 @dl_status = "failed: download image #{image_uri}" download_complete? and EM.stop } end end @dl_status = "download complete" @downloading = false end protected def download_complete? !running? && (@downloaded_image_count >= @_download_image || (image_limit_count? and @downloaded_image_count >= image_limit_count)) end def random_file_name SecureRandom.uuid end def save_image name = random_file_name,type,content file_name = File.join @save_path,"#{prefix}#{name}.#{type}" File.open(file_name,"w+") do |io| io.binmode io.write content end end def valid_url? url URI url rescue StandardError => e @error_urls << [url,e] false end def enq_urls link if !link_dup?(link) && valid_url?(link) @_urls[link] += 1 @urls << link @url_count += 1 end end def enq_images src if !image_dup?(src) && valid_url?(src) @_imgs[src] += 1 @images << src @image_count += 1 end end def link_dup? link @_urls.has_key? link end def image_dup? src @_imgs.has_key? src end def valid_link? link if url_reg? link.to_s =~ url_reg && !link_dup?(link) else !link_dup?(link) end end def parse_links page @status = "parse urls" links = page.links.map{|link| link.href.present? and URI.join @current_url,URI.escape(link.href) rescue nil} links.select!{|link| link.present? and valid_link?(link)} links.each{|link| enq_urls link} end def parse_images page @status = "parse images" images = page.images.map{|img| img.src.present? && URI.join(@current_url,URI.escape(img.src))} images.select!{|img| img.to_s =~ image_reg} if image_reg? images.each{|img| enq_images img} end def continue? recursive? && (image_limit_count? ? @image_count < image_limit_count : true) && (url_limit_count? ? @url_count < url_limit_count : true) end def next_parse? (image_limit_count? ? @image_count < image_limit_count : true) && (url_limit_count? ? @url_count < url_limit_count : true) && !@urls.empty? end end end