module Feedzirra # Handles HTTP requests for Feedzirra, including registration of on success and on failure # callbacks. class HttpMulti attr_reader :options, :retrievables, :multi, :responses DEFAULTS = { :backend => { :class => Feedzirra::Backend::Memory } } def initialize(*args) @options = DEFAULTS.merge(args.extract_options!) @retrievables = args.flatten @multi = Curl::Multi.new @responses = { } @backend = @options[:backend][:class].new end # Prepares the curl object and calls #perform def run prepare @multi.perform end # Breaks the urls into chunks of 30 because of weird errors encountered on # entering more items. As one finishes it pops another off the queue. def prepare retrievable_queue = @retrievables.dup retrievable_queue.slice!(0, 30).each do |retrievable| add_to_multi(retrievable, retrievable_queue) end end # Generic method for building Curl::Multi object. Retrievable may be a Feed or a # String URL. def add_to_multi(retrievable, retrievable_queue) if retrievable.respond_to?(:feed_url) url = retrievable.feed_url else url = retrievable retrievable = @backend.get(url) # Try to fetch the last retrieval from backend end easy = build_curl_easy(url, retrievable, retrievable_queue) @multi.add(easy) end # builds a Curl::Easy object that can be added to Curl::Multi. def build_curl_easy(url, retrievable, retrievable_queue) easy = Curl::Easy.new(url) do |curl| curl = set_curl_configuration(curl, retrievable) curl.on_success do |c| on_success_handler(c, url, retrievable, retrievable_queue) end curl.on_failure do |c| if c.response_code == 304 on_success_handler(c, url, retrievable, retrievable_queue) else on_failure_handler(c, url, retrievable, retrievable_queue) end end end easy end def set_updated_feed_entries!(retrievable, updated_feed) if retrievable.respond_to?(:update_from_feed) retrievable.update_from_feed(updated_feed) else # all elements are "new", since we weren't dealing with a Feed element. updated_feed.new_entries = updated_feed.entries end end # Handles successful Curl responses. def on_success_handler(curl, url, retrievable, retrievable_queue) add_to_multi(retrievable_queue.shift, retrievable_queue) unless retrievable_queue.empty? begin if curl.response_code == 304 updated_feed = retrievable updated_feed.new_entries = [ ] else updated_feed = parser_for_xml(curl.body_str).run updated_feed.feed_url = curl.last_effective_url updated_feed.etag = etag_from_header(curl.header_str) updated_feed.last_modified = last_modified_from_header(curl.header_str) set_updated_feed_entries!(retrievable, updated_feed) end @backend.set(url, updated_feed) responses[url] = updated_feed @options[:on_success].call(retrievable) if @options.has_key?(:on_success) rescue Exception => e puts "Caught exception, but we're throwing it away: #{e}" @options[:on_failure].call(retrievable, curl.response_code, curl.header_str, curl.body_str) if @options.has_key?(:on_failure) end end # Handles failed Curl responses. def on_failure_handler(curl, url, retrievable, retrievable_queue) add_to_multi(multi, retrievable_queue.shift, retrievable_queue, responses, options) unless retrievable_queue.empty? responses[url] = curl.response_code @options[:on_failure].call(retrievable, curl.response_code, curl.header_str, curl.body_str) if options.has_key?(:on_failure) end # Determines the etag from the request headers. # # === Parameters # [header] Raw request header returned from the request # === Returns # A string of the etag or nil if it cannot be found in the headers. def etag_from_header(header) header =~ /.*ETag:\s(.*)\r/ $1 end # Determines the last modified date from the request headers. # # === Parameters # [header] Raw request header returned from the request # === Returns # A Time object of the last modified date or nil if it cannot be found in the headers. def last_modified_from_header(header) header =~ /.*Last-Modified:\s(.*)\r/ Time.parse($1) if $1 end def parser_for_xml(xml) Feedzirra::FeedParser.new(xml) end # Accepts a Curl::Easy object with an optional set of options and returns # a Curl::Easy object with options merged into the defaults. def set_curl_configuration(curl, retrievable = nil) curl.headers["User-Agent"] = @options[:user_agent] || Feedzirra::USER_AGENT curl.headers["If-Modified-Since"] = @options[:if_modified_since].httpdate if @options.has_key?(:if_modified_since) curl.headers["If-None-Match"] = retrievable.etag if (retrievable.respond_to?(:etag) && retrievable.etag) curl.headers["Accept-Encoding"] = 'gzip, deflate' if @options.has_key?(:compress) curl.follow_location = true curl.userpwd = @options[:http_authentication].join(':') if @options.has_key?(:http_authentication) curl end # Decodes the XML document if it was compressed. # # === Parameters # [curl_request] The Curl::Easy response object from the request. # === Returns # A decoded string of XML. def decode_content(curl) if curl.header_str.match(/Content-Encoding: gzip/) begin gz = Zlib::GzipReader.new(StringIO.new(curl.body_str)) xml = gz.read gz.close rescue Zlib::GzipFile::Error # Maybe this is not gzipped? xml = c.body_str end elsif curl.header_str.match(/Content-Encoding: deflate/) xml = Zlib::Inflate.inflate(curl.body_str) else xml = curl.body_str end xml end end end