lib/feedzirra/feed.rb in kete-feedzirra-0.0.8.1 vs lib/feedzirra/feed.rb in kete-feedzirra-0.0.16.1

- old
+ new

@@ -25,11 +25,11 @@ # === Parameters # [xml<String>] The XML that you would like determine the parser for. # === Returns # The class name of the parser that can handle the XML. def self.determine_feed_parser_for_xml(xml) - start_of_doc = xml.slice(0, 1000) + start_of_doc = xml.slice(0, 2000) feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)} end # Adds a new feed parsing class that will be used for parsing. # @@ -44,11 +44,11 @@ # Provides a list of registered feed parsing classes. # # === Returns # A array of class names. def self.feed_classes - @feed_classes ||= [ITunesRSS, RSS, AtomFeedBurner, Atom] + @feed_classes ||= [Feedzirra::Parser::RSS, Feedzirra::Parser::AtomFeedBurner, Feedzirra::Parser::Atom] end # Makes all entry types look for the passed in element to parse. This is actually just a call to # element (a SAXMachine call) in the class # @@ -56,29 +56,15 @@ # [element_tag<String>] # [options<Hash>] Valid keys are same as with SAXMachine def self.add_common_feed_entry_element(element_tag, options = {}) # need to think of a better way to do this. will break for people who want this behavior # across their added classes - [RSSEntry, AtomFeedBurnerEntry, AtomEntry].each do |klass| + feed_classes.map{|k| eval("#{k}Entry") }.each do |klass| klass.send(:element, element_tag, options) end end - # Makes all entry types look for the passed in elements to parse. This is actually just a call to - # elements (a SAXMachine call) in the class - # - # === Parameters - # [element_tag<String>] - # [options<Hash>] Valid keys are same as with SAXMachine - def self.add_common_feed_entry_elements(element_tag, options = {}) - # need to think of a better way to do this. will break for people who want this behavior - # across their added classes - [RSSEntry, AtomFeedBurnerEntry, AtomEntry].each do |klass| - klass.send(:elements, element_tag, options) - end - end - # Fetches and returns the raw XML for each URL provided. # # === Parameters # [urls<String> or <Array>] A single feed URL, or an array of feed URLs. # [options<Hash>] Valid keys for this argument as as followed: @@ -98,13 +84,16 @@ url_queue.each do |url| easy = Curl::Easy.new(url) do |curl| curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT) curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since) curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match) - curl.headers["Accept-encoding"] = 'gzip, deflate' + curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress) curl.follow_location = true curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication) + + curl.max_redirects = options[:max_redirects] if options[:max_redirects] + curl.timeout = options[:timeout] if options[:timeout] curl.on_success do |c| responses[url] = decode_content(c) end curl.on_failure do |c| @@ -113,38 +102,38 @@ end multi.add(easy) end multi.perform - return urls.is_a?(String) ? responses.values.first : responses + urls.is_a?(String) ? responses.values.first : responses end # Fetches and returns the parsed XML for each URL provided. # # === Parameters # [urls<String> or <Array>] A single feed URL, or an array of feed URLs. # [options<Hash>] Valid keys for this argument as as followed: - # * :user_agent - String that overrides the default user agent. - # * :if_modified_since - Time object representing when the feed was last updated. - # * :if_none_match - String, an etag for the request that was stored previously. - # * :on_success - Block that gets executed after a successful request. - # * :on_failure - Block that gets executed after a failed request. + # * :user_agent - String that overrides the default user agent. + # * :if_modified_since - Time object representing when the feed was last updated. + # * :if_none_match - String, an etag for the request that was stored previously. + # * :on_success - Block that gets executed after a successful request. + # * :on_failure - Block that gets executed after a failed request. # === Returns # A Feed object if a single URL is passed. # # A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object. def self.fetch_and_parse(urls, options = {}) url_queue = [*urls] multi = Curl::Multi.new responses = {} - # I broke these down so I would only try to do 30 simultaneously because + # I broke these down so I would only try to do 30 simultaneously because # I was getting weird errors when doing a lot. As one finishes it pops another off the queue. url_queue.slice!(0, 30).each do |url| add_url_to_multi(multi, url, url_queue, responses, options) end - + multi.perform return urls.is_a?(String) ? responses.values.first : responses end # Decodes the XML document if it was compressed. @@ -192,11 +181,11 @@ feed_queue.slice!(0, 30).each do |feed| add_feed_to_multi(multi, feed, feed_queue, responses, options) end multi.perform - return responses.size == 1 ? responses.values.first : responses.values + responses.size == 1 ? responses.values.first : responses.values end # An abstraction for adding a feed by URL to the passed Curb::multi stack. # # === Parameters @@ -214,26 +203,33 @@ def self.add_url_to_multi(multi, url, url_queue, responses, options) easy = Curl::Easy.new(url) do |curl| curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT) curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since) curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match) - curl.headers["Accept-encoding"] = 'gzip, deflate' + curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress) curl.follow_location = true curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication) + + curl.max_redirects = options[:max_redirects] if options[:max_redirects] + curl.timeout = options[:timeout] if options[:timeout] curl.on_success do |c| add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty? xml = decode_content(c) klass = determine_feed_parser_for_xml(xml) if klass - feed = klass.parse(xml) - feed.feed_url = c.last_effective_url - feed.etag = etag_from_header(c.header_str) - feed.last_modified = last_modified_from_header(c.header_str) - responses[url] = feed - options[:on_success].call(url, feed) if options.has_key?(:on_success) + begin + feed = klass.parse(xml) + feed.feed_url = c.last_effective_url + feed.etag = etag_from_header(c.header_str) + feed.last_modified = last_modified_from_header(c.header_str) + responses[url] = feed + options[:on_success].call(url, feed) if options.has_key?(:on_success) + rescue Exception => e + options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure) + end else # puts "Error determining parser for #{url} - #{c.last_effective_url}" # raise NoParserAvailable.new("no valid parser for content.") (this would unfirtunately fail the whole 'multi', so it's not really useable) options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure) end @@ -268,18 +264,25 @@ curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified curl.headers["If-None-Match"] = feed.etag if feed.etag curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication) curl.follow_location = true + curl.max_redirects = options[:max_redirects] if options[:max_redirects] + curl.timeout = options[:timeout] if options[:timeout] + curl.on_success do |c| - add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty? - updated_feed = Feed.parse(c.body_str) - updated_feed.feed_url = c.last_effective_url - updated_feed.etag = etag_from_header(c.header_str) - updated_feed.last_modified = last_modified_from_header(c.header_str) - feed.update_from_feed(updated_feed) - responses[feed.feed_url] = feed - options[:on_success].call(feed) if options.has_key?(:on_success) + begin + add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty? + updated_feed = Feed.parse(c.body_str) + updated_feed.feed_url = c.last_effective_url + updated_feed.etag = etag_from_header(c.header_str) + updated_feed.last_modified = last_modified_from_header(c.header_str) + feed.update_from_feed(updated_feed) + responses[feed.feed_url] = feed + options[:on_success].call(feed) if options.has_key?(:on_success) + rescue Exception => e + options[:on_failure].call(feed, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure) + end end curl.on_failure do |c| add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty? response_code = c.response_code \ No newline at end of file