lib/feedzirra/feed.rb in pauldix-feedzirra-0.0.3 vs lib/feedzirra/feed.rb in pauldix-feedzirra-0.0.5

- old
+ new

@@ -2,34 +2,97 @@ class NoParserAvailable < StandardError; end class Feed USER_AGENT = "feedzirra http://github.com/pauldix/feedzirra/tree/master" + # Takes a raw XML feed and attempts to parse it. If no parser is available a Feedzirra::NoParserAvailable exception is raised. + # + # === Parameters + # [xml<String>] The XML that you would like parsed. + # === Returns + # An instance of the determined feed type. By default a Feedzirra::Atom, Feedzirra::AtomFeedBurner, Feedzirra::RDF, or Feedzirra::RSS object. + # === Raises + # Feedzirra::NoParserAvailable : If no valid parser classes could be found for the feed. def self.parse(xml) if parser = determine_feed_parser_for_xml(xml) parser.parse(xml) else - raise NoParserAvailable.new("no valid parser for content.") + raise NoParserAvailable.new("No valid parser for XML.") end end + # Determines the correct parser class to use for parsing the feed. + # + # === Parameters + # [xml<String>] The XML that you would like determine the parser for. + # === Returns + # The class name of the parser that can handle the XML. def self.determine_feed_parser_for_xml(xml) start_of_doc = xml.slice(0, 1000) feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)} end - def self.add_feed_class(klass) + # Adds a new feed parsing class that will be used for parsing. + # + # === Parameters + # [klass<Constant>] The class/constant that you want to register. + # === Returns + # A updated array of feed parser class names. + def self.add_feed_class(klass) feed_classes.unshift klass end - + + # Provides a list of registered feed parsing classes. + # + # === Returns + # A array of class names. def self.feed_classes - @feed_classes ||= [RSS, AtomFeedBurner, Atom] + @feed_classes ||= [ITunesRSS, RSS, AtomFeedBurner, Atom] end + + # Makes all entry types look for the passed in element to parse. This is actually just a call to + # element (a SAXMachine call) in the class + # + # === Parameters + # [element_tag<String>] + # [options<Hash>] Valid keys are same as with SAXMachine + def self.add_common_feed_entry_element(element_tag, options = {}) + # need to think of a better way to do this. will break for people who want this behavior + # across their added classes + [RSSEntry, AtomFeedBurnerEntry, AtomEntry].each do |klass| + klass.send(:element, element_tag, options) + end + end + + # Makes all entry types look for the passed in elements to parse. This is actually just a call to + # elements (a SAXMachine call) in the class + # + # === Parameters + # [element_tag<String>] + # [options<Hash>] Valid keys are same as with SAXMachine + def self.add_common_feed_entry_elements(element_tag, options = {}) + # need to think of a better way to do this. will break for people who want this behavior + # across their added classes + [RSSEntry, AtomFeedBurnerEntry, AtomEntry].each do |klass| + klass.send(:elements, element_tag, options) + end + end - # can take a single url or an array of urls - # when passed a single url it returns the body of the response - # when passed an array of urls it returns a hash with the urls as keys and body of responses as values + # Fetches and returns the raw XML for each URL provided. + # + # === Parameters + # [urls<String> or <Array>] A single feed URL, or an array of feed URLs. + # [options<Hash>] Valid keys for this argument as as followed: + # :user_agent - String that overrides the default user agent. + # :if_modified_since - Time object representing when the feed was last updated. + # :if_none_match - String that's normally an etag for the request that was stored previously. + # :on_success - Block that gets executed after a successful request. + # :on_failure - Block that gets executed after a failed request. + # === Returns + # A String of XML if a single URL is passed. + # + # A Hash if multiple URL's are passed. The key will be the URL, and the value the XML. def self.fetch_raw(urls, options = {}) url_queue = [*urls] multi = Curl::Multi.new responses = {} url_queue.each do |url| @@ -37,10 +100,12 @@ curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT) curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since) curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match) curl.headers["Accept-encoding"] = 'gzip, deflate' curl.follow_location = true + curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication) + curl.on_success do |c| responses[url] = decode_content(c) end curl.on_failure do |c| responses[url] = c.response_code @@ -50,103 +115,167 @@ end multi.perform return urls.is_a?(String) ? responses.values.first : responses end - + + # Fetches and returns the parsed XML for each URL provided. + # + # === Parameters + # [urls<String> or <Array>] A single feed URL, or an array of feed URLs. + # [options<Hash>] Valid keys for this argument as as followed: + # * :user_agent - String that overrides the default user agent. + # * :if_modified_since - Time object representing when the feed was last updated. + # * :if_none_match - String, an etag for the request that was stored previously. + # * :on_success - Block that gets executed after a successful request. + # * :on_failure - Block that gets executed after a failed request. + # === Returns + # A Feed object if a single URL is passed. + # + # A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object. def self.fetch_and_parse(urls, options = {}) url_queue = [*urls] multi = Curl::Multi.new - + responses = {} + # I broke these down so I would only try to do 30 simultaneously because # I was getting weird errors when doing a lot. As one finishes it pops another off the queue. - responses = {} url_queue.slice!(0, 30).each do |url| add_url_to_multi(multi, url, url_queue, responses, options) end multi.perform return urls.is_a?(String) ? responses.values.first : responses end - - def self.decode_content(c) - if c.header_str.match(/Content-Encoding: gzip/) - gz = Zlib::GzipReader.new(StringIO.new(c.body_str)) + + # Decodes the XML document if it was compressed. + # + # === Parameters + # [curl_request<Curl::Easy>] The Curl::Easy response object from the request. + # === Returns + # A decoded string of XML. + def self.decode_content(curl_request) + if curl_request.header_str.match(/Content-Encoding: gzip/) + gz = Zlib::GzipReader.new(StringIO.new(curl_request.body_str)) xml = gz.read gz.close - elsif c.header_str.match(/Content-Encoding: deflate/) - xml = Zlib::Deflate.inflate(c.body_str) + elsif curl_request.header_str.match(/Content-Encoding: deflate/) + xml = Zlib::Deflate.inflate(curl_request.body_str) else - xml = c.body_str + xml = curl_request.body_str end - + xml end - + + # Updates each feed for each Feed object provided. + # + # === Parameters + # [feeds<Feed> or <Array>] A single feed object, or an array of feed objects. + # [options<Hash>] Valid keys for this argument as as followed: + # * :user_agent - String that overrides the default user agent. + # * :on_success - Block that gets executed after a successful request. + # * :on_failure - Block that gets executed after a failed request. + # === Returns + # A updated Feed object if a single URL is passed. + # + # A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object. def self.update(feeds, options = {}) feed_queue = [*feeds] multi = Curl::Multi.new responses = {} + feed_queue.slice!(0, 30).each do |feed| add_feed_to_multi(multi, feed, feed_queue, responses, options) end multi.perform return responses.size == 1 ? responses.values.first : responses.values end + # An abstraction for adding a feed by URL to the passed Curb::multi stack. + # + # === Parameters + # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too. + # [url<String>] The URL of the feed that you would like to be fetched. + # [url_queue<Array>] An array of URLs that are queued for request. + # [responses<Hash>] Existing responses that you want the response from the request added to. + # [feeds<String> or <Array>] A single feed object, or an array of feed objects. + # [options<Hash>] Valid keys for this argument as as followed: + # * :user_agent - String that overrides the default user agent. + # * :on_success - Block that gets executed after a successful request. + # * :on_failure - Block that gets executed after a failed request. + # === Returns + # The updated Curl::Multi object with the request details added to it's stack. def self.add_url_to_multi(multi, url, url_queue, responses, options) easy = Curl::Easy.new(url) do |curl| curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT) curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since) curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match) curl.headers["Accept-encoding"] = 'gzip, deflate' curl.follow_location = true + curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication) + curl.on_success do |c| add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty? xml = decode_content(c) klass = determine_feed_parser_for_xml(xml) + if klass feed = klass.parse(xml) feed.feed_url = c.last_effective_url feed.etag = etag_from_header(c.header_str) feed.last_modified = last_modified_from_header(c.header_str) responses[url] = feed options[:on_success].call(url, feed) if options.has_key?(:on_success) else - puts "Error determining parser for #{url} - #{c.last_effective_url}" + raise NoParserAvailable.new("Error determining parser for #{url} - #{c.last_effective_url}.") end end + curl.on_failure do |c| add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty? responses[url] = c.response_code options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure) end end multi.add(easy) end - def self.add_feed_to_multi(multi, feed, feed_queue, responses, options) - # on_success = options[:on_success] - # on_failure = options[:on_failure] - # options[:on_success] = lambda do || - + # An abstraction for adding a feed by a Feed object to the passed Curb::multi stack. + # + # === Parameters + # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too. + # [feed<Feed>] A feed object that you would like to be fetched. + # [url_queue<Array>] An array of feed objects that are queued for request. + # [responses<Hash>] Existing responses that you want the response from the request added to. + # [feeds<String>] or <Array> A single feed object, or an array of feed objects. + # [options<Hash>] Valid keys for this argument as as followed: + # * :user_agent - String that overrides the default user agent. + # * :on_success - Block that gets executed after a successful request. + # * :on_failure - Block that gets executed after a failed request. + # === Returns + # The updated Curl::Multi object with the request details added to it's stack. + def self.add_feed_to_multi(multi, feed, feed_queue, responses, options) easy = Curl::Easy.new(feed.feed_url) do |curl| curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT) curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified curl.headers["If-None-Match"] = feed.etag if feed.etag + curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication) curl.follow_location = true + curl.on_success do |c| add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty? updated_feed = Feed.parse(c.body_str) updated_feed.feed_url = c.last_effective_url updated_feed.etag = etag_from_header(c.header_str) updated_feed.last_modified = last_modified_from_header(c.header_str) feed.update_from_feed(updated_feed) responses[feed.feed_url] = feed options[:on_success].call(feed) if options.has_key?(:on_success) end + curl.on_failure do |c| add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty? response_code = c.response_code if response_code == 304 # it's not modified. this isn't an error condition responses[feed.feed_url] = feed @@ -157,15 +286,27 @@ end end end multi.add(easy) end - + + # Determines the etag from the request headers. + # + # === Parameters + # [header<String>] Raw request header returned from the request + # === Returns + # A string of the etag or nil if it cannot be found in the headers. def self.etag_from_header(header) header =~ /.*ETag:\s(.*)\r/ $1 end - + + # Determines the last modified date from the request headers. + # + # === Parameters + # [header<String>] Raw request header returned from the request + # === Returns + # A Time object of the last modified date or nil if it cannot be found in the headers. def self.last_modified_from_header(header) header =~ /.*Last-Modified:\s(.*)\r/ Time.parse($1) if $1 end end \ No newline at end of file