lib/feedjira/feed.rb in feedjira-1.6.0 vs lib/feedjira/feed.rb in feedjira-2.0.0

- old
+ new

@@ -1,67 +1,28 @@ module Feedjira class Feed - USER_AGENT = 'feedjira http://feedjira.com' - - # Passes raw XML and callbacks to a parser. - # === Parameters - # [parser<Object>] The parser to pass arguments to - must respond to - # `parse` and should return a Feed object. - # [xml<String>] The XML that you would like parsed. - # === Returns - # An instance of the parser feed type. def self.parse_with(parser, xml, &block) parser.parse xml, &block end - # Takes a raw XML feed and attempts to parse it. If no parser is available a Feedjira::NoParserAvailable exception is raised. - # You can pass a block to be called when there's an error during the parsing. - # === Parameters - # [xml<String>] The XML that you would like parsed. - # === Returns - # An instance of the determined feed type. By default, one of these: - # * Feedjira::Parser::RSSFeedBurner - # * Feedjira::Parser::GoogleDocsAtom - # * Feedjira::Parser::AtomFeedBurner - # * Feedjira::Parser::Atom - # * Feedjira::Parser::ITunesRSS - # * Feedjira::Parser::RSS - # === Raises - # Feedjira::NoParserAvailable : If no valid parser classes could be found for the feed. def self.parse(xml, &block) if parser = determine_feed_parser_for_xml(xml) parse_with parser, xml, &block else raise NoParserAvailable.new("No valid parser for XML.") end end - # Determines the correct parser class to use for parsing the feed. - # - # === Parameters - # [xml<String>] The XML that you would like determine the parser for. - # === Returns - # The class name of the parser that can handle the XML. def self.determine_feed_parser_for_xml(xml) start_of_doc = xml.slice(0, 2000) feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)} end - # Adds a new feed parsing class that will be used for parsing. - # - # === Parameters - # [klass<Constant>] The class/constant that you want to register. - # === Returns - # A updated array of feed parser class names. def self.add_feed_class(klass) feed_classes.unshift klass end - # Provides a list of registered feed parsing classes. - # - # === Returns - # A array of class names. def self.feed_classes @feed_classes ||= [ Feedjira::Parser::RSSFeedBurner, Feedjira::Parser::GoogleDocsAtom, Feedjira::Parser::AtomFeedBurner, @@ -69,390 +30,57 @@ Feedjira::Parser::ITunesRSS, Feedjira::Parser::RSS ] end - # Makes all registered feeds types look for the passed in element to parse. - # This is actually just a call to element (a SAXMachine call) in the class. - # - # === Parameters - # [element_tag<String>] The element tag - # [options<Hash>] Valid keys are same as with SAXMachine def self.add_common_feed_element(element_tag, options = {}) feed_classes.each do |k| k.element element_tag, options end end - # Makes all registered feeds types look for the passed in elements to parse. - # This is actually just a call to elements (a SAXMachine call) in the class. - # - # === Parameters - # [element_tag<String>] The element tag - # [options<Hash>] Valid keys are same as with SAXMachine def self.add_common_feed_elements(element_tag, options = {}) feed_classes.each do |k| k.elements element_tag, options end end - # Makes all registered entry types look for the passed in element to parse. - # This is actually just a call to element (a SAXMachine call) in the class. - # - # === Parameters - # [element_tag<String>] - # [options<Hash>] Valid keys are same as with SAXMachine def self.add_common_feed_entry_element(element_tag, options = {}) call_on_each_feed_entry :element, element_tag, options end - # Makes all registered entry types look for the passed in elements to parse. - # This is actually just a call to element (a SAXMachine call) in the class. - # - # === Parameters - # [element_tag<String>] - # [options<Hash>] Valid keys are same as with SAXMachine def self.add_common_feed_entry_elements(element_tag, options = {}) call_on_each_feed_entry :elements, element_tag, options end - # Call a method on all feed entries classes. - # - # === Parameters - # [method<Symbol>] The method name - # [parameters<Array>] The method parameters def self.call_on_each_feed_entry(method, *parameters) feed_classes.each do |k| - # iterate on the collections defined in the sax collection k.sax_config.collection_elements.each_value do |vl| - # vl is a list of CollectionConfig mapped to an attribute name - # we'll look for the one set as 'entries' and add the new element vl.find_all{|v| (v.accessor == 'entries') && (v.data_class.class == Class)}.each do |v| v.data_class.send(method, *parameters) end end end end - # Setup curl from options. - # Possible parameters: - # * :user_agent - overrides the default user agent. - # * :language - accept language value. - # * :compress - any value to enable compression - # * :enable_cookies - boolean - # * :cookiefile - file to read cookies - # * :cookies - contents of cookies header - # * :http_authentication - array containing username, then password - # * :proxy_url - proxy url - # * :proxy_port - proxy port - # * :max_redirects - max number of redirections - # * :timeout - timeout - # * :ssl_verify_host - boolean - # * :ssl_verify_peer - boolean - # * :ssl_version - the ssl version to use, see OpenSSL::SSL::SSLContext::METHODS for options - def self.setup_easy(curl, options={}) - curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress) - curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT) - curl.headers["Accept-Language"] = options[:language] if options.has_key?(:language) - curl.enable_cookies = options[:enable_cookies] if options.has_key?(:enable_cookies) - curl.cookiefile = options[:cookiefile] if options.has_key?(:cookiefile) - curl.cookies = options[:cookies] if options.has_key?(:cookies) + def self.fetch_and_parse(url) + response = connection(url).get + raise FetchFailure.new("Fetch failed - #{response.status}") unless response.success? + xml = response.body + parser_klass = determine_feed_parser_for_xml xml + raise NoParserAvailable.new("No valid parser for XML.") unless parser_klass - curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication) - curl.proxy_url = options[:proxy_url] if options.has_key?(:proxy_url) - curl.proxy_port = options[:proxy_port] if options.has_key?(:proxy_port) - curl.max_redirects = options[:max_redirects] if options[:max_redirects] - curl.timeout = options[:timeout] if options[:timeout] - curl.ssl_verify_host = options[:ssl_verify_host] if options.has_key?(:ssl_verify_host) - curl.ssl_verify_peer = options[:ssl_verify_peer] if options.has_key?(:ssl_verify_peer) - curl.ssl_version = options[:ssl_version] if options.has_key?(:ssl_version) - - curl.follow_location = true + feed = parse_with parser_klass, xml + feed.feed_url = url + feed.etag = response.headers['etag'].to_s.gsub(/"/, '') + feed.last_modified = response.headers['last-modified'] + feed end - # Fetches and returns the raw XML for each URL provided. - # - # === Parameters - # [urls<String> or <Array>] A single feed URL, or an array of feed URLs. - # [options<Hash>] Valid keys for this argument as as followed: - # :if_modified_since - Time object representing when the feed was last updated. - # :if_none_match - String that's normally an etag for the request that was stored previously. - # :on_success - Block that gets executed after a successful request. - # :on_failure - Block that gets executed after a failed request. - # * all parameters defined in setup_easy - # === Returns - # A String of XML if a single URL is passed. - # - # A Hash if multiple URL's are passed. The key will be the URL, and the value the XML. - def self.fetch_raw(urls, options = {}) - url_queue = [*urls] - multi = Curl::Multi.new - responses = {} - url_queue.each do |url| - easy = Curl::Easy.new(url) do |curl| - setup_easy curl, options - - curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since) - curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match) - - curl.on_success do |c| - responses[url] = decode_content(c) - end - - curl.on_complete do |c, err| - responses[url] = c.response_code unless responses.has_key?(url) - end - end - multi.add(easy) - end - - multi.perform - urls.is_a?(String) ? responses.values.first : responses - end - - # Fetches and returns the parsed XML for each URL provided. - # - # === Parameters - # [urls<String> or <Array>] A single feed URL, or an array of feed URLs. - # [options<Hash>] Valid keys for this argument as as followed: - # * :user_agent - String that overrides the default user agent. - # * :if_modified_since - Time object representing when the feed was last updated. - # * :if_none_match - String, an etag for the request that was stored previously. - # * :on_success - Block that gets executed after a successful request. - # * :on_failure - Block that gets executed after a failed request. - # === Returns - # A Feed object if a single URL is passed. - # - # A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object. - def self.fetch_and_parse(urls, options = {}) - url_queue = [*urls] - multi = Curl::Multi.new - responses = {} - - # I broke these down so I would only try to do 30 simultaneously because - # I was getting weird errors when doing a lot. As one finishes it pops another off the queue. - url_queue.slice!(0, 30).each do |url| - add_url_to_multi(multi, url, url_queue, responses, options) - end - - multi.perform - return urls.is_a?(String) ? responses.values.first : responses - end - - # Decodes the XML document if it was compressed. - # - # === Parameters - # [curl_request<Curl::Easy>] The Curl::Easy response object from the request. - # === Returns - # A decoded string of XML. - def self.decode_content(c) - if c.header_str.match(/Content-Encoding: gzip/i) - begin - gz = Zlib::GzipReader.new(StringIO.new(c.body_str)) - xml = gz.read - gz.close - rescue Zlib::GzipFile::Error - # Maybe this is not gzipped? - xml = c.body_str - end - elsif c.header_str.match(/Content-Encoding: deflate/i) - xml = Zlib::Inflate.inflate(c.body_str) - else - xml = c.body_str - end - - xml - end - - # Updates each feed for each Feed object provided. - # - # === Parameters - # [feeds<Feed> or <Array>] A single feed object, or an array of feed objects. - # [options<Hash>] Valid keys for this argument as as followed: - # * :on_success - Block that gets executed after a successful request. - # * :on_failure - Block that gets executed after a failed request. - # * all parameters defined in setup_easy - # === Returns - # A updated Feed object if a single URL is passed. - # - # A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object. - def self.update(feeds, options = {}) - feed_queue = [*feeds] - multi = Curl::Multi.new - responses = {} - - feed_queue.slice!(0, 30).each do |feed| - add_feed_to_multi(multi, feed, feed_queue, responses, options) - end - - multi.perform - feeds.is_a?(Array) ? responses : responses.values.first - end - - # An abstraction for adding a feed by URL to the passed Curb::multi stack. - # - # === Parameters - # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too. - # [url<String>] The URL of the feed that you would like to be fetched. - # [url_queue<Array>] An array of URLs that are queued for request. - # [responses<Hash>] Existing responses that you want the response from the request added to. - # [feeds<String> or <Array>] A single feed object, or an array of feed objects. - # [options<Hash>] Valid keys for this argument as as followed: - # * :on_success - Block that gets executed after a successful request. - # * :on_failure - Block that gets executed after a failed request. - # * all parameters defined in setup_easy - # === Returns - # The updated Curl::Multi object with the request details added to it's stack. - def self.add_url_to_multi(multi, url, url_queue, responses, options) - easy = Curl::Easy.new(url) do |curl| - setup_easy curl, options - curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since) - curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match) - - curl.on_success do |c| - xml = decode_content(c) - klass = determine_feed_parser_for_xml(xml) - - if klass - begin - feed = parse_with klass, xml, &on_parser_failure(url) - - feed.feed_url = c.last_effective_url - feed.etag = etag_from_header(c.header_str) - feed.last_modified = last_modified_from_header(c.header_str) - responses[url] = feed - options[:on_success].call(url, feed) if options.has_key?(:on_success) - rescue Exception => e - call_on_failure(c, e, options[:on_failure]) - end - else - call_on_failure(c, "Can't determine a parser", options[:on_failure]) - end - end - - # - # trigger on_failure for 404s - # - curl.on_complete do |c| - add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty? - responses[url] = c.response_code unless responses.has_key?(url) - end - - curl.on_redirect do |c| - if c.response_code == 304 # it's not modified. this isn't an error condition - options[:on_success].call(url, nil) if options.has_key?(:on_success) - end - end - - curl.on_missing do |c| - if c.response_code == 404 && options.has_key?(:on_failure) - call_on_failure(c, 'Server returned a 404', options[:on_failure]) - end - end - - curl.on_failure do |c, err| - responses[url] = c.response_code - call_on_failure(c, err, options[:on_failure]) - end - end - multi.add(easy) - end - - # An abstraction for adding a feed by a Feed object to the passed Curb::multi stack. - # - # === Parameters - # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too. - # [feed<Feed>] A feed object that you would like to be fetched. - # [url_queue<Array>] An array of feed objects that are queued for request. - # [responses<Hash>] Existing responses that you want the response from the request added to. - # [feeds<String>] or <Array> A single feed object, or an array of feed objects. - # [options<Hash>] Valid keys for this argument as as followed: - # * :on_success - Block that gets executed after a successful request. - # * :on_failure - Block that gets executed after a failed request. - # * all parameters defined in setup_easy - # === Returns - # The updated Curl::Multi object with the request details added to it's stack. - def self.add_feed_to_multi(multi, feed, feed_queue, responses, options) - easy = Curl::Easy.new(feed.feed_url) do |curl| - setup_easy curl, options - curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified - curl.headers["If-Modified-Since"] = options[:if_modified_since] if options[:if_modified_since] && (!feed.last_modified || (Time.parse(options[:if_modified_since].to_s) > feed.last_modified)) - curl.headers["If-None-Match"] = feed.etag if feed.etag - - curl.on_success do |c| - begin - updated_feed = Feed.parse c.body_str, &on_parser_failure(feed.feed_url) - - updated_feed.feed_url = c.last_effective_url - updated_feed.etag = etag_from_header(c.header_str) - updated_feed.last_modified = last_modified_from_header(c.header_str) - feed.update_from_feed(updated_feed) - responses[feed.feed_url] = feed - options[:on_success].call(feed) if options.has_key?(:on_success) - rescue Exception => e - call_on_failure(c, e, options[:on_failure]) - end - end - - curl.on_failure do |c, err| # response code 50X - responses[feed.feed_url] = c.response_code - call_on_failure(c, 'Server returned a 404', options[:on_failure]) - end - - curl.on_redirect do |c, err| # response code 30X - if c.response_code == 304 - options[:on_success].call(feed) if options.has_key?(:on_success) - else - responses[feed.feed_url] = c.response_code - call_on_failure(c, err, options[:on_failure]) - end - end - - curl.on_complete do |c| - add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty? - responses[feed.feed_url] = feed unless responses.has_key?(feed.feed_url) - end - end - multi.add(easy) - end - - # Determines the etag from the request headers. - # - # === Parameters - # [header<String>] Raw request header returned from the request - # === Returns - # A string of the etag or nil if it cannot be found in the headers. - def self.etag_from_header(header) - header =~ /.*ETag:\s(.*)\r/ - $1 - end - - # Determines the last modified date from the request headers. - # - # === Parameters - # [header<String>] Raw request header returned from the request - # === Returns - # A Time object of the last modified date or nil if it cannot be found in the headers. - def self.last_modified_from_header(header) - header =~ /.*Last-Modified:\s(.*)\r/ - Time.parse_safely($1) if $1 - end - - class << self - private - - def on_parser_failure(url) - Proc.new { |message| raise "Error while parsing [#{url}] #{message}" } - end - - def call_on_failure(c, error, on_failure) - if on_failure - if on_failure.arity == 2 - on_failure.call(c, error) - else - warn "on_failure proc with invalid parameters number #{on_failure.arity} instead of 2, ignoring it" - end - end + def self.connection(url) + Faraday.new(url: url) do |conn| + conn.use FaradayMiddleware::FollowRedirects, limit: 3 + conn.adapter :net_http end end end end