Sha256: e61bc336c75db0eaa648de033558328974b733cc6ffe1f675f0cb60daccb5bfe

Contents?: true

Size: 1.71 KB

Versions: 12

Compression:

Stored size: 1.71 KB

Contents

require 'nokogiri'
require 'hashie'
require 'net/http'

module Caboodle
  def self.scrape url
    ::Nokogiri::HTML(Weary.get(url).perform_sleepily.body)
  end
  
  def self.mash req
    ::Hashie::Mash.new(req.perform_sleepily.parse)
  end
  
  def self.extract_feed url
    Caboodle::FeedDetector.fetch_feed_url url
  end
  
  class FeedDetector

    ##
    # return the feed url for a url
    # for example: http://blog.dominiek.com/ => http://blog.dominiek.com/feed/atom.xml
    # only_detect can force detection of :rss or :atom
    def self.fetch_feed_url(page_url, only_detect=nil)
      url = URI.parse(page_url)
      host_with_port = url.host
      host_with_port << ":#{url.port}" unless url.port == 80

      res = Weary.get(page_url).perform_sleepily

      feed_url = self.get_feed_path(res.body, only_detect)
      "http://#{host_with_port}/#{feed_url.gsub(/^\//, '')}" unless !feed_url || feed_url =~ /^http:\/\//
    end

    ##
    # get the feed href from an HTML document
    # for example:
    # ...
    # <link href="/feed/atom.xml" rel="alternate" type="application/atom+xml" />
    # ...
    # => /feed/atom.xml
    # only_detect can force detection of :rss or :atom
    def self.get_feed_path(html, only_detect=nil)
      unless only_detect && only_detect != :atom
        md ||= /<link.*href=['"]*([^\s'"]+)['"]*.*application\/atom\+xml.*>/.match(html)
        md ||= /<link.*application\/atom\+xml.*href=['"]*([^\s'"]+)['"]*.*>/.match(html)
      end
      unless only_detect && only_detect != :rss
        md ||= /<link.*href=['"]*([^\s'"]+)['"]*.*application\/rss\+xml.*>/.match(html)
        md ||= /<link.*application\/rss\+xml.*href=['"]*([^\s'"]+)['"]*.*>/.match(html)
      end
      md && md[1]
    end

  end
  
end

Version data entries

12 entries across 12 versions & 1 rubygems

Version Path
caboodle-0.2.25 lib/caboodle/scrape.rb
caboodle-0.2.24 lib/caboodle/scrape.rb
caboodle-0.2.23 lib/caboodle/scrape.rb
caboodle-0.2.22 lib/caboodle/scrape.rb
caboodle-0.2.21 lib/caboodle/scrape.rb
caboodle-0.2.20 lib/caboodle/scrape.rb
caboodle-0.2.19 lib/caboodle/scrape.rb
caboodle-0.2.18 lib/caboodle/scrape.rb
caboodle-0.2.16 lib/caboodle/scrape.rb
caboodle-0.2.15 lib/caboodle/scrape.rb
caboodle-0.2.14 lib/caboodle/scrape.rb
caboodle-0.2.13 lib/caboodle/scrape.rb