require 'nokogiri'
require 'hashie'
require 'net/http'
require 'open-uri'
module Caboodle
begin
require 'memcached'
CACHE = Memcached.new
puts "Running with memcache"
rescue
puts "Running without memcache"
end
def self.round_time(integer, factor)
return integer if(integer % factor == 0)
return integer - (integer % factor)
end
def self.scrape url
begin
if defined?(CACHE)
puts "Scraping with cache optimisation"
timeout = 60*60*1000
response = CACHE.get("#{round_time(Time.new.to_i, timeout)}:#{url}") rescue nil
response ||= open(url).read
CACHE.set("#{round_time(Time.new.to_i, timeout)}:#{url}", response)
CACHE.set("0:#{url}", response)
else
puts "Scraping without cache optimisation"
response = open(url).read
end
::Nokogiri::HTML(response)
rescue Exception => e
puts e.inspect
if defined?(CACHE)
response = CACHE.get("0:#{url}")
end
response ||= ""
::Nokogiri::HTML(response)
end
end
def self.mash req
::Hashie::Mash.new(req.perform_sleepily.parse)
end
def self.extract_feed url
Caboodle::FeedDetector.fetch_feed_url url
end
class FeedDetector
##
# return the feed url for a url
# for example: http://blog.dominiek.com/ => http://blog.dominiek.com/feed/atom.xml
# only_detect can force detection of :rss or :atom
def self.fetch_feed_url(page_url, only_detect=nil)
url = URI.parse(page_url)
host_with_port = url.host
host_with_port << ":#{url.port}" unless url.port == 80
res = Weary.get(page_url).perform_sleepily
feed_url = self.get_feed_path(res.body, only_detect)
"http://#{host_with_port}/#{feed_url.gsub(/^\//, '')}" unless !feed_url || feed_url =~ /^http:\/\//
end
##
# get the feed href from an HTML document
# for example:
# ...
#
# ...
# => /feed/atom.xml
# only_detect can force detection of :rss or :atom
def self.get_feed_path(html, only_detect=nil)
unless only_detect && only_detect != :atom
md ||= //.match(html)
md ||= //.match(html)
end
unless only_detect && only_detect != :rss
md ||= //.match(html)
md ||= //.match(html)
end
md && md[1]
end
end
end