lib/scrapers/xkcd.rb in scrapers-0.4.0 vs lib/scrapers/xkcd.rb in scrapers-0.4.1
- old
+ new
@@ -2,22 +2,55 @@
require 'nokogiri'
module Scrapers
module Xkcd
XKCD_URL = "http://xkcd.com"
+ PUBDATE_FORMAT = "%F"
+
+ # Get the current or numbered xkcd comic
+ #
+ # +comic+ = (string) the number of the xkcd comic to
+ #retreive. Gets current comic if nil.
+ #
+ # returns hash containing comic info:
+ #
+ # {:title => "comic' title",
+ # :url => "url to comic",
+ # :img_src => "source url to comic image",
+ # :hover_text => "the hover (mouse-over) text",
+ # :pubdate => "publication date",
+ # }
+ #
def self.scrape(comic=nil)
results = Hash.new
url = URI.parse XKCD_URL
url.path = "/#{comic}/" unless comic.nil?
results[:url] = url.to_s
+
doc = Nokogiri::HTML(open(url.to_s))
comic = doc.at_css("#comic img")
results[:img_src] = comic.attr("src")
results[:hover_text] = comic.attr("title")
results[:title] = comic.attr("alt")
+ results[:pubdate] = get_pubdate(results[:img_src])
results
+ end
+
+ # Get the http header of the image file which reveals the last_modified date.
+ # We'll use this as the publication date.
+ def self.get_pubdate(url)
+ url = URI.parse(url.dup)
+ head_req = Net::HTTP::Head.new url
+
+ head = Net::HTTP.start(url.host, url.port) do |http|
+ http.request head_req
+ end
+ return Time.now.strftime(PUBDATE_FORMAT) if head["Last-Modified"].nil?
+ last_modified = Time.parse(head["Last-Modified"]) rescue nil
+ return Time.now.strftime(PUBDATE_FORMAT) if last_modified.nil?
+ last_modified.strftime(PUBDATE_FORMAT)
end
end
end