lib/scrapers/xkcd.rb in scrapers-0.4.0 vs lib/scrapers/xkcd.rb in scrapers-0.4.1

- old
+ new

@@ -2,22 +2,55 @@ require 'nokogiri' module Scrapers module Xkcd XKCD_URL = "http://xkcd.com" + PUBDATE_FORMAT = "%F" + + # Get the current or numbered xkcd comic + # + # +comic+ = (string) the number of the xkcd comic to + #retreive. Gets current comic if nil. + # + # returns hash containing comic info: + # + # {:title => "comic' title", + # :url => "url to comic", + # :img_src => "source url to comic image", + # :hover_text => "the hover (mouse-over) text", + # :pubdate => "publication date", + # } + # def self.scrape(comic=nil) results = Hash.new url = URI.parse XKCD_URL url.path = "/#{comic}/" unless comic.nil? results[:url] = url.to_s + doc = Nokogiri::HTML(open(url.to_s)) comic = doc.at_css("#comic img") results[:img_src] = comic.attr("src") results[:hover_text] = comic.attr("title") results[:title] = comic.attr("alt") + results[:pubdate] = get_pubdate(results[:img_src]) results + end + + # Get the http header of the image file which reveals the last_modified date. + # We'll use this as the publication date. + def self.get_pubdate(url) + url = URI.parse(url.dup) + head_req = Net::HTTP::Head.new url + + head = Net::HTTP.start(url.host, url.port) do |http| + http.request head_req + end + return Time.now.strftime(PUBDATE_FORMAT) if head["Last-Modified"].nil? + last_modified = Time.parse(head["Last-Modified"]) rescue nil + return Time.now.strftime(PUBDATE_FORMAT) if last_modified.nil? + last_modified.strftime(PUBDATE_FORMAT) end end end