require 'rubygems' gem 'mechanize', '>= 0.7.0' require 'mechanize' require 'generator' # for SyncEnumerator # Patch Mechanize's broken html unescaping Mechanize 0.6.11 class WWW::Mechanize def to_absolute_uri(url, cur_page=current_page()) unless url.is_a? URI url = url.to_s.strip url = URI.parse( Util.html_unescape( SyncEnumerator.new( url.split(/%[0-9A-Fa-f]{2}/), url.scan(/%[0-9A-Fa-f]{2}/) ).map { |x,y| "#{URI.escape(x||'')}#{y}" }.join('').gsub(/%23/, '#') ) ) # Mechanize here uses #zip to combine the two arrays, which will ignore # excessive elements of the second array (the one which is passed as an # argument). That means if the URL ends with more than one already escaped # character, then only the first one will be restored into the resulting # URL. end # construct an absolute uri if url.relative? raise 'no history. please specify an absolute URL' unless cur_page.uri url = cur_page.uri + url # Strip initial "/.." bits from the path url.path.sub!(/^(\/\.\.)+(?=\/)/, '') end return url end end ## # A base class for importers that scrape their contacts from web services class Graybook::Importer::PageScraper < Graybook::Importer::Base attr_accessor :agent ## # creates the Mechanize agent used to do the scraping and sets a nice # user agent header for good net educate def create_agent self.agent = WWW::Mechanize.new agent.user_agent = "Mozilla/4.0 (compatible; Graybook #{Graybook::VERSION})" agent.keep_alive = false agent end ## # Page scrapers will follow a fairly simple pattern of instantiating the # agent, prepping for the scrape and then the actual scrape process def fetch_contacts! create_agent prep = prepare return prep if prep.nil? cont = scrape_contacts return cont if cont.nil? end ## # Providers will often require you to login or otherwise prepare to actual # scrape the contacts def prepare; end # stub ## # Some providers have a single page you can scrape from (like Gmail's HTML # Contacts page) while others might require you to navigate several pages, # scraping as you go. def scrape_contacts; end # stub ## # helper to strip html from text def strip_html( html ) html.gsub(/<\/?[^>]*>/, '') end end