require 'rubygems' require 'hpricot' require 'open-uri' require 'algorithm/diff' require 'pathname' #require "test/code/all_the_gems" require "test/code/tarball" def fetch_with_cache(base,path,cachedir) path[-1]==?/ and path+='index.html' #make sure right dir structure for this path exists in cachedir dnames=Pathname.new(path) dirs=[] while dnames.to_s["/"] dnames=dnames.dirname dirs.unshift cachedir+dnames.to_s end dirs.each{|dir| Dir.mkdir dir rescue nil} #find latest cached version of this file in cachedir latest_fname=nil latest_date=Time.mktime '1970' Dir[cachedir+path+".*"].each{|fname| mtime=File.mtime(fname) if mtime>=latest_date latest_date=mtime latest_fname=fname end } options={"User-Agent"=>"all_the_raas.rb"} #extract etag from latest name if latest_fname latest_etag=latest_fname[%r{\A#{Regexp.quote cachedir+path}\.(.*)\Z},1] options['If-None-Match']=latest_etag unless latest_etag=='' end #refetch the file if it has changed, otherwise use cached copy begin open(base+path, options){|net| latest_fname=cachedir+path+"."+(net.meta['etag']||'') File.open(latest_fname,'w'){|f| f.write net.read } } #puts "fetched a fresh #{base+path}" rescue OpenURI::HTTPError=>e raise e unless e.io.status.first=='304' and /Not Modified/i===e.io.status.last #puts "reusing latest #{latest_fname}" end return File.open latest_fname end def changeratio(s1,s2) diffslen=s1.diff(s2).inject(0){|sum,(msg,pos,data)| sum+data.size } return diffslen.to_f/(s1.size+s2.size) end def sameproject?(s1,s2) s1.casecmp(s2).zero? or s1.index(s2)==0 or s2.index(s1)==0 or changeratio(s1,s2)<0.5 end if __FILE__==$0 base="http://raa.ruby-lang.org/" offset=(ENV['OFFSET']||0).to_i limit=(ENV['LIMIT']||20).to_i cachedir="jewels/" Dir.mkdir cachedir rescue nil #fetch list of all projects from raa's all.html all_raas=fetch_with_cache(base,"all.html",cachedir) tree=Hpricot(all_raas) tree/=:table tree.search(:thead).remove tree/=:tr #tree=tree[offset,limit] urls=tree.map{|row| begin row.search(:td).first.search(:a).first[:href] rescue Exception=>e puts "failure #{e} in row #{row}" #wank about it nil end } RUBYFORGE=%r{\Ahttp://(?:[a-z0-9_+-:]+\.)*rubyforge.org/}i PROTOCOLS=%w[http https ftp] EXTENSIONS=%w[tar zip rb tgz tbz2 tbz gem] EXTRA_EXTENSIONS=%w[gz bz2 Z] EXTRA_EXTENSIONS_REX="\\.(?:#{EXTRA_EXTENSIONS.join'|'})" VERSIONTOO='' #was: "(?:[_-](.*))" ENDINGS="\\.(?:#{EXTENSIONS.join('|')})(?:#{EXTRA_EXTENSIONS_REX})?" TARBALL=%r< \A(?:#{PROTOCOLS.join('|')}):// (?:[^/]+/)+ (.*) #{VERSIONTOO} #{ENDINGS} \Z >ixo $rubyforge_urls=0 #pp urls #crawl raa's individual page for each project, looking for dl link urls.map!{|url| begin tree=Hpricot(fetch_with_cache(base,url,cachedir)) # if url # url=base+url # tree=Hpricot(open(url)) # end project=tree.search(:title).inner_html[/\ARAA - (.*)\Z/,1] tree/=:table trs=tree/:tr dl=trs.find{|tr| !tr.search("th[text()^='Download']").empty? } newurl=dl.search('td/a').first if newurl url=newurl[:href] %r{\A(#{PROTOCOLS.join('|')})://}io===url or url="http://"+url if RUBYFORGE===url unless TARBALL===url and not /\.gem\Z/===url url=nil $rubyforge_urls += 1 end end url.slice!(/\#.*\Z/) if url #trim off url section [url,project] else puts "couldn't find td/a in #{base+url}" [nil,nil] end rescue Interrupt=>e raise if e.class==Interrupt #^c only, dammit! #$^$&%'n Timeout::Error rescue Exception=>e puts "error: #{e} during url #{url}" #wank about it [nil,nil] end } #pp urls #resolve list of dl urls into urls to 'tarballs' (which is meant to include zip, gem, etc) #dl urls found on raa might be a direct link to a tarball, or point to a page that #points to the tarball urls.map!{|url,project| if TARBALL===url urlproject=$1 versionstart=urlproject.rindex(/[_-]/) urlproject.slice! versionstart..-1 if versionstart if sameproject?(urlproject,project) url else puts "uh-oh, project #{project} not found in url #{url}. urlproject was #{urlproject}" end elsif url href=nil begin tree=Hpricot(open(url)) tree/=:a tarball_url=tree.find{|a| href=a[:href] or next unless %r[\A(?:#{PROTOCOLS.join('|')})://]o===href #relative url? # url+="/" unless url[-1]==?/ href="/"+href unless href[0]==?/ href=url[%r{\A[^/]+//[^/]+}]+href end TARBALL===href } if tarball_url urlproject=$1 versionstart=urlproject.rindex(/[_-]/) urlproject.slice! versionstart..-1 if versionstart if sameproject?(urlproject,project) href else puts "uh-oh, project #{project} not found in url #{url}. urlproject was #{urlproject}" end end rescue Interrupt=>e raise if e.class==Interrupt puts "error: #{e} during page scan of url #{url}" rescue Exception=>e puts "error: #{e} during page scan of url #{url}" nil end end } /unzip ([^\s]+)[\s\n]/i===`unzip -v` unzip="unzip" unzip+=" -L" if $1[0..2].to_f>=5.5 pp urls #for each tarball url actually found, dl the tarball urls.compact.each{|url| begin Tarball.dl_and_unpack(cachedir,url) rescue Interrupt=>e raise if e.class==Interrupt #else do nothing rescue Exception #do nothing end } p [:$rubyforge_urls, $rubyforge_urls] end