Sha256: 5036ffbc4386fd20e69ad015ed8e18a9d7e5c1857b3ab8bb0712866f0d11f3f0
Contents?: true
Size: 1.98 KB
Versions: 1
Compression:
Stored size: 1.98 KB
Contents
require 'archive_lister/version' require 'addressable/uri' require 'net/http' require 'nokogiri' require 'archive_lister/wayback_file' module ArchiveLister class HttpError < RuntimeError attr_reader :uri, :response def initialize(uri, response) @uri = uri @response = response end def to_s "#{uri}\t#{response}" end end WAYBACK_FORMAT = 'http://wayback.archive.org/web/*/#SITE#/*' def self.list(url) query_uri = url.is_a?(URI) ? url : URI.parse(url) query_uri.query = nil wayback_uri = URI(WAYBACK_FORMAT.sub('#SITE#', query_uri.to_s)) # Poor man's one-level redirect response = Net::HTTP.get_response(wayback_uri) if response.is_a?(Net::HTTPRedirection) response = Net::HTTP.get_response(URI(response.header['location'])) end unless response.is_a?(Net::HTTPSuccess) raise HttpError.new(wayback_uri, response) end WaybackFile.parse(response.body).urls end def self.batch(url_filename, output_dir, options = {}) successes, skipped, failures = 0, 0, [] File.read(url_filename).each_line do |url| normalised_url = url.sub(/(\n$)|(_$)|(\/\n$)/, '') url = Addressable::URI.parse(normalised_url) output_filename = File.join(output_dir, "#{url.host}#{url.path.gsub('/', '_')}").chomp File.delete(output_filename) if File.exist?(output_filename) && File.zero?(output_filename) skipping = options[:skip_existing] && File.exist?(output_filename) puts "#{url}#{skipping ? ' -- Skipping' : ''}" skipped += 1 and next if skipping File.open(output_filename, 'w') do |file| begin urls = ArchiveLister.list(url) urls.each { |url| file.puts url.to_s } successes += 1 rescue HttpError => e failures << e File.delete(output_filename) end end end puts "#{successes} successes, #{failures.length} failures, #{skipped} skipped" failures.each { |e| puts e } if options[:verbose] end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
archive_lister-0.0.1 | lib/archive_lister.rb |