#!/usr/bin/env ruby $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__) #require 'rubygems' # require 'wukong' require 'monkeyshines' # require 'monkeyshines/utils/uri' # require 'monkeyshines/utils/filename_pattern' # require 'monkeyshines/store/conditional_store' # require 'monkeyshines/fetcher/http_head_fetcher' # require 'trollop' # gem install trollop # require 'shorturl_request' require 'shorturl_sequence' digits = { } ; (('0'..'9').to_a+('a'..'z').to_a).each do |ch| digits[ch] = 0 end # (1..10000).each do |idx| # s = ShorturlSequence.encode_integer idx, 36 # digits[s[0..0]] += 1 # end # p digits # puts digits.sort.map{|ch,ct| "%-7s\t%10d"%[ch,ct]} class Histo attr_accessor :buckets def initialize self.buckets = { } end def << val buckets[val] ||= 0 buckets[val] += 1 end def dump buckets.sort.each do |val, count| puts "%10d\t%s"%[count,val] end end end len_histo = Histo.new num_histo = Histo.new ltr_histo = Histo.new iter = 0 # 123456789-123456789- # http://bit.ly/ # http://tinyurl.com/ BASE_URL = "http://is.gd/" RADIX = 62 HANDLE = BASE_URL.gsub(%r{^http://},'').gsub(/\.com$/,'').gsub(/\W+/,'') BASE_URL_LEN = BASE_URL.length MAX_TAIL_LEN = BASE_URL_LEN + 2 + 6 SIX_CHARS = RADIX**6 File.open("rawd/req/shorturl_requests-20090710-#{HANDLE}.tsv" ) do |reqfile| reqfile.each do |url| #decode next unless url.length <= MAX_TAIL_LEN tail = url.chomp.strip[BASE_URL_LEN..-1] || '' # tail.downcase! asnum = ShorturlSequence.decode_str tail, RADIX rescue nil # tail.to_i(36) rescue -1 next unless asnum && asnum < SIX_CHARS size = (asnum / 1_000_000) len = tail.length # track stats len_histo << len num_histo << size ltr_histo << "%s-%s" % [len, tail[0..0]] # + (len > 1 ? '.'* (len-1) : '') puts iter if ((iter += 1) % 1_000_000 == 0) end end puts "Integer magnitude of decoded (M):" num_histo.dump puts "Length of encoded:" len_histo.dump puts "First Letter:" ltr_histo.dump # puts tail.length # [tail.length, tail, tail[-1].to_i].join("\t") # puts [asnum, tail, url].inspect