# -*- mode:ruby ; -*- -* require "code_zauker/version" require "code_zauker/constants" require 'redis/connection/hiredis' require 'redis' require 'set' # This module implements a simple reverse indexer # based on Redis # The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html module CodeZauker GRAM_SIZE=3 SPACE_GUY=" "*GRAM_SIZE # = Basic utility class class Util # Compute all the possible case-mixed trigrams # It works for every string size # TODO: Very bad implementation, need improvements def mixCase(trigram) caseMixedElements=[] lx=trigram.length combos=2**lx startString=trigram.downcase #puts "Combos... 1..#{combos}... #{startString}" for c in 0..(combos-1) do # Make binary maskForStuff=c.to_s(2) p=0 #puts maskForStuff currentMix="" # Pad it if maskForStuff.length < lx maskForStuff = ("0"*(lx-maskForStuff.length)) +maskForStuff end maskForStuff.each_char { | x | #putc x if x=="1" currentMix +=startString[p].upcase else currentMix +=startString[p].downcase end #puts currentMix p+=1 } caseMixedElements.push(currentMix) end return caseMixedElements end # = Ensure Data are correctly imported # http://blog.grayproductions.net/articles/ruby_19s_string # This code try to "guess" the right encoding # switching to ISO-8859-1 if UTF-8 is not valid. # Tipical use case: an italian source code wronlgy interpreted as a UTF-8 # whereas it is a ISO-8859 windows code. def ensureUTF8(untrusted_string) if untrusted_string.valid_encoding?()==false #puts "DEBUG Trouble on #{untrusted_string}" untrusted_string.force_encoding("ISO-8859-1") # We try ISO-8859-1 tipical windows begin valid_string=untrusted_string.encode("UTF-8", { :undef =>:replace, :invalid => :replace} ) rescue Encoding::InvalidByteSequenceError => e raise e end # if valid_string != untrusted_string # puts "CONVERTED #{valid_string} Works?#{valid_string.valid_encoding?}" # end return valid_string else return untrusted_string end end end # Scan a file and push it inside redis... # then it can provide handy method to find file scontaining the trigram... class FileScanner def initialize(redisConnection=nil) if redisConnection==nil @redis=Redis.new else @redis=redisConnection end end def disconnect() @redis.quit end def pushTrigramsSet(s, fid, filename) case_insensitive_trigram_failed=false showlog=false if s.length > (TRIGRAM_DEFAULT_PUSH_SIZE/2) puts " >Pushing...#{s.length} for id #{fid}=#{filename}" showlog=true end # Ask for a protected transaction # Sometimes can fail... welldone=false tryCounter=0 while welldone == false do begin tryCounter +=1 case_insensitive_trigram_failed=pushTrigramsSetRecoverable(s,fid,filename) welldone=true rescue Errno::EAGAIN =>ea if tryCounter >=MAX_PUSH_TRIGRAM_RETRIES puts "FATAL: Too many Errno::EAGAIN Errors" raise ea else puts "Trouble storing #{s.length} data. Retrying..." welldone=false end end end if showlog puts " adaptiveSize pushTrigramsSet(s,fid,filename) s=Set.new() end trigramScanned += 1 #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}" end end } if s.length > 0 pushTrigramsSet(s,fid,filename) s=nil #puts "Final push of #{s.length}" end trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}" @redis.sadd "fscan:processedFiles", "#{filename}" trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0 if trigramRatio < 10 or trigramRatio >75 puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70 end return nil end def split_in_trigrams(term, prefix) trigramInAnd=Set.new() # Search=> Sea AND ear AND arc AND rch for j in 0...term.length currentTrigram=term[j,GRAM_SIZE] if currentTrigram.length