lib/code_zauker.rb in code_zauker-0.0.2 vs lib/code_zauker.rb in code_zauker-0.0.3

- old
+ new

@@ -8,49 +8,145 @@ # based on Redis # The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html module CodeZauker GRAM_SIZE=3 SPACE_GUY=" "*GRAM_SIZE + + # = Basic utility class + class Util + # Compute all the possible case-mixed trigrams + # It works for every string size + # TODO: Very bad implementation, need improvements + def mixCase(trigram) + caseMixedElements=[] + lx=trigram.length + combos=2**lx + startString=trigram.downcase + #puts "Combos... 1..#{combos}... #{startString}" + for c in 0..(combos-1) do + # Make binary + maskForStuff=c.to_s(2) + p=0 + #puts maskForStuff + currentMix="" + # Pad it + if maskForStuff.length < lx + maskForStuff = ("0"*(lx-maskForStuff.length)) +maskForStuff + end + maskForStuff.each_char { | x | + #putc x + if x=="1" + currentMix +=startString[p].upcase + else + currentMix +=startString[p].downcase + end + #puts currentMix + p+=1 + } + caseMixedElements.push(currentMix) + end + return caseMixedElements + end + + # = Ensure Data are correctly imported + # http://blog.grayproductions.net/articles/ruby_19s_string + # This code try to "guess" the right encoding + # switching to ISO-8859-1 if UTF-8 is not valid. + # Tipical use case: an italian source code wronlgy interpreted as a UTF-8 + # whereas it is a ISO-8859 windows code. + def ensureUTF8(untrusted_string) + if untrusted_string.valid_encoding?()==false + #puts "DEBUG Trouble on #{untrusted_string}" + untrusted_string.force_encoding("ISO-8859-1") + # We try ISO-8859-1 tipical windows + begin + valid_string=untrusted_string.encode("UTF-8", { :undef =>:replace, :invalid => :replace} ) + rescue Encoding::InvalidByteSequenceError => e + raise e + end + # if valid_string != untrusted_string + # puts "CONVERTED #{valid_string} Works?#{valid_string.valid_encoding?}" + # end + return valid_string + else + return untrusted_string + end + end + + end + # Scan a file and push it inside redis... # then it can provide handy method to find file scontaining the trigram... class FileScanner def initialize(redisConnection=nil) if redisConnection==nil @redis=Redis.new else @redis=redisConnection end end - def disconnect() + + + def disconnect() @redis.quit end + + def pushTrigramsSet(s, fid, filename) - error=false - if s.length > 5000 + case_insensitive_trigram_failed=false + showlog=false + if s.length > (TRIGRAM_DEFAULT_PUSH_SIZE/2) puts " >Pushing...#{s.length} for id #{fid}=#{filename}" + showlog=true end - s.each do | trigram | - @redis.sadd "trigram:#{trigram}",fid - @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram - # Add the case-insensitive-trigram + # Ask for a protected transaction + # Sometimes can fail... + welldone=false + tryCounter=0 + while welldone == false do begin - @redis.sadd "trigram:ci:#{trigram.downcase}",fid - rescue ArgumentError - error=true + tryCounter +=1 + case_insensitive_trigram_failed=pushTrigramsSetRecoverable(s,fid,filename) + welldone=true + rescue Errno::EAGAIN =>ea + if tryCounter >=MAX_PUSH_TRIGRAM_RETRIES + puts "FATAL: Too many Errno::EAGAIN Errors" + raise ea + else + puts "Trouble storing #{s.length} data. Retrying..." + welldone=false + end end end - if s.length > 5000 + if showlog puts " <Pushed #{s.length}..." - puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if error - end + end + puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if case_insensitive_trigram_failed end - private :pushTrigramsSet + def pushTrigramsSetRecoverable(s, fid, filename) + error=false + @redis.multi do + s.each do | trigram | + @redis.sadd "trigram:#{trigram}",fid + @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram + # Add the case-insensitive-trigram + begin + @redis.sadd "trigram:ci:#{trigram.downcase}",fid + rescue ArgumentError + error=true + end + end + end # multi + return error + end + private :pushTrigramsSetRecoverable + def load(filename, noReload=false) # Define my redis id... # Already exists?... fid=@redis.get "fscan:id:#{filename}" if fid==nil @@ -59,11 +155,11 @@ # BUG: Consider storing it at the END of the processing @redis.set "fscan:id:#{filename}", fid @redis.set "fscan:id2filename:#{fid}",filename else if noReload - puts "Already found #{filename} as id:#{fid} and NOT RELOADED" + #puts "Already found #{filename} as id:#{fid} and NOT RELOADED" return nil end end # fid is the set key!... trigramScanned=0 @@ -71,14 +167,16 @@ # The ratio is below 13% of total trigrams are unique for very big files # So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set # before sending it to redis. This avoid # a lot of spourios work s=Set.new - File.open(filename,"r") do |f| + File.open(filename,"r") { |f| lines=f.readlines() - adaptiveSize= 6000 - lines.each do |l| + adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE + util=Util.new() + lines.each do |lineNotUTF8| + l= util.ensureUTF8(lineNotUTF8) # Split each line into 3-char chunks, and store in a redis set i=0 for istart in 0...(l.length-GRAM_SIZE) trigram = l[istart, GRAM_SIZE] # Avoid storing the 3space guy enterely @@ -93,11 +191,11 @@ end trigramScanned += 1 #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}" end end - end + } if s.length > 0 pushTrigramsSet(s,fid,filename) s=nil #puts "Final push of #{s.length}" @@ -105,60 +203,96 @@ trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}" @redis.sadd "fscan:processedFiles", "#{filename}" trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0 - if trigramRatio < 10 or trigramRatio >75 - puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} " + if trigramRatio < 10 or trigramRatio >75 + puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70 end return nil end - # = search - # Find a list of file candidates to a search string - # The search string is padded into trigrams - def search(term) - if term.length < GRAM_SIZE - raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character" - end - #puts " ** Searching: #{term}" - # split the term in a padded trigram - trigramInAnd=[] + def split_in_trigrams(term, prefix) + trigramInAnd=Set.new() # Search=> Sea AND ear AND arc AND rch for j in 0...term.length currentTrigram=term[j,GRAM_SIZE] if currentTrigram.length <GRAM_SIZE # We are at the end... break end - trigramInAnd.push("trigram:#{currentTrigram}") + trigramInAnd.add("#{prefix}:#{currentTrigram}") end - #puts "Trigam conversion /#{term}/ into #{trigramInAnd}" - if trigramInAnd.length==0 - return [] - end - fileIds= @redis.sinter(*trigramInAnd) + return trigramInAnd + end + + def map_ids_to_files(fileIds) filenames=[] # fscan:id2filename:#{fid}.... fileIds.each do | id | - filenames.push(@redis.get("fscan:id2filename:#{id}")) + file_name=@redis.get("fscan:id2filename:#{id}") + filenames.push(file_name) if !file_name.nil? end #puts " ** Files found:#{filenames} from ids #{fileIds}" return filenames end + + + + # = Do a case-insenitive search + # using the special set of trigrams + # "trigram:ci:*" + # all downcase + def isearch(term) + termLowercase=term.downcase() + trigramInAnd=split_in_trigrams(termLowercase,"trigram:ci") + if trigramInAnd.length==0 + return [] + end + fileIds= @redis.sinter(*trigramInAnd) + return map_ids_to_files(fileIds) + end + + + # = search + # Find a list of file candidates to a search string + # The search string is padded into trigrams + def search(term) + if term.length < GRAM_SIZE + raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character" + end + #puts " ** Searching: #{term}" + trigramInAnd=split_in_trigrams(term,"trigram") + #puts "Trigam conversion /#{term}/ into #{trigramInAnd}" + if trigramInAnd.length==0 + return [] + end + fileIds= @redis.sinter(*trigramInAnd) + fileNames=map_ids_to_files(fileIds) + #puts "DEBUG #{fileIds} #{fileNames}" + return fileNames + end + def reindex(fileList) #puts "Reindexing... #{fileList.length} files..." fileList.each do |current_file | self.remove([current_file]) self.load(current_file,noReload=false) end end # Remove all the keys def removeAll() - self.remove(nil) + tokill=[] + tokill=@redis.keys("fscan:*") + tokill.push(*(@redis.keys("trigram*"))) + tokill.each do | x | + @redis.del x + #puts "Deleted #x" + end + @redis.del "fscan:processedFiles" end # Remove the files from the index, updating trigrams def remove(filePaths=nil) if filePaths==nil @@ -176,24 +310,31 @@ fid=@redis.get "fscan:id:#{filename}" trigramsToExpurge=@redis.smembers "fscan:trigramsOnFile:#{fid}" if trigramsToExpurge.length==0 puts "?Nothing to do on #{filename}" end - puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..." + puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..." trigramsToExpurge.each do | ts | @redis.srem "trigram:#{ts}", fid begin @redis.srem "trigram:ci:#{ts.downcase}",fid + #putc "." rescue ArgumentError # Ignore "ArgumentError: invalid byte sequence in UTF-8" # and proceed... end end + #putc "\n" - @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}" + @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}" @redis.srem "fscan:processedFiles", filename end return nil end + + private :pushTrigramsSet + private :split_in_trigrams + #private :map_ids_to_files + end end