lib/code_zauker.rb in code_zauker-0.0.9 vs lib/code_zauker.rb in code_zauker-0.1.0

- old
+ new

@@ -1,15 +1,18 @@ # -*- mode:ruby ; -*- -* require "code_zauker/version" require "code_zauker/constants" require 'code_zauker/grep' -require 'redis/connection/hiredis' +# require 'redis/connection/hiredis' require 'redis' require 'set' require 'pdf/reader' require 'date' +#require 'digest' +require 'digest/md5' + # This module implements a simple reverse indexer # based on Redis # The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html module CodeZauker GRAM_SIZE=3 @@ -197,13 +200,13 @@ puts "Trouble storing #{s.length} data. Retrying..." welldone=false end end end - if showlog - puts " <Pushed #{s.length}..." - end + # if showlog + # puts " <Pushed #{s.length}..." + # end puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if case_insensitive_trigram_failed end def pushTrigramsSetRecoverable(s, fid, filename) error=false @@ -224,25 +227,33 @@ return error end private :pushTrigramsSetRecoverable - def load(filename, noReload=false) + def load(filename) # Define my redis id... # Already exists?... fid=@redis.get "fscan:id:#{filename}" if fid==nil @redis.setnx "fscan:nextId",0 fid=@redis.incr "fscan:nextId" # BUG: Consider storing it at the END of the processing @redis.set "fscan:id:#{filename}", fid @redis.set "fscan:id2filename:#{fid}",filename else - if noReload - #puts "Already found #{filename} as id:#{fid} and NOT RELOADED" + # ADD MD5 Checksum + #Digest::MD5.hexdigest("aaa") + fileDigest = Digest::MD5.hexdigest(File.read(filename)) + storedDigest=@redis.get("cz:md5:#{filename}") + if(fileDigest!=storedDigest) + puts "#{filename} CHANGED...MD5: #{fileDigest} REINDEXING..." + self.remove([filename]) + else + ## puts "#{filename} id:#{fid} MD% UP TO DATE and NOT RELOADED" return nil end + end # fid is the set key!... trigramScanned=0 # TEST_LICENSE.txt: 3290 Total Scanned: 24628 # The ratio is below 13% of total trigrams are unique for very big files @@ -254,11 +265,11 @@ lines=util.get_lines(filename) adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE lines.each do |lineNotUTF8| l= util.ensureUTF8(lineNotUTF8) - # Split each line into 3-char chunks, and store in a redis set + # Split each line into GRAM_SIZE-char chunks, and store in a redis set i=0 for istart in 0...(l.length-GRAM_SIZE) trigram = l[istart, GRAM_SIZE] # Avoid storing the 3space guy enterely if trigram==SPACE_GUY @@ -269,11 +280,11 @@ if s.length > adaptiveSize pushTrigramsSet(s,fid,filename) s=Set.new() end trigramScanned += 1 - #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}" + #puts "#{istart} Gram fscan:#{trigram}/ FileId: #{fid}" end end if s.length > 0 @@ -285,12 +296,17 @@ trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}" @redis.sadd "fscan:processedFiles", "#{filename}" trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0 if trigramRatio < 10 or trigramRatio >75 - puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70 + puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique #{GRAM_SIZE}-grams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70 end + + # Register digest...do at last for better security + fileDigest = Digest::MD5.hexdigest(File.read(filename)) + @redis.set("cz:md5:#{filename}",fileDigest) + return nil end def split_in_trigrams(term, prefix) trigramInAnd=Set.new() @@ -342,15 +358,15 @@ # public*class*Apple # will match java declaration of MyApple but not # YourAppManager def wsearch(term) # Split stuff - puts "Wild Search request:#{term}" + #puts "Wild Search request:#{term}" m=term.split("*") if m.length>0 trigramInAnd=Set.new() - puts "*= Found:#{m.length}" + #puts "*= Found:#{m.length}" m.each do | wtc | wt=wtc.downcase() #puts "Splitting #{wt}" trigSet=split_in_trigrams(wt,"trigram:ci") trigramInAnd=trigramInAnd.merge(trigSet) @@ -384,10 +400,10 @@ def reindex(fileList) #puts "Reindexing... #{fileList.length} files..." fileList.each do |current_file | self.remove([current_file]) - self.load(current_file,noReload=false) + self.load(current_file) end end # Remove all the keys def removeAll()