lib/code_zauker.rb in code_zauker-0.0.9 vs lib/code_zauker.rb in code_zauker-0.1.0
- old
+ new
@@ -1,15 +1,18 @@
# -*- mode:ruby ; -*- -*
require "code_zauker/version"
require "code_zauker/constants"
require 'code_zauker/grep'
-require 'redis/connection/hiredis'
+# require 'redis/connection/hiredis'
require 'redis'
require 'set'
require 'pdf/reader'
require 'date'
+#require 'digest'
+require 'digest/md5'
+
# This module implements a simple reverse indexer
# based on Redis
# The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
module CodeZauker
GRAM_SIZE=3
@@ -197,13 +200,13 @@
puts "Trouble storing #{s.length} data. Retrying..."
welldone=false
end
end
end
- if showlog
- puts " <Pushed #{s.length}..."
- end
+ # if showlog
+ # puts " <Pushed #{s.length}..."
+ # end
puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if case_insensitive_trigram_failed
end
def pushTrigramsSetRecoverable(s, fid, filename)
error=false
@@ -224,25 +227,33 @@
return error
end
private :pushTrigramsSetRecoverable
- def load(filename, noReload=false)
+ def load(filename)
# Define my redis id...
# Already exists?...
fid=@redis.get "fscan:id:#{filename}"
if fid==nil
@redis.setnx "fscan:nextId",0
fid=@redis.incr "fscan:nextId"
# BUG: Consider storing it at the END of the processing
@redis.set "fscan:id:#{filename}", fid
@redis.set "fscan:id2filename:#{fid}",filename
else
- if noReload
- #puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
+ # ADD MD5 Checksum
+ #Digest::MD5.hexdigest("aaa")
+ fileDigest = Digest::MD5.hexdigest(File.read(filename))
+ storedDigest=@redis.get("cz:md5:#{filename}")
+ if(fileDigest!=storedDigest)
+ puts "#{filename} CHANGED...MD5: #{fileDigest} REINDEXING..."
+ self.remove([filename])
+ else
+ ## puts "#{filename} id:#{fid} MD% UP TO DATE and NOT RELOADED"
return nil
end
+
end
# fid is the set key!...
trigramScanned=0
# TEST_LICENSE.txt: 3290 Total Scanned: 24628
# The ratio is below 13% of total trigrams are unique for very big files
@@ -254,11 +265,11 @@
lines=util.get_lines(filename)
adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE
lines.each do |lineNotUTF8|
l= util.ensureUTF8(lineNotUTF8)
- # Split each line into 3-char chunks, and store in a redis set
+ # Split each line into GRAM_SIZE-char chunks, and store in a redis set
i=0
for istart in 0...(l.length-GRAM_SIZE)
trigram = l[istart, GRAM_SIZE]
# Avoid storing the 3space guy enterely
if trigram==SPACE_GUY
@@ -269,11 +280,11 @@
if s.length > adaptiveSize
pushTrigramsSet(s,fid,filename)
s=Set.new()
end
trigramScanned += 1
- #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
+ #puts "#{istart} Gram fscan:#{trigram}/ FileId: #{fid}"
end
end
if s.length > 0
@@ -285,12 +296,17 @@
trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}"
@redis.sadd "fscan:processedFiles", "#{filename}"
trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
if trigramRatio < 10 or trigramRatio >75
- puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70
+ puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique #{GRAM_SIZE}-grams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70
end
+
+ # Register digest...do at last for better security
+ fileDigest = Digest::MD5.hexdigest(File.read(filename))
+ @redis.set("cz:md5:#{filename}",fileDigest)
+
return nil
end
def split_in_trigrams(term, prefix)
trigramInAnd=Set.new()
@@ -342,15 +358,15 @@
# public*class*Apple
# will match java declaration of MyApple but not
# YourAppManager
def wsearch(term)
# Split stuff
- puts "Wild Search request:#{term}"
+ #puts "Wild Search request:#{term}"
m=term.split("*")
if m.length>0
trigramInAnd=Set.new()
- puts "*= Found:#{m.length}"
+ #puts "*= Found:#{m.length}"
m.each do | wtc |
wt=wtc.downcase()
#puts "Splitting #{wt}"
trigSet=split_in_trigrams(wt,"trigram:ci")
trigramInAnd=trigramInAnd.merge(trigSet)
@@ -384,10 +400,10 @@
def reindex(fileList)
#puts "Reindexing... #{fileList.length} files..."
fileList.each do |current_file |
self.remove([current_file])
- self.load(current_file,noReload=false)
+ self.load(current_file)
end
end
# Remove all the keys
def removeAll()