lib/code_zauker.rb in code_zauker-0.0.2 vs lib/code_zauker.rb in code_zauker-0.0.3
- old
+ new
@@ -8,49 +8,145 @@
# based on Redis
# The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
module CodeZauker
GRAM_SIZE=3
SPACE_GUY=" "*GRAM_SIZE
+
+ # = Basic utility class
+ class Util
+ # Compute all the possible case-mixed trigrams
+ # It works for every string size
+ # TODO: Very bad implementation, need improvements
+ def mixCase(trigram)
+ caseMixedElements=[]
+ lx=trigram.length
+ combos=2**lx
+ startString=trigram.downcase
+ #puts "Combos... 1..#{combos}... #{startString}"
+ for c in 0..(combos-1) do
+ # Make binary
+ maskForStuff=c.to_s(2)
+ p=0
+ #puts maskForStuff
+ currentMix=""
+ # Pad it
+ if maskForStuff.length < lx
+ maskForStuff = ("0"*(lx-maskForStuff.length)) +maskForStuff
+ end
+ maskForStuff.each_char { | x |
+ #putc x
+ if x=="1"
+ currentMix +=startString[p].upcase
+ else
+ currentMix +=startString[p].downcase
+ end
+ #puts currentMix
+ p+=1
+ }
+ caseMixedElements.push(currentMix)
+ end
+ return caseMixedElements
+ end
+
+ # = Ensure Data are correctly imported
+ # http://blog.grayproductions.net/articles/ruby_19s_string
+ # This code try to "guess" the right encoding
+ # switching to ISO-8859-1 if UTF-8 is not valid.
+ # Tipical use case: an italian source code wronlgy interpreted as a UTF-8
+ # whereas it is a ISO-8859 windows code.
+ def ensureUTF8(untrusted_string)
+ if untrusted_string.valid_encoding?()==false
+ #puts "DEBUG Trouble on #{untrusted_string}"
+ untrusted_string.force_encoding("ISO-8859-1")
+ # We try ISO-8859-1 tipical windows
+ begin
+ valid_string=untrusted_string.encode("UTF-8", { :undef =>:replace, :invalid => :replace} )
+ rescue Encoding::InvalidByteSequenceError => e
+ raise e
+ end
+ # if valid_string != untrusted_string
+ # puts "CONVERTED #{valid_string} Works?#{valid_string.valid_encoding?}"
+ # end
+ return valid_string
+ else
+ return untrusted_string
+ end
+ end
+
+ end
+
# Scan a file and push it inside redis...
# then it can provide handy method to find file scontaining the trigram...
class FileScanner
def initialize(redisConnection=nil)
if redisConnection==nil
@redis=Redis.new
else
@redis=redisConnection
end
end
- def disconnect()
+
+
+ def disconnect()
@redis.quit
end
+
+
def pushTrigramsSet(s, fid, filename)
- error=false
- if s.length > 5000
+ case_insensitive_trigram_failed=false
+ showlog=false
+ if s.length > (TRIGRAM_DEFAULT_PUSH_SIZE/2)
puts " >Pushing...#{s.length} for id #{fid}=#{filename}"
+ showlog=true
end
- s.each do | trigram |
- @redis.sadd "trigram:#{trigram}",fid
- @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
- # Add the case-insensitive-trigram
+ # Ask for a protected transaction
+ # Sometimes can fail...
+ welldone=false
+ tryCounter=0
+ while welldone == false do
begin
- @redis.sadd "trigram:ci:#{trigram.downcase}",fid
- rescue ArgumentError
- error=true
+ tryCounter +=1
+ case_insensitive_trigram_failed=pushTrigramsSetRecoverable(s,fid,filename)
+ welldone=true
+ rescue Errno::EAGAIN =>ea
+ if tryCounter >=MAX_PUSH_TRIGRAM_RETRIES
+ puts "FATAL: Too many Errno::EAGAIN Errors"
+ raise ea
+ else
+ puts "Trouble storing #{s.length} data. Retrying..."
+ welldone=false
+ end
end
end
- if s.length > 5000
+ if showlog
puts " <Pushed #{s.length}..."
- puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if error
- end
+ end
+ puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if case_insensitive_trigram_failed
end
- private :pushTrigramsSet
+ def pushTrigramsSetRecoverable(s, fid, filename)
+ error=false
+ @redis.multi do
+ s.each do | trigram |
+ @redis.sadd "trigram:#{trigram}",fid
+ @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
+ # Add the case-insensitive-trigram
+ begin
+ @redis.sadd "trigram:ci:#{trigram.downcase}",fid
+ rescue ArgumentError
+ error=true
+ end
+ end
+ end # multi
+ return error
+ end
+ private :pushTrigramsSetRecoverable
+
def load(filename, noReload=false)
# Define my redis id...
# Already exists?...
fid=@redis.get "fscan:id:#{filename}"
if fid==nil
@@ -59,11 +155,11 @@
# BUG: Consider storing it at the END of the processing
@redis.set "fscan:id:#{filename}", fid
@redis.set "fscan:id2filename:#{fid}",filename
else
if noReload
- puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
+ #puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
return nil
end
end
# fid is the set key!...
trigramScanned=0
@@ -71,14 +167,16 @@
# The ratio is below 13% of total trigrams are unique for very big files
# So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set
# before sending it to redis. This avoid
# a lot of spourios work
s=Set.new
- File.open(filename,"r") do |f|
+ File.open(filename,"r") { |f|
lines=f.readlines()
- adaptiveSize= 6000
- lines.each do |l|
+ adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE
+ util=Util.new()
+ lines.each do |lineNotUTF8|
+ l= util.ensureUTF8(lineNotUTF8)
# Split each line into 3-char chunks, and store in a redis set
i=0
for istart in 0...(l.length-GRAM_SIZE)
trigram = l[istart, GRAM_SIZE]
# Avoid storing the 3space guy enterely
@@ -93,11 +191,11 @@
end
trigramScanned += 1
#puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
end
end
- end
+ }
if s.length > 0
pushTrigramsSet(s,fid,filename)
s=nil
#puts "Final push of #{s.length}"
@@ -105,60 +203,96 @@
trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}"
@redis.sadd "fscan:processedFiles", "#{filename}"
trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
- if trigramRatio < 10 or trigramRatio >75
- puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} "
+ if trigramRatio < 10 or trigramRatio >75
+ puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70
end
return nil
end
- # = search
- # Find a list of file candidates to a search string
- # The search string is padded into trigrams
- def search(term)
- if term.length < GRAM_SIZE
- raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
- end
- #puts " ** Searching: #{term}"
- # split the term in a padded trigram
- trigramInAnd=[]
+ def split_in_trigrams(term, prefix)
+ trigramInAnd=Set.new()
# Search=> Sea AND ear AND arc AND rch
for j in 0...term.length
currentTrigram=term[j,GRAM_SIZE]
if currentTrigram.length <GRAM_SIZE
# We are at the end...
break
end
- trigramInAnd.push("trigram:#{currentTrigram}")
+ trigramInAnd.add("#{prefix}:#{currentTrigram}")
end
- #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
- if trigramInAnd.length==0
- return []
- end
- fileIds= @redis.sinter(*trigramInAnd)
+ return trigramInAnd
+ end
+
+ def map_ids_to_files(fileIds)
filenames=[]
# fscan:id2filename:#{fid}....
fileIds.each do | id |
- filenames.push(@redis.get("fscan:id2filename:#{id}"))
+ file_name=@redis.get("fscan:id2filename:#{id}")
+ filenames.push(file_name) if !file_name.nil?
end
#puts " ** Files found:#{filenames} from ids #{fileIds}"
return filenames
end
+
+
+
+ # = Do a case-insenitive search
+ # using the special set of trigrams
+ # "trigram:ci:*"
+ # all downcase
+ def isearch(term)
+ termLowercase=term.downcase()
+ trigramInAnd=split_in_trigrams(termLowercase,"trigram:ci")
+ if trigramInAnd.length==0
+ return []
+ end
+ fileIds= @redis.sinter(*trigramInAnd)
+ return map_ids_to_files(fileIds)
+ end
+
+
+ # = search
+ # Find a list of file candidates to a search string
+ # The search string is padded into trigrams
+ def search(term)
+ if term.length < GRAM_SIZE
+ raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
+ end
+ #puts " ** Searching: #{term}"
+ trigramInAnd=split_in_trigrams(term,"trigram")
+ #puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
+ if trigramInAnd.length==0
+ return []
+ end
+ fileIds= @redis.sinter(*trigramInAnd)
+ fileNames=map_ids_to_files(fileIds)
+ #puts "DEBUG #{fileIds} #{fileNames}"
+ return fileNames
+ end
+
def reindex(fileList)
#puts "Reindexing... #{fileList.length} files..."
fileList.each do |current_file |
self.remove([current_file])
self.load(current_file,noReload=false)
end
end
# Remove all the keys
def removeAll()
- self.remove(nil)
+ tokill=[]
+ tokill=@redis.keys("fscan:*")
+ tokill.push(*(@redis.keys("trigram*")))
+ tokill.each do | x |
+ @redis.del x
+ #puts "Deleted #x"
+ end
+ @redis.del "fscan:processedFiles"
end
# Remove the files from the index, updating trigrams
def remove(filePaths=nil)
if filePaths==nil
@@ -176,24 +310,31 @@
fid=@redis.get "fscan:id:#{filename}"
trigramsToExpurge=@redis.smembers "fscan:trigramsOnFile:#{fid}"
if trigramsToExpurge.length==0
puts "?Nothing to do on #{filename}"
end
- puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
+ puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
trigramsToExpurge.each do | ts |
@redis.srem "trigram:#{ts}", fid
begin
@redis.srem "trigram:ci:#{ts.downcase}",fid
+ #putc "."
rescue ArgumentError
# Ignore "ArgumentError: invalid byte sequence in UTF-8"
# and proceed...
end
end
+ #putc "\n"
- @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}"
+ @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}"
@redis.srem "fscan:processedFiles", filename
end
return nil
end
+
+ private :pushTrigramsSet
+ private :split_in_trigrams
+ #private :map_ids_to_files
+
end
end