lib/code_zauker.rb in code_zauker-0.0.1 vs lib/code_zauker.rb in code_zauker-0.0.2
- old
+ new
@@ -1,32 +1,66 @@
# -*- mode:ruby ; -*- -*
require "code_zauker/version"
+require "code_zauker/constants"
require 'redis/connection/hiredis'
require 'redis'
require 'set'
-# This module try to implement a simple reverse indexer
-# based on redis
+# This module implements a simple reverse indexer
+# based on Redis
# The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
module CodeZauker
GRAM_SIZE=3
SPACE_GUY=" "*GRAM_SIZE
# Scan a file and push it inside redis...
# then it can provide handy method to find file scontaining the trigram...
class FileScanner
- def initialize()
+ def initialize(redisConnection=nil)
+ if redisConnection==nil
+ @redis=Redis.new
+ else
+ @redis=redisConnection
+ end
end
+ def disconnect()
+ @redis.quit
+ end
+
+
+
+ def pushTrigramsSet(s, fid, filename)
+ error=false
+ if s.length > 5000
+ puts " >Pushing...#{s.length} for id #{fid}=#{filename}"
+ end
+ s.each do | trigram |
+ @redis.sadd "trigram:#{trigram}",fid
+ @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
+ # Add the case-insensitive-trigram
+ begin
+ @redis.sadd "trigram:ci:#{trigram.downcase}",fid
+ rescue ArgumentError
+ error=true
+ end
+ end
+ if s.length > 5000
+ puts " <Pushed #{s.length}..."
+ puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if error
+ end
+ end
+
+ private :pushTrigramsSet
+
def load(filename, noReload=false)
- # Define my redis id...
- r=Redis.new
+ # Define my redis id...
# Already exists?...
- fid=r.get "fscan:id:#{filename}"
+ fid=@redis.get "fscan:id:#{filename}"
if fid==nil
- r.setnx "fscan:nextId",0
- fid=r.incr "fscan:nextId"
+ @redis.setnx "fscan:nextId",0
+ fid=@redis.incr "fscan:nextId"
# BUG: Consider storing it at the END of the processing
- r.set "fscan:id:#{filename}", fid
- r.set "fscan:id2filename:#{fid}",filename
+ @redis.set "fscan:id:#{filename}", fid
+ @redis.set "fscan:id2filename:#{fid}",filename
else
if noReload
puts "Already found #{filename} as id:#{fid} and NOT RELOADED"
return nil
end
@@ -52,45 +86,42 @@
next
end
# push the trigram to redis (highly optimized)
s.add(trigram)
if s.length > adaptiveSize
- puts " >Pushing...#{s.length}"
- s.each do | trigram |
- r.sadd "trigram:#{trigram}",fid
- r.sadd "fscan:trigramsOnFile:#{fid}", trigram
- end
- puts " <Pushed #{s.length}..."
+ pushTrigramsSet(s,fid,filename)
s=Set.new()
end
trigramScanned += 1
#puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
end
end
end
if s.length > 0
- s.each do | trigram |
- r.sadd "trigram:#{trigram}",fid
- r.sadd "fscan:trigramsOnFile:#{fid}", trigram
- end
+ pushTrigramsSet(s,fid,filename)
+ s=nil
#puts "Final push of #{s.length}"
end
- trigramsOnFile=r.scard "fscan:trigramsOnFile:#{fid}"
- r.sadd "fscan:processedFiles", "fscan:id:#{filename}"
+ trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}"
+ @redis.sadd "fscan:processedFiles", "#{filename}"
trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0
- puts "File processed. Unique Trigrams for #{filename}: #{trigramsOnFile} Total Scanned: #{trigramScanned} Ratio:#{trigramRatio}"
- r.quit
+ if trigramRatio < 10 or trigramRatio >75
+ puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} "
+ end
return nil
end
# = search
# Find a list of file candidates to a search string
# The search string is padded into trigrams
def search(term)
+ if term.length < GRAM_SIZE
+ raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character"
+ end
#puts " ** Searching: #{term}"
# split the term in a padded trigram
trigramInAnd=[]
# Search=> Sea AND ear AND arc AND rch
for j in 0...term.length
@@ -102,27 +133,67 @@
trigramInAnd.push("trigram:#{currentTrigram}")
end
#puts "Trigam conversion /#{term}/ into #{trigramInAnd}"
if trigramInAnd.length==0
return []
- end
- r=Redis.new
- fileIds= r.sinter(*trigramInAnd)
+ end
+ fileIds= @redis.sinter(*trigramInAnd)
filenames=[]
# fscan:id2filename:#{fid}....
fileIds.each do | id |
- filenames.push(r.get("fscan:id2filename:#{id}"))
- end
- r.quit
+ filenames.push(@redis.get("fscan:id2filename:#{id}"))
+ end
#puts " ** Files found:#{filenames} from ids #{fileIds}"
return filenames
end
-
- # This function accepts a very simple search query like
- # Gio*
- # will match Giovanni, Giovedi, Giorno...
- # Giova*ni
- # will match Giovanni, Giovani, Giovannini
- def searchSimpleRegexp(termWithStar)
+
+ def reindex(fileList)
+ #puts "Reindexing... #{fileList.length} files..."
+ fileList.each do |current_file |
+ self.remove([current_file])
+ self.load(current_file,noReload=false)
+ end
end
+
+ # Remove all the keys
+ def removeAll()
+ self.remove(nil)
+ end
+
+ # Remove the files from the index, updating trigrams
+ def remove(filePaths=nil)
+ if filePaths==nil
+ fileList=[]
+ storedFiles=@redis.keys "fscan:id:*"
+ storedFiles.each do |fileKey|
+ filename=fileKey.split("fscan:id:")[1]
+ fileList.push(filename)
+ end
+ else
+ fileList=filePaths
+ end
+ # puts "Files to remove from index...#{fileList.length}"
+ fileList.each do |filename|
+ fid=@redis.get "fscan:id:#{filename}"
+ trigramsToExpurge=@redis.smembers "fscan:trigramsOnFile:#{fid}"
+ if trigramsToExpurge.length==0
+ puts "?Nothing to do on #{filename}"
+ end
+ puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..."
+ trigramsToExpurge.each do | ts |
+ @redis.srem "trigram:#{ts}", fid
+ begin
+ @redis.srem "trigram:ci:#{ts.downcase}",fid
+ rescue ArgumentError
+ # Ignore "ArgumentError: invalid byte sequence in UTF-8"
+ # and proceed...
+ end
+ end
+
+ @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}"
+ @redis.srem "fscan:processedFiles", filename
+ end
+ return nil
+ end
+
end
end