Class: CodeZauker::FileScanner
- Inherits:
-
Object
- Object
- CodeZauker::FileScanner
- Defined in:
- lib/code_zauker.rb
Overview
Scan a file and push it inside redis... then it can provide handy method to find file scontaining the trigram...
Instance Method Summary (collapse)
- - (Object) disconnect
-
- (FileScanner) initialize(redisConnection = nil)
constructor
A new instance of FileScanner.
-
- (Object) isearch(term)
Do a case-insenitive search
using the special set of trigrams "trigram:ci:*" all downcase.
- - (Object) load(filename, noReload = false)
- - (Object) map_ids_to_files(fileIds)
- - (Object) reindex(fileList)
-
- (Object) remove(filePaths = nil)
Remove the files from the index, updating trigrams.
-
- (Object) removeAll
Remove all the keys.
-
- (Object) search(term)
search
Find a list of file candidates to a search string The search string is padded into trigrams.
Constructor Details
- (FileScanner) initialize(redisConnection = nil)
A new instance of FileScanner
112 113 114 115 116 117 118 |
# File 'lib/code_zauker.rb', line 112 def initialize(redisConnection=nil) if redisConnection==nil @redis=Redis.new else @redis=redisConnection end end |
Instance Method Details
- (Object) disconnect
121 122 123 |
# File 'lib/code_zauker.rb', line 121 def disconnect() @redis.quit end |
- (Object) isearch(term)
Do a case-insenitive search
using the special set of trigrams "trigram:ci:*" all downcase
282 283 284 285 286 287 288 289 290 |
# File 'lib/code_zauker.rb', line 282 def isearch(term) termLowercase=term.downcase() trigramInAnd=split_in_trigrams(termLowercase,"trigram:ci") if trigramInAnd.length==0 return [] end fileIds= @redis.sinter(*trigramInAnd) return map_ids_to_files(fileIds) end |
- (Object) load(filename, noReload = false)
184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
# File 'lib/code_zauker.rb', line 184 def load(filename, noReload=false) # Define my redis id... # Already exists?... fid=@redis.get "fscan:id:#{filename}" if fid==nil @redis.setnx "fscan:nextId",0 fid=@redis.incr "fscan:nextId" # BUG: Consider storing it at the END of the processing @redis.set "fscan:id:#{filename}", fid @redis.set "fscan:id2filename:#{fid}",filename else if noReload #puts "Already found #{filename} as id:#{fid} and NOT RELOADED" return nil end end # fid is the set key!... trigramScanned=0 # TEST_LICENSE.txt: 3290 Total Scanned: 24628 # The ratio is below 13% of total trigrams are unique for very big files # So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set # before sending it to redis. This avoid # a lot of spourios work s=Set.new util=Util.new() lines=util.get_lines(filename) adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE lines.each do |lineNotUTF8| l= util.ensureUTF8(lineNotUTF8) # Split each line into 3-char chunks, and store in a redis set i=0 for istart in 0...(l.length-GRAM_SIZE) trigram = l[istart, GRAM_SIZE] # Avoid storing the 3space guy enterely if trigram==SPACE_GUY next end # push the trigram to redis (highly optimized) s.add(trigram) if s.length > adaptiveSize pushTrigramsSet(s,fid,filename) s=Set.new() end trigramScanned += 1 #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}" end end if s.length > 0 pushTrigramsSet(s,fid,filename) s=nil #puts "Final push of #{s.length}" end trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}" @redis.sadd "fscan:processedFiles", "#{filename}" trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0 if trigramRatio < 10 or trigramRatio >75 puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70 end return nil end |
- (Object) map_ids_to_files(fileIds)
264 265 266 267 268 269 270 271 272 273 |
# File 'lib/code_zauker.rb', line 264 def map_ids_to_files(fileIds) filenames=[] # fscan:id2filename:#{fid}.... fileIds.each do | id | file_name=@redis.get("fscan:id2filename:#{id}") filenames.push(file_name) if !file_name.nil? end #puts " ** Files found:#{filenames} from ids #{fileIds}" return filenames end |
- (Object) reindex(fileList)
312 313 314 315 316 317 318 |
# File 'lib/code_zauker.rb', line 312 def reindex(fileList) #puts "Reindexing... #{fileList.length} files..." fileList.each do |current_file | self.remove([current_file]) self.load(current_file,noReload=false) end end |
- (Object) remove(filePaths = nil)
Remove the files from the index, updating trigrams
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 |
# File 'lib/code_zauker.rb', line 333 def remove(filePaths=nil) if filePaths==nil fileList=[] storedFiles=@redis.keys "fscan:id:*" storedFiles.each do |fileKey| filename=fileKey.split("fscan:id:")[1] fileList.push(filename) end else fileList=filePaths end # puts "Files to remove from index...#{fileList.length}" fileList.each do |filename| fid=@redis.get "fscan:id:#{filename}" trigramsToExpurge=@redis.smembers "fscan:trigramsOnFile:#{fid}" if trigramsToExpurge.length==0 puts "?Nothing to do on #{filename}" end puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..." trigramsToExpurge.each do | ts | @redis.srem "trigram:#{ts}", fid begin @redis.srem "trigram:ci:#{ts.downcase}",fid #putc "." rescue ArgumentError # Ignore "ArgumentError: invalid byte sequence in UTF-8" # and proceed... end end #putc "\n" @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}" @redis.srem "fscan:processedFiles", filename end return nil end |
- (Object) removeAll
Remove all the keys
321 322 323 324 325 326 327 328 329 330 |
# File 'lib/code_zauker.rb', line 321 def removeAll() tokill=[] tokill=@redis.keys("fscan:*") tokill.push(*(@redis.keys("trigram*"))) tokill.each do | x | @redis.del x #puts "Deleted #x" end @redis.del "fscan:processedFiles" end |
- (Object) search(term)
search
Find a list of file candidates to a search string The search string is padded into trigrams
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
# File 'lib/code_zauker.rb', line 296 def search(term) if term.length < GRAM_SIZE raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character" end #puts " ** Searching: #{term}" trigramInAnd=split_in_trigrams(term,"trigram") #puts "Trigam conversion /#{term}/ into #{trigramInAnd}" if trigramInAnd.length==0 return [] end fileIds= @redis.sinter(*trigramInAnd) fileNames=map_ids_to_files(fileIds) #puts "DEBUG #{fileIds} #{fileNames}" return fileNames end |