Class: CodeZauker::FileScanner
- Inherits:
-
Object
- Object
- CodeZauker::FileScanner
- Defined in:
- lib/code_zauker.rb
Overview
Scan a file and push it inside redis... then it can provide handy method to find file scontaining the trigram...
Instance Method Summary (collapse)
- - (Object) disconnect
-
- (FileScanner) initialize(redisConnection = nil)
constructor
A new instance of FileScanner.
-
- (Object) isearch(term)
Do a case-insenitive search
using the special set of trigrams "trigram:ci:*" all downcase.
- - (Object) load(filename, noReload = false)
- - (Object) map_ids_to_files(fileIds)
- - (Object) reindex(fileList)
-
- (Object) remove(filePaths = nil)
Remove the files from the index, updating trigrams.
-
- (Object) removeAll
Remove all the keys.
-
- (Object) search(term)
search
Find a list of file candidates to a search string The search string is padded into trigrams.
-
- (Object) wsearch(term)
wild cards search
You can search trigram in the form public*class*Apple will match java declaration of MyApple but not YourAppManager.
Constructor Details
- (FileScanner) initialize(redisConnection = nil)
A new instance of FileScanner
154 155 156 157 158 159 160 |
# File 'lib/code_zauker.rb', line 154 def initialize(redisConnection=nil) if redisConnection==nil @redis=Redis.new else @redis=redisConnection end end |
Instance Method Details
- (Object) disconnect
163 164 165 166 167 168 169 170 |
# File 'lib/code_zauker.rb', line 163 def disconnect() begin @redis.quit rescue Errno::EAGAIN =>e # Nothing to do... puts "Ignored EAGAIN ERROR during disconnect..." end end |
- (Object) isearch(term)
Do a case-insenitive search
using the special set of trigrams "trigram:ci:*" all downcase
329 330 331 332 333 334 335 336 337 |
# File 'lib/code_zauker.rb', line 329 def isearch(term) termLowercase=term.downcase() trigramInAnd=split_in_trigrams(termLowercase,"trigram:ci") if trigramInAnd.length==0 return [] end fileIds= @redis.sinter(*trigramInAnd) return map_ids_to_files(fileIds) end |
- (Object) load(filename, noReload = false)
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
# File 'lib/code_zauker.rb', line 231 def load(filename, noReload=false) # Define my redis id... # Already exists?... fid=@redis.get "fscan:id:#{filename}" if fid==nil @redis.setnx "fscan:nextId",0 fid=@redis.incr "fscan:nextId" # BUG: Consider storing it at the END of the processing @redis.set "fscan:id:#{filename}", fid @redis.set "fscan:id2filename:#{fid}",filename else if noReload #puts "Already found #{filename} as id:#{fid} and NOT RELOADED" return nil end end # fid is the set key!... trigramScanned=0 # TEST_LICENSE.txt: 3290 Total Scanned: 24628 # The ratio is below 13% of total trigrams are unique for very big files # So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set # before sending it to redis. This avoid # a lot of spourios work s=Set.new util=Util.new() lines=util.get_lines(filename) adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE lines.each do |lineNotUTF8| l= util.ensureUTF8(lineNotUTF8) # Split each line into 3-char chunks, and store in a redis set i=0 for istart in 0...(l.length-GRAM_SIZE) trigram = l[istart, GRAM_SIZE] # Avoid storing the 3space guy enterely if trigram==SPACE_GUY next end # push the trigram to redis (highly optimized) s.add(trigram) if s.length > adaptiveSize pushTrigramsSet(s,fid,filename) s=Set.new() end trigramScanned += 1 #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}" end end if s.length > 0 pushTrigramsSet(s,fid,filename) s=nil #puts "Final push of #{s.length}" end trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}" @redis.sadd "fscan:processedFiles", "#{filename}" trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0 if trigramRatio < 10 or trigramRatio >75 puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70 end return nil end |
- (Object) map_ids_to_files(fileIds)
311 312 313 314 315 316 317 318 319 320 |
# File 'lib/code_zauker.rb', line 311 def map_ids_to_files(fileIds) filenames=[] # fscan:id2filename:#{fid}.... fileIds.each do | id | file_name=@redis.get("fscan:id2filename:#{id}") filenames.push(file_name) if !file_name.nil? end #puts " ** Files found:#{filenames} from ids #{fileIds}" return filenames end |
- (Object) reindex(fileList)
394 395 396 397 398 399 400 |
# File 'lib/code_zauker.rb', line 394 def reindex(fileList) #puts "Reindexing... #{fileList.length} files..." fileList.each do |current_file | self.remove([current_file]) self.load(current_file,noReload=false) end end |
- (Object) remove(filePaths = nil)
Remove the files from the index, updating trigrams
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 |
# File 'lib/code_zauker.rb', line 415 def remove(filePaths=nil) if filePaths==nil fileList=[] storedFiles=@redis.keys "fscan:id:*" storedFiles.each do |fileKey| filename=fileKey.split("fscan:id:")[1] fileList.push(filename) end else fileList=filePaths end # puts "Files to remove from index...#{fileList.length}" fileList.each do |filename| fid=@redis.get "fscan:id:#{filename}" trigramsToExpurge=@redis.smembers "fscan:trigramsOnFile:#{fid}" if trigramsToExpurge.length==0 puts "?Nothing to do on #{filename}" end puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..." trigramsToExpurge.each do | ts | @redis.srem "trigram:#{ts}", fid begin @redis.srem "trigram:ci:#{ts.downcase}",fid #putc "." rescue ArgumentError # Ignore "ArgumentError: invalid byte sequence in UTF-8" # and proceed... end end #putc "\n" @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}" @redis.srem "fscan:processedFiles", filename end return nil end |
- (Object) removeAll
Remove all the keys
403 404 405 406 407 408 409 410 411 412 |
# File 'lib/code_zauker.rb', line 403 def removeAll() tokill=[] tokill=@redis.keys("fscan:*") tokill.push(*(@redis.keys("trigram*"))) tokill.each do | x | @redis.del x #puts "Deleted #x" end @redis.del "fscan:processedFiles" end |
- (Object) search(term)
search
Find a list of file candidates to a search string The search string is padded into trigrams
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 |
# File 'lib/code_zauker.rb', line 378 def search(term) if term.length < GRAM_SIZE raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character" end #puts " ** Searching: #{term}" trigramInAnd=split_in_trigrams(term,"trigram") #puts "Trigam conversion /#{term}/ into #{trigramInAnd}" if trigramInAnd.length==0 return [] end fileIds= @redis.sinter(*trigramInAnd) fileNames=map_ids_to_files(fileIds) #puts "DEBUG #{fileIds} #{fileNames}" return fileNames end |
- (Object) wsearch(term)
wild cards search
You can search trigram in the form public*class*Apple will match java declaration of MyApple but not YourAppManager
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
# File 'lib/code_zauker.rb', line 344 def wsearch(term) # Split stuff puts "Wild Search request:#{term}" m=term.split("*") if m.length>0 trigramInAnd=Set.new() puts "*= Found:#{m.length}" m.each do | wtc | wt=wtc.downcase() #puts "Splitting #{wt}" trigSet=split_in_trigrams(wt,"trigram:ci") trigramInAnd=trigramInAnd.merge(trigSet) end puts "Trigrams: #{trigramInAnd.length}" trigramInAnd.each do | x | puts "#{x}" end if trigramInAnd.length==0 return [] end fileIds=@redis.sinter(*trigramInAnd) fileNames=map_ids_to_files(fileIds) #puts "DEBUG #{fileIds} #{fileNames}" return fileNames else puts "Warn no Wild!" return search(term) end end |