Class: CodeZauker::FileScanner
- Inherits:
-
Object
- Object
- CodeZauker::FileScanner
- Defined in:
- lib/code_zauker.rb
Overview
Scan a file and push it inside redis... then it can provide handy method to find file scontaining the trigram...
Instance Method Summary (collapse)
- - (Object) disconnect
-
- (FileScanner) initialize(redisConnection = nil)
constructor
A new instance of FileScanner.
-
- (Object) isearch(term)
Do a case-insenitive search
using the special set of trigrams "trigram:ci:*" all downcase.
- - (Object) load(filename, noReload = false)
- - (Object) map_ids_to_files(fileIds)
- - (Object) reindex(fileList)
-
- (Object) remove(filePaths = nil)
Remove the files from the index, updating trigrams.
-
- (Object) removeAll
Remove all the keys.
-
- (Object) search(term)
search
Find a list of file candidates to a search string The search string is padded into trigrams.
Constructor Details
- (FileScanner) initialize(redisConnection = nil)
A new instance of FileScanner
80 81 82 83 84 85 86 |
# File 'lib/code_zauker.rb', line 80 def initialize(redisConnection=nil) if redisConnection==nil @redis=Redis.new else @redis=redisConnection end end |
Instance Method Details
- (Object) disconnect
89 90 91 |
# File 'lib/code_zauker.rb', line 89 def disconnect() @redis.quit end |
- (Object) isearch(term)
Do a case-insenitive search
using the special set of trigrams "trigram:ci:*" all downcase
246 247 248 249 250 251 252 253 254 |
# File 'lib/code_zauker.rb', line 246 def isearch(term) termLowercase=term.downcase() trigramInAnd=split_in_trigrams(termLowercase,"trigram:ci") if trigramInAnd.length==0 return [] end fileIds= @redis.sinter(*trigramInAnd) return map_ids_to_files(fileIds) end |
- (Object) load(filename, noReload = false)
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
# File 'lib/code_zauker.rb', line 148 def load(filename, noReload=false) # Define my redis id... # Already exists?... fid=@redis.get "fscan:id:#{filename}" if fid==nil @redis.setnx "fscan:nextId",0 fid=@redis.incr "fscan:nextId" # BUG: Consider storing it at the END of the processing @redis.set "fscan:id:#{filename}", fid @redis.set "fscan:id2filename:#{fid}",filename else if noReload #puts "Already found #{filename} as id:#{fid} and NOT RELOADED" return nil end end # fid is the set key!... trigramScanned=0 # TEST_LICENSE.txt: 3290 Total Scanned: 24628 # The ratio is below 13% of total trigrams are unique for very big files # So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set # before sending it to redis. This avoid # a lot of spourios work s=Set.new File.open(filename,"r") { |f| lines=f.readlines() adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE util=Util.new() lines.each do |lineNotUTF8| l= util.ensureUTF8(lineNotUTF8) # Split each line into 3-char chunks, and store in a redis set i=0 for istart in 0...(l.length-GRAM_SIZE) trigram = l[istart, GRAM_SIZE] # Avoid storing the 3space guy enterely if trigram==SPACE_GUY next end # push the trigram to redis (highly optimized) s.add(trigram) if s.length > adaptiveSize pushTrigramsSet(s,fid,filename) s=Set.new() end trigramScanned += 1 #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}" end end } if s.length > 0 pushTrigramsSet(s,fid,filename) s=nil #puts "Final push of #{s.length}" end trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}" @redis.sadd "fscan:processedFiles", "#{filename}" trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0 if trigramRatio < 10 or trigramRatio >75 puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} ?Binary" if trigramRatio >90 and trigramsOnFile>70 end return nil end |
- (Object) map_ids_to_files(fileIds)
228 229 230 231 232 233 234 235 236 237 |
# File 'lib/code_zauker.rb', line 228 def map_ids_to_files(fileIds) filenames=[] # fscan:id2filename:#{fid}.... fileIds.each do | id | file_name=@redis.get("fscan:id2filename:#{id}") filenames.push(file_name) if !file_name.nil? end #puts " ** Files found:#{filenames} from ids #{fileIds}" return filenames end |
- (Object) reindex(fileList)
276 277 278 279 280 281 282 |
# File 'lib/code_zauker.rb', line 276 def reindex(fileList) #puts "Reindexing... #{fileList.length} files..." fileList.each do |current_file | self.remove([current_file]) self.load(current_file,noReload=false) end end |
- (Object) remove(filePaths = nil)
Remove the files from the index, updating trigrams
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 |
# File 'lib/code_zauker.rb', line 297 def remove(filePaths=nil) if filePaths==nil fileList=[] storedFiles=@redis.keys "fscan:id:*" storedFiles.each do |fileKey| filename=fileKey.split("fscan:id:")[1] fileList.push(filename) end else fileList=filePaths end # puts "Files to remove from index...#{fileList.length}" fileList.each do |filename| fid=@redis.get "fscan:id:#{filename}" trigramsToExpurge=@redis.smembers "fscan:trigramsOnFile:#{fid}" if trigramsToExpurge.length==0 puts "?Nothing to do on #{filename}" end puts "#{filename} id=#{fid} Trigrams: #{trigramsToExpurge.length} Expurging..." trigramsToExpurge.each do | ts | @redis.srem "trigram:#{ts}", fid begin @redis.srem "trigram:ci:#{ts.downcase}",fid #putc "." rescue ArgumentError # Ignore "ArgumentError: invalid byte sequence in UTF-8" # and proceed... end end #putc "\n" @redis.del "fscan:id:#{filename}", "fscan:trigramsOnFile:#{fid}", "fscan:id2filename:#{fid}" @redis.srem "fscan:processedFiles", filename end return nil end |
- (Object) removeAll
Remove all the keys
285 286 287 288 289 290 291 292 293 294 |
# File 'lib/code_zauker.rb', line 285 def removeAll() tokill=[] tokill=@redis.keys("fscan:*") tokill.push(*(@redis.keys("trigram*"))) tokill.each do | x | @redis.del x #puts "Deleted #x" end @redis.del "fscan:processedFiles" end |
- (Object) search(term)
search
Find a list of file candidates to a search string The search string is padded into trigrams
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
# File 'lib/code_zauker.rb', line 260 def search(term) if term.length < GRAM_SIZE raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character" end #puts " ** Searching: #{term}" trigramInAnd=split_in_trigrams(term,"trigram") #puts "Trigam conversion /#{term}/ into #{trigramInAnd}" if trigramInAnd.length==0 return [] end fileIds= @redis.sinter(*trigramInAnd) fileNames=map_ids_to_files(fileIds) #puts "DEBUG #{fileIds} #{fileNames}" return fileNames end |