# -*- mode:ruby ; -*- -* require "code_zauker/version" require "code_zauker/constants" require 'redis/connection/hiredis' require 'redis' require 'set' # This module implements a simple reverse indexer # based on Redis # The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html module CodeZauker GRAM_SIZE=3 SPACE_GUY=" "*GRAM_SIZE # Scan a file and push it inside redis... # then it can provide handy method to find file scontaining the trigram... class FileScanner def initialize(redisConnection=nil) if redisConnection==nil @redis=Redis.new else @redis=redisConnection end end def disconnect() @redis.quit end def pushTrigramsSet(s, fid, filename) error=false if s.length > 5000 puts " >Pushing...#{s.length} for id #{fid}=#{filename}" end s.each do | trigram | @redis.sadd "trigram:#{trigram}",fid @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram # Add the case-insensitive-trigram begin @redis.sadd "trigram:ci:#{trigram.downcase}",fid rescue ArgumentError error=true end end if s.length > 5000 puts " adaptiveSize pushTrigramsSet(s,fid,filename) s=Set.new() end trigramScanned += 1 #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}" end end end if s.length > 0 pushTrigramsSet(s,fid,filename) s=nil #puts "Final push of #{s.length}" end trigramsOnFile=@redis.scard "fscan:trigramsOnFile:#{fid}" @redis.sadd "fscan:processedFiles", "#{filename}" trigramRatio=( (trigramsOnFile*1.0) / trigramScanned )* 100.0 if trigramRatio < 10 or trigramRatio >75 puts "#{filename}\n\tRatio:#{trigramRatio.round}% Unique Trigrams:#{trigramsOnFile} Total Scanned: #{trigramScanned} " end return nil end # = search # Find a list of file candidates to a search string # The search string is padded into trigrams def search(term) if term.length < GRAM_SIZE raise "FATAL: #{term} is shorter then the minimum size of #{GRAM_SIZE} character" end #puts " ** Searching: #{term}" # split the term in a padded trigram trigramInAnd=[] # Search=> Sea AND ear AND arc AND rch for j in 0...term.length currentTrigram=term[j,GRAM_SIZE] if currentTrigram.length