lib/code_zauker.rb in code_zauker-0.0.3 vs lib/code_zauker.rb in code_zauker-0.0.4

- old
+ new

@@ -2,10 +2,11 @@ require "code_zauker/version" require "code_zauker/constants" require 'redis/connection/hiredis' require 'redis' require 'set' +require 'pdf/reader' # This module implements a simple reverse indexer # based on Redis # The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html module CodeZauker GRAM_SIZE=3 @@ -70,10 +71,41 @@ else return untrusted_string end end + def is_pdf?(filename) + return filename.downcase().end_with?(".pdf") + end + + # Obtain lines from a filename + # It works even with pdf files + def get_lines(filename) + lines=[] + if self.is_pdf?(filename) + # => enable pdf processing.... + #puts "PDF..." + File.open(filename, "rb") do |io| + reader = PDF::Reader.new(io) + #puts "PDF Scanning...#{reader.info}" + reader.pages.each do |page| + linesToTrim=page.text.split("\n") + linesToTrim.each do |l| + lines.push(l.strip()) + end + end + #puts "PDF Lines:#{lines.length}" + end + else + File.open(filename,"r") { |f| + lines=f.readlines() + } + end + return lines + end + + end # Scan a file and push it inside redis... # then it can provide handy method to find file scontaining the trigram... class FileScanner @@ -126,22 +158,26 @@ puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if case_insensitive_trigram_failed end def pushTrigramsSetRecoverable(s, fid, filename) error=false - @redis.multi do + # @redis.multi do + # From 5.8 + # to 7.6 Files per sec + # changing multi into pipielined + @redis.pipelined do s.each do | trigram | @redis.sadd "trigram:#{trigram}",fid @redis.sadd "fscan:trigramsOnFile:#{fid}", trigram # Add the case-insensitive-trigram begin @redis.sadd "trigram:ci:#{trigram.downcase}",fid rescue ArgumentError error=true end end - end # multi + end # multi/pipelined return error end private :pushTrigramsSetRecoverable @@ -167,34 +203,34 @@ # The ratio is below 13% of total trigrams are unique for very big files # So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set # before sending it to redis. This avoid # a lot of spourios work s=Set.new - File.open(filename,"r") { |f| - lines=f.readlines() - adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE - util=Util.new() - lines.each do |lineNotUTF8| - l= util.ensureUTF8(lineNotUTF8) - # Split each line into 3-char chunks, and store in a redis set - i=0 - for istart in 0...(l.length-GRAM_SIZE) - trigram = l[istart, GRAM_SIZE] - # Avoid storing the 3space guy enterely - if trigram==SPACE_GUY - next - end - # push the trigram to redis (highly optimized) - s.add(trigram) - if s.length > adaptiveSize - pushTrigramsSet(s,fid,filename) - s=Set.new() - end - trigramScanned += 1 - #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}" + util=Util.new() + lines=util.get_lines(filename) + adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE + + lines.each do |lineNotUTF8| + l= util.ensureUTF8(lineNotUTF8) + # Split each line into 3-char chunks, and store in a redis set + i=0 + for istart in 0...(l.length-GRAM_SIZE) + trigram = l[istart, GRAM_SIZE] + # Avoid storing the 3space guy enterely + if trigram==SPACE_GUY + next end + # push the trigram to redis (highly optimized) + s.add(trigram) + if s.length > adaptiveSize + pushTrigramsSet(s,fid,filename) + s=Set.new() + end + trigramScanned += 1 + #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}" end - } + end + if s.length > 0 pushTrigramsSet(s,fid,filename) s=nil #puts "Final push of #{s.length}"