lib/code_zauker.rb in code_zauker-0.0.3 vs lib/code_zauker.rb in code_zauker-0.0.4
- old
+ new
@@ -2,10 +2,11 @@
require "code_zauker/version"
require "code_zauker/constants"
require 'redis/connection/hiredis'
require 'redis'
require 'set'
+require 'pdf/reader'
# This module implements a simple reverse indexer
# based on Redis
# The idea is ispired by http://swtch.com/~rsc/regexp/regexp4.html
module CodeZauker
GRAM_SIZE=3
@@ -70,10 +71,41 @@
else
return untrusted_string
end
end
+ def is_pdf?(filename)
+ return filename.downcase().end_with?(".pdf")
+ end
+
+ # Obtain lines from a filename
+ # It works even with pdf files
+ def get_lines(filename)
+ lines=[]
+ if self.is_pdf?(filename)
+ # => enable pdf processing....
+ #puts "PDF..."
+ File.open(filename, "rb") do |io|
+ reader = PDF::Reader.new(io)
+ #puts "PDF Scanning...#{reader.info}"
+ reader.pages.each do |page|
+ linesToTrim=page.text.split("\n")
+ linesToTrim.each do |l|
+ lines.push(l.strip())
+ end
+ end
+ #puts "PDF Lines:#{lines.length}"
+ end
+ else
+ File.open(filename,"r") { |f|
+ lines=f.readlines()
+ }
+ end
+ return lines
+ end
+
+
end
# Scan a file and push it inside redis...
# then it can provide handy method to find file scontaining the trigram...
class FileScanner
@@ -126,22 +158,26 @@
puts "WARN: Some invalid UTF-8 char on #{filename} Case insensitive search will be compromised" if case_insensitive_trigram_failed
end
def pushTrigramsSetRecoverable(s, fid, filename)
error=false
- @redis.multi do
+ # @redis.multi do
+ # From 5.8
+ # to 7.6 Files per sec
+ # changing multi into pipielined
+ @redis.pipelined do
s.each do | trigram |
@redis.sadd "trigram:#{trigram}",fid
@redis.sadd "fscan:trigramsOnFile:#{fid}", trigram
# Add the case-insensitive-trigram
begin
@redis.sadd "trigram:ci:#{trigram.downcase}",fid
rescue ArgumentError
error=true
end
end
- end # multi
+ end # multi/pipelined
return error
end
private :pushTrigramsSetRecoverable
@@ -167,34 +203,34 @@
# The ratio is below 13% of total trigrams are unique for very big files
# So we avoid a huge roundtrip to redis, and store the trigram on a memory-based set
# before sending it to redis. This avoid
# a lot of spourios work
s=Set.new
- File.open(filename,"r") { |f|
- lines=f.readlines()
- adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE
- util=Util.new()
- lines.each do |lineNotUTF8|
- l= util.ensureUTF8(lineNotUTF8)
- # Split each line into 3-char chunks, and store in a redis set
- i=0
- for istart in 0...(l.length-GRAM_SIZE)
- trigram = l[istart, GRAM_SIZE]
- # Avoid storing the 3space guy enterely
- if trigram==SPACE_GUY
- next
- end
- # push the trigram to redis (highly optimized)
- s.add(trigram)
- if s.length > adaptiveSize
- pushTrigramsSet(s,fid,filename)
- s=Set.new()
- end
- trigramScanned += 1
- #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
+ util=Util.new()
+ lines=util.get_lines(filename)
+ adaptiveSize= TRIGRAM_DEFAULT_PUSH_SIZE
+
+ lines.each do |lineNotUTF8|
+ l= util.ensureUTF8(lineNotUTF8)
+ # Split each line into 3-char chunks, and store in a redis set
+ i=0
+ for istart in 0...(l.length-GRAM_SIZE)
+ trigram = l[istart, GRAM_SIZE]
+ # Avoid storing the 3space guy enterely
+ if trigram==SPACE_GUY
+ next
end
+ # push the trigram to redis (highly optimized)
+ s.add(trigram)
+ if s.length > adaptiveSize
+ pushTrigramsSet(s,fid,filename)
+ s=Set.new()
+ end
+ trigramScanned += 1
+ #puts "#{istart} Trigram fscan:#{trigram}/ FileId: #{fid}"
end
- }
+ end
+
if s.length > 0
pushTrigramsSet(s,fid,filename)
s=nil
#puts "Final push of #{s.length}"