Coverage Report

File Lines Lines Of Code Untested Lines of Code Tested %
lib/ocrsdk/pdf.rb 27 12 0 100.0%
 
Legend
This line was executed.
This line was not executed!
This line doesn't matter.
 
1
class OCRSDK::PDF < OCRSDK::Image
23
2
  # We're on a shaky ground regarding what kind of pdfs
3
  # should be recognized and what shouldn't.
4
  # Currently we count that if there are
5
  #   images * 20 > length of text
6
  # then this document might need recognition.
7
  # Assumption is that there might be a title,
8
  # page numbers or credits along with images.
9
  def recognizeable?
23
10
    reader = PDF::Reader.new @image_path
20
11

              
12
    images = 0
12
13
    text   = 0
12
14
    chars  = Set.new
12
15
    reader.pages.each do |page|
12
16
      text   += page.text.length
51
17
      chars  += page.text.split('').map(&:ord).uniq
51
18
      images += page.xobjects.map {|k, v| v.hash[:Subtype]}.count(:Image)
99
19
    end
20

              
21
    # count number of distinct characters
22
    # in case of "searchable", but incorrectly recognized document
23
    images * 20 > text || chars.length < 10
12
24
  rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError
25
    false
3
26
  end
27
end

Generated on: 2012-12-01 01:40:30 +0400