lib/ocrsdk/pdf.rb

File	Lines	Lines Of Code	Untested Lines of Code	Tested %
lib/ocrsdk/pdf.rb	27	12	0	100.0%

Legend
This line was executed.
This line was not executed!
This line doesn't matter.

class OCRSDK::PDF < OCRSDK::Image
  # We're on a shaky ground regarding what kind of pdfs
  # should be recognized and what shouldn't.
  # Currently we count that if there are
  #   images * 20 > length of text
  # then this document might need recognition.
  # Assumption is that there might be a title,
  # page numbers or credits along with images.
  def recognizeable?
    reader = PDF::Reader.new @image_path

    images = 0
    text   = 0
    chars  = Set.new
    reader.pages.each do |page|
      text   += page.text.length
      chars  += page.text.split('').map(&:ord).uniq
      images += page.xobjects.map {|k, v| v.hash[:Subtype]}.count(:Image)
    end

    # count number of distinct characters
    # in case of "searchable", but incorrectly recognized document
    images * 20 > text || chars.length < 10
  rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError
    false
  end
end

1	class OCRSDK::PDF < OCRSDK::Image	23
2	# We're on a shaky ground regarding what kind of pdfs
3	# should be recognized and what shouldn't.
4	# Currently we count that if there are
5	# images * 20 > length of text
6	# then this document might need recognition.
7	# Assumption is that there might be a title,
8	# page numbers or credits along with images.
9	def recognizeable?	23
10	reader = PDF::Reader.new @image_path	20
11
12	images = 0	12
13	text = 0	12
14	chars = Set.new	12
15	reader.pages.each do \|page\|	12
16	text += page.text.length	51
17	chars += page.text.split('').map(&:ord).uniq	51
18	images += page.xobjects.map {\|k, v\| v.hash[:Subtype]}.count(:Image)	99
19	end
20
21	# count number of distinct characters
22	# in case of "searchable", but incorrectly recognized document
23	images * 20 > text \|\| chars.length < 10	12
24	rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError
25	false	3
26	end
27	end

Coverage Report