1 |
class OCRSDK::PDF < OCRSDK::Image
|
23 |
2 |
# We're on a shaky ground regarding what kind of pdfs
|
|
3 |
# should be recognized and what shouldn't.
|
|
4 |
# Currently we count that if there are
|
|
5 |
# images * 20 > length of text
|
|
6 |
# then this document might need recognition.
|
|
7 |
# Assumption is that there might be a title,
|
|
8 |
# page numbers or credits along with images.
|
|
9 |
def recognizeable?
|
23 |
10 |
reader = PDF::Reader.new @image_path
|
20 |
11 |
|
|
12 |
images = 0
|
12 |
13 |
text = 0
|
12 |
14 |
chars = Set.new
|
12 |
15 |
reader.pages.each do |page|
|
12 |
16 |
text += page.text.length
|
51 |
17 |
chars += page.text.split('').map(&:ord).uniq
|
51 |
18 |
images += page.xobjects.map {|k, v| v.hash[:Subtype]}.count(:Image)
|
99 |
19 |
end
|
|
20 |
|
|
21 |
# count number of distinct characters
|
|
22 |
# in case of "searchable", but incorrectly recognized document
|
|
23 |
images * 20 > text || chars.length < 10
|
12 |
24 |
rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError
|
|
25 |
false
|
3 |
26 |
end
|
|
27 |
end
|
|