lib/ocr-file/document.rb in ocr-file-0.0.3 vs lib/ocr-file/document.rb in ocr-file-0.0.4
- old
+ new
@@ -3,10 +3,11 @@
# TODO: Skewness / text orientation detection
# TODO: Better handwriting analysis
ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
PAGE_BREAK = "\n\r\n" # TODO: Make configurable
+ EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw']
DEFAULT_CONFIG = {
# Images from PDF
filetype: 'png',
quality: 100,
dpi: 300,
@@ -21,24 +22,28 @@
type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
ocr_engine: 'tesseract', # 'cloud-vision'
# Image Pre-Processing
image_preprocess: true,
effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'],
+ automatic_reprocess: true,
# PDF to Image Processing
optimise_pdf: true,
extract_pdf_images: true, # if false will screenshot each PDF page
temp_filename_prefix: 'image',
# Console Output
verbose: true,
+ timing: true,
}
attr_reader :original_file_path,
:filename,
:save_file_path,
:final_save_file,
:config,
- :ocr_engine
+ :ocr_engine,
+ :start_time,
+ :end_time
# save_file_path will also generate a tmp path for tmp files. Expected folder path
# TODO: Add in more input validation
def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG)
@original_file_path = original_file_path
@@ -67,80 +72,54 @@
!pdf? && !image?
end
# Trigger OCR pipeline
def to_pdf
- if pdf?
- create_temp_folder
- image_paths = extract_image_paths_from_pdf(@original_file_path)
+ @start_time = Time.now
+ find_best_image_processing if config[:automatic_reprocess] && !text?
- pdfs_to_merge = []
-
- image_paths.each do |image_path|
- pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
- end
-
- merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
-
- OcrFile::ImageEngines::PdfEngine
- .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+ if pdf?
+ ocr_pdf_to_searchable_pdf
elsif text?
- text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
- pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
-
- OcrFile::ImageEngines::PdfEngine
- .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+ text_to_pdf
else # is an image
ocr_image_to_pdf
end
close
+
+ @end_time = Time.now
+ print_time
end
def to_text
- if pdf?
- create_temp_folder
- image_paths = extract_image_paths_from_pdf(@original_file_path)
+ @start_time = Time.now
+ return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
- image_paths.each do |image_path|
- text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
- ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
- end
- elsif text?
- ::OcrFile::FileHelpers.open_text_file(@original_file_path)
- else # is an image
- ocr_image_to_text(save: true)
- end
-
+ find_best_image_processing(save: true)
close
+
+ @end_time = Time.now
+ print_time
end
def to_s
- if pdf?
- create_temp_folder
- image_paths = extract_image_paths_from_pdf(@original_file_path)
+ @start_time = Time.now
+ return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text?
- text = ''
+ text = find_best_image_processing(save: false)
- image_paths.each do |image_path|
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
- end
+ close
- close
- text
- elsif text?
- ::OcrFile::FileHelpers.open_text_file(@original_file_path)
- else # is an image
- text = ocr_image_to_text(save: false)
+ @end_time = Time.now
+ print_time
- close
- text
- end
+ text
end
def close
- ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
+ # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
end
private
def extract_image_paths_from_pdf(file_path)
@@ -183,23 +162,99 @@
)
image_processor.convert!
end
+ def ocr_pdf_to_searchable_pdf
+ create_temp_folder
+ image_paths = extract_image_paths_from_pdf(@original_file_path)
+
+ pdfs_to_merge = []
+
+ image_paths.each do |image_path|
+ pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
+ end
+
+ merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
+
+ OcrFile::ImageEngines::PdfEngine
+ .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+ end
+
+ def text_to_pdf
+ text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
+ pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
+
+ OcrFile::ImageEngines::PdfEngine
+ .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
+ end
+
def ocr_image_to_pdf
+ find_best_image_processing if config[:automatic_reprocess]
+
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
OcrFile::ImageEngines::PdfEngine
.save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
end
- def ocr_image_to_text(save: true)
+ def ocr_pdf_to_text(save:)
+ create_temp_folder
+ image_paths = extract_image_paths_from_pdf(@original_file_path)
+
+ text = ''
+
+ image_paths.each do |image_path|
+ text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
+ end
+
+ if save
+ ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
+ else
+ text
+ end
+ end
+
+ def ocr_image_to_text(save:)
+ create_temp_folder
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
if save
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
else
text
end
+ end
+
+ def ocr_file_to_text(save:)
+ if pdf? &&
+ ocr_pdf_to_text(save: save)
+ else # is an image
+ ocr_image_to_text(save: save)
+ end
+ end
+
+ def find_best_image_processing(save:)
+ ocr_file_to_text(save: save) if !config[:automatic_reprocess]
+
+ text = ''
+ effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
+ effects_to_test.each do |effect|
+ config[:effects] = config[:effects] - [effect]
+
+ text = ocr_file_to_text(save: false)
+ break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
+ end
+
+ # Adds in extra operations which is unfortunately inefficient
+ if save
+ ocr_file_to_text(save: save)
+ else
+ text
+ end
+ end
+
+ def print_time
+ puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
end
def find_ocr_engine(engine_id)
ocr_engine_constants
.map { |c| ocr_module(c) }