lib/ocr-file/document.rb in ocr-file-0.0.3 vs lib/ocr-file/document.rb in ocr-file-0.0.4

- old
+ new

@@ -3,10 +3,11 @@ # TODO: Skewness / text orientation detection # TODO: Better handwriting analysis ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp'] PAGE_BREAK = "\n\r\n" # TODO: Make configurable + EFFECTS_TO_REMOVE = ['', 'norm', 'remove_shadow', 'bw'] DEFAULT_CONFIG = { # Images from PDF filetype: 'png', quality: 100, dpi: 300, @@ -21,24 +22,28 @@ type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION, ocr_engine: 'tesseract', # 'cloud-vision' # Image Pre-Processing image_preprocess: true, effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'remove_shadow', 'bw'], + automatic_reprocess: true, # PDF to Image Processing optimise_pdf: true, extract_pdf_images: true, # if false will screenshot each PDF page temp_filename_prefix: 'image', # Console Output verbose: true, + timing: true, } attr_reader :original_file_path, :filename, :save_file_path, :final_save_file, :config, - :ocr_engine + :ocr_engine, + :start_time, + :end_time # save_file_path will also generate a tmp path for tmp files. Expected folder path # TODO: Add in more input validation def initialize(original_file_path:, save_file_path:, config: DEFAULT_CONFIG) @original_file_path = original_file_path @@ -67,80 +72,54 @@ !pdf? && !image? end # Trigger OCR pipeline def to_pdf - if pdf? - create_temp_folder - image_paths = extract_image_paths_from_pdf(@original_file_path) + @start_time = Time.now + find_best_image_processing if config[:automatic_reprocess] && !text? - pdfs_to_merge = [] - - image_paths.each do |image_path| - pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config) - end - - merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge) - - OcrFile::ImageEngines::PdfEngine - .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) + if pdf? + ocr_pdf_to_searchable_pdf elsif text? - text = ::OcrFile::FileHelpers.open_text_file(@original_file_path) - pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config) - - OcrFile::ImageEngines::PdfEngine - .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) + text_to_pdf else # is an image ocr_image_to_pdf end close + + @end_time = Time.now + print_time end def to_text - if pdf? - create_temp_folder - image_paths = extract_image_paths_from_pdf(@original_file_path) + @start_time = Time.now + return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text? - image_paths.each do |image_path| - text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) - ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}") - end - elsif text? - ::OcrFile::FileHelpers.open_text_file(@original_file_path) - else # is an image - ocr_image_to_text(save: true) - end - + find_best_image_processing(save: true) close + + @end_time = Time.now + print_time end def to_s - if pdf? - create_temp_folder - image_paths = extract_image_paths_from_pdf(@original_file_path) + @start_time = Time.now + return ::OcrFile::FileHelpers.open_text_file(@original_file_path) if text? - text = '' + text = find_best_image_processing(save: false) - image_paths.each do |image_path| - text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}" - end + close - close - text - elsif text? - ::OcrFile::FileHelpers.open_text_file(@original_file_path) - else # is an image - text = ocr_image_to_text(save: false) + @end_time = Time.now + print_time - close - text - end + text end def close - ::OcrFile::FileHelpers.clear_folder(@temp_folder_path) + # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path) end private def extract_image_paths_from_pdf(file_path) @@ -183,23 +162,99 @@ ) image_processor.convert! end + def ocr_pdf_to_searchable_pdf + create_temp_folder + image_paths = extract_image_paths_from_pdf(@original_file_path) + + pdfs_to_merge = [] + + image_paths.each do |image_path| + pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config) + end + + merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge) + + OcrFile::ImageEngines::PdfEngine + .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) + end + + def text_to_pdf + text = ::OcrFile::FileHelpers.open_text_file(@original_file_path) + pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config) + + OcrFile::ImageEngines::PdfEngine + .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) + end + def ocr_image_to_pdf + find_best_image_processing if config[:automatic_reprocess] + pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config) OcrFile::ImageEngines::PdfEngine .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) end - def ocr_image_to_text(save: true) + def ocr_pdf_to_text(save:) + create_temp_folder + image_paths = extract_image_paths_from_pdf(@original_file_path) + + text = '' + + image_paths.each do |image_path| + text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}" + end + + if save + ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}") + else + text + end + end + + def ocr_image_to_text(save:) + create_temp_folder text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config) if save ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text) else text end + end + + def ocr_file_to_text(save:) + if pdf? && + ocr_pdf_to_text(save: save) + else # is an image + ocr_image_to_text(save: save) + end + end + + def find_best_image_processing(save:) + ocr_file_to_text(save: save) if !config[:automatic_reprocess] + + text = '' + effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects])) + effects_to_test.each do |effect| + config[:effects] = config[:effects] - [effect] + + text = ocr_file_to_text(save: false) + break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words? + end + + # Adds in extra operations which is unfortunately inefficient + if save + ocr_file_to_text(save: save) + else + text + end + end + + def print_time + puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing] end def find_ocr_engine(engine_id) ocr_engine_constants .map { |c| ocr_module(c) }