lib/ocr-file/document.rb in ocr-file-0.0.4 vs lib/ocr-file/document.rb in ocr-file-0.0.6

- old
+ new

@@ -27,10 +27,11 @@ automatic_reprocess: true, # PDF to Image Processing optimise_pdf: true, extract_pdf_images: true, # if false will screenshot each PDF page temp_filename_prefix: 'image', + spelling_correction: true, # Console Output verbose: true, timing: true, } @@ -73,11 +74,11 @@ end # Trigger OCR pipeline def to_pdf @start_time = Time.now - find_best_image_processing if config[:automatic_reprocess] && !text? + find_best_image_processing(save: false) if config[:automatic_reprocess] && !text? if pdf? ocr_pdf_to_searchable_pdf elsif text? text_to_pdf @@ -115,11 +116,11 @@ text end def close - # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path) + ::OcrFile::FileHelpers.clear_folder(@temp_folder_path) end private def extract_image_paths_from_pdf(file_path) @@ -169,10 +170,11 @@ image_paths = extract_image_paths_from_pdf(@original_file_path) pdfs_to_merge = [] image_paths.each do |image_path| + puts image_path pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config) end merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge) @@ -180,18 +182,20 @@ .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) end def text_to_pdf text = ::OcrFile::FileHelpers.open_text_file(@original_file_path) + text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction] + pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config) OcrFile::ImageEngines::PdfEngine .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) end def ocr_image_to_pdf - find_best_image_processing if config[:automatic_reprocess] + find_best_image_processing(save: false) if config[:automatic_reprocess] pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config) OcrFile::ImageEngines::PdfEngine .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) end @@ -201,11 +205,15 @@ image_paths = extract_image_paths_from_pdf(@original_file_path) text = '' image_paths.each do |image_path| - text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}" + puts image_path + text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || '' + + text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction] + text = "#{text}#{PAGE_BREAK}#{text}" end if save ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}") else @@ -213,44 +221,65 @@ end end def ocr_image_to_text(save:) create_temp_folder + text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config) + text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction] if save ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text) else text end end def ocr_file_to_text(save:) - if pdf? && + if pdf? ocr_pdf_to_text(save: save) else # is an image ocr_image_to_text(save: save) end end def find_best_image_processing(save:) - ocr_file_to_text(save: save) if !config[:automatic_reprocess] + ocr_file_to_text(save: save) unless config[:automatic_reprocess] text = '' + best_text_count = 0 + best_effects = config[:effects] + effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects])) effects_to_test.each do |effect| - config[:effects] = config[:effects] - [effect] + text = test_ocr_settings(effect) + processed_result = OcrFile::TextEngines::ResultProcessor.new(text) + if processed_result.count_of_issues < best_text_count + best_text_count = processed_result.count_of_issues + best_effects = config[:effects] + end + + break if processed_result.valid_words? + end + + # Fallback + if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words? + config[:effects] = best_effects text = ocr_file_to_text(save: false) - break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words? end # Adds in extra operations which is unfortunately inefficient if save ocr_file_to_text(save: save) else text end + end + + def test_ocr_settings(effect) + config[:effects] = config[:effects] - [effect] + ocr_file_to_text(save: false) end def print_time puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing] end