lib/ocr-file/document.rb in ocr-file-0.0.4 vs lib/ocr-file/document.rb in ocr-file-0.0.6
- old
+ new
@@ -27,10 +27,11 @@
automatic_reprocess: true,
# PDF to Image Processing
optimise_pdf: true,
extract_pdf_images: true, # if false will screenshot each PDF page
temp_filename_prefix: 'image',
+ spelling_correction: true,
# Console Output
verbose: true,
timing: true,
}
@@ -73,11 +74,11 @@
end
# Trigger OCR pipeline
def to_pdf
@start_time = Time.now
- find_best_image_processing if config[:automatic_reprocess] && !text?
+ find_best_image_processing(save: false) if config[:automatic_reprocess] && !text?
if pdf?
ocr_pdf_to_searchable_pdf
elsif text?
text_to_pdf
@@ -115,11 +116,11 @@
text
end
def close
- # ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
+ ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
end
private
def extract_image_paths_from_pdf(file_path)
@@ -169,10 +170,11 @@
image_paths = extract_image_paths_from_pdf(@original_file_path)
pdfs_to_merge = []
image_paths.each do |image_path|
+ puts image_path
pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
end
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
@@ -180,18 +182,20 @@
.save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
end
def text_to_pdf
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
+
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
OcrFile::ImageEngines::PdfEngine
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
end
def ocr_image_to_pdf
- find_best_image_processing if config[:automatic_reprocess]
+ find_best_image_processing(save: false) if config[:automatic_reprocess]
pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
OcrFile::ImageEngines::PdfEngine
.save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
end
@@ -201,11 +205,15 @@
image_paths = extract_image_paths_from_pdf(@original_file_path)
text = ''
image_paths.each do |image_path|
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
+ puts image_path
+ text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) || ''
+
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
+ text = "#{text}#{PAGE_BREAK}#{text}"
end
if save
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
else
@@ -213,44 +221,65 @@
end
end
def ocr_image_to_text(save:)
create_temp_folder
+
text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
+ text = OcrFile::TextEngines::ResultProcessor.new(text).correct if config[:spelling_correction]
if save
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
else
text
end
end
def ocr_file_to_text(save:)
- if pdf? &&
+ if pdf?
ocr_pdf_to_text(save: save)
else # is an image
ocr_image_to_text(save: save)
end
end
def find_best_image_processing(save:)
- ocr_file_to_text(save: save) if !config[:automatic_reprocess]
+ ocr_file_to_text(save: save) unless config[:automatic_reprocess]
text = ''
+ best_text_count = 0
+ best_effects = config[:effects]
+
effects_to_test = [''] + (EFFECTS_TO_REMOVE - (EFFECTS_TO_REMOVE - config[:effects]))
effects_to_test.each do |effect|
- config[:effects] = config[:effects] - [effect]
+ text = test_ocr_settings(effect)
+ processed_result = OcrFile::TextEngines::ResultProcessor.new(text)
+ if processed_result.count_of_issues < best_text_count
+ best_text_count = processed_result.count_of_issues
+ best_effects = config[:effects]
+ end
+
+ break if processed_result.valid_words?
+ end
+
+ # Fallback
+ if OcrFile::TextEngines::ResultProcessor.new(text).invalid_words?
+ config[:effects] = best_effects
text = ocr_file_to_text(save: false)
- break if OcrFile::TextEngines::ResultProcessor.new(text).valid_words?
end
# Adds in extra operations which is unfortunately inefficient
if save
ocr_file_to_text(save: save)
else
text
end
+ end
+
+ def test_ocr_settings(effect)
+ config[:effects] = config[:effects] - [effect]
+ ocr_file_to_text(save: false)
end
def print_time
puts "Total Time: #{end_time-start_time} secs.\n\n" if config[:timing]
end