lib/ocr-file/document.rb in ocr-file-0.0.1 vs lib/ocr-file/document.rb in ocr-file-0.0.2

- old
+ new

@@ -1,7 +1,10 @@ module OcrFile class Document + # TODO: Skewness / text orientation detection + # TODO: Better handwriting analysis + ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp'] PAGE_BREAK = "\n\r\n" # TODO: Make configurable DEFAULT_CONFIG = { # Images from PDF filetype: 'png', @@ -16,13 +19,12 @@ # Cloud-Vision OCR image_annotator: nil, # Needed for Cloud-Vision type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION, ocr_engine: 'tesseract', # 'cloud-vision' # Image Pre-Processing - image_pre_preprocess: true, - effects: ['bw', 'norm'], - threshold: 0.25, + image_preprocess: true, + effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'], # PDF to Image Processing optimise_pdf: true, extract_pdf_images: true, # if false will screenshot each PDF page temp_filename_prefix: 'image', # Console Output @@ -50,86 +52,90 @@ @config = config @ocr_engine = find_ocr_engine(config[:ocr_engine]) end def pdf? - @original_file_path.include?('.pdf') + @original_file_path.downcase.include?('.pdf') end def image? return false if pdf? - ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")} + ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")} end # Treat anything which isnt a PDF or image as text def text? !pdf? && !image? end + # Trigger OCR pipeline def to_pdf if pdf? create_temp_folder image_paths = extract_image_paths_from_pdf(@original_file_path) pdfs_to_merge = [] image_paths.each do |image_path| - pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config) + pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config) end merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge) OcrFile::ImageEngines::PdfEngine .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) - - close elsif text? text = ::OcrFile::FileHelpers.open_text_file(@original_file_path) pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config) OcrFile::ImageEngines::PdfEngine .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) else # is an image ocr_image_to_pdf end + + close end def to_text if pdf? create_temp_folder image_paths = extract_image_paths_from_pdf(@original_file_path) image_paths.each do |image_path| - text = @ocr_engine.ocr_to_text(image_path, options: @config) + text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config) ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}") end - - close elsif text? ::OcrFile::FileHelpers.open_text_file(@original_file_path) else # is an image ocr_image_to_text(save: true) end + + close end def to_s if pdf? create_temp_folder image_paths = extract_image_paths_from_pdf(@original_file_path) text = '' image_paths.each do |image_path| - text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}" + text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}" end close text elsif text? ::OcrFile::FileHelpers.open_text_file(@original_file_path) else # is an image - ocr_image_to_text(save: false) + text = ocr_image_to_text(save: false) + + close + text end end def close ::OcrFile::FileHelpers.clear_folder(@temp_folder_path) @@ -155,22 +161,39 @@ ) end end def create_temp_folder - # TODO: Make this a bit more robust - @temp_folder_path = "#{save_file_path}/temp/".gsub(' ', '\ ') + date = Time.now.to_s.split(' ').first + + @temp_folder_path = "#{save_file_path}/temp-#{date}/".gsub(' ', '\ ') ::OcrFile::FileHelpers.make_directory(@temp_folder_path) end + def process_image(path) + return path unless @config[:image_preprocess] + + create_temp_folder + save_file_path = "#{@temp_folder_path}/#{Time.now.to_i}.#{@config[:filetype]}" + + image_processor = OcrFile::ImageEngines::ImageMagick.new( + image_path: path, + temp_path: @temp_folder_path, + save_file_path: save_file_path, + config: @config + ) + + image_processor.convert! + end + def ocr_image_to_pdf - pdf_document = @ocr_engine.ocr_to_pdf(@original_file_path, options: @config) + pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config) OcrFile::ImageEngines::PdfEngine .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf]) end def ocr_image_to_text(save: true) - text = @ocr_engine.ocr_to_text(@original_file_path, options: @config) + text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config) if save ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text) else text