document.rb in ocr-file-0.0.2

- old
+ new

@@ -1,7 +1,10 @@
 module OcrFile
   class Document
+    # TODO: Skewness / text orientation detection
+    # TODO: Better handwriting analysis
+
     ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
     PAGE_BREAK = "\n\r\n" # TODO: Make configurable
     DEFAULT_CONFIG = {
       # Images from PDF
       filetype: 'png',
@@ -16,13 +19,12 @@
       # Cloud-Vision OCR
       image_annotator: nil, # Needed for Cloud-Vision
       type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
       ocr_engine: 'tesseract', # 'cloud-vision'
       # Image Pre-Processing
-      image_pre_preprocess: true,
-      effects: ['bw', 'norm'],
-      threshold: 0.25,
+      image_preprocess: true,
+      effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'],
       # PDF to Image Processing
       optimise_pdf: true,
       extract_pdf_images: true, # if false will screenshot each PDF page
       temp_filename_prefix: 'image',
       # Console Output
@@ -50,86 +52,90 @@
       @config = config
       @ocr_engine = find_ocr_engine(config[:ocr_engine])
     end
 
     def pdf?
-      @original_file_path.include?('.pdf')
+      @original_file_path.downcase.include?('.pdf')
     end
 
     def image?
       return false if pdf?
-      ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
+      ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
     end
 
     # Treat anything which isnt a PDF or image as text
     def text?
       !pdf? && !image?
     end
 
+    # Trigger OCR pipeline
     def to_pdf
       if pdf?
         create_temp_folder
         image_paths = extract_image_paths_from_pdf(@original_file_path)
 
         pdfs_to_merge = []
 
         image_paths.each do |image_path|
-          pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
+          pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
         end
 
         merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
 
         OcrFile::ImageEngines::PdfEngine
           .save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
-
-        close
       elsif text?
         text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
         pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
 
         OcrFile::ImageEngines::PdfEngine
           .save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
       else # is an image
         ocr_image_to_pdf
       end
+
+      close
     end
 
     def to_text
       if pdf?
         create_temp_folder
         image_paths = extract_image_paths_from_pdf(@original_file_path)
 
         image_paths.each do |image_path|
-          text = @ocr_engine.ocr_to_text(image_path, options: @config)
+          text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
           ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
         end
-
-        close
       elsif text?
         ::OcrFile::FileHelpers.open_text_file(@original_file_path)
       else # is an image
         ocr_image_to_text(save: true)
       end
+
+      close
     end
 
     def to_s
       if pdf?
         create_temp_folder
         image_paths = extract_image_paths_from_pdf(@original_file_path)
 
         text = ''
 
         image_paths.each do |image_path|
-          text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
+          text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
         end
 
         close
         text
       elsif text?
         ::OcrFile::FileHelpers.open_text_file(@original_file_path)
       else # is an image
-        ocr_image_to_text(save: false)
+        text = ocr_image_to_text(save: false)
+
+        close
+        text
       end
     end
 
     def close
       ::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
@@ -155,22 +161,39 @@
         )
       end
     end
 
     def create_temp_folder
-      # TODO: Make this a bit more robust
-      @temp_folder_path = "#{save_file_path}/temp/".gsub(' ', '\ ')
+      date = Time.now.to_s.split(' ').first
+
+      @temp_folder_path = "#{save_file_path}/temp-#{date}/".gsub(' ', '\ ')
       ::OcrFile::FileHelpers.make_directory(@temp_folder_path)
     end
 
+    def process_image(path)
+      return path unless @config[:image_preprocess]
+
+      create_temp_folder
+      save_file_path = "#{@temp_folder_path}/#{Time.now.to_i}.#{@config[:filetype]}"
+
+      image_processor = OcrFile::ImageEngines::ImageMagick.new(
+        image_path: path,
+        temp_path: @temp_folder_path,
+        save_file_path: save_file_path,
+        config: @config
+      )
+
+      image_processor.convert!
+    end
+
     def ocr_image_to_pdf
-      pdf_document = @ocr_engine.ocr_to_pdf(@original_file_path, options: @config)
+      pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
       OcrFile::ImageEngines::PdfEngine
         .save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
     end
 
     def ocr_image_to_text(save: true)
-      text = @ocr_engine.ocr_to_text(@original_file_path, options: @config)
+      text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
 
       if save
         ::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
       else
         text