lib/ocr-file/document.rb in ocr-file-0.0.1 vs lib/ocr-file/document.rb in ocr-file-0.0.2
- old
+ new
@@ -1,7 +1,10 @@
module OcrFile
class Document
+ # TODO: Skewness / text orientation detection
+ # TODO: Better handwriting analysis
+
ACCEPTED_IMAGE_TYPES = ['png', 'jpeg', 'jpg', 'tiff', 'bmp']
PAGE_BREAK = "\n\r\n" # TODO: Make configurable
DEFAULT_CONFIG = {
# Images from PDF
filetype: 'png',
@@ -16,13 +19,12 @@
# Cloud-Vision OCR
image_annotator: nil, # Needed for Cloud-Vision
type_of_ocr: OcrFile::OcrEngines::CloudVision::DOCUMENT_TEXT_DETECTION,
ocr_engine: 'tesseract', # 'cloud-vision'
# Image Pre-Processing
- image_pre_preprocess: true,
- effects: ['bw', 'norm'],
- threshold: 0.25,
+ image_preprocess: true,
+ effects: ['despeckle', 'deskew', 'enhance', 'sharpen', 'bw'],
# PDF to Image Processing
optimise_pdf: true,
extract_pdf_images: true, # if false will screenshot each PDF page
temp_filename_prefix: 'image',
# Console Output
@@ -50,86 +52,90 @@
@config = config
@ocr_engine = find_ocr_engine(config[:ocr_engine])
end
def pdf?
- @original_file_path.include?('.pdf')
+ @original_file_path.downcase.include?('.pdf')
end
def image?
return false if pdf?
- ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.include?(".#{type}")}
+ ACCEPTED_IMAGE_TYPES.any? { |type| @original_file_path.downcase.include?(".#{type}")}
end
# Treat anything which isnt a PDF or image as text
def text?
!pdf? && !image?
end
+ # Trigger OCR pipeline
def to_pdf
if pdf?
create_temp_folder
image_paths = extract_image_paths_from_pdf(@original_file_path)
pdfs_to_merge = []
image_paths.each do |image_path|
- pdfs_to_merge << @ocr_engine.ocr_to_pdf(image_path, options: @config)
+ pdfs_to_merge << @ocr_engine.ocr_to_pdf(process_image(image_path), options: @config)
end
merged_pdf = OcrFile::ImageEngines::PdfEngine.merge(pdfs_to_merge)
OcrFile::ImageEngines::PdfEngine
.save_pdf(merged_pdf, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
-
- close
elsif text?
text = ::OcrFile::FileHelpers.open_text_file(@original_file_path)
pdf_file = OcrFile::ImageEngines::PdfEngine.pdf_from_text(text, @config)
OcrFile::ImageEngines::PdfEngine
.save_pdf(pdf_file, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
else # is an image
ocr_image_to_pdf
end
+
+ close
end
def to_text
if pdf?
create_temp_folder
image_paths = extract_image_paths_from_pdf(@original_file_path)
image_paths.each do |image_path|
- text = @ocr_engine.ocr_to_text(image_path, options: @config)
+ text = @ocr_engine.ocr_to_text(process_image(image_path), options: @config)
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", "#{text}#{PAGE_BREAK}")
end
-
- close
elsif text?
::OcrFile::FileHelpers.open_text_file(@original_file_path)
else # is an image
ocr_image_to_text(save: true)
end
+
+ close
end
def to_s
if pdf?
create_temp_folder
image_paths = extract_image_paths_from_pdf(@original_file_path)
text = ''
image_paths.each do |image_path|
- text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(image_path, options: @config)}"
+ text = "#{text}#{PAGE_BREAK}#{@ocr_engine.ocr_to_text(process_image(image_path), options: @config)}"
end
close
text
elsif text?
::OcrFile::FileHelpers.open_text_file(@original_file_path)
else # is an image
- ocr_image_to_text(save: false)
+ text = ocr_image_to_text(save: false)
+
+ close
+ text
end
end
def close
::OcrFile::FileHelpers.clear_folder(@temp_folder_path)
@@ -155,22 +161,39 @@
)
end
end
def create_temp_folder
- # TODO: Make this a bit more robust
- @temp_folder_path = "#{save_file_path}/temp/".gsub(' ', '\ ')
+ date = Time.now.to_s.split(' ').first
+
+ @temp_folder_path = "#{save_file_path}/temp-#{date}/".gsub(' ', '\ ')
::OcrFile::FileHelpers.make_directory(@temp_folder_path)
end
+ def process_image(path)
+ return path unless @config[:image_preprocess]
+
+ create_temp_folder
+ save_file_path = "#{@temp_folder_path}/#{Time.now.to_i}.#{@config[:filetype]}"
+
+ image_processor = OcrFile::ImageEngines::ImageMagick.new(
+ image_path: path,
+ temp_path: @temp_folder_path,
+ save_file_path: save_file_path,
+ config: @config
+ )
+
+ image_processor.convert!
+ end
+
def ocr_image_to_pdf
- pdf_document = @ocr_engine.ocr_to_pdf(@original_file_path, options: @config)
+ pdf_document = @ocr_engine.ocr_to_pdf(process_image(@original_file_path), options: @config)
OcrFile::ImageEngines::PdfEngine
.save_pdf(pdf_document, "#{@final_save_file}.pdf", optimise: @config[:optimise_pdf])
end
def ocr_image_to_text(save: true)
- text = @ocr_engine.ocr_to_text(@original_file_path, options: @config)
+ text = @ocr_engine.ocr_to_text(process_image(@original_file_path), options: @config)
if save
::OcrFile::FileHelpers.append_file("#{@final_save_file}.txt", text)
else
text