require 'rtesseract' # We use tempfile because Rtesseract doesn't work directly with ImageMagic::Image (any longer... apparently, maybe) # https://ruby-doc.org/stdlib-2.6.1/libdoc/tempfile/rdoc/Tempfile.html require 'tempfile' # encoding: UTF-8 # # Given a single image return all text in that image. # # For reference # http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/ # https://code.google.com/p/tesseract-ocr/wiki/FAQ # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version # # "There is a minimum text size for reasonable accuracy. # You have to consider resolution as well as point size. # Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi. # A quick check is to count the pixels of the x-height of your characters. # (X-height is the height of the lower case x.) # At 10pt x 300dpi x-heights are typically about 20 pixels, although this # can vary dramatically from font to font. # Below an x-height of 10 pixels, you have very little chance of accurate results, # and below about 8 pixels, most of the text will be "noise removed". # class Sqed::Parser::OcrParser < Sqed::Parser TYPE = :text # Other experimented with default params # classify_debug_level: 5, # lang: 'eng', # load_system_dawg: 0, # load_unambig_dawg: 0, # load_freq_dawg: 0, # load_fixed_length_dawgs: 0, # load_number_dawg: 0, # load_punc_dawg: 1, ## important # load_unambig_dawg: 1, # chop_enable: 0, # enable_new_segsearch: 1, # tessedit_debug_quality_metrics: 1, # tessedit_write_params_to_file: 'tmp/ocr_config_file.txt', # tessedit_write_images: 1, # equationdetect_save_merged_image: 1, # tessedit_dump_pageseg_images: 1, # equationdetect_save_bi_image: 1 # Tesseract parameters default/specific to section type, # default is merged into the type SECTION_PARAMS = { default: { psm: 3 }, annotated_specimen: { # was 45, significantly improves annotated_specimen for odontates edges_children_count_limit: 3000 }, identifier: { psm: 1, # tessedit_char_whitelist: '0123456789' # edges_children_count_limit: 4000 }, curator_metadata: { psm: 3 }, labels: { psm: 3, # may need to be 6 }, determination_labels: { psm: 3 }, other_labels: { psm: 3 }, collecting_event_labels: { psm: 3 } }.freeze # future consideration # def enhance_image(img) # get potential border pixel color (based on quadrant?) # new_color = img.pixel_color(1, 1) # img = img.scale(2) # img.write('foo0.jpg.jpg') # img = img.enhance # img.write('foo1.jpg') # img = img.quantize(8, Magick::GRAYColorspace) # img.write('foo1.jpg') # img = img.sharpen(1.0, 0.2) # img.write('foo2.jpg') # border_color = img.pixel_color(img.columns - 1, img.rows - 1) # img = img.color_floodfill(img.columns - 1, img.rows - 1, new_color) # img.write('tmp/foo4.jpg') # img = img.quantize(2, Magick::GRAYColorspace) # #img = img.threshold(0.5) # img.write('foo4.jpg') # img = img.equalize #(32, Magick::GRAYColorspace) # img.write('foo5.jpg') # #img.write('foo3.jpg') # # img.write('foo.jpg') # img = img.white_threshold(245) # img # end # @return [String] # the ocr text # TODO: very kludge def get_text(section_type: :default) img = image # resample if an image 4"x4" is less than 300dpi if img.columns * img.rows < 144000 img = img.resample(300) end params = SECTION_PARAMS[:default].dup params.merge!(SECTION_PARAMS[section_type]) # May be able to overcome this hacky kludge messe with providing `processor:` to new file = Tempfile.new('foo1', encoding: 'utf-8') begin file.write(image.to_blob.force_encoding('utf-8')) file.rewind @extracted_text = RTesseract.new(file.path, params).to_s&.strip file.close ensure file.close file.unlink # deletes the temp file end if @extracted_text == '' file = Tempfile.new('foo2', encoding: 'utf-8') begin file.write(img.dup.white_threshold(245).to_blob.force_encoding('utf-8')) file.rewind @extracted_text = RTesseract.new(file.path, params).to_s&.strip file.close ensure file.close file.unlink end end if @extracted_text == '' file = Tempfile.new('foo3', encoding: 'utf-8') begin file.write(img.dup.quantize(256, Magick::GRAYColorspace).to_blob.force_encoding('utf-8')) file.rewind @extracted_text = RTesseract.new(file.path, params).to_s&.strip file.close ensure file.close file.unlink end end @extracted_text end end