ocr_parser.rb in sqed-0.4.0

- old
+ new

@@ -1,5 +1,7 @@
+require 'rtesseract'
+
 # encoding: UTF-8
 #
 # Given a single image return all text in that image.
 #
 # For reference 
@@ -15,68 +17,65 @@
 # At 10pt x 300dpi x-heights are typically about 20 pixels, although this
 # can vary dramatically from font to font. 
 # Below an x-height of 10 pixels, you have very little chance of accurate results, 
 # and below about 8 pixels, most of the text will be "noise removed". 
 #
-require 'rtesseract' 
-
 class Sqed::Parser::OcrParser < Sqed::Parser
 
   TYPE = :text
 
+  # Other experimented with default params
+  #      classify_debug_level: 5,
+  #      lang: 'eng',
+  #      load_system_dawg: 0,
+  #      load_unambig_dawg: 0,
+  #      load_freq_dawg: 0,
+  #      load_fixed_length_dawgs: 0,
+  #      load_number_dawg: 0,
+  #      load_punc_dawg: 1, ## important
+  #      load_unambig_dawg: 1,
+  #      chop_enable: 0,
+  #     enable_new_segsearch: 1,
+  #     tessedit_debug_quality_metrics: 1,
+  #     tessedit_write_params_to_file: 'tmp/ocr_config_file.txt',
+  #     tessedit_write_images: 1,
+  #     equationdetect_save_merged_image: 1,
+  #     tessedit_dump_pageseg_images: 1,
+  #     equationdetect_save_bi_image: 1
+
   # Tesseract parameters default/specific to section type, 
   # default is merged into the type
   SECTION_PARAMS = {
     default: {
-      psm: 3,
-#      classify_debug_level: 5,
-#      lang: 'eng', 
-#      load_system_dawg: 0,
-#      load_unambig_dawg: 0,
-#      load_freq_dawg: 0,
-#      load_fixed_length_dawgs: 0,
-#      load_number_dawg: 0,
-#      load_punc_dawg: 1, ## important
-#      load_unambig_dawg: 1,
-#      chop_enable: 0,
-#     enable_new_segsearch: 1,
-#     tessedit_debug_quality_metrics: 1,
-#     tessedit_write_params_to_file: 'tmp/ocr_config_file.txt',
-#     tessedit_write_images: 1,
-#     equationdetect_save_merged_image: 1,
-#     tessedit_dump_pageseg_images: 1,
-#     equationdetect_save_bi_image: 1
+      psm: 3
     },
     annotated_specimen: {
-      edges_children_count_limit: 3000 # was 45, significantly improves annotated_specimen for odontates
+      # was 45, significantly improves annotated_specimen for odontates
+      edges_children_count_limit: 3000 
     },
     identifier: {
       psm: 1,
       # tessedit_char_whitelist: '0123456789'
       #  edges_children_count_limit: 4000
-    }, 
+    },
     curator_metadata: {
+      psm: 3
     },
     labels: {
       psm: 3, # may need to be 6
     },
-    deterimination_labels: {
+    determination_labels: {
       psm: 3
     },
     other_labels: {
       psm: 3
     },
     collecting_event_labels: {
       psm: 3
     }
+  }.freeze
 
-
-  }
-
-  # the text extracted from the image
-  attr_accessor :text
-
   # future consideration 
   # def enhance_image(img)
   # get potential border pixel color (based on quadrant?)
   # new_color = img.pixel_color(1, 1)
 
@@ -100,37 +99,38 @@
   #
   # img.write('foo.jpg') 
   # img = img.white_threshold(245)
   # img
   # end
- 
+
   # @return [String]
-  #   the ocr text 
-  def text(section_type: :default)
-    img = @image 
- 
+  #   the ocr text
+  def get_text(section_type: :default)
+    img = image
+
     # resample if an image 4"x4" is less than 300dpi 
     if img.columns * img.rows < 144000
       img = img.resample(300)
     end
-    
-    params = SECTION_PARAMS[:default].merge(SECTION_PARAMS[section_type])
-    r = RTesseract.new(img, params) 
-    @text = r.to_s.strip
 
-    if @text == ""
+    params = SECTION_PARAMS[:default].dup
+    params.merge!(SECTION_PARAMS[section_type])
+
+    r = RTesseract.new(img, params)
+    @extracted_text = r.to_s.strip
+
+    if @extracted_text == ''
       img = img.white_threshold(245)
-      r = RTesseract.new(img, params) 
-      @text = r.to_s.strip
+      r = RTesseract.new(img, params)
+      @extracted_text = r.to_s.strip
     end
 
-    if @text == ""
+    if @extracted_text == ''
       img = img.quantize(256,Magick::GRAYColorspace)
-      r = RTesseract.new(img, params) 
-      @text = r.to_s.strip
+      r = RTesseract.new(img, params)
+      @extracted_text = r.to_s.strip
     end
 
-    @text
+    @extracted_text
   end
-
 
 end