ocr_parser.rb in sqed-0.1.0

- old
+ new

@@ -5,25 +5,27 @@
 # For past reference http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/
 #
 require 'rtesseract' 
 
 class Sqed::Parser::OcrParser < Sqed::Parser
-  attr_accessor  :text
 
+  TYPE = :text
+
+  # the text extracted from the image
+  attr_accessor :text
+
+  # https://code.google.com/p/tesseract-ocr/wiki/FAQ
   def text
     img = @image #.white_threshold(245)
 
     # @jrflood: this is where you will have to do some research, tuning images so that they can be better ocr-ed,
     # all of these methods are from RMagick.
     # get potential border pixel color (based on quadrant?)
     new_color = img.pixel_color(1, 1)
     # img = img.scale(2)
     # img.write('foo0.jpg.jpg')
     # img = img.enhance
-    # img = img.enhance
-    # img = img.enhance
-    # img = img.enhance
     # img.write('foo1.jpg')
     # img = img.quantize(8, Magick::GRAYColorspace)
     # img.write('foo1.jpg')
     # img = img.sharpen(1.0, 0.2)
     # img.write('foo2.jpg')
@@ -37,15 +39,31 @@
     # img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR
     # #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR
     #
     # img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR
 
-    r = RTesseract.new(img, lang: 'eng', psm: 3)
 
+    # From https://code.google.com/p/tesseract-ocr/wiki/FAQ
+    # " There is a minimum text size for reasonable accuracy. You have to consider resolution as well as point size. Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi. A quick check is to count the pixels of the x-height of your characters. (X-height is the height of the lower case x.) At 10pt x 300dpi x-heights are typically about 20 pixels, although this can vary dramatically from font to font. Below an x-height of 10 pixels, you have very little chance of accurate results, and below about 8 pixels, most of the text will be "noise removed". 
 
+
+    # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
+    # doesn't supprot outputbase
+    r = RTesseract.new(img, lang: 'eng', psm: 1, 
+                       load_system_dawg: 0,
+                       tessedit_debug_quality_metrics: 1,
+                       load_freq_dawg: 1 ,
+                       chop_enable: 1,
+                       tessedit_write_images: 1,
+                       equationdetect_save_merged_image: 1,
+                       tessedit_dump_pageseg_images: 1,
+                       equationdetect_save_bi_image: 1,
+                       load_unambig_dawg: 0,
+                       tessedit_write_params_to_file: 'tmp/ocr_config_file.txt' ) # psm: 3,
+
     # img = img.white_threshold(245)
 
-    @text = r.to_s 
+    @text = r.to_s.strip 
   end
 
   # Need to provide tuning methods here, i.e. image transormations that facilitate OCR
 
 end