lib/sqed/parser/ocr_parser.rb in sqed-0.0.4 vs lib/sqed/parser/ocr_parser.rb in sqed-0.1.0

- old
+ new

@@ -5,25 +5,27 @@ # For past reference http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/ # require 'rtesseract' class Sqed::Parser::OcrParser < Sqed::Parser - attr_accessor :text + TYPE = :text + + # the text extracted from the image + attr_accessor :text + + # https://code.google.com/p/tesseract-ocr/wiki/FAQ def text img = @image #.white_threshold(245) # @jrflood: this is where you will have to do some research, tuning images so that they can be better ocr-ed, # all of these methods are from RMagick. # get potential border pixel color (based on quadrant?) new_color = img.pixel_color(1, 1) # img = img.scale(2) # img.write('foo0.jpg.jpg') # img = img.enhance - # img = img.enhance - # img = img.enhance - # img = img.enhance # img.write('foo1.jpg') # img = img.quantize(8, Magick::GRAYColorspace) # img.write('foo1.jpg') # img = img.sharpen(1.0, 0.2) # img.write('foo2.jpg') @@ -37,15 +39,31 @@ # img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR # #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR # # img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR - r = RTesseract.new(img, lang: 'eng', psm: 3) + # From https://code.google.com/p/tesseract-ocr/wiki/FAQ + # " There is a minimum text size for reasonable accuracy. You have to consider resolution as well as point size. Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi. A quick check is to count the pixels of the x-height of your characters. (X-height is the height of the lower case x.) At 10pt x 300dpi x-heights are typically about 20 pixels, although this can vary dramatically from font to font. Below an x-height of 10 pixels, you have very little chance of accurate results, and below about 8 pixels, most of the text will be "noise removed". + + # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version + # doesn't supprot outputbase + r = RTesseract.new(img, lang: 'eng', psm: 1, + load_system_dawg: 0, + tessedit_debug_quality_metrics: 1, + load_freq_dawg: 1 , + chop_enable: 1, + tessedit_write_images: 1, + equationdetect_save_merged_image: 1, + tessedit_dump_pageseg_images: 1, + equationdetect_save_bi_image: 1, + load_unambig_dawg: 0, + tessedit_write_params_to_file: 'tmp/ocr_config_file.txt' ) # psm: 3, + # img = img.white_threshold(245) - @text = r.to_s + @text = r.to_s.strip end # Need to provide tuning methods here, i.e. image transormations that facilitate OCR end