lib/sqed/parser/ocr_parser.rb in sqed-0.1.5 vs lib/sqed/parser/ocr_parser.rb in sqed-0.1.6
- old
+ new
@@ -1,69 +1,106 @@
# encoding: UTF-8
#
# Given a single image return all text in that image.
#
-# For past reference http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/
+# For reference
+# http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/
+# https://code.google.com/p/tesseract-ocr/wiki/FAQ
+# http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
#
+# "There is a minimum text size for reasonable accuracy.
+# You have to consider resolution as well as point size.
+# Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi.
+# A quick check is to count the pixels of the x-height of your characters.
+# (X-height is the height of the lower case x.)
+# At 10pt x 300dpi x-heights are typically about 20 pixels, although this
+# can vary dramatically from font to font.
+# Below an x-height of 10 pixels, you have very little chance of accurate results,
+# and below about 8 pixels, most of the text will be "noise removed".
+#
require 'rtesseract'
class Sqed::Parser::OcrParser < Sqed::Parser
TYPE = :text
+ # Tesseract parameters default/specific to section type,
+ # default is merged into the type
+ SECTION_PARAMS = {
+ default: {
+ psm: 3,
+# classify_debug_level: 5,
+# lang: 'eng',
+# load_system_dawg: 0,
+# load_unambig_dawg: 0,
+# load_freq_dawg: 0,
+# load_fixed_length_dawgs: 0,
+# load_number_dawg: 0,
+# load_punc_dawg: 1, ## important
+# load_unambig_dawg: 1,
+# chop_enable: 0,
+# enable_new_segsearch: 1,
+# tessedit_debug_quality_metrics: 1,
+# tessedit_write_params_to_file: 'tmp/ocr_config_file.txt',
+# tessedit_write_images: 1,
+# equationdetect_save_merged_image: 1,
+# tessedit_dump_pageseg_images: 1,
+# equationdetect_save_bi_image: 1
+ },
+ annotated_specimen: {
+ edges_children_count_limit: 3000 # was 45, significantly improves annotated_specimen for odontates
+ },
+ identifier: {
+ psm: 1,
+ # tessedit_char_whitelist: '0123456789'
+ # edges_children_count_limit: 4000
+ },
+ curator_metadata: {
+ },
+ labels: {
+ psm: 3, # may need to be 6
+ },
+ deterimination_labels: {
+ psm: 3
+ }
+ }
+
# the text extracted from the image
attr_accessor :text
- # https://code.google.com/p/tesseract-ocr/wiki/FAQ
- def text
- img = @image #.white_threshold(245)
+ # future consideration
+ # def enhance_image(img)
+ # get potential border pixel color (based on quadrant?)
+ # new_color = img.pixel_color(1, 1)
- # @jrflood: this is where you will have to do some research, tuning images so that they can be better ocr-ed,
- # get potential border pixel color (based on quadrant?)
- new_color = img.pixel_color(1, 1)
- # img = img.scale(2)
- # img.write('foo0.jpg.jpg')
- # img = img.enhance
- # img.write('foo1.jpg')
- # img = img.quantize(8, Magick::GRAYColorspace)
- # img.write('foo1.jpg')
- # img = img.sharpen(1.0, 0.2)
- # img.write('foo2.jpg')
- # border_color = img.pixel_color(img.columns - 1, img.rows - 1)
- # img = img.color_floodfill(img.columns - 1, img.rows - 1, new_color)
- # img.write('tmp/foo4.jpg')
- # img = img.quantize(2, Magick::GRAYColorspace)
- # #img = img.threshold(0.5)
- # img.write('foo4.jpg') # for debugging purposes, this is the image that is sent to OCR
- # img = img.equalize #(32, Magick::GRAYColorspace)
- # img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR
- # #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR
- #
- # img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR
-
-
- # From https://code.google.com/p/tesseract-ocr/wiki/FAQ
- # " There is a minimum text size for reasonable accuracy. You have to consider resolution as well as point size. Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi. A quick check is to count the pixels of the x-height of your characters. (X-height is the height of the lower case x.) At 10pt x 300dpi x-heights are typically about 20 pixels, although this can vary dramatically from font to font. Below an x-height of 10 pixels, you have very little chance of accurate results, and below about 8 pixels, most of the text will be "noise removed".
-
-
- # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
- # doesn't supprot outputbase
- r = RTesseract.new(img, lang: 'eng', psm: 1,
- load_system_dawg: 0,
- tessedit_debug_quality_metrics: 1,
- load_freq_dawg: 1 ,
- chop_enable: 1,
- tessedit_write_images: 1,
- equationdetect_save_merged_image: 1,
- tessedit_dump_pageseg_images: 1,
- equationdetect_save_bi_image: 1,
- load_unambig_dawg: 0,
- tessedit_write_params_to_file: 'tmp/ocr_config_file.txt' ) # psm: 3,
-
- # img = img.white_threshold(245)
-
+ # img = img.scale(2)
+ # img.write('foo0.jpg.jpg')
+ # img = img.enhance
+ # img.write('foo1.jpg')
+ # img = img.quantize(8, Magick::GRAYColorspace)
+ # img.write('foo1.jpg')
+ # img = img.sharpen(1.0, 0.2)
+ # img.write('foo2.jpg')
+ # border_color = img.pixel_color(img.columns - 1, img.rows - 1)
+ # img = img.color_floodfill(img.columns - 1, img.rows - 1, new_color)
+ # img.write('tmp/foo4.jpg')
+ # img = img.quantize(2, Magick::GRAYColorspace)
+ # #img = img.threshold(0.5)
+ # img.write('foo4.jpg') # for debugging purposes, this is the image that is sent to OCR
+ # img = img.equalize #(32, Magick::GRAYColorspace)
+ # img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR
+ # #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR
+ #
+ # img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR
+ # img = img.white_threshold(245)
+ # img
+ # end
+
+ def text(section_type: :default)
+ img = @image
+ params = SECTION_PARAMS[:default].merge(SECTION_PARAMS[section_type])
+ r = RTesseract.new(img, params)
@text = r.to_s.strip
end
- # Need to provide tuning methods here, i.e. image transormations that facilitate OCR
end