lib/sqed/parser/ocr_parser.rb in sqed-0.3.2 vs lib/sqed/parser/ocr_parser.rb in sqed-0.4.0
- old
+ new
@@ -1,5 +1,7 @@
+require 'rtesseract'
+
# encoding: UTF-8
#
# Given a single image return all text in that image.
#
# For reference
@@ -15,68 +17,65 @@
# At 10pt x 300dpi x-heights are typically about 20 pixels, although this
# can vary dramatically from font to font.
# Below an x-height of 10 pixels, you have very little chance of accurate results,
# and below about 8 pixels, most of the text will be "noise removed".
#
-require 'rtesseract'
-
class Sqed::Parser::OcrParser < Sqed::Parser
TYPE = :text
+ # Other experimented with default params
+ # classify_debug_level: 5,
+ # lang: 'eng',
+ # load_system_dawg: 0,
+ # load_unambig_dawg: 0,
+ # load_freq_dawg: 0,
+ # load_fixed_length_dawgs: 0,
+ # load_number_dawg: 0,
+ # load_punc_dawg: 1, ## important
+ # load_unambig_dawg: 1,
+ # chop_enable: 0,
+ # enable_new_segsearch: 1,
+ # tessedit_debug_quality_metrics: 1,
+ # tessedit_write_params_to_file: 'tmp/ocr_config_file.txt',
+ # tessedit_write_images: 1,
+ # equationdetect_save_merged_image: 1,
+ # tessedit_dump_pageseg_images: 1,
+ # equationdetect_save_bi_image: 1
+
# Tesseract parameters default/specific to section type,
# default is merged into the type
SECTION_PARAMS = {
default: {
- psm: 3,
-# classify_debug_level: 5,
-# lang: 'eng',
-# load_system_dawg: 0,
-# load_unambig_dawg: 0,
-# load_freq_dawg: 0,
-# load_fixed_length_dawgs: 0,
-# load_number_dawg: 0,
-# load_punc_dawg: 1, ## important
-# load_unambig_dawg: 1,
-# chop_enable: 0,
-# enable_new_segsearch: 1,
-# tessedit_debug_quality_metrics: 1,
-# tessedit_write_params_to_file: 'tmp/ocr_config_file.txt',
-# tessedit_write_images: 1,
-# equationdetect_save_merged_image: 1,
-# tessedit_dump_pageseg_images: 1,
-# equationdetect_save_bi_image: 1
+ psm: 3
},
annotated_specimen: {
- edges_children_count_limit: 3000 # was 45, significantly improves annotated_specimen for odontates
+ # was 45, significantly improves annotated_specimen for odontates
+ edges_children_count_limit: 3000
},
identifier: {
psm: 1,
# tessedit_char_whitelist: '0123456789'
# edges_children_count_limit: 4000
- },
+ },
curator_metadata: {
+ psm: 3
},
labels: {
psm: 3, # may need to be 6
},
- deterimination_labels: {
+ determination_labels: {
psm: 3
},
other_labels: {
psm: 3
},
collecting_event_labels: {
psm: 3
}
+ }.freeze
-
- }
-
- # the text extracted from the image
- attr_accessor :text
-
# future consideration
# def enhance_image(img)
# get potential border pixel color (based on quadrant?)
# new_color = img.pixel_color(1, 1)
@@ -100,37 +99,38 @@
#
# img.write('foo.jpg')
# img = img.white_threshold(245)
# img
# end
-
+
# @return [String]
- # the ocr text
- def text(section_type: :default)
- img = @image
-
+ # the ocr text
+ def get_text(section_type: :default)
+ img = image
+
# resample if an image 4"x4" is less than 300dpi
if img.columns * img.rows < 144000
img = img.resample(300)
end
-
- params = SECTION_PARAMS[:default].merge(SECTION_PARAMS[section_type])
- r = RTesseract.new(img, params)
- @text = r.to_s.strip
- if @text == ""
+ params = SECTION_PARAMS[:default].dup
+ params.merge!(SECTION_PARAMS[section_type])
+
+ r = RTesseract.new(img, params)
+ @extracted_text = r.to_s.strip
+
+ if @extracted_text == ''
img = img.white_threshold(245)
- r = RTesseract.new(img, params)
- @text = r.to_s.strip
+ r = RTesseract.new(img, params)
+ @extracted_text = r.to_s.strip
end
- if @text == ""
+ if @extracted_text == ''
img = img.quantize(256,Magick::GRAYColorspace)
- r = RTesseract.new(img, params)
- @text = r.to_s.strip
+ r = RTesseract.new(img, params)
+ @extracted_text = r.to_s.strip
end
- @text
+ @extracted_text
end
-
end