# encoding: UTF-8
#
# Given a single image return all text in that image.
#
# For past reference http://misteroleg.wordpress.com/2012/12/19/ocr-using-tesseract-and-imagemagick-as-pre-processing-task/
#
require 'rtesseract' 

class Sqed::Parser::OcrParser < Sqed::Parser

  TYPE = :text

  # the text extracted from the image
  attr_accessor :text

  # https://code.google.com/p/tesseract-ocr/wiki/FAQ
  def text
    img = @image #.white_threshold(245)

    # @jrflood: this is where you will have to do some research, tuning images so that they can be better ocr-ed,
    # get potential border pixel color (based on quadrant?)
    new_color = img.pixel_color(1, 1)
    # img = img.scale(2)
    # img.write('foo0.jpg.jpg')
    # img = img.enhance
    # img.write('foo1.jpg')
    # img = img.quantize(8, Magick::GRAYColorspace)
    # img.write('foo1.jpg')
    # img = img.sharpen(1.0, 0.2)
    # img.write('foo2.jpg')
    # border_color = img.pixel_color(img.columns - 1, img.rows - 1)
    # img = img.color_floodfill(img.columns - 1, img.rows - 1, new_color)
    # img.write('tmp/foo4.jpg')
    # img = img.quantize(2, Magick::GRAYColorspace)
    # #img = img.threshold(0.5)
    # img.write('foo4.jpg') # for debugging purposes, this is the image that is sent to OCR
    # img = img.equalize #(32, Magick::GRAYColorspace)
    # img.write('foo5.jpg') # for debugging purposes, this is the image that is sent to OCR
    # #img.write('foo3.jpg') # for debugging purposes, this is the image that is sent to OCR
    #
    # img.write('foo.jpg') # for debugging purposes, this is the image that is sent to OCR


    # From https://code.google.com/p/tesseract-ocr/wiki/FAQ
    # " There is a minimum text size for reasonable accuracy. You have to consider resolution as well as point size. Accuracy drops off below 10pt x 300dpi, rapidly below 8pt x 300dpi. A quick check is to count the pixels of the x-height of your characters. (X-height is the height of the lower case x.) At 10pt x 300dpi x-heights are typically about 20 pixels, although this can vary dramatically from font to font. Below an x-height of 10 pixels, you have very little chance of accurate results, and below about 8 pixels, most of the text will be "noise removed". 


    # http://www.sk-spell.sk.cx/tesseract-ocr-parameters-in-302-version
    # doesn't supprot outputbase
    r = RTesseract.new(img, lang: 'eng', psm: 1, 
                       load_system_dawg: 0,
                       tessedit_debug_quality_metrics: 1,
                       load_freq_dawg: 1 ,
                       chop_enable: 1,
                       tessedit_write_images: 1,
                       equationdetect_save_merged_image: 1,
                       tessedit_dump_pageseg_images: 1,
                       equationdetect_save_bi_image: 1,
                       load_unambig_dawg: 0,
                       tessedit_write_params_to_file: 'tmp/ocr_config_file.txt' ) # psm: 3,

    # img = img.white_threshold(245)

    @text = r.to_s.strip 
  end

  # Need to provide tuning methods here, i.e. image transormations that facilitate OCR

end