lib/docsplit.rb in docsplit-0.4.1 vs lib/docsplit.rb in docsplit-0.5.0

- old
+ new

@@ -1,9 +1,9 @@ # The Docsplit module delegates to the Java PDF extractors. module Docsplit - VERSION = '0.4.1' # Keep in sync with gemspec. + VERSION = '0.5.0' # Keep in sync with gemspec. ROOT = File.expand_path(File.dirname(__FILE__) + '/..') CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'" @@ -70,11 +70,16 @@ InfoExtractor.new.extract(:#{key}, pdfs, opts) end EOS end + # Utility method to clean OCR'd text with garbage characters. + def self.clean_text(text) + TextCleaner.new.clean(text) + end + private # Runs a Java command, with quieted logging, and the classpath set properly. def self.run(command, pdfs, opts, return_output=false) pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ') @@ -101,5 +106,6 @@ require "#{Docsplit::ROOT}/lib/docsplit/image_extractor" require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs" require "#{Docsplit::ROOT}/lib/docsplit/text_extractor" require "#{Docsplit::ROOT}/lib/docsplit/page_extractor" require "#{Docsplit::ROOT}/lib/docsplit/info_extractor" +require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"