lib/docsplit.rb in docsplit-0.4.1 vs lib/docsplit.rb in docsplit-0.5.0
- old
+ new
@@ -1,9 +1,9 @@
# The Docsplit module delegates to the Java PDF extractors.
module Docsplit
- VERSION = '0.4.1' # Keep in sync with gemspec.
+ VERSION = '0.5.0' # Keep in sync with gemspec.
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
@@ -70,11 +70,16 @@
InfoExtractor.new.extract(:#{key}, pdfs, opts)
end
EOS
end
+ # Utility method to clean OCR'd text with garbage characters.
+ def self.clean_text(text)
+ TextCleaner.new.clean(text)
+ end
+
private
# Runs a Java command, with quieted logging, and the classpath set properly.
def self.run(command, pdfs, opts, return_output=false)
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
@@ -101,5 +106,6 @@
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"
+require "#{Docsplit::ROOT}/lib/docsplit/text_cleaner"