lib/yomu.rb in yomu-0.2.1 vs lib/yomu.rb in yomu-0.2.2

- old
+ new

@@ -2,48 +2,82 @@ require 'net/http' require 'mime/types' require 'json' +require 'socket' +require 'stringio' + class Yomu GEMPATH = File.dirname(File.dirname(__FILE__)) JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.6.jar') + DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port + @@server_port = nil + @@server_pid = nil + # Read text or metadata from a data buffer. # # data = File.read 'sample.pages' # text = Yomu.read :text, data # metadata = Yomu.read :metadata, data def self.read(type, data) + result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data) + + case type + when :text + result + when :html + result + when :metadata + JSON.parse(result) + when :mimetype + MIME::Types[JSON.parse(result)['Content-Type']].first + end + end + + def self._client_read(type, data) switch = case type when :text '-t' when :html '-h' when :metadata '-m -j' when :mimetype '-m -j' end - - result = IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io| + + IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io| io.write data io.close_write io.read end + end - case type - when :text - result - when :html - result - when :metadata - JSON.parse(result) - when :mimetype - MIME::Types[JSON.parse(result)['Content-Type']].first + + def self._server_read(_, data) + s = TCPSocket.new('localhost', @@server_port) + file = StringIO.new(data, 'r') + + while 1 + chunk = file.read(65536) + break unless chunk + s.write(chunk) end + + # tell Tika that we're done sending data + s.shutdown(Socket::SHUT_WR) + + resp = '' + while 1 + chunk = s.recv(65536) + break if chunk.empty? || !chunk + resp << chunk + end + resp end # Create a new instance of Yomu with a given document. # # Using a file path: @@ -135,11 +169,10 @@ else nil end end - def path? defined? @path end # Returns +true+ if the Yomu document was specified using a URI. @@ -176,9 +209,57 @@ elsif stream? @data = @stream.read end @data + end + + # Returns pid of Tika server, started as a new spawned process. + # + # type :html, :text or :metadata + # custom_port e.g. 9293 + # + # Yomu.server(:text, 9294) + # + def self.server(type, custom_port=nil) + switch = case type + when :text + '-t' + when :html + '-h' + when :metadata + '-m -j' + when :mimetype + '-m -j' + end + + @@server_port = custom_port || DEFAULT_SERVER_PORT + + @@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}") + sleep(2) # Give the server 2 seconds to spin up. + @@server_pid + end + + # Kills server started by Yomu.server + # + # Always run this when you're done, or else Tika might run until you kill it manually + # You might try putting your extraction in a begin..rescue...ensure...end block and + # putting this method in the ensure block. + # + # Yomu.server(:text) + # reports = ["report1.docx", "report2.doc", "report3.pdf"] + # begin + # my_texts = reports.map{|report_path| Yomu.new(report_path).text } + # rescue + # ensure + # Yomu.kill_server! + # end + def self.kill_server! + if @@server_pid + Process.kill('INT', @@server_pid) + @@server_pid = nil + @@server_port = nil + end end def self.java ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java' end