lib/yomu.rb in yomu-0.2.1 vs lib/yomu.rb in yomu-0.2.2
- old
+ new
@@ -2,48 +2,82 @@
require 'net/http'
require 'mime/types'
require 'json'
+require 'socket'
+require 'stringio'
+
class Yomu
GEMPATH = File.dirname(File.dirname(__FILE__))
JARPATH = File.join(Yomu::GEMPATH, 'jar', 'tika-app-1.6.jar')
+ DEFAULT_SERVER_PORT = 9293 # an arbitrary, but perfectly cromulent, port
+ @@server_port = nil
+ @@server_pid = nil
+
# Read text or metadata from a data buffer.
#
# data = File.read 'sample.pages'
# text = Yomu.read :text, data
# metadata = Yomu.read :metadata, data
def self.read(type, data)
+ result = @@server_pid ? self._server_read(type, data) : self._client_read(type, data)
+
+ case type
+ when :text
+ result
+ when :html
+ result
+ when :metadata
+ JSON.parse(result)
+ when :mimetype
+ MIME::Types[JSON.parse(result)['Content-Type']].first
+ end
+ end
+
+ def self._client_read(type, data)
switch = case type
when :text
'-t'
when :html
'-h'
when :metadata
'-m -j'
when :mimetype
'-m -j'
end
-
- result = IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
+
+ IO.popen "#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} #{switch}", 'r+' do |io|
io.write data
io.close_write
io.read
end
+ end
- case type
- when :text
- result
- when :html
- result
- when :metadata
- JSON.parse(result)
- when :mimetype
- MIME::Types[JSON.parse(result)['Content-Type']].first
+
+ def self._server_read(_, data)
+ s = TCPSocket.new('localhost', @@server_port)
+ file = StringIO.new(data, 'r')
+
+ while 1
+ chunk = file.read(65536)
+ break unless chunk
+ s.write(chunk)
end
+
+ # tell Tika that we're done sending data
+ s.shutdown(Socket::SHUT_WR)
+
+ resp = ''
+ while 1
+ chunk = s.recv(65536)
+ break if chunk.empty? || !chunk
+ resp << chunk
+ end
+ resp
end
# Create a new instance of Yomu with a given document.
#
# Using a file path:
@@ -135,11 +169,10 @@
else
nil
end
end
-
def path?
defined? @path
end
# Returns +true+ if the Yomu document was specified using a URI.
@@ -176,9 +209,57 @@
elsif stream?
@data = @stream.read
end
@data
+ end
+
+ # Returns pid of Tika server, started as a new spawned process.
+ #
+ # type :html, :text or :metadata
+ # custom_port e.g. 9293
+ #
+ # Yomu.server(:text, 9294)
+ #
+ def self.server(type, custom_port=nil)
+ switch = case type
+ when :text
+ '-t'
+ when :html
+ '-h'
+ when :metadata
+ '-m -j'
+ when :mimetype
+ '-m -j'
+ end
+
+ @@server_port = custom_port || DEFAULT_SERVER_PORT
+
+ @@server_pid = Process.spawn("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --server --port #{@@server_port} #{switch}")
+ sleep(2) # Give the server 2 seconds to spin up.
+ @@server_pid
+ end
+
+ # Kills server started by Yomu.server
+ #
+ # Always run this when you're done, or else Tika might run until you kill it manually
+ # You might try putting your extraction in a begin..rescue...ensure...end block and
+ # putting this method in the ensure block.
+ #
+ # Yomu.server(:text)
+ # reports = ["report1.docx", "report2.doc", "report3.pdf"]
+ # begin
+ # my_texts = reports.map{|report_path| Yomu.new(report_path).text }
+ # rescue
+ # ensure
+ # Yomu.kill_server!
+ # end
+ def self.kill_server!
+ if @@server_pid
+ Process.kill('INT', @@server_pid)
+ @@server_pid = nil
+ @@server_port = nil
+ end
end
def self.java
ENV['JAVA_HOME'] ? ENV['JAVA_HOME'] + '/bin/java' : 'java'
end