Sha256: f7582a61c0acf0464e5ef06dd449367b1a2d3a163780332cc3cb19f0fca8f3d1

Contents?: true

Size: 1.65 KB

Versions: 2

Compression:

Stored size: 1.65 KB

Contents

# frozen_string_literal: true

# Based on the rake remote task code

require 'rubygems'
require 'stringio'
require 'open4'

class RubyTikaApp
  class Error < RuntimeError; end

  class CommandFailedError < Error
    attr_reader :status
    def initialize(status)
      @status = status
    end
  end

  def initialize(document)
    @document = if (document =~ %r{https?:\/\/[\S]+}) == 0
                  document
                else
                  "file://#{document}"
                end

    java_cmd = '/opt/lib/jvm/java-1.8.0-openjdk-1.8.0.222.b10-0.lambda2.0.1.x86_64/jre/bin/java'
    java_args = '-server -Djava.awt.headless=true -Dfile.encoding=UTF-8'
    ext_dir = File.join(File.dirname(__FILE__))
    tika_path = "/opt/tika-app.jar"
    tika_config_path = "#{ext_dir}/../ext/tika-config.xml"

    @tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
  end

  def to_xml
    run_tika('--xml')
  end

  def to_html
    run_tika('--html')
  end

  def to_json(*_args)
    run_tika('--json')
  end

  def to_text
    run_tika('--text')
  end

  def to_text_main
    run_tika('--text-main')
  end

  def to_metadata
    run_tika('--metadata')
  end

  private

  def run_tika(option)
    final_cmd = "#{@tika_cmd} #{option} '#{@document}'"

    _, stdin, stdout, stderr = Open4.popen4(final_cmd)

    stdout_result = stdout.read.strip
    stderr_result = stderr.read.strip

    if stdout_result.empty? && !stderr_result.empty?
      raise(CommandFailedError.new(stderr_result), "execution failed with status #{stderr_result}: #{final_cmd}")
    end

    stdout_result
  ensure
    stdin.close
    stdout.close
    stderr.close
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
ruby_tika_app_lambda-1.25.2 lib/ruby_tika_app.rb
ruby_tika_app_lambda-1.25.1 lib/ruby_tika_app.rb