# frozen_string_literal: true # This file is the top level file for the Rika gem. # It requires the other files in the gem and provides the top level API. # It also provides the top level module for the gem. require 'rika/version' require_relative 'rika/parser' require_relative 'rika/tika_loader' # The top level module for the Rika gem. module Rika PROJECT_URL = 'https://github.com/keithrbennett/rika' # Loads the Tika jar file and imports the needed Java classes. # @return [Module] the Rika module, for chaining def self.init return if @initialized Rika.raise_unless_jruby Rika::TikaLoader.require_tika import java.io.FileInputStream import java.net.URL import org.apache.tika.Tika import org.apache.tika.detect.DefaultDetector import org.apache.tika.io.TikaInputStream import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector import org.apache.tika.language.detect.LanguageDetector import org.apache.tika.language.detect.LanguageResult import org.apache.tika.metadata.Metadata @initialized = true self end # Gets a ParseResult from parsing a document. # # @param [String] data_source file path or HTTP(s) URL # @param [Boolean] key_sort whether to sort the keys in the metadata hash, defaults to true # @param [Integer] max_content_length maximum content length to return, defaults to all # @param [Detector] detector Tika detector, defaults to DefaultDetector # @return [ParseResult] def self.parse(data_source, key_sort: true, max_content_length: -1, detector: DefaultDetector.new) init parser = Parser.new(data_source, key_sort: key_sort, max_content_length: max_content_length, detector: detector) parser.parse end # @return [String] version of loaded Tika jar file def self.tika_version init Tika.java_class.package.implementation_version end # @param [String] text text to detect language of # @return [String] language of passed text, as 2-character ISO 639-1 code def self.language(text) init tika_language_detector.detect(text.to_java_string).get_language end # @param [String] data_source file path or HTTP URL # @return [Array] content and metadata of file at specified location # # @deprecated Instead, get a ParseResult and access the content and metadata fields. def self.parse_content_and_metadata(data_source, max_content_length: -1) init result = parse(data_source, max_content_length: max_content_length) [result.content, result.metadata] end # @param [String] data_source file path or HTTP URL # @return [Hash] content and metadata of file at specified location # # @deprecated Instead, use a ParseResult or its to_h method. def self.parse_content_and_metadata_as_hash(data_source, max_content_length: -1) init result = parse(data_source, max_content_length: max_content_length) { content: result.content, metadata: result.metadata } end # @param [String] data_source file path or HTTP URL # @return [Parser] parser for resource at specified location # # @deprecated Instead, get a ParseResult and access the content field def self.parse_content(data_source, max_content_length: -1) init parse(data_source, max_content_length: max_content_length).content end # Regarding max_content_length, the default is set at 0 to save unnecessary processing, # since the content is being ignored. However, the PDF metadata "pdf:unmappedUnicodeCharsPerPage" # and "pdf:charsPerPage" will be absent if the max_content_length is 0, and otherwise may differ # depending on the number of characters read. # # @deprecated Instead, get a ParseResult and access the metadata field def self.parse_metadata(data_source, max_content_length: -1) init parse(data_source, max_content_length: max_content_length).metadata end # @return [Detector] Tika detector def self.tika_language_detector init @tika_language_detector ||= OptimaizeLangDetector.new.loadModels end # Raise an error if not running under JRuby. def self.raise_unless_jruby unless RUBY_PLATFORM.match(/java/) raise "\n\n\nRika can only be run with JRuby! It needs access to the Java Virtual Machine.\n\n\n" end end end