# Copyright (C) 2013-2019 Kouhei Sutou <kou@clear-code.com> # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA require "pathname" require "uri" module ChupaText class Extractor include Loggable def initialize @decomposers = [] end # Sets the extractor up by the configuration. It adds decomposers # enabled in the configuration. # # @param [Configuration] configuration The configuration to be # applied. # # @return [void] def apply_configuration(configuration) decomposers = Decomposers.create(Decomposer.registry, configuration.decomposer) decomposers.each do |decomposer| add_decomposer(decomposer) end end def add_decomposer(decomposer) @decomposers << decomposer end # Extracts texts from input. Each extracted text is passes to the # given block. # # @param [Data, String] input The input to be extracted texts. # If `input` is `String`, it is treated as the local file path or URI # of input data. # # @yield [text_data] Gives extracted text data to the block. # The block may be called zero or more times. # @yieldparam [Data] text_data The extracted text data. # You can get text data by `text_data.body`. # # @return [void] def extract(input, &block) extract_recursive(ensure_data(input), &block) end private def ensure_data(input) if input.is_a?(Data) input else InputData.new(input) end end def find_decomposer(data) candidates = [] @decomposers.each do |decomposer| score = decomposer.target_score(data) next if score.nil? candidates << [score, decomposer] end return nil if candidates.empty? candidate = candidates.sort_by {|score, _| -score}.first candidate[1] end def extract_recursive(target, &block) debug do "#{log_tag}[extract][target] <#{target.uri}>:<#{target.mime_type}>" end decomposer = find_decomposer(target) if decomposer.nil? if target.text_plain? debug {"#{log_tag}[extract][text-plain]"} yield(target.to_utf8_body_data) else debug {"#{log_tag}[extract][decomposer] not found"} if target.text? yield(target.to_utf8_body_data) end end else debug {"#{log_tag}[extract][decomposer] #{decomposer.class}"} decomposer.decompose(target) do |decomposed| debug do "#{log_tag}[extract][decomposed] " + "#{decomposer.class}: " + "<#{target.uri}>: " + "<#{target.mime_type}> -> <#{decomposed.mime_type}>" end extract_recursive(decomposed, &block) end end end def log_tag "[extractor]" end end end