Sha256: f5cfd221f69d08153809a773a5369e85df6067014ec14c373d52f3089e24fb38

Contents?: true

Size: 1.94 KB

Versions: 5

Compression:

Stored size: 1.94 KB

Contents

module Hydra::Derivatives::Processors
  # Extract the full text from the content using Solr's extract handler
  class FullText < Processor
    # Run the full text extraction and save the result
    # @return [TrueClass,FalseClass] was the process successful.
    def process
      output_file_service.call(extract, directives)
    end

    private

      ##
      # Extract full text from the content using Solr's extract handler.
      # This will extract text from the file
      #
      # @return [String] The extracted text
      def extract
        JSON.parse(fetch)[''].rstrip
      end

      # send the request to the extract service and return the response if it was successful.
      # TODO: this pulls the whole file into memory. We should stream it from Fedora instead
      # @return [String] the result of calling the extract service
      def fetch
        req = Net::HTTP.new(uri.host, uri.port)
        resp = req.post(uri.to_s, file_content, request_headers)
        raise "Solr Extract service was unsuccessful. '#{uri}' returned code #{resp.code} for #{source_path}\n#{resp.body}" unless resp.code == '200'
        file_content.rewind if file_content.respond_to?(:rewind)

        resp.body
      end

      def file_content
        @content ||= File.open(source_path).read
      end

      # @return [Hash] the request headers to send to the Solr extract service
      def request_headers
        { Faraday::Request::UrlEncoded::CONTENT_TYPE => mime_type.to_s,
          Faraday::Adapter::CONTENT_LENGTH => original_size.to_s }
      end

      def mime_type
        Hydra::Derivatives::MimeTypeService.mime_type(source_path)
      end

      def original_size
        File.size(source_path)
      end

      # @returns [URI] path to the extract service
      def uri
        @uri ||= URI("#{connection_url}/update/extract?extractOnly=true&wt=json&extractFormat=text")
      end

      def connection_url
        ActiveFedora.solr_config[:url]
      end
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
hydra-derivatives-3.3.0 lib/hydra/derivatives/processors/full_text.rb
hydra-derivatives-3.2.2 lib/hydra/derivatives/processors/full_text.rb
hydra-derivatives-3.2.1 lib/hydra/derivatives/processors/full_text.rb
hydra-derivatives-3.2.0 lib/hydra/derivatives/processors/full_text.rb
hydra-derivatives-3.1.4 lib/hydra/derivatives/processors/full_text.rb