Sha256: 5257b45a1409e6aabe2fa7d2b5cfd5872d824a7010d8a5fda0d2351d4dec1ff8

Contents?: true

Size: 1.97 KB

Versions: 1

Compression:

Stored size: 1.97 KB

Contents

# Copyright (C) 2013  Kouhei Sutou <kou@clear-code.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA

require "time"

require "poppler"

module ChupaText
  module Decomposers
    class PDF < Decomposer
      registry.register("pdf", self)

      def target?(data)
        data.extension == "pdf" or
          data.mime_type == "application/pdf"
      end

      def decompose(data)
        document = Poppler::Document.new(data.body)
        text = ""
        document.each do |page|
          text << page.get_text
        end
        text_data = TextData.new(text)
        text_data.uri = data.uri
        add_attribute(text_data, document, :title)
        add_attribute(text_data, document, :author)
        add_attribute(text_data, document, :subject)
        add_attribute(text_data, document, :keywords)
        add_attribute(text_data, document, :creator)
        add_attribute(text_data, document, :producer)
        add_attribute(text_data, document, :creation_date)
        yield(text_data)
      end

      private
      def add_attribute(text_data, document, name)
        value = document.send(name)
        return if value.nil?
        attribute_name = name.to_s.gsub(/_/, "-")
        value = Time.at(value).utc.iso8601 if value.is_a?(Integer)
        text_data[attribute_name] = value
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
chupa-text-decomposer-pdf-1.0.0 lib/chupa-text/decomposers/pdf.rb