Sha256: 9e95753912c9ba176dbc3cccfb886492e39d18a96328d04a0c1942c4f682ea30

Contents?: true

Size: 1.75 KB

Versions: 23

Compression:

Stored size: 1.75 KB

Contents

class FormatParser::PDFParser
  include FormatParser::IOUtils

  # First 9 bytes of a PDF should be in this format, according to:
  #
  #  https://stackoverflow.com/questions/3108201/detect-if-pdf-file-is-correct-header-pdf
  #
  # There are however exceptions, which are left out for now.
  #
  PDF_MARKER = /%PDF-1\.[0-8]{1}/

  # Page counts have different markers depending on
  # the PDF type. There is not a single common way of solving
  # this. The only way of solving this correctly is by adding
  # different types of PDF's in the specs.
  #
  COUNT_MARKERS = ['Count ']
  EOF_MARKER    = '%EOF'

  def call(io)
    io = FormatParser::IOConstraint.new(io)

    return unless safe_read(io, 9) =~ PDF_MARKER

    attributes = scan_for_attributes(io)

    FormatParser::Document.new(
      format: :pdf,
      page_count: attributes[:page_count]
    )
  end

  private

  # Read ahead bytes until one of % or / is reached.
  # A header in a PDF always starts with a /
  # The % is to detect the EOF
  #
  def scan_for_attributes(io)
    result = {}

    while read = safe_read(io, 1)
      case read
      when '%'
        break if safe_read(io, EOF_MARKER.size) == EOF_MARKER
      when '/'
        find_page_count(io, result)
      end
    end

    result
  end

  def find_page_count(io, result)
    COUNT_MARKERS.each do |marker|
      if safe_read(io, marker.size) == marker
        result[:page_count] = read_numbers(io)
      end
    end
  end

  # Read ahead bytes until no more numbers are found
  # This assumes that the position of io starts at a
  # number
  def read_numbers(io)
    numbers = ''

    while c = safe_read(io, 1)
      c =~ /\d+/ ? numbers << c : break
    end

    numbers.to_i
  end

  FormatParser.register_parser self, natures: :document, formats: :pdf
end

Version data entries

23 entries across 23 versions & 1 rubygems

Version Path
format_parser-0.13.6 lib/parsers/pdf_parser.rb
format_parser-0.13.5 lib/parsers/pdf_parser.rb
format_parser-0.13.4 lib/parsers/pdf_parser.rb
format_parser-0.13.3 lib/parsers/pdf_parser.rb
format_parser-0.13.2 lib/parsers/pdf_parser.rb
format_parser-0.13.1 lib/parsers/pdf_parser.rb
format_parser-0.13.0 lib/parsers/pdf_parser.rb
format_parser-0.12.4 lib/parsers/pdf_parser.rb
format_parser-0.12.2 lib/parsers/pdf_parser.rb
format_parser-0.12.1 lib/parsers/pdf_parser.rb
format_parser-0.12.0 lib/parsers/pdf_parser.rb
format_parser-0.11.0 lib/parsers/pdf_parser.rb
format_parser-0.10.0 lib/parsers/pdf_parser.rb
format_parser-0.9.4 lib/parsers/pdf_parser.rb
format_parser-0.9.3 lib/parsers/pdf_parser.rb
format_parser-0.9.0 lib/parsers/pdf_parser.rb
format_parser-0.8.0 lib/parsers/pdf_parser.rb
format_parser-0.7.0 lib/parsers/pdf_parser.rb
format_parser-0.6.0 lib/parsers/pdf_parser.rb
format_parser-0.5.2 lib/parsers/pdf_parser.rb