Sha256: 6cddcf2af1e32de6eed62c084ac7d796f7249c03e669d4bdcf09a7e487aa6b46

Contents?: true

Size: 1.84 KB

Versions: 14

Compression:

Stored size: 1.84 KB

Contents

#!/usr/bin/env ruby
# coding: utf-8

# A sample script that attempts to extract bates numbers from a PDF file.
# Bates numbers are often used to markup documents being used in legal
# cases. For more info, see http://en.wikipedia.org/wiki/Bates_numbering
#
# Acrobat 9 introduced a markup syntax that directly specifies the bates
# number for each page. For earlier versions, the easiest way to find
# the number is to look for words that match a pattern.
#
# This example attempts to extract numbers using the Acrobat 9 syntax.
# As a fall back, you can provide a regular expression that will be
# used to look for words that look like the numbers you expect in the
# page content.

require 'rubygems'
require 'pdf/reader'

class BatesReceiver

  def initialize(regexp = nil)
    @numbers = []
    @backup  = []
    @regexp  = regexp
  end 

  def numbers
    @numbers.size > 0 ? @numbers : @backup
  end

  # Called when page parsing starts
  def begin_marked_content(*args)
    return unless args.size >= 2
    return unless args.first == :Artifact
    return unless args[1][:Subtype] == :BatesN

    @numbers << args[1][:Contents]
  end
  alias :begin_marked_content_with_pl :begin_marked_content

  # record text that is drawn on the page
  def show_text(string, *params)
    return if @regexp.nil?

    string.scan(@regexp).each { |m| @backup << m }
  end

  # there's a few text callbacks, so make sure we process them all
  alias :super_show_text :show_text
  alias :move_to_next_line_and_show_text :show_text
  alias :set_spacing_next_line_show_text :show_text

  # this final text callback takes slightly different arguments
  def show_text_with_positioning(*params)
    params = params.first
    params.each { |str| show_text(str) if str.kind_of?(String)}
  end
end

receiver = BatesReceiver.new(/CC.+/)
PDF::Reader.file("bates.pdf", receiver)
puts receiver.numbers.inspect

Version data entries

14 entries across 14 versions & 1 rubygems

Version Path
pdf-reader-0.10.1 examples/extract_bates.rb
pdf-reader-0.10.0 examples/extract_bates.rb
pdf-reader-0.9.3 examples/extract_bates.rb
pdf-reader-0.9.2 examples/extract_bates.rb
pdf-reader-0.9.1 examples/extract_bates.rb
pdf-reader-0.9.0 examples/extract_bates.rb
pdf-reader-0.8.6 examples/extract_bates.rb
pdf-reader-0.8.5 examples/extract_bates.rb
pdf-reader-0.8.4 examples/extract_bates.rb
pdf-reader-0.8.3 examples/extract_bates.rb
pdf-reader-0.8.2 examples/extract_bates.rb
pdf-reader-0.8.1 examples/extract_bates.rb
pdf-reader-0.8.0 examples/extract_bates.rb
pdf-reader-0.7.7 examples/extract_bates.rb