lib/podoff.rb in podoff-0.9.0 vs lib/podoff.rb in podoff-1.0.0

- old
+ new

@@ -24,253 +24,524 @@ #++ module Podoff - VERSION = '0.9.0' + VERSION = '1.0.0' - def self.load(path) + def self.load(path, encoding='iso-8859-1') - Podoff::Document.new( - File.open(path, 'r:iso8859-1') { |f| f.read }) + Podoff::Document.load(path, encoding) end + def self.parse(s) + + Podoff::Document.new(s) + end + + #OBJ_ATTRIBUTES = + # { type: 'Type', subtype: 'Subtype', + # parent: 'Parent', kids: 'Kids', contents: 'Contents', annots: 'Annots', + # pagenum: 'pdftk_PageNum' } + OBJ_ATTRIBUTES = + { type: 'Type', contents: 'Contents', pagenum: 'pdftk_PageNum' } + class Document - attr_reader :header + def self.load(path, encoding='iso-8859-1') + + Podoff::Document.new(File.open(path, 'r:' + encoding) { |f| f.read }) + end + + def self.parse(s) + + Podoff::Document.new(s) + end + + attr_reader :source + attr_reader :xref attr_reader :objs - attr_reader :footer + attr_reader :obj_counters + attr_reader :root + # + attr_reader :additions def initialize(s) fail ArgumentError.new('not a PDF file') \ unless s.match(/\A%PDF-\d+\.\d+\n/) - @header = [] - # + @source = s + @xref = nil @objs = {} - cur = nil + @obj_counters = {} + @root = nil + + @additions = {} + + index = 0 + matches = {} # - @footer = nil + loop do - s.split("\n").each do |l| + matches[:obj] ||= s.match(/^(\d+ \d+) obj\b/, index) + matches[:endobj] ||= s.match(/\bendobj\b/, index) + # + OBJ_ATTRIBUTES.each do |k, v| + matches[k] ||= s.match(/\/#{v} (\/?[^\/\n<>]+)/, index) + end + # + matches[:startxref] ||= s.match(/\bstartxref\s+(\d+)\s*%%EOF/, index) - if @footer - @footer << l - elsif m = /^(\d+ \d+) obj\b/.match(l) - cur = (@objs[m[1]] = Obj.new(self, m[1])) - cur << l - elsif m = /^xref\b/.match(l) - @footer = [] - @footer << l - elsif cur - cur << l + objm = matches[:obj] + sxrm = matches[:startxref] + + break unless sxrm || objm + + fail ArgumentError.new('failed to find "startxref"') unless sxrm + + @root = nil if @root && index > @root.offset(0).last + @root ||= s.match(/\/Root (\d+ \d+) R\b/, index) + + sxri = sxrm.offset(0).first + obji = objm ? objm.offset(0).first : sxri + 1 + + if obji < sxri + obj = Podoff::Obj.extract(self, matches) + @objs[obj.ref] = obj + @obj_counters[obj.ref] = (@obj_counters[obj.ref] || 0) + 1 + index = obj.end_index + 1 else - @header << l + @xref = sxrm[1].to_i + index = sxrm.offset(0).last + 1 + matches.delete(:startxref) end end + + fail ArgumentError.new('found no /Root') unless @root + @root = @root[1] end - def fonts; @objs.values.select(&:is_font?); end - def pages; @objs.values.select(&:is_page?); end + def updated? - def page(i) - - i < 1 ? nil : @objs.values.find { |o| o.page_number == i } + @additions.any? end def dup - d0 = self + o = self - d = d0.class.allocate + self.class.allocate.instance_eval do - d.instance_eval do - @header = d0.header.dup - @footer = d0.footer.dup - @objs = d0.objs.values.inject({}) { |h, v| h[v.ref] = v.dup(d); h } + @source = o.source + @xref = o.xref + + @objs = o.objs.inject({}) { |h, (k, v)| h[k] = v.dup(self); h } + @obj_counters = o.obj_counters.dup + + @root = o.root + + @additions = + o.additions.inject({}) { |h, (k, v)| h[k] = v.dup(self); h } + + self end + end - d + def pages + + @objs.values.select { |o| o.type == '/Page' } end + def page(index) + + return nil if index == 0 + + pas = pages + return nil if pas.empty? + + return ( + index > 0 ? pas.at(index - 1) : pas.at(index) + ) unless pas.first.attributes[:pagenum] + + if index < 0 + max = pas.inject(0) { |n, pa| [ n, pa.page_number ].max } + index = max + 1 + index + end + + pas.find { |pa| pa.page_number == index } + end + + def new_ref + + "#{ + @objs.keys.inject(-1) { |i, r| [ i, r.split(' ').first.to_i ].max } + 1 + } 0" + end + + def add(obj) + + @objs[obj.ref] = obj + @additions[obj.ref] = obj + + obj + end + + def add_base_font(name) + + name = name[1..-1] if name[0] == '/' + + ref = new_ref + + add( + Obj.create( + self, + ref, + [ + "#{ref} obj", + "<< /Type /Font /Subtype /Type1 /BaseFont /#{name} >>", + "endobj" + ].join(' '))) + end + + def add_stream(s=nil, &block) + + ref = new_ref + + s = s || make_stream(&block) + + s = [ + "#{ref} obj", + "<< /Length #{s.length} >>", + "stream\n#{s}\nendstream", + "endobj" + ].join("\n") if s.is_a?(String) + + o = add(Obj.create(self, ref, s)) + + s.is_a?(Podoff::Stream) ? s : o + end + + def re_add(obj_or_ref) + + obj = obj_or_ref.is_a?(String) ? @objs[obj_or_ref] : obj_or_ref + + obj = obj.replicate unless obj.replica? + + add(obj) + end + def write(path) - File.open(path, 'wb') do |f| + f = (path == :string) ? StringIO.new : File.open(path, 'wb') - @header.each { |l| f.print(l); f.print("\n") } + f.write(@source) - @objs.values.each do |o| - o.lines.each { |l| f.print(l); f.print("\n") } + if @additions.any? + + pointers = {} + + @additions.values.each do |o| + f.write("\n") + pointers[o.ref] = f.pos + 1 + if o.source.is_a?(String) + f.write(o.source) + else # Stream + s = o.source.to_s + f.write("#{o.ref} obj\n<< /Length #{s.length} >>\n") + f.write("stream\n#{s}\nendstream\nendobj") + end end + f.write("\n\n") - @footer.each { |l| f.print(l); f.print("\n") } + xref = f.pos + 1 + + f.write("xref\n") + f.write("0 1\n") + f.write("0000000000 65535 f\n") + + pointers.each do |k, v| + f.write("#{k.split(' ').first} 1\n") + f.write(sprintf("%010d 00000 n\n", v)) + end + + f.write("trailer\n") + f.write("<<\n") + f.write("/Prev #{self.xref}\n") + f.write("/Size #{objs.size}\n") + f.write("/Root #{root} R\n") + f.write(">>\n") + f.write("startxref #{xref}\n") + f.write("%%EOF\n") end + + f.close + + path == :string ? f.string : nil end + + private + + def make_stream(&block) + + s = Stream.new + s.instance_exec(&block) if block + + s + end end class Obj + def self.extract(doc, matches) + + re = matches[:obj][1] + st = matches[:obj].offset(0).first + en = matches[:endobj].offset(0).last - 1 + + atts = {} + + OBJ_ATTRIBUTES.keys.each do |k| + m = matches[k] + if m && m.offset(0).last < en + atts[k] = m[1].strip + matches.delete(k) + end + end + + matches.delete(:obj) + matches.delete(:endobj) + + Podoff::Obj.new(doc, re, st, en, atts) + end + attr_reader :document attr_reader :ref - attr_reader :lines + attr_reader :start_index, :end_index + attr_reader :attributes - def initialize(doc, ref) + def initialize(doc, ref, st, en, atts, source=nil) @document = doc @ref = ref - @lines = [] + @start_index = st + @end_index = en + @attributes = atts + @source = source + + recompute_attributes if @source.is_a?(String) + @source.obj = self if @source.is_a?(Podoff::Stream) end - def <<(l) + def dup(new_doc) - @lines << l + self.class.new(new_doc, ref, start_index, end_index, attributes.dup) end - def lookup(k) + def self.create(doc, ref, source) - @lines.each do |l| + self.new(doc, ref, nil, nil, nil, source) + end - m = l.match(/^\/#{k} (.*)$/) - return m[1] if m - end + def replicate - nil + self.class.create(document, ref, source.dup) end - def index(o, start=0) + def to_a - @lines[start..-1].each_with_index do |l, i| + [ @ref, @start_index, @end_index, @attributes ] + end - if o.is_a?(String) - return start + i if l == o - else - return start + i if l.match(o) - end - end + def source - nil + @source || @document.source[@start_index..@end_index] end + def replica? + + @source != nil + end + def type - t = lookup('Type') - t ? t[1..-1] : nil + @attributes && @attributes[:type] end def page_number - r = lookup('pdftk_PageNum') + r = @attributes && @attributes[:pagenum] r ? r.to_i : nil end - def is_page? +# def parent +# +# r = @attributes[:parent] +# r ? r[0..-2].strip : nil +# end +# +# def kids +# +# r = @attributes[:kids] +# (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?) +# end +# +# def contents +# +# r = @attributes[:contents] +# (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?) +# end - page_number != nil - end +# def add_annotation(ref) +# +# if annots = @attributes[:annots] +# fail "implement me!" +# else +# i = @source.index('/Type ') +# @source.insert(i, "/Annots [#{ref} R]\n") +# end +# recompute_attributes +# end - def is_font? +# def add_free_text(x, y, text, font, size) +# +# fail ArgumentError.new('target is not a page') unless type == '/Page' +# +# nref = document.new_ref +# +# s = [ +# "#{nref} obj <<", +# "/Type /Annot", +# "/Subtype /FreeText", +# "/Da (/F1 70 Tf 0 100 Td)", +# "/Rect [0 0 500 600]", +# "/Contents (#{text})", +# ">>", +# "endobj" +# ].join("\n") +# anno = Obj.create(document, nref, s) +# +# page = self.replicate +# page.add_annotation(nref) +# +# document.add(anno) +# document.add(page) +# +# anno +# end - type() == 'Font' - end + def insert_font(nick, obj_or_ref) - def parent + fail ArgumentError.new("target '#{ref}' not a replica") \ + unless @source - # /Parent 2 0 R + nick = nick[1..-1] if nick[0] == '/' - r = lookup('Parent') + re = obj_or_ref + re = re.ref if re.respond_to?(:ref) - r ? r[0..-2].strip : nil + @source = @source.gsub(/\/Font\s*<</, "/Font\n<<\n/#{nick} #{re} R") end - def kids + def insert_contents(obj_or_ref) - # /Kids [1 0 R 16 0 R 33 0 R] + fail ArgumentError.new("target '#{ref}' not a replica") \ + unless @source + fail ArgumentError.new("target '#{ref}' doesn't have /Contents") \ + unless @attributes[:contents] - r = lookup('Kids') - (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?) - end + re = obj_or_ref + re = re.obj if re.respond_to?(:obj) # Stream + re = re.ref if re.respond_to?(:ref) - def contents - - r = lookup('Contents') - r ? r[0..-2].strip : nil + add_to_attribute(:contents, re) end + alias :insert_content :insert_contents - def font_names + protected - @lines.inject(nil) do |names, l| + def recompute_attributes - if names - return names if l == '>>' - if m = l.match(/\/([^ ]+) /); names << m[1]; end - elsif l.match(/\/Font\s*$/) - names = [] + @attributes = + OBJ_ATTRIBUTES.inject({}) do |h, (k, v)| + m = @source.match(/\/#{v} (\/?[^\/\n<>]+)/) + h[k] = m[1] if m + h end + end - names - end + def concat(refs, ref) - [] + refs = refs.strip + refs = refs[1..-2] if refs[0] == '[' + + "[#{refs} #{ref} R]" end - def dup(new_doc) + def add_to_attribute(key, ref) - o0 = self - o = o0.class.new(new_doc, @ref) - o.instance_eval { @lines = o0.lines.dup } + fail ArgumentError.new("obj not replicated") unless @source - o + pkey = OBJ_ATTRIBUTES[key] + + if v = @attributes[key] + v = concat(v, ref) + @source = @source.gsub(/#{pkey} ([\[\]0-9 R]+)/, "#{pkey} #{v}") + else + i = @source.index('/Type ') + @source.insert(i, "/#{pkey} [#{ref} R]\n") + end + recompute_attributes end + end - def find(opts={}, &block) + class Stream - return self if block.call(self) + attr_accessor :obj - [ *kids, contents ].compact.each do |k| - o = @document.objs[k] - return o if o && block.call(o) - end + def initialize - nil + @font = nil + @content = StringIO.new end - def crop_box + #def document; obj.document; end + #def ref; obj.ref; end + #def source; self; end - r = lookup('CropBox') || lookup('MediaBox') + def tf(font_name, font_size) - r ? r.strip[1..-2].split(' ').collect(&:strip).collect(&:to_f) : nil + n = font_name[0] == '/' ? font_name[1..-1] : font_name + + @font = "/#{n} #{font_size} Tf " end + alias :font :tf - def crop_dims + def bt(x, y, text) - x, y, w, h = crop_box - - x ? [ w - x, h - y ] : nil + @content.write "\n" if @content.size > 0 + @content.write "BT " + @content.write @font if @font + @content.write "#{x} #{y} Td (#{escape(text)}) Tj" + @content.write " ET" end + alias :text :bt - def prepend_text(x, y, text, opts={}) + def write(text) - o = find { |o| o.index('BT') } - fail ArgumentError.new('found no BT in the tree') unless o + @content.write(text) + end - font = opts[:font] || o.font_names.first || 'TT0' - size = opts[:size] || 10 - comm = opts[:comment] + def to_s - i = o.index('BT') - bt = [] - bt << 'BT' - bt << "#{x} #{y} Td" - bt << "/#{font} #{size} Tf" - bt << "(#{text})Tj" - bt << 'ET' - bt << " % #{comm}" if comm - bt = bt.join(' ') + @content.string + end - o.lines.insert(i, bt) + protected - o + def escape(s) + + s.gsub(/\(/, '\(').gsub(/\)/, '\)') end end end