lib/podoff.rb in podoff-1.0.0 vs lib/podoff.rb in podoff-1.1.0

- old
+ new

@@ -21,14 +21,17 @@ # THE SOFTWARE. # # Made in Japan. #++ +require 'strscan' +require 'stringio' + module Podoff - VERSION = '1.0.0' + VERSION = '1.1.0' def self.load(path, encoding='iso-8859-1') Podoff::Document.load(path, encoding) end @@ -36,17 +39,10 @@ def self.parse(s) Podoff::Document.new(s) end - #OBJ_ATTRIBUTES = - # { type: 'Type', subtype: 'Subtype', - # parent: 'Parent', kids: 'Kids', contents: 'Contents', annots: 'Annots', - # pagenum: 'pdftk_PageNum' } - OBJ_ATTRIBUTES = - { type: 'Type', contents: 'Contents', pagenum: 'pdftk_PageNum' } - class Document def self.load(path, encoding='iso-8859-1') Podoff::Document.new(File.open(path, 'r:' + encoding) { |f| f.read }) @@ -56,73 +52,69 @@ Podoff::Document.new(s) end attr_reader :source + attr_reader :version attr_reader :xref attr_reader :objs attr_reader :obj_counters attr_reader :root # attr_reader :additions def initialize(s) fail ArgumentError.new('not a PDF file') \ - unless s.match(/\A%PDF-\d+\.\d+\n/) + unless s.match(/\A%PDF-\d+\.\d+\s/) @source = s + @version = nil @xref = nil @objs = {} @obj_counters = {} @root = nil @additions = {} - index = 0 - matches = {} - # + sca = ::StringScanner.new(s) + @version = sca.scan(/%PDF-\d+\.\d+/) + loop do - matches[:obj] ||= s.match(/^(\d+ \d+) obj\b/, index) - matches[:endobj] ||= s.match(/\bendobj\b/, index) - # - OBJ_ATTRIBUTES.each do |k, v| - matches[k] ||= s.match(/\/#{v} (\/?[^\/\n<>]+)/, index) - end - # - matches[:startxref] ||= s.match(/\bstartxref\s+(\d+)\s*%%EOF/, index) + i = sca.skip_until( + /(startxref\s+\d+|\d+\s+\d+\s+obj|\/Root\s+\d+\s+\d+\s+R)/) - objm = matches[:obj] - sxrm = matches[:startxref] + m = sca.matched + break unless m - break unless sxrm || objm - - fail ArgumentError.new('failed to find "startxref"') unless sxrm - - @root = nil if @root && index > @root.offset(0).last - @root ||= s.match(/\/Root (\d+ \d+) R\b/, index) - - sxri = sxrm.offset(0).first - obji = objm ? objm.offset(0).first : sxri + 1 - - if obji < sxri - obj = Podoff::Obj.extract(self, matches) + if m[0] == 's' + @xref = m.split(' ').last.to_i + elsif m[0] == '/' + @root = extract_ref(m) + else + obj = Podoff::Obj.extract(self, sca) @objs[obj.ref] = obj @obj_counters[obj.ref] = (@obj_counters[obj.ref] || 0) + 1 - index = obj.end_index + 1 - else - @xref = sxrm[1].to_i - index = sxrm.offset(0).last + 1 - matches.delete(:startxref) end end - fail ArgumentError.new('found no /Root') unless @root - @root = @root[1] + if @root == nil + sca.pos = 0 + loop do + i = sca.skip_until(/\/Root\s+\d+\s+\d+\s+R/) + break unless sca.matched + @root = extract_ref(sca.matched) + end + end end + def extract_ref(s) + + s.gsub(/\s+/, ' ').gsub(/[^0-9 ]+/, '').strip + end + def updated? @additions.any? end @@ -230,11 +222,16 @@ add(obj) end def write(path) - f = (path == :string) ? StringIO.new : File.open(path, 'wb') + f = + case path + when :string, '-' then StringIO.new + when String then File.open(path, 'wb') + else path + end f.write(@source) if @additions.any? @@ -272,15 +269,78 @@ f.write(">>\n") f.write("startxref #{xref}\n") f.write("%%EOF\n") end - f.close + f.close if path.is_a?(String) || path.is_a?(Symbol) - path == :string ? f.string : nil + f.is_a?(StringIO) ? f.string : nil end + def rewrite(path=:string) + + f = + case path + when :string, '-' then StringIO.new + when String then File.open(path, 'wb') + else path + end + + v = source.match(/%PDF-\d+\.\d+/)[0] + f.write(v) + f.write("\n") + + ptrs = {} + + objs.keys.sort.each do |k| + ptrs[k] = f.pos + 1 + f.write(objs[k].source) + f.write("\n") + end + + xref = f.pos + 1 + max = objs.keys.inject(-1) { |i, k| [ i, k.split(' ')[0].to_i ].max } + + #f.write("xref\n0 #{max}\n0000000000 65535 f\n") + f.write("xref\n0 1\n0000000000 65535 f\n") + + partitions = [ [] ] + # + (1..max).each do |i| + k = "#{i} 0" + last = partitions.last + if ptrs.has_key?(k) + last << i + else + partitions << [] unless last == [] + end + end + # + partitions.each do |part| + + f.write("#{part.first} #{part.size}\n") + + part.each do |i| + k = "#{i} 0" + #f.write(sprintf("%010d 00000 n %% %s\n", ptrs[k], k)) + f.write(sprintf("%010d 00000 n\n", ptrs[k])) + end + end + + f.write("trailer\n") + f.write("<<\n") + f.write("/Size #{objs.size}\n") + f.write("/Root #{root} R\n") + f.write(">>\n") + f.write("startxref #{xref}\n") + f.write("%%EOF\n") + + f.close if path.is_a?(String) || path.is_a?(Symbol) + + f.is_a?(StringIO) ? f.string : nil + end + private def make_stream(&block) s = Stream.new @@ -290,28 +350,30 @@ end end class Obj - def self.extract(doc, matches) + ATTRIBUTES = + { type: 'Type', contents: 'Contents', pagenum: 'pdftk_PageNum' } - re = matches[:obj][1] - st = matches[:obj].offset(0).first - en = matches[:endobj].offset(0).last - 1 + def self.extract(doc, sca) - atts = {} + re = sca.matched[0..-4].strip + st = sca.pos - sca.matched.length - OBJ_ATTRIBUTES.keys.each do |k| - m = matches[k] - if m && m.offset(0).last < en - atts[k] = m[1].strip - matches.delete(k) - end + i = sca.skip_until(/endobj/); return nil unless i + en = sca.pos - 1 + + atts = {} + ATTRIBUTES.each do |k, v| + sca.pos = st + i = sca.skip_until(/\/#{v}\b/); next unless i + next if sca.pos > en + atts[k] = sca.scan(/ *\/?[^\n\r\/>]+/).strip end - matches.delete(:obj) - matches.delete(:endobj) + sca.pos = en Podoff::Obj.new(doc, re, st, en, atts) end attr_reader :document @@ -371,66 +433,10 @@ r = @attributes && @attributes[:pagenum] r ? r.to_i : nil end -# def parent -# -# r = @attributes[:parent] -# r ? r[0..-2].strip : nil -# end -# -# def kids -# -# r = @attributes[:kids] -# (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?) -# end -# -# def contents -# -# r = @attributes[:contents] -# (r || '').split(/[\[\]R]/).collect(&:strip).reject(&:empty?) -# end - -# def add_annotation(ref) -# -# if annots = @attributes[:annots] -# fail "implement me!" -# else -# i = @source.index('/Type ') -# @source.insert(i, "/Annots [#{ref} R]\n") -# end -# recompute_attributes -# end - -# def add_free_text(x, y, text, font, size) -# -# fail ArgumentError.new('target is not a page') unless type == '/Page' -# -# nref = document.new_ref -# -# s = [ -# "#{nref} obj <<", -# "/Type /Annot", -# "/Subtype /FreeText", -# "/Da (/F1 70 Tf 0 100 Td)", -# "/Rect [0 0 500 600]", -# "/Contents (#{text})", -# ">>", -# "endobj" -# ].join("\n") -# anno = Obj.create(document, nref, s) -# -# page = self.replicate -# page.add_annotation(nref) -# -# document.add(anno) -# document.add(page) -# -# anno -# end - def insert_font(nick, obj_or_ref) fail ArgumentError.new("target '#{ref}' not a replica") \ unless @source @@ -460,13 +466,13 @@ protected def recompute_attributes @attributes = - OBJ_ATTRIBUTES.inject({}) do |h, (k, v)| - m = @source.match(/\/#{v} (\/?[^\/\n<>]+)/) - h[k] = m[1] if m + ATTRIBUTES.inject({}) do |h, (k, v)| + m = @source.match(/\/#{v}\s+(\/?[^\/\n<>]+)/) + h[k] = m[1].strip if m h end end def concat(refs, ref) @@ -479,11 +485,11 @@ def add_to_attribute(key, ref) fail ArgumentError.new("obj not replicated") unless @source - pkey = OBJ_ATTRIBUTES[key] + pkey = ATTRIBUTES[key] if v = @attributes[key] v = concat(v, ref) @source = @source.gsub(/#{pkey} ([\[\]0-9 R]+)/, "#{pkey} #{v}") else @@ -501,13 +507,9 @@ def initialize @font = nil @content = StringIO.new end - - #def document; obj.document; end - #def ref; obj.ref; end - #def source; self; end def tf(font_name, font_size) n = font_name[0] == '/' ? font_name[1..-1] : font_name