# -*- encoding : utf-8 -*- ######################################################## ## Thoughts from reading the ISO 32000-1:2008 ## this file is part of the CombinePDF library and the code ## is subject to the same license. ######################################################## module CombinePDF # PDF class is the PDF object that can save itself to # a file and that can be used as a container for a full # PDF file data, including version, information etc'. # # PDF objects can be used to combine or to inject data. # == Combine/Merge PDF files or Pages # To combine PDF files (or data): # pdf = CombinePDF.new # pdf << CombinePDF.new("file1.pdf") # one way to combine, very fast. # pdf << CombinePDF.new("file2.pdf") # pdf.save "combined.pdf" # or even a one liner: # (CombinePDF.new("file1.pdf") << CombinePDF.new("file2.pdf") << CombinePDF.new("file3.pdf")).save("combined.pdf") # you can also add just odd or even pages: # pdf = CombinePDF.new # i = 0 # CombinePDF.new("file.pdf").pages.each do |page| # i += 1 # pdf << page if i.even? # end # pdf.save "even_pages.pdf" # notice that adding all the pages one by one is slower then adding the whole file. # == Add content to existing pages (Stamp / Watermark) # To add content to existing PDF pages, first import the new content from an existing PDF file. # after that, add the content to each of the pages in your existing PDF. # # in this example, we will add a company logo to each page: # company_logo = CombinePDF.new("company_logo.pdf").pages[0] # pdf = CombinePDF.new "content_file.pdf" # pdf.pages.each {|page| page << company_logo} # notice the << operator is on a page and not a PDF object. # pdf.save "content_with_logo.pdf" # Notice the << operator is on a page and not a PDF object. The << operator acts differently on PDF objects and on Pages. # # The << operator defaults to secure injection by renaming references to avoid conflics. For overlaying pages using compressed data that might not be editable (due to limited filter support), you can use: # pdf.pages(nil, false).each {|page| page << stamp_page} # # == Page Numbering # adding page numbers to a PDF object or file is as simple as can be: # pdf = CombinePDF.new "file_to_number.pdf" # pdf.number_pages # pdf.save "file_with_numbering.pdf" # # numbering can be done with many different options, with different formating, with or without a box object, and even with opacity values. # # == Loading PDF data # Loading PDF data can be done from file system or directly from the memory. # # Loading data from a file is easy: # pdf = CombinePDF.new("file.pdf") # you can also parse PDF files from memory: # pdf_data = IO.read 'file.pdf' # for this demo, load a file to memory # pdf = CombinePDF.parse(pdf_data) # Loading from the memory is especially effective for importing PDF data recieved through the internet or from a different authoring library such as Prawn. class PDF # the objects attribute is an Array containing all the PDF sub-objects for te class. attr_reader :objects # the info attribute is a Hash that sets the Info data for the PDF. # use, for example: # pdf.info[:Title] = "title" attr_reader :info # gets/sets the string output format (PDF files store strings in to type of formats). # # Accepts: # - :literal # - :hex attr_accessor :string_output # set/get the PDF version of the file (1.1-1.7) - shuold be type Float. attr_accessor :version def initialize (*args) # default before setting @objects = [] @version = 0 @info = {} if args[0].is_a? PDFParser @objects = args[0].parse @version = args[0].version if args[0].version.is_a? Float @info = args[0].info_object || {} elsif args[0].is_a? Array # object initialization @objects = args[0] @version = args[1] if args[1].is_a? Float elsif args[0].is_a? Hash @objects = args end # connecting references with original objects serialize_objects_and_references # general globals @string_output = :literal @need_to_rebuild_resources = false @set_start_id = 1 @info[:Producer] = "Ruby CombinePDF Library by Boaz Segev" @info.delete :CreationDate @info.delete :ModDate warn "finished to initialize PDF object." end # Formats the data to PDF formats and returns a binary string that represents the PDF file content. # # This method is used by the save(file_name) method to save the content to a file. # # use this to export the PDF file without saving to disk (such as sending through HTTP ect'). def to_pdf #reset version if not specified @version = 1.5 if @version.to_f == 0.0 #set creation date for merged file @info[:CreationDate] = Time.now.strftime "D:%Y%m%d%H%M%S%:::z'00" #rebuild resources if needed if @need_to_rebuild_resources rebuild_resources end catalog = rebuild_catalog_and_objects #rebuild_catalog warn "Formatting PDF output" out = [] xref = [] indirect_object_count = 1 #the first object is the null object #write head (version and binanry-code) out << "%PDF-#{@version.to_s}\n%\x00\x00\x00\x00".force_encoding(Encoding::ASCII_8BIT) #collect objects and set xref table locations loc = 0 out.each {|line| loc += line.bytes.length + 1} @objects.each do |o| indirect_object_count += 1 xref << loc out << PDFOperations._object_to_pdf(o) loc += out.last.length + 1 end warn "Building XREF" xref_location = 0 out.each { |line| xref_location += line.bytes.length + 1} out << "xref\n\r0 #{(indirect_object_count).to_s}\n\r0000000000 65535 f \n\r" xref.each {|offset| out << ( out.pop + ("%010d 00000 n \n\r" % offset) ) } out << out.pop + "trailer" out << "<<\n/Root #{false || "#{catalog[:indirect_reference_id]} #{catalog[:indirect_generation_number]} R"}" out << "/Size #{indirect_object_count.to_s}" if @info.is_a?(Hash) PRIVATE_HASH_KEYS.each {|key| @info.delete key} # make sure the dictionary is rendered inline, without stream out << "/Info #{PDFOperations._object_to_pdf @info}" end out << ">>\nstartxref\n#{xref_location.to_s}\n%%EOF" # when finished, remove the numbering system and keep only pointers PDFOperations.remove_old_ids @objects # output the pdf stream out.join("\n").force_encoding(Encoding::ASCII_8BIT) end # Save the PDF to file. # # file_name:: is a string or path object for the output. # # Notice! if the file exists, it WILL be overwritten. def save(file_name) IO.binwrite file_name, to_pdf end # this method returns all the pages cataloged in the catalog. # # if no catalog is passed, it seeks the existing catalog(s) and searches # for any registered Page objects. # # This method also adds the << operator to each page instance, so that content can be # injected to the pages, as described above. # # if the secure_injection is false, then the << operator will not alter the any of the information added to the page. # this might cause conflicts in the added content, but is available for situations in which # the content added is compressed using unsupported filters or options. # # the default is for the << operator to attempt a secure copy, by attempting to rename the content references and avoiding conflicts. # because not all PDF files are created equal (some might have formating errors or variations), # it is imposiible to learn if the attempt was successful. # # (page objects are Hash class objects. the << operator is added to the specific instances without changing the class) # # catalogs:: a catalog, or an Array of catalog objects. defaults to the existing catalog. # secure_injection:: a boolean (true / false) controling the behavior of the << operator. def pages(catalogs = nil, secure_injection = true) page_list = [] if catalogs == nil catalogs = @objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog} catalogs ||= [] end case when catalogs.is_a?(Array) catalogs.each {|c| page_list.push *(pages(c)) unless c.nil?} when catalogs.is_a?(Hash) if catalogs[:is_reference_only] catalogs[:referenced_object] = pages(PDFOperations.get_refernced_object @objects, catalogs) unless catalogs[:referenced_object] if catalogs[:referenced_object] page_list.push *( pages(catalogs[:referenced_object]) ) else warn "couldn't follow reference!!! #{catalogs} not found!" end else case catalogs[:Type] when :Page holder = self if secure_injection catalogs.define_singleton_method("<<".to_sym) do |obj| obj = PDFOperations.copy_and_secure_for_injection obj PDFOperations.inject_to_page self, obj holder.add_referenced self # add new referenced objects self end else catalogs.define_singleton_method("<<".to_sym) do |obj| obj = PDFOperations.create_deep_copy obj PDFOperations.inject_to_page self, obj holder.add_referenced self # add new referenced objects self end end page_list << catalogs when :Pages page_list.push *(pages(catalogs[:Kids])) unless catalogs[:Kids].nil? when :Catalog page_list.push *(pages(catalogs[:Pages])) unless catalogs[:Pages].nil? end end end page_list end # this function adds pages or CombinePDF objects at the end of the file (merge) # for example: # # pdf = CombinePDF.new "first_file.pdf" # # pdf << CombinePDF.new "second_file.pdf" # # pdf.save "both_files_merged.pdf" # @params obj is Hash, PDF or Array of parsed PDF data. def << (obj) ######### ## how should we add data to PDF? ## and how to handles imported pages? case when (obj.is_a?(PDF)) @version = [@version, obj.version].max obj.renumber_object_ids @set_start_id + @objects.length @objects.push(*obj.objects) # rebuild_catalog @need_to_rebuild_resources = true when (obj.is_a?(Hash) && obj[:Type] == :Page), (obj.is_a?(Array) && (obj.reject {|i| i.is_a?(Hash) && i[:Type] == :Page}).empty?) # set obj paramater to array if it's only one page obj = [obj] if obj.is_a?(Hash) # add page(s) to objects @objects.push(*obj) # add page dependencies to objects add_referenced(obj) # add page(s) to Catalog(s) rebuild_catalog obj @need_to_rebuild_resources = true when (obj.is_a?(Hash) && obj[:indirect_reference_id] && obj[:referenced_object].nil?) #only let top level indirect objects into the PDF tree. @objects << obj @need_to_rebuild_resources = true else warn "Shouldn't add objects to the file if they are not top-level indirect PDF objects." retrun false # return false, which will also stop any chaining. end return self #return self object for injection chaining (pdf << page << page << page) end # add page numbers to the PDF # # For unicode text, a unicode font(s) must first be registered. the registered font(s) must supply the # subset of characters used in the text. UNICODE IS AN ISSUE WITH THE PDF FORMAT - USE CAUSION. # # options:: a Hash of options setting the behavior and format of the page numbers: # - :number_format a string representing the format for page number. defaults to ' - %s - ' (allows for letter numbering as well, such as "a", "b"...). # - :number_location an Array containing the location for the page numbers, can be :top, :buttom, :top_left, :top_right, :bottom_left, :bottom_right. defaults to [:top, :buttom]. # - :start_at a Fixnum that sets the number for first page number. also accepts a letter ("a") for letter numbering. defaults to 1. # - :margin_from_height a number (PDF points) for the top and buttom margins. defaults to 45. # - :margin_from_side a number (PDF points) for the left and right margins. defaults to 15. # the options Hash can also take all the options for PDFWriter.textbox. # defaults to font: :Helvetica, font_size: 12 and no box (:border_width => 0, :box_color => nil). def number_pages(options = {}) opt = { number_format: ' - %s - ', number_location: [:top, :bottom], start_at: 1, font_size: 12, font: :Helvetica, margin_from_height: 45, margin_from_side: 15 } opt.update options page_number = opt[:start_at] pages.each do |page| # create a "stamp" PDF page with the same size as the target page mediabox = page[:MediaBox] stamp = PDFWriter.new mediabox # set the visible dimensions to the CropBox, if it exists. cropbox = page[:CropBox] mediabox = cropbox if cropbox # set stamp text text = opt[:number_format] % page_number # compute locations for text boxes text_dimantions = stamp.dimensions_of( text, opt[:font], opt[:font_size] ) box_width = text_dimantions[0] * 1.2 box_height = text_dimantions[1] * 2 opt[:width] = box_width opt[:height] = box_height from_height = 45 from_side = 15 page_width = mediabox[2] page_height = mediabox[3] center_position = (page_width - box_width)/2 left_position = from_side right_position = page_width - from_side - box_width top_position = page_height - from_height buttom_position = from_height + box_height x = center_position y = top_position if opt[:number_location].include? :top stamp.textbox text, {x: x, y: y }.merge(opt) end y = buttom_position #bottom position if opt[:number_location].include? :bottom stamp.textbox text, {x: x, y: y }.merge(opt) end y = top_position #top position x = left_position # left posotion if opt[:number_location].include? :top_left stamp.textbox text, {x: x, y: y }.merge(opt) end y = buttom_position #bottom position if opt[:number_location].include? :bottom_left stamp.textbox text, {x: x, y: y }.merge(opt) end x = right_position # right posotion y = top_position #top position if opt[:number_location].include? :top_right stamp.textbox text, {x: x, y: y }.merge(opt) end y = buttom_position #bottom position if opt[:number_location].include? :bottom_right stamp.textbox text, {x: x, y: y }.merge(opt) end page << stamp page_number = page_number.succ end end # get the title for the pdf # The title is stored in the information dictionary and isn't required def title return @info[:Title] end # set the title for the pdf # The title is stored in the information dictionary and isn't required # new_title:: a string that is the new author value. def title=(new_title = nil) @info[:Title] = new_title end # get the author value for the pdf. # The author is stored in the information dictionary and isn't required def author return @info[:Author] end # set the author value for the pdf. # The author is stored in the information dictionary and isn't required # # new_title:: a string that is the new author value. def author=(new_author = nil) @info[:Author] = new_author end end #:nodoc: all class PDF # @private # Some PDF objects contain references to other PDF objects. # # this function adds the references contained in "object", but DOESN'T add the object itself. # # this is used for internal operations, such as injectng data using the << operator. def add_referenced(object) # add references but not root case when object.is_a?(Array) object.each {|it| add_referenced(it)} when object.is_a?(Hash) if object[:is_reference_only] && object[:referenced_object] found_at = @objects.find_index object[:referenced_object] if found_at #if the objects are equal, they might still be different objects! # so, we need to make sure they are the same object for the pointers to effect id numbering # and formatting operations. object[:referenced_object] = @objects[found_at] else @objects.include? object[:referenced_object] #the object wasn't found - add it to the @objects array @objects << object[:referenced_object] object[:referenced_object].each do |k, v| add_referenced(v) unless k == :Parent end end else object.each do |k, v| add_referenced(v) unless k == :Parent end end end end # @private # run block of code on evey PDF object (PDF objects are class Hash) def each_object(&block) PDFOperations._each_object(@objects, &block) end protected # @private # this function returns all the Page objects - regardless of order and even if not cataloged # could be used for finding "lost" pages... but actually rather useless. def all_pages ######### ## Only return the page item, but make sure all references are connected so that ## referenced items and be reached through the connections. [].tap {|out| each_object {|obj| out << obj if obj.is_a?(Hash) && obj[:Type] == :Page } } end # @private def serialize_objects_and_references(object = nil) warn "connecting objects with their references (serialize_objects_and_references)." # # Version 3.5 injects indirect objects if they arn't dictionaries. # # benchmark 1000.times was 3.568246 sec for pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf" } # # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} } # # ######### Intreduces a BUG with catalogging pages... why? I don't know... mybey doesn't catch all. # each_object do |obj| # obj.each do |k, v| # if v.is_a?(Hash) && v[:is_reference_only] # v[:referenced_object] = PDFOperations.get_refernced_object @objects, v # raise "couldn't connect references" unless v[:referenced_object] # obj[k] = v[:referenced_object][:indirect_without_dictionary] if v[:referenced_object][:indirect_without_dictionary] # end # end # end # Version 4 # benchmark 1000.times was 0.980651 sec for: # pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf" # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} } objects_reference_hash = {} @objects.each {|o| objects_reference_hash[ [o[:indirect_reference_id], o[:indirect_generation_number] ] ] = o } each_object do |obj| if obj[:is_reference_only] obj[:referenced_object] = objects_reference_hash[ [obj[:indirect_reference_id], obj[:indirect_generation_number] ] ] warn "couldn't connect a reference!!! could be a null object, Silent error!!!" unless obj[:referenced_object] end end # when finished, remove the old numbering system and keep only pointers PDFOperations.remove_old_ids @objects # # Version 3 # # benchmark 1000.times was 3.568246 sec for pdf = CombinePDF.new "/Users/2Be/Desktop/מוצגים/20121002\ הודעת\ הערעור.pdf" } # # puts Benchmark.measure { 1000.times {pdf.serialize_objects_and_references} } # each_object do |obj| # if obj[:is_reference_only] # obj[:referenced_object] = PDFOperations.get_refernced_object @objects, obj # warn "couldn't connect a reference!!! could be a null object, Silent error!!!" unless obj[:referenced_object] # end # end end # @private def renumber_object_ids(start = nil) warn "Resetting Object Reference IDs" @set_start_id = start || @set_start_id start = @set_start_id history = {} all_indirect_object.each do |obj| obj[:indirect_reference_id] = start start += 1 end warn "Finished serializing IDs" end # @private def references(indirect_reference_id = nil, indirect_generation_number = nil) ref = {indirect_reference_id: indirect_reference_id, indirect_generation_number: indirect_generation_number} out = [] each_object do |obj| if obj[:is_reference_only] if (indirect_reference_id == nil && indirect_generation_number == nil) out << obj elsif compare_reference_values(ref, obj) out << obj end end end out end # @private def all_indirect_object # [].tap {|out| @objects.each {|obj| out << obj if (obj.is_a?(Hash) && obj[:is_reference_only].nil?) } } @objects end # @private def sort_objects_by_id @objects.sort! do |a,b| if a.is_a?(Hash) && a[:indirect_reference_id] && a[:is_reference_only].nil? && b.is_a?(Hash) && b[:indirect_reference_id] && b[:is_reference_only].nil? return a[:indirect_reference_id] <=> b[:indirect_reference_id] end 0 end end # @private def rebuild_catalog(*with_pages) warn "Re-Building Catalog" # # build page list v.1 Slow but WORKS # # Benchmark testing value: 26.708394 # old_catalogs = @objects.select {|obj| obj.is_a?(Hash) && obj[:Type] == :Catalog} # old_catalogs ||= [] # page_list = [] # PDFOperations._each_object(old_catalogs,false) { |p| page_list << p if p.is_a?(Hash) && p[:Type] == :Page } # build page list v.2 faster, better, and works # Benchmark testing value: 0.215114 page_list = pages # add pages to catalog, if requested page_list.push(*with_pages) unless with_pages.empty? # build new Pages object pages_object = {Type: :Pages, Count: page_list.length, Kids: page_list.map {|p| {referenced_object: p, is_reference_only: true} } } # build new Catalog object catalog_object = {Type: :Catalog, Pages: {referenced_object: pages_object, is_reference_only: true} } # point old Pages pointers to new Pages object ## first point known pages objects - enough? pages.each {|p| p[:Parent] = { referenced_object: pages_object, is_reference_only: true} } ## or should we, go over structure? (fails) # each_object {|obj| obj[:Parent][:referenced_object] = pages_object if obj.is_a?(Hash) && obj[:Parent].is_a?(Hash) && obj[:Parent][:referenced_object] && obj[:Parent][:referenced_object][:Type] == :Pages} # remove old catalog and pages objects @objects.reject! {|obj| obj.is_a?(Hash) && (obj[:Type] == :Catalog || obj[:Type] == :Pages) } # inject new catalog and pages objects @objects << pages_object @objects << catalog_object catalog_object end # @private # this is an alternative to the rebuild_catalog catalog method # this method is used by the to_pdf method, for streamlining the PDF output. # there is no point is calling the method before preparing the output. def rebuild_catalog_and_objects catalog = rebuild_catalog @objects = [] @objects << catalog add_referenced catalog renumber_object_ids catalog end # @private # disabled, don't use. simpley returns true. def rebuild_resources warn "Resources re-building disabled as it isn't worth the price in peformance as of yet." return true warn "Re-Building Resources" @need_to_rebuild_resources = false # what are resources? # anything at the top level of the file exept catalogs, page lists (Pages) and pages... not_resources = [:Catalog, :Pages, :Page] # get old resources list old_resources = @objects.select {|obj| obj.is_a?(Hash) && !not_resources.include?(obj[:Type])} # collect all unique resources while ignoring double values and resetting references # also ignore inner values (canot use PRIVATE_HASH_KEYS because of stream and other issues) ignore_keys = [:indirect_reference_id, :indirect_generation_number, :is_reference_only, :referenced_object] new_resources = [] all_references = references old_resources.each do |old_r| add = true new_resources.each do |new_r| # ## v.1.0 - slower # if (old_r.reject {|k,v| ignore_keys.include?(k) }) == (new_r.reject {|k,v| ignore_keys.include?(k)}) # all_references.each {|ref| ref[:referenced_object] = new_r if ref[:referenced_object].object_id == old_r.object_id } # fails, but doesn't assume all references are connected: compare_reference_values(old_r, ref) } # add = false # end ## v.1.1 - faster, doesn't build two hashes (but iterates one) if ( [].tap {|out| old_r.each {|k,v| out << true unless ((!ignore_keys.include?(k)) && new_r[k] == v) } } .empty?) all_references.each {|ref| ref[:referenced_object] = new_r if ref[:referenced_object].object_id == old_r.object_id } # fails, but doesn't assume all references are connected: compare_reference_values(old_r, ref) } add = false end end new_resources << old_r if add end # remove old resources @objects.reject! {|obj| old_resources.include?(obj)} # insert new resources @objects.push *new_resources # rebuild stream lengths? end # @private # the function rerturns true if the reference belongs to the object def compare_reference_values(obj, ref) if obj[:referenced_object] && ref[:referenced_object] return (obj[:referenced_object][:indirect_reference_id] == ref[:referenced_object][:indirect_reference_id] && obj[:referenced_object][:indirect_generation_number] == ref[:referenced_object][:indirect_generation_number]) elsif ref[:referenced_object] return (obj[:indirect_reference_id] == ref[:referenced_object][:indirect_reference_id] && obj[:indirect_generation_number] == ref[:referenced_object][:indirect_generation_number]) elsif obj[:referenced_object] return (obj[:referenced_object][:indirect_reference_id] == ref[:indirect_reference_id] && obj[:referenced_object][:indirect_generation_number] == ref[:indirect_generation_number]) else return (obj[:indirect_reference_id] == ref[:indirect_reference_id] && obj[:indirect_generation_number] == ref[:indirect_generation_number]) end end end end