module XML module SAX # Build a Nokogiri::XML::Document fragments that match an XPath. # # Stream large (or small) record based XML documents building each matching XPath into a document fragment making # futher manipulation of each record easier. # # ==== Notes # * In order to save memory well balanced elements that do not match any XPath are unlinked. This means you *cannot* # match records by position in relation to siblings. # * Because we are parsing a SAX stream there is no read ahead. You *cannot* match records by any children the # element may have once further events are pushed. # * You can match by attributes of an element. # # ==== Example # # builder = XML::SAX::FragmentBuilder.new(nil, { # '//record' => lambda{|record| puts el.to_s} # Process each matched record element. # }) # parser = Nokogiri::XML::SAX::PushParser.new(builder) # parser << %q{ # # record one # record two # # } # #=> record one # #=> record two # parser.finish # # ==== See # * XML::SAX::Builder # * XML::SAX::Filter # # -- # TODO: # * Namespaces. class FragmentBuilder < Builder private :document # Would return an empty/partial document you really shouldn't mess with. # ==== Parameters # handler:: # Optional next XML::SAX::Filter or Nokogiri::XML::SAX::Document(final) in the chain. # By default a Nokogiri::XML::SAX::Document will be used making the chain final. # # options:: # {xpath => &block} pairs. The first element passed to the block will be the matching # Nokogiri::XML::Node. Keep in mind the node will be unlinked after your block returns. def initialize(options = {}) super() @find = options @found = {} @buffer = 0 end def start_element_namespace(name, attributes = [], prefix = nil, uri = nil, ns = []) #:nodoc: super @find.each_pair do |xpath, block| if match = @document.at(xpath) unless @found[match.path] @buffer += 1 @found[match.path] = block end end end end def end_element_namespace(name, prefix = nil, uri = nil) #:nodoc: path = @context.path if @buffer > 0 && block = @found.delete(path) @buffer -= 1 block.call(@context) end super if @buffer == 0 && !(path == '/') @document.at(path).unlink # Unlinked children are not garbage collected till the document they were created in is (I think). # This hack job halves memory usage but it still grows too fast for my liking :( @document = @document.dup @context = @document.at(@context.path) rescue nil end end def characters(string) # :nodoc: @buffer > 0 ? super : (filter && filter.characters(string)) end def comment(string) # :nodoc: @buffer > 0 ? super : (filter && filter.comment(string)) end def cdata_block(string) # :nodoc: @buffer > 0 ? super : (filter && filter.cdata_block(string)) end end # FragmentBuilder end # SAX end # XML