lib/xamplr/from-xml.rb in xamplr-1.9.13 vs lib/xamplr/from-xml.rb in xamplr-1.9.14

- old
+ new

@@ -1,96 +1,103 @@ # encoding utf-8 -require 'libxml' +require 'nokogiri' module Xampl class FromXML attr :checkWellFormed #1.9.1 , false attr :is_realising #1.9.1 , false attr :tokenise_content #1.9.1 , false - @reader = nil + @reader = nil - @@by_tag = {} + @@by_tag = {} @@by_ns_tag = {} def initialize(recovering=false) - @recovering = recovering + @recovering = recovering - @attribute_name = Array.new(32) - @attribute_namespace = Array.new(32) - @attribute_value = Array.new(32) + @attribute_name = Array.new(32) + @attribute_namespace = Array.new(32) + @attribute_value = Array.new(32) - @insert_end_element = false - @faking_an_end_element = false + @insert_end_element = false + @faking_an_end_element = false @just_opened_an_element = false end def FromXML.reset_registry - @@by_tag = {} + @@by_tag = {} @@by_ns_tag = {} end def FromXML.register(tag, ns_tag, klass) @@by_ns_tag[ns_tag] = [klass] - a = @@by_tag[tag] + a = @@by_tag[tag] if (nil == a) then @@by_tag[tag] = [klass] else found = false a.each { |thing| found = found | (thing == klass) } a << klass unless found end end def FromXML.registered(name) - #puts "registered by ns tag: #{ @@by_ns_tag.keys.sort.inspect }" klass = @@by_ns_tag[name] - #puts "registered by tag: #{ @@by_tag.keys.sort.inspect }" klass = @@by_tag[name] unless klass klass = [] unless klass return klass end def resolve(name) - #TODO -- ??? + #TODO -- ??? don't seem to need it, this is for specific named entities return name end def setup_parse(filename, tokenise_content=true, is_realising=false) - @resolver = self - - @is_realising = is_realising - @tokenise_content = tokenise_content - - @reader = LibXML::XML::Reader.file(filename, - :options => LibXML::XML::Parser::Options::NOENT | - LibXML::XML::Parser::Options::NONET | - LibXML::XML::Parser::Options::NOCDATA | - LibXML::XML::Parser::Options::DTDATTR | - # LibXML::XML::Parser::Options::COMPACT | - 0) - #TODO CLOSE THIS THING!! + xml = File.read(filename) + setup_parse_string(xml, tokenise_content, is_realising) end def setup_parse_string(string, tokenise_content=true, is_realising=false) - @resolver = self + @resolver = self - @is_realising = is_realising + @is_realising = is_realising @tokenise_content = tokenise_content - # setInput(string) - @reader = LibXML::XML::Reader.string(string, - :options => LibXML::XML::Parser::Options::NOENT | - LibXML::XML::Parser::Options::NONET | - LibXML::XML::Parser::Options::NOCDATA | - LibXML::XML::Parser::Options::DTDATTR | - # LibXML::XML::Parser::Options::COMPACT) | - 0) - #TODO CLOSE THIS THING!! +=begin + STRICT = 0 Strict parsing + RECOVER = 1 << 0 Recover from errors + NOENT = 1 << 1 Substitute entities + DTDLOAD = 1 << 2 Load external subsets + DTDATTR = 1 << 3 Default DTD attributes + DTDVALID = 1 << 4 validate with the DTD + NOERROR = 1 << 5 suppress error reports + NOWARNING = 1 << 6 suppress warning reports + PEDANTIC = 1 << 7 pedantic error reporting + NOBLANKS = 1 << 8 remove blank nodes + SAX1 = 1 << 9 use the SAX1 interface internally + XINCLUDE = 1 << 10 Implement XInclude substitition + NONET = 1 << 11 Forbid network access + NODICT = 1 << 12 Do not reuse the context dictionnary + NSCLEAN = 1 << 13 remove redundant namespaces declarations + NOCDATA = 1 << 14 merge CDATA as text nodes + NOXINCNODE = 1 << 15 do not generate XINCLUDE START/END nodes + DEFAULT_XML = RECOVER the default options used for parsing XML documents + DEFAULT_HTML = RECOVER | NOERROR | NOWARNING | NONET the default options used for parsing HTML documents +=end + + options = Nokogiri::XML::ParseOptions::RECOVER | Nokogiri::XML::ParseOptions::NOENT | Nokogiri::XML::ParseOptions::NONET | Nokogiri::XML::ParseOptions::NOCDATA | Nokogiri::XML::ParseOptions::DTDATTR + + utf8_string = string.force_encoding('utf-8') + url = nil + encoding = nil + + @reader = Nokogiri::XML::Reader.from_memory(utf8_string, url, encoding, options) end def parse(filename, tokenise_content=true, is_realising=false) begin setup_parse(filename, tokenise_content, is_realising) @@ -113,44 +120,51 @@ rescue => e raise RuntimeError, "trouble parsing string: '#{string}' -- #{ e }", e.backtrace end end - def FromXML.tokenise_string(str, strip=true) - return nil unless str - str.strip! if strip - str.gsub!(/[ \n\r\t][ \n\r\t]*/, " ") - return str + def chew + xml = @reader.outer_xml + depth = @reader.depth + @reader.read + while depth != @reader.depth do + @reader.read + end + return xml end + def parse_element(parent=nil, target=nil) +# puts caller(0)[0..5] + find_the_first_element return unless start_element? - namespace = @reader.namespace_uri - name = @reader.local_name - + namespace = @reader.namespace_uri + name = @reader.local_name existing_element = nil - element = nil + element = nil requires_caching = false build_attribute_arrays if ((nil != namespace) and (0 < namespace.size)) then klass_name = "{#{namespace}}#{name}" - klasses = FromXML.registered(klass_name) + klasses = FromXML.registered(klass_name) if (0 == klasses.size) then # The class has not been registered (either it was never generated, or it was never loaded) - puts "#{ __FILE__ }:#{ __LINE__ } [#{__method__}] Don't know about class name: #{ klass_name }" -# puts "#{ __FILE__ }:#{ __LINE__ } [#{__method__}] @@by_ns_tag: #{ @@by_ns_tag.inspect }" -# puts "#{ __FILE__ }:#{ __LINE__ } [#{__method__}] @@by_tag: #{ @@by_tag.inspect }" - xml_text = XMLText.new - xml_text.build(self) - xml_text = parent.note_adding_text_content(xml_text, @is_realising) - parent.add_content(xml_text, @tokenise_content) if xml_text - return xml_text, false + begin + #discard this node and all children, but say something + thing = chew + puts "#{ ::File.basename __FILE__ }:#{ __LINE__ } [#{__method__}] UNRECOGNISED CHILD ELEMENTS: class: #{ klass_name }\n#{ thing }" + return nil, true + rescue => e + puts "Ohhhh NO! #{ e }" + puts e.backtrace + raise e + end end if (1 < klasses.size) then raise XamplException.new("there is more than one '#{name}' tag in namespace '#{namespace}'\nplease report this error") end else @@ -168,13 +182,13 @@ FromXML.tokenise_string @attribute_value[i] end end if target then - element = target + element = target target.load_needed = false - target = nil + target = nil element.init_attributes(@attribute_name, @attribute_namespace, @attribute_value) element.note_attributes_initialised(@is_realising) else if klasses[0].persisted? then @attribute_name.each_index do |i| @@ -208,11 +222,11 @@ # puts "#{File.basename(__FILE__)} #{__LINE__} EXISTING ELEMENT: #{ existing_element }" # puts "#{File.basename(__FILE__)} #{__LINE__} WOW, must handle the existing element correctly" element = existing_element #TODO -- IS THIS RIGHT???????????????????????? end unless element then - element = klasses[0].new + element = klasses[0].new requires_caching = @recovering # puts "#{File.basename(__FILE__)} #{__LINE__} WOW, what about recovering????" #TODO -- IS THIS RIGHT???????????????????????? requires_caching = true #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! unless @recovering then @@ -233,194 +247,117 @@ element.note_initialise_attributes_with(@attribute_name, @attribute_namespace, @attribute_value, @is_realising) element.init_attributes(@attribute_name, @attribute_namespace, @attribute_value) element.note_attributes_initialised(@is_realising) - if requires_caching and element and element.persist_required then - Xampl.cache(element) - end - - #element = element.note_add_to_parent(parent, @is_realising) - #element.append_to(parent) if parent + Xampl.cache(element) if requires_caching && element && element.persist_required end - while next_reader_event - case current_node_type + while next_reader_event do + if @reader.value? then + text = @reader.value + text = text.force_encoding('utf-8') unless 'UTF-8' == text.encoding + the_text = element.note_adding_text_content(text, @is_realising) + if element.has_mixed_content then + element << the_text + else + element.add_content(the_text, false) + end + elsif Nokogiri::XML::Node::ELEMENT_NODE == @reader.node_type then + child, ignore_child = parse_element(element) -=begin -TODO -- can these ever happen? - when START_DOCUMENT - return element if @recovering - return existing_element || element - when END_DOCUMENT - return element if @recovering - return existing_element || element - -=end - - when LibXML::XML::Reader::TYPE_ELEMENT - child, ignore_child = parse_element(element) - - unless ignore_child then - case child - when XamplObject then - child = child.note_add_to_parent(element, @is_realising) if child - child = element.note_add_child(child, @is_realising) if element - child.append_to(element) if element and child - when XMLText then - #TODO -- get rid of this puts - puts "UNRECOGNISED Well-formed XML: #{child.to_s[0..25]}..." - else - #TODO -- get rid of this puts - puts "WHAT IS THIS??? #{child.class.name}" - end + unless ignore_child then + case child + when XamplObject then + child = child.note_add_to_parent(element, @is_realising) if child + child = element.note_add_child(child, @is_realising) if element + child.append_to(element) if element && child + when XMLText then + #TODO -- get rid of this puts + puts "UNRECOGNISED Well-formed XML: #{child.to_s[0..25]}..." + else + #TODO -- get rid of this puts + puts "WHAT IS THIS??? #{child.class.name}" end - when LibXML::XML::Reader::TYPE_END_ELEMENT - element = element.note_closed(@is_realising) - return element if @recovering - return existing_element || element - when LibXML::XML::Reader::TYPE_TEXT, LibXML::XML::Reader::TYPE_CDATA, LibXML::XML::Reader::TYPE_SIGNIFICANT_WHITESPACE, LibXML::XML::Reader::TYPE_ENTITY_REFERENCE - if element.has_mixed_content then - text = @reader.read_string.force_encoding('utf-8') -# puts "#{ File.basename __FILE__ }:#{ __LINE__ } [#{__method__}] #{ text.encoding } [[#{ text }]]" - the_text = element.note_adding_text_content(text, @is_realising) - element << the_text - else - text = @reader.read_string.force_encoding('utf-8') -# puts "#{ File.basename __FILE__ }:#{ __LINE__ } [#{__method__}] #{ text.encoding } [[#{ text }]] (#{ @reader.class })" - the_text = element.note_adding_text_content(text, @is_realising) - element.add_content(the_text, false) - end - else + end + elsif Nokogiri::XML::Node::ELEMENT_DECL == @reader.node_type then + element = element.note_closed(@is_realising) + return element if @recovering + return existing_element || element + else + puts "WTF??(#{ @reader.depth }) name: #{ @reader.name }, #{ say_node_type(@reader.node_type)}/#{ @reader.node_type }\n#{ @reader.outer_xml }" end end return element if @recovering return existing_element || element end + def FromXML.tokenise_string(str, strip=true) + return nil unless str + str.strip! if strip + str.gsub!(/[ \n\r\t][ \n\r\t]*/, " ") + return str + end + def current_node_type if @faking_an_end_element then - LibXML::XML::Reader::TYPE_END_ELEMENT + Nokogiri::XML::Node::ELEMENT_DECL else @reader.node_type end end -=begin - def describe_current_element_type() - case @reader.node_type - when LibXML::XML::Reader::TYPE_ATTRIBUTE - puts "ATTRIBUTE" - when LibXML::XML::Reader::TYPE_DOCUMENT - puts "DOCUMENT" - when LibXML::XML::Reader::TYPE_ELEMENT - attribute_count = @reader.attribute_count - puts "ELEMENT #{ @reader.local_name }, ns: #{ @reader.namespace_uri }, #attributes: #{ attribute_count }, depth: #{ @reader.depth }" - puts " FAKING END ELEMENT" if @faking_an_end_element - when LibXML::XML::Reader::TYPE_END_ELEMENT - puts "END ELEMENT" - when LibXML::XML::Reader::TYPE_TEXT - puts "TEXT [[#{ @reader.read_string }]]" - when LibXML::XML::Reader::TYPE_CDATA - puts "CDATA [[#{ @reader.read_string }]]" - when LibXML::XML::Reader::TYPE_SIGNIFICANT_WHITESPACE - puts "SIGNIFICANT white space [[#{ @reader.read_string }]]" - when LibXML::XML::Reader::TYPE_ENTITY_REFERENCE - puts "entity ref" - when LibXML::XML::Reader::TYPE_WHITESPACE - puts "whitespace" - when LibXML::XML::Reader::TYPE_PROCESSING_INSTRUCTION - puts "processing instruction" - when LibXML::XML::Reader::TYPE_COMMENT - puts "comment" - when LibXML::XML::Reader::TYPE_DOCUMENT_TYPE - puts "doc type" - - when LibXML::XML::Reader::TYPE_XML_DECLARATION - puts "xml decl" - when LibXML::XML::Reader::TYPE_NONE - puts "NONE!!" - when LibXML::XML::Reader::TYPE_NOTATION - puts "notifiation" - when LibXML::XML::Reader::TYPE_DOCUMENT_FRAGMENT - puts "doc fragment" - when LibXML::XML::Reader::TYPE_ENTITY - puts "entity" - when LibXML::XML::Reader::TYPE_END_ENTITY - puts "end entity" - else - puts "UNKNOWN: #{@reader.node_type}" - end - end -=end - def next_reader_event if @insert_end_element then @faking_an_end_element = true - @insert_end_element = false + @insert_end_element = false return end - @faking_an_end_element = false + @faking_an_end_element = false - #describe_current_element_type - begin -#TODO -- get rid of this?? -#TODO -- really? okay = @reader.read rescue => e raise RuntimeError, "WHAT?? -- #{ e }", e.backtrace end - @just_opened_an_element = start_element? - @insert_end_element = (@just_opened_an_element and @reader.empty_element?) - - #describe_current_element_type - + @just_opened_an_element = self.start_element? + @insert_end_element = (@just_opened_an_element and @reader.empty_element?) okay end def start_element? - current_node_type == LibXML::XML::Reader::TYPE_ELEMENT + current_node_type == Nokogiri::XML::Node::ELEMENT_NODE end def whitespace? - current_note_type == LibXML::XML::Reader::TYPE_WHITESPACE + #there is no whitespace type with nokogiri + #TODO -- this is not actually called, so... + @reader.value? && @reader.value.match(/\S/).nil? end def find_the_first_element while true do break if start_element? break unless next_reader_event end @just_opened_an_element = start_element? + @insert_end_element = (@just_opened_an_element and @reader.empty_element?) end def build_attribute_arrays - @attribute_name.clear @attribute_namespace.clear @attribute_value.clear - return unless LibXML::XML::Reader::TYPE_ELEMENT == current_node_type + return unless @reader.attributes? - if @reader.has_attributes? then - attribute_count = @reader.attribute_count - @reader.move_to_first_attribute - attribute_count.times do |i| - if @reader.namespace_declaration? then - @reader.move_to_next_attribute - next - end - - @attribute_name << @reader.local_name - @attribute_namespace << @reader.namespace_uri - @attribute_value << @reader.value - - @reader.move_to_next_attribute - end + @reader.attributes.each do |name, value| + @attribute_name << name + @attribute_namespace << nil + @attribute_value << value end end def attributeCount return @attribute_name.length