#include static void dealloc(xmlTextReaderPtr reader) { NOKOGIRI_DEBUG_START(reader); xmlFreeTextReader(reader); NOKOGIRI_DEBUG_END(reader); } static int has_attributes(xmlTextReaderPtr reader) { /* * this implementation of xmlTextReaderHasAttributes explicitly includes * namespaces and properties, because some earlier versions ignore * namespaces. */ xmlNodePtr node ; node = xmlTextReaderCurrentNode(reader); if (node == NULL) return(0); if ((node->type == XML_ELEMENT_NODE) && ((node->properties != NULL) || (node->nsDef != NULL))) return(1); return(0); } static void Nokogiri_xml_node_namespaces(xmlNodePtr node, VALUE attr_hash) { xmlNsPtr ns; static char buffer[XMLNS_BUFFER_LEN] ; char *key ; size_t keylen ; if (node->type != XML_ELEMENT_NODE) return ; ns = node->nsDef; while (ns != NULL) { keylen = XMLNS_PREFIX_LEN + (ns->prefix ? (strlen((const char*)ns->prefix) + 1) : 0) ; if (keylen > XMLNS_BUFFER_LEN) { key = (char*)malloc(keylen) ; } else { key = buffer ; } if (ns->prefix) { sprintf(key, "%s:%s", XMLNS_PREFIX, ns->prefix); } else { sprintf(key, "%s", XMLNS_PREFIX); } rb_hash_aset(attr_hash, NOKOGIRI_STR_NEW2(key), (ns->href ? NOKOGIRI_STR_NEW2(ns->href) : Qnil) ); if (key != buffer) { free(key); } ns = ns->next ; } } /* * call-seq: * default? * * Was an attribute generated from the default value in the DTD or schema? */ static VALUE default_eh(VALUE self) { xmlTextReaderPtr reader; int eh; Data_Get_Struct(self, xmlTextReader, reader); eh = xmlTextReaderIsDefault(reader); if(eh == 0) return Qfalse; if(eh == 1) return Qtrue; return Qnil; } /* * call-seq: * value? * * Does this node have a text value? */ static VALUE value_eh(VALUE self) { xmlTextReaderPtr reader; int eh; Data_Get_Struct(self, xmlTextReader, reader); eh = xmlTextReaderHasValue(reader); if(eh == 0) return Qfalse; if(eh == 1) return Qtrue; return Qnil; } /* * call-seq: * attributes? * * Does this node have attributes? */ static VALUE attributes_eh(VALUE self) { xmlTextReaderPtr reader; int eh; Data_Get_Struct(self, xmlTextReader, reader); eh = has_attributes(reader); if(eh == 0) return Qfalse; if(eh == 1) return Qtrue; return Qnil; } /* * call-seq: * namespaces * * Get a hash of namespaces for this Node */ static VALUE namespaces(VALUE self) { xmlTextReaderPtr reader; xmlNodePtr ptr; VALUE attr ; Data_Get_Struct(self, xmlTextReader, reader); attr = rb_hash_new() ; if (! has_attributes(reader)) return attr ; ptr = xmlTextReaderExpand(reader); if(ptr == NULL) return Qnil; Nokogiri_xml_node_namespaces(ptr, attr); return attr ; } /* * call-seq: * attribute_nodes * * Get a list of attributes for this Node */ static VALUE attribute_nodes(VALUE self) { xmlTextReaderPtr reader; xmlNodePtr ptr; VALUE attr ; Data_Get_Struct(self, xmlTextReader, reader); attr = rb_ary_new() ; if (! has_attributes(reader)) return attr ; ptr = xmlTextReaderExpand(reader); if(ptr == NULL) return Qnil; Nokogiri_xml_node_properties(ptr, attr); return attr ; } /* * call-seq: * attribute_at(index) * * Get the value of attribute at +index+ */ static VALUE attribute_at(VALUE self, VALUE index) { xmlTextReaderPtr reader; xmlChar *value; VALUE rb_value; Data_Get_Struct(self, xmlTextReader, reader); if(NIL_P(index)) return Qnil; index = rb_Integer(index); value = xmlTextReaderGetAttributeNo( reader, (int)NUM2INT(index) ); if(value == NULL) return Qnil; rb_value = NOKOGIRI_STR_NEW2(value); xmlFree(value); return rb_value; } /* * call-seq: * attribute(name) * * Get the value of attribute named +name+ */ static VALUE reader_attribute(VALUE self, VALUE name) { xmlTextReaderPtr reader; xmlChar *value ; VALUE rb_value; Data_Get_Struct(self, xmlTextReader, reader); if(NIL_P(name)) return Qnil; name = StringValue(name) ; value = xmlTextReaderGetAttribute(reader, (xmlChar*)StringValuePtr(name)); if(value == NULL) { /* this section is an attempt to workaround older versions of libxml that don't handle namespaces properly in all attribute-and-friends functions */ xmlChar *prefix = NULL ; xmlChar *localname = xmlSplitQName2((xmlChar*)StringValuePtr(name), &prefix); if (localname != NULL) { value = xmlTextReaderLookupNamespace(reader, localname); xmlFree(localname) ; } else { value = xmlTextReaderLookupNamespace(reader, prefix); } xmlFree(prefix); } if(value == NULL) return Qnil; rb_value = NOKOGIRI_STR_NEW2(value); xmlFree(value); return rb_value; } /* * call-seq: * attribute_count * * Get the number of attributes for the current node */ static VALUE attribute_count(VALUE self) { xmlTextReaderPtr reader; int count; Data_Get_Struct(self, xmlTextReader, reader); count = xmlTextReaderAttributeCount(reader); if(count == -1) return Qnil; return INT2NUM((long)count); } /* * call-seq: * depth * * Get the depth of the node */ static VALUE depth(VALUE self) { xmlTextReaderPtr reader; int depth; Data_Get_Struct(self, xmlTextReader, reader); depth = xmlTextReaderDepth(reader); if(depth == -1) return Qnil; return INT2NUM((long)depth); } /* * call-seq: * xml_version * * Get the XML version of the document being read */ static VALUE xml_version(VALUE self) { xmlTextReaderPtr reader; const char *version; Data_Get_Struct(self, xmlTextReader, reader); version = (const char *)xmlTextReaderConstXmlVersion(reader); if(version == NULL) return Qnil; return NOKOGIRI_STR_NEW2(version); } /* * call-seq: * lang * * Get the xml:lang scope within which the node resides. */ static VALUE lang(VALUE self) { xmlTextReaderPtr reader; const char *lang; Data_Get_Struct(self, xmlTextReader, reader); lang = (const char *)xmlTextReaderConstXmlLang(reader); if(lang == NULL) return Qnil; return NOKOGIRI_STR_NEW2(lang); } /* * call-seq: * value * * Get the text value of the node if present. Returns a utf-8 encoded string. */ static VALUE value(VALUE self) { xmlTextReaderPtr reader; const char *value; Data_Get_Struct(self, xmlTextReader, reader); value = (const char *)xmlTextReaderConstValue(reader); if(value == NULL) return Qnil; return NOKOGIRI_STR_NEW2(value); } /* * call-seq: * prefix * * Get the shorthand reference to the namespace associated with the node. */ static VALUE prefix(VALUE self) { xmlTextReaderPtr reader; const char *prefix; Data_Get_Struct(self, xmlTextReader, reader); prefix = (const char *)xmlTextReaderConstPrefix(reader); if(prefix == NULL) return Qnil; return NOKOGIRI_STR_NEW2(prefix); } /* * call-seq: * namespace_uri * * Get the URI defining the namespace associated with the node */ static VALUE namespace_uri(VALUE self) { xmlTextReaderPtr reader; const char *uri; Data_Get_Struct(self, xmlTextReader, reader); uri = (const char *)xmlTextReaderConstNamespaceUri(reader); if(uri == NULL) return Qnil; return NOKOGIRI_STR_NEW2(uri); } /* * call-seq: * local_name * * Get the local name of the node */ static VALUE local_name(VALUE self) { xmlTextReaderPtr reader; const char *name; Data_Get_Struct(self, xmlTextReader, reader); name = (const char *)xmlTextReaderConstLocalName(reader); if(name == NULL) return Qnil; return NOKOGIRI_STR_NEW2(name); } /* * call-seq: * name * * Get the name of the node. Returns a utf-8 encoded string. */ static VALUE name(VALUE self) { xmlTextReaderPtr reader; const char *name; Data_Get_Struct(self, xmlTextReader, reader); name = (const char *)xmlTextReaderConstName(reader); if(name == NULL) return Qnil; return NOKOGIRI_STR_NEW2(name); } /* * call-seq: * base_uri * * Get the xml:base of the node */ static VALUE base_uri(VALUE self) { xmlTextReaderPtr reader; const char * base_uri; Data_Get_Struct(self, xmlTextReader, reader); base_uri = (const char *)xmlTextReaderBaseUri(reader); if (base_uri == NULL) return Qnil; return NOKOGIRI_STR_NEW2(base_uri); } /* * call-seq: * state * * Get the state of the reader */ static VALUE state(VALUE self) { xmlTextReaderPtr reader; Data_Get_Struct(self, xmlTextReader, reader); return INT2NUM((long)xmlTextReaderReadState(reader)); } /* * call-seq: * node_type * * Get the type of readers current node */ static VALUE node_type(VALUE self) { xmlTextReaderPtr reader; Data_Get_Struct(self, xmlTextReader, reader); return INT2NUM((long)xmlTextReaderNodeType(reader)); } /* * call-seq: * read * * Move the Reader forward through the XML document. */ static VALUE read_more(VALUE self) { xmlTextReaderPtr reader; xmlErrorPtr error; VALUE error_list; int ret; Data_Get_Struct(self, xmlTextReader, reader); error_list = rb_funcall(self, rb_intern("errors"), 0); xmlSetStructuredErrorFunc((void *)error_list, Nokogiri_error_array_pusher); ret = xmlTextReaderRead(reader); xmlSetStructuredErrorFunc(NULL, NULL); if(ret == 1) return self; if(ret == 0) return Qnil; error = xmlGetLastError(); if(error) rb_exc_raise(Nokogiri_wrap_xml_syntax_error(error)); else rb_raise(rb_eRuntimeError, "Error pulling: %d", ret); return Qnil; } /* * call-seq: * inner_xml * * Read the contents of the current node, including child nodes and markup. * Returns a utf-8 encoded string. */ static VALUE inner_xml(VALUE self) { xmlTextReaderPtr reader; xmlChar* value; VALUE str; Data_Get_Struct(self, xmlTextReader, reader); value = xmlTextReaderReadInnerXml(reader); str = Qnil; if(value) { str = NOKOGIRI_STR_NEW2((char*)value); xmlFree(value); } return str; } /* * call-seq: * outer_xml * * Read the current node and its contents, including child nodes and markup. * Returns a utf-8 encoded string. */ static VALUE outer_xml(VALUE self) { xmlTextReaderPtr reader; xmlChar *value; VALUE str = Qnil; Data_Get_Struct(self, xmlTextReader, reader); value = xmlTextReaderReadOuterXml(reader); if(value) { str = NOKOGIRI_STR_NEW2((char*)value); xmlFree(value); } return str; } /* * call-seq: * from_memory(string, url = nil, encoding = nil, options = 0) * * Create a new reader that parses +string+ */ static VALUE from_memory(int argc, VALUE *argv, VALUE klass) { VALUE rb_buffer, rb_url, encoding, rb_options; xmlTextReaderPtr reader; const char * c_url = NULL; const char * c_encoding = NULL; int c_options = 0; VALUE rb_reader, args[3]; rb_scan_args(argc, argv, "13", &rb_buffer, &rb_url, &encoding, &rb_options); if (!RTEST(rb_buffer)) rb_raise(rb_eArgError, "string cannot be nil"); if (RTEST(rb_url)) c_url = StringValuePtr(rb_url); if (RTEST(encoding)) c_encoding = StringValuePtr(encoding); if (RTEST(rb_options)) c_options = (int)NUM2INT(rb_options); reader = xmlReaderForMemory( StringValuePtr(rb_buffer), (int)RSTRING_LEN(rb_buffer), c_url, c_encoding, c_options ); if(reader == NULL) { xmlFreeTextReader(reader); rb_raise(rb_eRuntimeError, "couldn't create a parser"); } rb_reader = Data_Wrap_Struct(klass, NULL, dealloc, reader); args[0] = rb_buffer; args[1] = rb_url; args[2] = encoding; rb_obj_call_init(rb_reader, 3, args); return rb_reader; } /* * call-seq: * from_io(io, url = nil, encoding = nil, options = 0) * * Create a new reader that parses +io+ */ static VALUE from_io(int argc, VALUE *argv, VALUE klass) { VALUE rb_io, rb_url, encoding, rb_options; xmlTextReaderPtr reader; const char * c_url = NULL; const char * c_encoding = NULL; int c_options = 0; VALUE rb_reader, args[3]; rb_scan_args(argc, argv, "13", &rb_io, &rb_url, &encoding, &rb_options); if (!RTEST(rb_io)) rb_raise(rb_eArgError, "io cannot be nil"); if (RTEST(rb_url)) c_url = StringValuePtr(rb_url); if (RTEST(encoding)) c_encoding = StringValuePtr(encoding); if (RTEST(rb_options)) c_options = (int)NUM2INT(rb_options); reader = xmlReaderForIO( (xmlInputReadCallback)io_read_callback, (xmlInputCloseCallback)io_close_callback, (void *)rb_io, c_url, c_encoding, c_options ); if(reader == NULL) { xmlFreeTextReader(reader); rb_raise(rb_eRuntimeError, "couldn't create a parser"); } rb_reader = Data_Wrap_Struct(klass, NULL, dealloc, reader); args[0] = rb_io; args[1] = rb_url; args[2] = encoding; rb_obj_call_init(rb_reader, 3, args); return rb_reader; } /* * call-seq: * reader.empty_element? # => true or false * * Returns true if the current node is empty, otherwise false. */ static VALUE empty_element_p(VALUE self) { xmlTextReaderPtr reader; Data_Get_Struct(self, xmlTextReader, reader); if(xmlTextReaderIsEmptyElement(reader)) return Qtrue; return Qfalse; } VALUE cNokogiriXmlReader; void init_xml_reader() { VALUE module = rb_define_module("Nokogiri"); VALUE xml = rb_define_module_under(module, "XML"); /* * The Reader parser allows you to effectively pull parse an XML document. * Once instantiated, call Nokogiri::XML::Reader#each to iterate over each * node. Note that you may only iterate over the document once! */ VALUE klass = rb_define_class_under(xml, "Reader", rb_cObject); cNokogiriXmlReader = klass; rb_define_singleton_method(klass, "from_memory", from_memory, -1); rb_define_singleton_method(klass, "from_io", from_io, -1); rb_define_method(klass, "read", read_more, 0); rb_define_method(klass, "inner_xml", inner_xml, 0); rb_define_method(klass, "outer_xml", outer_xml, 0); rb_define_method(klass, "state", state, 0); rb_define_method(klass, "node_type", node_type, 0); rb_define_method(klass, "name", name, 0); rb_define_method(klass, "local_name", local_name, 0); rb_define_method(klass, "namespace_uri", namespace_uri, 0); rb_define_method(klass, "prefix", prefix, 0); rb_define_method(klass, "value", value, 0); rb_define_method(klass, "lang", lang, 0); rb_define_method(klass, "xml_version", xml_version, 0); rb_define_method(klass, "depth", depth, 0); rb_define_method(klass, "attribute_count", attribute_count, 0); rb_define_method(klass, "attribute", reader_attribute, 1); rb_define_method(klass, "namespaces", namespaces, 0); rb_define_method(klass, "attribute_at", attribute_at, 1); rb_define_method(klass, "empty_element?", empty_element_p, 0); rb_define_method(klass, "attributes?", attributes_eh, 0); rb_define_method(klass, "value?", value_eh, 0); rb_define_method(klass, "default?", default_eh, 0); rb_define_method(klass, "base_uri", base_uri, 0); rb_define_private_method(klass, "attr_nodes", attribute_nodes, 0); }