#include
VALUE cNokogiriHtml4Document ;
static ID id_encoding_found;
static ID id_to_s;
/*
* call-seq:
* new
*
* Create a new document
*/
static VALUE
rb_html_document_s_new(int argc, VALUE *argv, VALUE klass)
{
VALUE uri, external_id, rest, rb_doc;
htmlDocPtr doc;
rb_scan_args(argc, argv, "0*", &rest);
uri = rb_ary_entry(rest, (long)0);
external_id = rb_ary_entry(rest, (long)1);
doc = htmlNewDoc(
RTEST(uri) ? (const xmlChar *)StringValueCStr(uri) : NULL,
RTEST(external_id) ? (const xmlChar *)StringValueCStr(external_id) : NULL
);
rb_doc = noko_xml_document_wrap_with_init_args(klass, doc, argc, argv);
return rb_doc ;
}
/*
* call-seq:
* read_io(io, url, encoding, options)
*
* Read the HTML document from +io+ with given +url+, +encoding+,
* and +options+. See Nokogiri::HTML4.parse
*/
static VALUE
rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options)
{
VALUE rb_doc;
VALUE rb_error_list = rb_ary_new();
htmlDocPtr c_doc;
const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url);
const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
int options = NUM2INT(rb_options);
xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
c_doc = htmlReadIO(noko_io_read, noko_io_close, (void *)rb_io, c_url, c_encoding, options);
xmlSetStructuredErrorFunc(NULL, NULL);
/*
* If EncodingFound has occurred in EncodingReader, make sure to do
* a cleanup and propagate the error.
*/
if (rb_respond_to(rb_io, id_encoding_found)) {
VALUE encoding_found = rb_funcall(rb_io, id_encoding_found, 0);
if (!NIL_P(encoding_found)) {
xmlFreeDoc(c_doc);
rb_exc_raise(encoding_found);
}
}
if ((c_doc == NULL) || (!(options & XML_PARSE_RECOVER) && (RARRAY_LEN(rb_error_list) > 0))) {
VALUE rb_error ;
xmlFreeDoc(c_doc);
rb_error = rb_ary_entry(rb_error_list, 0);
if (rb_error == Qnil) {
rb_raise(rb_eRuntimeError, "Could not parse document");
} else {
VALUE exception_message = rb_funcall(rb_error, id_to_s, 0);
exception_message = rb_str_concat(rb_str_new2("Parser without recover option encountered error or warning: "),
exception_message);
rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError));
}
return Qnil;
}
rb_doc = noko_xml_document_wrap(klass, c_doc);
rb_iv_set(rb_doc, "@errors", rb_error_list);
return rb_doc;
}
/*
* call-seq:
* read_memory(string, url, encoding, options)
*
* Read the HTML document contained in +string+ with given +url+, +encoding+,
* and +options+. See Nokogiri::HTML4.parse
*/
static VALUE
rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE rb_options)
{
VALUE rb_doc;
VALUE rb_error_list = rb_ary_new();
htmlDocPtr c_doc;
const char *c_buffer = StringValuePtr(rb_html);
const char *c_url = NIL_P(rb_url) ? NULL : StringValueCStr(rb_url);
const char *c_encoding = NIL_P(rb_encoding) ? NULL : StringValueCStr(rb_encoding);
int html_len = (int)RSTRING_LEN(rb_html);
int options = NUM2INT(rb_options);
xmlSetStructuredErrorFunc((void *)rb_error_list, Nokogiri_error_array_pusher);
c_doc = htmlReadMemory(c_buffer, html_len, c_url, c_encoding, options);
xmlSetStructuredErrorFunc(NULL, NULL);
if ((c_doc == NULL) || (!(options & XML_PARSE_RECOVER) && (RARRAY_LEN(rb_error_list) > 0))) {
VALUE rb_error ;
xmlFreeDoc(c_doc);
rb_error = rb_ary_entry(rb_error_list, 0);
if (rb_error == Qnil) {
rb_raise(rb_eRuntimeError, "Could not parse document");
} else {
VALUE exception_message = rb_funcall(rb_error, id_to_s, 0);
exception_message = rb_str_concat(rb_str_new2("Parser without recover option encountered error or warning: "),
exception_message);
rb_exc_raise(rb_class_new_instance(1, &exception_message, cNokogiriXmlSyntaxError));
}
return Qnil;
}
rb_doc = noko_xml_document_wrap(klass, c_doc);
rb_iv_set(rb_doc, "@errors", rb_error_list);
return rb_doc;
}
/*
* call-seq:
* type
*
* The type for this document
*/
static VALUE
rb_html_document_type(VALUE self)
{
htmlDocPtr doc;
Data_Get_Struct(self, xmlDoc, doc);
return INT2NUM(doc->type);
}
void
noko_init_html_document()
{
assert(cNokogiriXmlDocument);
cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument);
rb_define_singleton_method(cNokogiriHtml4Document, "read_memory", rb_html_document_s_read_memory, 4);
rb_define_singleton_method(cNokogiriHtml4Document, "read_io", rb_html_document_s_read_io, 4);
rb_define_singleton_method(cNokogiriHtml4Document, "new", rb_html_document_s_new, -1);
rb_define_method(cNokogiriHtml4Document, "type", rb_html_document_type, 0);
id_encoding_found = rb_intern("encoding_found");
id_to_s = rb_intern("to_s");
}