/* $Id: ruby_xml_html_parser.c 737 2009-01-23 01:23:08Z walltndr $ */
/* Please see the LICENSE file for copyright and distribution information */
#include "ruby_libxml.h"
/* Document-class: LibXML::XML::HTMLParser
*
* The HTML parser implements an HTML 4.0 non-verifying parser with an API
* compatible with the XML::Parser. In contrast with the XML::Parser,
* it can parse "real world" HTML, even if it severely broken from a
* specification point of view.
*
* The HTML parser creates an in-memory document object
* that consist of any number of XML::Node instances. This is simple
* and powerful model, but has the major limitation that the size of
* the document that can be processed is limited by the amount of
* memory available.
*
* Using the html parser is simple:
*
* parser = XML::HTMLParser.file('my_file')
* doc = parser.parse
*
* You can also parse documents (see XML::HTMLParser.document),
* strings (see XML::HTMLParser.string) and io objects (see
* XML::HTMLParser.io).
*/
VALUE cXMLHtmlParser;
static ID CONTEXT_ATTR;
/* call-seq:
* XML::HTMLParser.initialize -> parser
*
* Initializes a new parser instance with no pre-determined source.
*/
static VALUE rxml_html_parser_initialize(int argc, VALUE *argv, VALUE self)
{
VALUE context = Qnil;
rb_scan_args(argc, argv, "01", &context);
if (context == Qnil)
{
rb_warn("Passing no parameters to XML::HTMLParser.new is deprecated. Pass an instance of XML::Parser::Context instead.");
context = rb_class_new_instance(0, NULL, cXMLParserContext);
}
rb_ivar_set(self, CONTEXT_ATTR, context);
return self;
}
/*
* call-seq:
* parser.parse -> XML::Document
*
* Parse the input XML and create an XML::Document with
* it's content. If an error occurs, XML::Parser::ParseError
* is thrown.
*/
static VALUE rxml_html_parser_parse(VALUE self)
{
xmlParserCtxtPtr ctxt;
VALUE context = rb_ivar_get(self, CONTEXT_ATTR);
Data_Get_Struct(context, xmlParserCtxt, ctxt);
if (htmlParseDocument(ctxt) == -1 && ! ctxt->recovery)
{
if (ctxt->myDoc)
xmlFreeDoc(ctxt->myDoc);
rxml_raise(&ctxt->lastError);
}
return rxml_document_wrap(ctxt->myDoc);
}
// Rdoc needs to know
#ifdef RDOC_NEVER_DEFINED
mLibXML = rb_define_module("LibXML");
mXML = rb_define_module_under(mLibXML, "XML");
#endif
void ruby_init_html_parser(void)
{
CONTEXT_ATTR = rb_intern("@context");
cXMLHtmlParser = rb_define_class_under(mXML, "HTMLParser", rb_cObject);
/* Atributes */
rb_define_attr(cXMLHtmlParser, "input", 1, 0);
/* Instance methods */
rb_define_method(cXMLHtmlParser, "initialize", rxml_html_parser_initialize, -1);
rb_define_method(cXMLHtmlParser, "parse", rxml_html_parser_parse, 0);
}