/* $Id: ruby_xml_html_parser.c 604 2008-11-19 18:01:55Z cfis $ */ /* Please see the LICENSE file for copyright and distribution information */ #include "ruby_libxml.h" VALUE cXMLHTMLParser; static ID INPUT_ATTR; static ID CONTEXT_ATTR; /* * Document-class: LibXML::XML::HTMLParser * * The HTML parser implements an HTML 4.0 non-verifying parser with an API * compatible with the XML::Parser. In contrast with the XML::Parser, * it can parse "real world" HTML, even if it severely broken from a * specification point of view. */ /* * call-seq: * XML::HTMLParser.initialize -> parser * * Initializes a new parser instance with no pre-determined source. */ VALUE ruby_xml_html_parser_initialize(VALUE self) { VALUE input = rb_class_new_instance(0, NULL, cXMLInput); rb_iv_set(self, "@input", input); rb_iv_set(self, "@context", Qnil); return self; } htmlParserCtxtPtr ruby_xml_html_parser_file_ctxt(VALUE input) { VALUE file = rb_ivar_get(input, FILE_ATTR); VALUE encoding = rb_ivar_get(input, ENCODING_ATTR); VALUE encodingStr = ruby_xml_encoding_to_s(Qnil, encoding); return htmlCreateFileParserCtxt(StringValuePtr(file), StringValuePtr(encodingStr)); } htmlParserCtxtPtr ruby_xml_html_parser_str_ctxt(VALUE input) { VALUE data = rb_ivar_get(input, STRING_ATTR); return htmlCreateMemoryParserCtxt(StringValuePtr(data), RSTRING_LEN(data)); } /* htmlParserCtxtPtr ruby_xml_html_parser_io_ctxt(VALUE input) { VALUE io = rb_ivar_get(input, IO_ATTR); VALUE encoding = rb_ivar_get(input, ENCODING_ATTR); xmlCharEncoding xmlEncoding = NUM2INT(encoding); OpenFile *fptr; FILE *f; GetOpenFile(io, fptr); rb_io_check_readable(fptr); f = GetWriteFile(fptr); return htmlCreateIOParserCtxt(NULL, NULL, (xmlInputReadCallback) ctxtRead, NULL, f, xmlEncoding); } */ /* * call-seq: * parser.parse -> document * * Parse the input XML and create an XML::Document with * it's content. If an error occurs, XML::Parser::ParseError * is thrown. */ VALUE ruby_xml_html_parser_parse(VALUE self) { xmlParserCtxtPtr ctxt; VALUE context; VALUE input = rb_ivar_get(self, INPUT_ATTR); context = rb_ivar_get(self, CONTEXT_ATTR); if (context != Qnil) rb_raise(rb_eRuntimeError, "You cannot parse a data source twice"); if (rb_ivar_get(input, FILE_ATTR) != Qnil) ctxt = ruby_xml_html_parser_file_ctxt(input); else if (rb_ivar_get(input, STRING_ATTR) != Qnil) ctxt = ruby_xml_html_parser_str_ctxt(input); /*else if (rb_ivar_get(input, DOCUMENT_ATTR) != Qnil) ctxt = ruby_xml_html_parser_parse_document(input); else if (rb_ivar_get(input, IO_ATTR) != Qnil) ctxt = ruby_xml_html_parser_io_ctxt(input);*/ else rb_raise(rb_eArgError, "You must specify a parser data source"); if (!ctxt) ruby_xml_raise(&xmlLastError); context = ruby_xml_parser_context_wrap(ctxt); rb_ivar_set(self, CONTEXT_ATTR, context); if (htmlParseDocument(ctxt) == -1 || !ctxt->wellFormed) { xmlFreeDoc(ctxt->myDoc); ruby_xml_raise(&ctxt->lastError); } return ruby_xml_document_wrap(ctxt->myDoc); } // Rdoc needs to know #ifdef RDOC_NEVER_DEFINED mLibXML = rb_define_module("LibXML"); mXML = rb_define_module_under(mLibXML, "XML"); #endif void ruby_init_html_parser(void) { INPUT_ATTR = rb_intern("@input"); CONTEXT_ATTR = rb_intern("@context"); cXMLHTMLParser = rb_define_class_under(mXML, "HTMLParser", rb_cObject); /* Atributes */ rb_define_attr(cXMLHTMLParser, "input", 1, 0); rb_define_attr(cXMLHTMLParser, "context", 1, 0); /* Instance methods */ rb_define_method(cXMLHTMLParser, "initialize", ruby_xml_html_parser_initialize, 0); rb_define_method(cXMLHTMLParser, "parse", ruby_xml_html_parser_parse, 0); }