#include "markup.h" #include <ruby/encoding.h> %%{ machine Trenni_markup_parser; # Track the location of an identifier (tag name, attribute name, etc) action identifier_begin { identifier.begin = p; } action identifier_end { identifier.end = p; } action pcdata_begin { pcdata = Qnil; } action pcdata_end { } action characters_begin { characters.begin = p; } action characters_end { characters.end = p; Trenni_append_token(&pcdata, encoding, characters); } action entity_error { Trenni_raise_error("could not parse entity", buffer, p-s); } action entity_begin { entity.begin = p; } action entity_name { entity.end = p; Trenni_append_string(&pcdata, encoding, rb_funcall(entities, rb_intern("[]"), 1, Trenni_token(entity, encoding)) ); } action entity_hex { entity.end = p; codepoint = strtoul(entity.begin, (char **)&entity.end, 16); Trenni_append_codepoint(&pcdata, encoding, codepoint); } action entity_number { entity.end = p; codepoint = strtoul(entity.begin, (char **)&entity.end, 10); Trenni_append_codepoint(&pcdata, encoding, codepoint); } action doctype_begin { doctype.begin = p; } action doctype_end { doctype.end = p; rb_funcall(delegate, id_doctype, 1, Trenni_token(doctype, encoding)); } action doctype_error { Trenni_raise_error("could not parse doctype", buffer, p-s); } action comment_begin { comment.begin = p; } action comment_end { comment.end = p; rb_funcall(delegate, id_comment, 1, Trenni_token(comment, encoding)); } action comment_error { Trenni_raise_error("could not parse comment", buffer, p-s); } action instruction_begin { instruction.begin = p; } action instruction_text_begin { } action instruction_text_end { } action instruction_end { instruction.end = p; rb_funcall(delegate, id_instruction, 1, Trenni_token(instruction, encoding)); } action instruction_error { Trenni_raise_error("could not parse instruction", buffer, p-s); } action tag_name { // Reset self-closing state - we don't know yet. self_closing = 0; rb_funcall(delegate, id_open_tag_begin, 2, Trenni_token(identifier, encoding), ULONG2NUM(identifier.begin-s)); } action tag_opening_begin { } action tag_self_closing { self_closing = 1; } action attribute_begin { has_value = 0; } action attribute_value { has_value = 1; } action attribute_empty { has_value = 2; } action attribute { if (has_value == 1) { rb_funcall(delegate, id_attribute, 2, Trenni_token(identifier, encoding), pcdata); } else if (has_value == 2) { rb_funcall(delegate, id_attribute, 2, Trenni_token(identifier, encoding), empty_string); } else { rb_funcall(delegate, id_attribute, 2, Trenni_token(identifier, encoding), Qtrue); } } action tag_opening_end { rb_funcall(delegate, id_open_tag_end, 1, self_closing == 1 ? Qtrue : Qfalse); } action tag_closing_begin { } action tag_closing_end { rb_funcall(delegate, id_close_tag, 2, Trenni_token(identifier, encoding), ULONG2NUM(identifier.begin-s)); } action tag_error { Trenni_raise_error("could not parse tag", buffer, p-s); } action cdata_begin { cdata.begin = p; } action cdata_end { cdata.end = p; rb_funcall(delegate, id_cdata, 1, Trenni_token(cdata, encoding)); } action cdata_error { Trenni_raise_error("could not parse cdata", buffer, p-s); } action text_begin { } action text_end { // Entities are handled separately: rb_funcall(delegate, id_text, 1, pcdata); } include markup "trenni/markup.rl"; write data; }%% VALUE Trenni_Native_parse_markup(VALUE self, VALUE buffer, VALUE delegate, VALUE entities) { VALUE string = rb_funcall(buffer, id_read, 0); rb_encoding *encoding = rb_enc_get(string); VALUE pcdata = Qnil; VALUE empty_string = rb_obj_freeze(rb_enc_str_new("", 0, encoding)); const char *s, *p, *pe, *eof; unsigned long cs, top = 0, stack[2] = {0}; unsigned long codepoint = 0; Token identifier = {0}, cdata = {0}, characters = {0}, entity = {0}, doctype = {0}, comment = {0}, instruction = {0}; unsigned self_closing = 0, has_value = 0; s = p = RSTRING_PTR(string); eof = pe = p + RSTRING_LEN(string); %%{ write init; write exec; }%% if (p != eof) { Trenni_raise_error("could not parse all input", buffer, p-s); } return Qnil; }