#include
#include "html_tokenizer.h"
#include "tokenizer.h"
static VALUE cTokenizer = Qnil;
static void tokenizer_mark(void *ptr)
{}
static void tokenizer_free(void *ptr)
{
struct tokenizer_t *tk = ptr;
if(tk) {
if(tk->current_tag) {
DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
xfree(tk->current_tag);
tk->current_tag = NULL;
}
if(tk->scan.string) {
DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
xfree(tk->scan.string);
tk->scan.string = NULL;
}
DBG_PRINT("tk=%p xfree(tk)", tk);
xfree(tk);
}
}
static size_t tokenizer_memsize(const void *ptr)
{
return ptr ? sizeof(struct tokenizer_t) : 0;
}
const rb_data_type_t ht_tokenizer_data_type = {
"ht_tokenizer_data_type",
{ tokenizer_mark, tokenizer_free, tokenizer_memsize, },
#if defined(RUBY_TYPED_FREE_IMMEDIATELY)
NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY
#endif
};
static VALUE tokenizer_allocate(VALUE klass)
{
VALUE obj;
struct tokenizer_t *tokenizer = NULL;
obj = TypedData_Make_Struct(klass, struct tokenizer_t, &ht_tokenizer_data_type, tokenizer);
DBG_PRINT("tk=%p allocate", tokenizer);
memset((void *)&tokenizer->context, TOKENIZER_NONE, sizeof(struct tokenizer_t));
return obj;
}
void tokenizer_init(struct tokenizer_t *tk)
{
tk->current_context = 0;
tk->context[0] = TOKENIZER_HTML;
tk->scan.string = NULL;
tk->scan.cursor = 0;
tk->scan.length = 0;
tk->attribute_value_start = 0;
tk->found_attribute = 0;
tk->current_tag = NULL;
tk->is_closing_tag = 0;
tk->last_token = TOKEN_NONE;
tk->callback_data = NULL;
tk->f_callback = NULL;
return;
}
VALUE token_type_to_symbol(enum token_type type)
{
switch(type) {
case TOKEN_NONE:
return ID2SYM(rb_intern("none"));
case TOKEN_TEXT:
return ID2SYM(rb_intern("text"));
case TOKEN_WHITESPACE:
return ID2SYM(rb_intern("whitespace"));
case TOKEN_COMMENT_START:
return ID2SYM(rb_intern("comment_start"));
case TOKEN_COMMENT_END:
return ID2SYM(rb_intern("comment_end"));
case TOKEN_TAG_NAME:
return ID2SYM(rb_intern("tag_name"));
case TOKEN_TAG_START:
return ID2SYM(rb_intern("tag_start"));
case TOKEN_TAG_END:
return ID2SYM(rb_intern("tag_end"));
case TOKEN_ATTRIBUTE_NAME:
return ID2SYM(rb_intern("attribute_name"));
case TOKEN_ATTRIBUTE_QUOTED_VALUE_START:
return ID2SYM(rb_intern("attribute_quoted_value_start"));
case TOKEN_ATTRIBUTE_QUOTED_VALUE:
return ID2SYM(rb_intern("attribute_quoted_value"));
case TOKEN_ATTRIBUTE_QUOTED_VALUE_END:
return ID2SYM(rb_intern("attribute_quoted_value_end"));
case TOKEN_ATTRIBUTE_UNQUOTED_VALUE:
return ID2SYM(rb_intern("attribute_unquoted_value"));
case TOKEN_CDATA_START:
return ID2SYM(rb_intern("cdata_start"));
case TOKEN_CDATA_END:
return ID2SYM(rb_intern("cdata_end"));
case TOKEN_SOLIDUS:
return ID2SYM(rb_intern("solidus"));
case TOKEN_EQUAL:
return ID2SYM(rb_intern("equal"));
case TOKEN_MALFORMED:
return ID2SYM(rb_intern("malformed"));
}
return Qnil;
}
static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
{
tk->last_token = type;
rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.cursor), INT2NUM(tk->scan.cursor + length));
}
static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
{
if(tk->f_callback)
tk->f_callback(tk, type, length, tk->callback_data);
tk->scan.cursor += length;
}
static VALUE tokenizer_initialize_method(VALUE self)
{
struct tokenizer_t *tk = NULL;
Tokenizer_Get_Struct(self, tk);
DBG_PRINT("tk=%p initialize", tk);
tokenizer_init(tk);
tk->f_callback = tokenizer_yield_tag;
return Qnil;
}
static inline int eos(struct scan_t *scan)
{
return scan->cursor >= scan->length;
}
static inline long unsigned int length_remaining(struct scan_t *scan)
{
return scan->length - scan->cursor;
}
static inline void push_context(struct tokenizer_t *tk, enum tokenizer_context ctx)
{
tk->context[++tk->current_context] = ctx;
}
static inline void pop_context(struct tokenizer_t *tk)
{
tk->context[tk->current_context--] = TOKENIZER_NONE;
}
static int is_text(struct scan_t *scan, long unsigned int *length)
{
long unsigned int i;
*length = 0;
for(i = scan->cursor;i < scan->length; i++, (*length)++) {
if(scan->string[i] == '<')
break;
}
return *length != 0;
}
static inline int is_comment_start(struct scan_t *scan)
{
return (length_remaining(scan) >= 4) &&
!strncmp((const char *)&scan->string[scan->cursor], "