#include #include #include "html_tokenizer.h" #include "tokenizer.h" static VALUE cTokenizer = Qnil; static void tokenizer_mark(void *ptr) {} static void tokenizer_free(void *ptr) { struct tokenizer_t *tk = ptr; if(tk) { if(tk->current_tag) { DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag); xfree(tk->current_tag); tk->current_tag = NULL; } if(tk->scan.string) { DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string); xfree(tk->scan.string); tk->scan.string = NULL; } DBG_PRINT("tk=%p xfree(tk)", tk); xfree(tk); } } static size_t tokenizer_memsize(const void *ptr) { return ptr ? sizeof(struct tokenizer_t) : 0; } const rb_data_type_t ht_tokenizer_data_type = { "ht_tokenizer_data_type", { tokenizer_mark, tokenizer_free, tokenizer_memsize, }, #if defined(RUBY_TYPED_FREE_IMMEDIATELY) NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY #endif }; static VALUE tokenizer_allocate(VALUE klass) { VALUE obj; struct tokenizer_t *tokenizer = NULL; obj = TypedData_Make_Struct(klass, struct tokenizer_t, &ht_tokenizer_data_type, tokenizer); DBG_PRINT("tk=%p allocate", tokenizer); memset((void *)&tokenizer->context, TOKENIZER_NONE, sizeof(struct tokenizer_t)); return obj; } void tokenizer_init(struct tokenizer_t *tk) { tk->current_context = 0; tk->context[0] = TOKENIZER_HTML; tk->scan.string = NULL; tk->scan.cursor = 0; tk->scan.length = 0; tk->scan.mb_cursor = 0; tk->scan.enc_index = 0; tk->attribute_value_start = 0; tk->found_attribute = 0; tk->current_tag = NULL; tk->is_closing_tag = 0; tk->last_token = TOKEN_NONE; tk->callback_data = NULL; tk->f_callback = NULL; return; } VALUE token_type_to_symbol(enum token_type type) { switch(type) { case TOKEN_NONE: return ID2SYM(rb_intern("none")); case TOKEN_TEXT: return ID2SYM(rb_intern("text")); case TOKEN_WHITESPACE: return ID2SYM(rb_intern("whitespace")); case TOKEN_COMMENT_START: return ID2SYM(rb_intern("comment_start")); case TOKEN_COMMENT_END: return ID2SYM(rb_intern("comment_end")); case TOKEN_TAG_NAME: return ID2SYM(rb_intern("tag_name")); case TOKEN_TAG_START: return ID2SYM(rb_intern("tag_start")); case TOKEN_TAG_END: return ID2SYM(rb_intern("tag_end")); case TOKEN_ATTRIBUTE_NAME: return ID2SYM(rb_intern("attribute_name")); case TOKEN_ATTRIBUTE_QUOTED_VALUE_START: return ID2SYM(rb_intern("attribute_quoted_value_start")); case TOKEN_ATTRIBUTE_QUOTED_VALUE: return ID2SYM(rb_intern("attribute_quoted_value")); case TOKEN_ATTRIBUTE_QUOTED_VALUE_END: return ID2SYM(rb_intern("attribute_quoted_value_end")); case TOKEN_ATTRIBUTE_UNQUOTED_VALUE: return ID2SYM(rb_intern("attribute_unquoted_value")); case TOKEN_CDATA_START: return ID2SYM(rb_intern("cdata_start")); case TOKEN_CDATA_END: return ID2SYM(rb_intern("cdata_end")); case TOKEN_SOLIDUS: return ID2SYM(rb_intern("solidus")); case TOKEN_EQUAL: return ID2SYM(rb_intern("equal")); case TOKEN_MALFORMED: return ID2SYM(rb_intern("malformed")); } return Qnil; } static long unsigned int tokenizer_mblength(struct tokenizer_t *tk, long unsigned int length) { rb_encoding *enc = rb_enc_from_index(tk->scan.enc_index); const char *buf = tk->scan.string + tk->scan.cursor; return rb_enc_strlen(buf, buf + length, enc); } static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data) { long unsigned int mb_length = tokenizer_mblength(tk, length); tk->last_token = type; rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.mb_cursor), INT2NUM(tk->scan.mb_cursor + mb_length)); } static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length) { long unsigned int mb_length = tokenizer_mblength(tk, length); if(tk->f_callback) tk->f_callback(tk, type, length, tk->callback_data); tk->scan.cursor += length; tk->scan.mb_cursor += mb_length; } static VALUE tokenizer_initialize_method(VALUE self) { struct tokenizer_t *tk = NULL; Tokenizer_Get_Struct(self, tk); DBG_PRINT("tk=%p initialize", tk); tokenizer_init(tk); tk->f_callback = tokenizer_yield_tag; return Qnil; } static inline int eos(struct scan_t *scan) { return scan->cursor >= scan->length; } static inline long unsigned int length_remaining(struct scan_t *scan) { return scan->length - scan->cursor; } static inline void push_context(struct tokenizer_t *tk, enum tokenizer_context ctx) { tk->context[++tk->current_context] = ctx; } static inline void pop_context(struct tokenizer_t *tk) { tk->context[tk->current_context--] = TOKENIZER_NONE; } static int is_text(struct scan_t *scan, long unsigned int *length) { long unsigned int i; *length = 0; for(i = scan->cursor;i < scan->length; i++, (*length)++) { if(scan->string[i] == '<') break; } return *length != 0; } static inline int is_comment_start(struct scan_t *scan) { return (length_remaining(scan) >= 4) && !strncmp((const char *)&scan->string[scan->cursor], "