Sha256: 9d710e06ca6aa9a1da742ec251e20923f6b504cbd80b59df0a21519ef1774fd1

Contents?: true

Size: 1.76 KB

Versions: 2

Compression:

Stored size: 1.76 KB

Contents

#pragma once

enum tokenizer_context {
  TOKENIZER_NONE = 0,
  TOKENIZER_HTML,
  TOKENIZER_OPEN_TAG,
  TOKENIZER_SOLIDUS_OR_TAG_NAME,
  TOKENIZER_TAG_NAME,
  TOKENIZER_CDATA,
  TOKENIZER_RCDATA, // title, textarea
  TOKENIZER_RAWTEXT, // style, xmp, iframe, noembed, noframes
  TOKENIZER_SCRIPT_DATA, // script
  TOKENIZER_PLAINTEXT, // plaintext
  TOKENIZER_COMMENT,
  TOKENIZER_ATTRIBUTE_NAME,
  TOKENIZER_ATTRIBUTE_VALUE,
  TOKENIZER_ATTRIBUTE_UNQUOTED,
  TOKENIZER_ATTRIBUTE_QUOTED,
};

enum token_type {
  TOKEN_NONE = 0,
  TOKEN_TEXT,
  TOKEN_WHITESPACE,
  TOKEN_COMMENT_START,
  TOKEN_COMMENT_END,
  TOKEN_TAG_START,
  TOKEN_TAG_NAME,
  TOKEN_TAG_END,
  TOKEN_ATTRIBUTE_NAME,
  TOKEN_ATTRIBUTE_QUOTED_VALUE_START,
  TOKEN_ATTRIBUTE_QUOTED_VALUE,
  TOKEN_ATTRIBUTE_QUOTED_VALUE_END,
  TOKEN_ATTRIBUTE_UNQUOTED_VALUE,
  TOKEN_CDATA_START,
  TOKEN_CDATA_END,
  TOKEN_SOLIDUS,
  TOKEN_EQUAL,
  TOKEN_MALFORMED,
};

struct scan_t {
  char *string;
  long unsigned int cursor;
  long unsigned int length;

  int enc_index;
  long unsigned int mb_cursor;
};

struct tokenizer_t
{
  enum tokenizer_context context[1000];
  uint32_t current_context;

  void *callback_data;
  void (*f_callback)(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data);

  char attribute_value_start;
  int found_attribute;

  char *current_tag;

  int is_closing_tag;
  enum token_type last_token;

  struct scan_t scan;
};


void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
void tokenizer_init(struct tokenizer_t *tk);
void tokenizer_scan_all(struct tokenizer_t *tk);
VALUE token_type_to_symbol(enum token_type type);

extern const rb_data_type_t ht_tokenizer_data_type;
#define Tokenizer_Get_Struct(obj, sval) TypedData_Get_Struct(obj, struct tokenizer_t, &ht_tokenizer_data_type, sval)

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
html_tokenizer-0.0.3 ext/html_tokenizer_ext/tokenizer.h
html_tokenizer-0.0.2 ext/html_tokenizer_ext/tokenizer.h