#include #include #if defined(_WIN32) #include #endif #ifdef HAVE_RUBY_RE_H #include #else #include #endif #ifdef HAVE_RUBY_ENCODING_H #include #define ENCODED_STR_NEW(ptr, len) \ rb_enc_str_new(ptr, len, rb_utf8_encoding()) #else #define ENCODED_STR_NEW(ptr, len) \ rb_str_new(ptr, len) #endif #define LF_FLAG 0 #define CRLF_FLAG 1 #define LF "\n" #define CRLF "\r\n" #ifndef RSTRING_PTR #define RSTRING_PTR(s) (RSTRING(s)->ptr) #endif #ifndef RSTRING_LEN #define RSTRING_LEN(s) (RSTRING(s)->len) #endif #define DATA_GET(FROM, TYPE, NAME) \ Data_Get_Struct(FROM, TYPE, NAME); \ if (NAME == NULL) { \ rb_raise(rb_eArgError, "NULL found for " # NAME " when it shouldn't be."); \ } typedef struct lexer_state { int content_len; int line_number; int current_line; int start_col; int eol; size_t mark; size_t keyword_start; size_t keyword_end; size_t next_keyword_start; size_t content_start; size_t content_end; size_t query_start; size_t last_newline; size_t final_newline; } lexer_state; static VALUE mGherkin; static VALUE mGherkinLexer; static VALUE mCLexer; static VALUE cI18nLexer; static VALUE rb_eGherkinLexingError; #define LEN(AT, P) (P - data - lexer->AT) #define MARK(M, P) (lexer->M = (P) - data) #define PTR_TO(P) (data + lexer->P) #define STORE_KW_END_CON(EVENT) \ store_multiline_kw_con(listener, # EVENT, \ PTR_TO(keyword_start), LEN(keyword_start, PTR_TO(keyword_end - 1)), \ PTR_TO(content_start), LEN(content_start, PTR_TO(content_end)), \ lexer->current_line, lexer->eol); \ if (lexer->content_end != 0) { \ p = PTR_TO(content_end - 1); \ } \ lexer->content_end = 0 #define STORE_ATTR(ATTR) \ store_attr(listener, # ATTR, \ PTR_TO(content_start), LEN(content_start, p), \ lexer->line_number) %%{ machine lexer; action begin_content { MARK(content_start, p); lexer->current_line = lexer->line_number; } action begin_pystring_content { MARK(content_start, p); } action start_pystring { lexer->current_line = lexer->line_number; lexer->start_col = p - data - lexer->last_newline; } action store_pystring_content { int len = LEN(content_start, PTR_TO(final_newline)); if (len < 0) len = 0; store_pystring_content(listener, lexer->start_col, PTR_TO(content_start), len, lexer->current_line); } action store_feature_content { STORE_KW_END_CON(feature); } action store_background_content { STORE_KW_END_CON(background); } action store_scenario_content { STORE_KW_END_CON(scenario); } action store_scenario_outline_content { STORE_KW_END_CON(scenario_outline); } action store_examples_content { STORE_KW_END_CON(examples); } action store_step_content { store_kw_con(listener, "step", PTR_TO(keyword_start), LEN(keyword_start, PTR_TO(keyword_end)), PTR_TO(content_start), LEN(content_start, p), lexer->current_line); } action store_comment_content { STORE_ATTR(comment); lexer->mark = 0; } action store_tag_content { STORE_ATTR(tag); lexer->mark = 0; } action inc_line_number { lexer->line_number += 1; MARK(final_newline, p); } action last_newline { MARK(last_newline, p + 1); } action start_keyword { if (lexer->mark == 0) { MARK(mark, p); } } action end_keyword { MARK(keyword_end, p); MARK(keyword_start, PTR_TO(mark)); MARK(content_start, p + 1); lexer->mark = 0; } action next_keyword_start { MARK(content_end, p); } action start_row { p = p - 1; lexer->current_line = lexer->line_number; current_row = rb_ary_new(); } action begin_cell_content { MARK(content_start, p); } action store_cell_content { VALUE con = ENCODED_STR_NEW(PTR_TO(content_start), LEN(content_start, p)); rb_funcall(con, rb_intern("strip!"), 0); VALUE re_pipe = rb_reg_regcomp(rb_str_new2("\\\\\\|")); VALUE re_backslash = rb_reg_regcomp(rb_str_new2("\\\\\\\\")); rb_funcall(con, rb_intern("gsub!"), 2, re_pipe, rb_str_new2("|")); rb_funcall(con, rb_intern("gsub!"), 2, re_backslash, rb_str_new2("\\")); rb_ary_push(current_row, con); } action store_row { rb_funcall(listener, rb_intern("row"), 2, current_row, INT2FIX(lexer->current_line)); } action end_feature { if (cs < lexer_first_final) { if (raise_lexer_error != NULL) { size_t count = 0; VALUE newstr_val; char *newstr; int newstr_count = 0; size_t len; const char *buff; if (lexer->last_newline != 0) { len = LEN(last_newline, eof); buff = PTR_TO(last_newline); } else { len = strlen(data); buff = data; } // Allocate as a ruby string so that it gets cleaned up by GC newstr_val = rb_str_new(buff, len); newstr = RSTRING_PTR(newstr_val); for (count = 0; count < len; count++) { if(buff[count] == 10) { newstr[newstr_count] = '\0'; // terminate new string at first newline found break; } else { if (buff[count] == '%') { newstr[newstr_count++] = buff[count]; newstr[newstr_count] = buff[count]; } else { newstr[newstr_count] = buff[count]; } } newstr_count++; } int line = lexer->line_number; lexer_init(lexer); // Re-initialize so we can scan again with the same lexer raise_lexer_error(newstr, line); } } else { rb_funcall(listener, rb_intern("eof"), 0); } } include lexer_common "lexer_common.<%= @i18n.underscored_iso_code %>.rl"; }%% /** Data **/ %% write data; static VALUE strip_i(VALUE str, VALUE ary) { rb_funcall(str, rb_intern("strip!"), 0); rb_ary_push(ary, str); return Qnil; } static VALUE multiline_strip(VALUE text) { VALUE map = rb_ary_new(); VALUE split = rb_str_split(text, "\n"); rb_iterate(rb_each, split, strip_i, map); return split; } static void store_kw_con(VALUE listener, const char * event_name, const char * keyword_at, size_t keyword_length, const char * at, size_t length, int current_line) { VALUE con = Qnil, kw = Qnil; kw = ENCODED_STR_NEW(keyword_at, keyword_length); con = ENCODED_STR_NEW(at, length); rb_funcall(con, rb_intern("strip!"), 0); rb_funcall(listener, rb_intern(event_name), 3, kw, con, INT2FIX(current_line)); } static void store_multiline_kw_con(VALUE listener, const char * event_name, const char * keyword_at, size_t keyword_length, const char * at, size_t length, int current_line, int eol) { VALUE con = Qnil, kw = Qnil, name = Qnil, desc = Qnil; kw = ENCODED_STR_NEW(keyword_at, keyword_length); con = ENCODED_STR_NEW(at, length); VALUE split = multiline_strip(con); name = rb_funcall(split, rb_intern("shift"), 0); desc = rb_ary_join(split, rb_str_new2( \ eol == CRLF_FLAG ? CRLF : LF )); if( name == Qnil ) { name = rb_str_new2(""); } if( rb_funcall(desc, rb_intern("size"), 0) == 0) { desc = rb_str_new2(""); } rb_funcall(name, rb_intern("strip!"), 0); rb_funcall(desc, rb_intern("strip!"), 0); rb_funcall(listener, rb_intern(event_name), 4, kw, name, desc, INT2FIX(current_line)); } static void store_attr(VALUE listener, const char * attr_type, const char * at, size_t length, int line) { VALUE val = ENCODED_STR_NEW(at, length); rb_funcall(listener, rb_intern(attr_type), 2, val, INT2FIX(line)); } static void store_pystring_content(VALUE listener, int start_col, const char *at, size_t length, int current_line) { VALUE con = ENCODED_STR_NEW(at, length); // Gherkin will crash gracefully if the string representation of start_col pushes the pattern past 32 characters char pat[32]; snprintf(pat, 32, "^[\t ]{0,%d}", start_col); VALUE re = rb_reg_regcomp(rb_str_new2(pat)); VALUE re2 = rb_reg_regcomp(rb_str_new2("\r\\Z")); VALUE unescape_escaped_quotes = rb_reg_regcomp(rb_str_new2("\\\\\"\\\\\"\\\\\"")); rb_funcall(con, rb_intern("gsub!"), 2, re, rb_str_new2("")); rb_funcall(con, rb_intern("sub!"), 2, re2, rb_str_new2("")); rb_funcall(con, rb_intern("gsub!"), 2, unescape_escaped_quotes, rb_str_new2("\"\"\"")); rb_funcall(listener, rb_intern("py_string"), 2, con, INT2FIX(current_line)); } static void raise_lexer_error(const char * at, int line) { rb_raise(rb_eGherkinLexingError, "Lexing error on line %d: '%s'. See http://wiki.github.com/aslakhellesoy/gherkin/lexingerror for more information.", line, at); } static int count_char(char char_to_count, char *str) { int count = 0; int i = 0; while(str[i] != '\0') { if(str[i] == char_to_count) { count++; } i++; } return count; } static void lexer_init(lexer_state *lexer) { lexer->content_start = 0; lexer->content_end = 0; lexer->content_len = 0; lexer->mark = 0; lexer->keyword_start = 0; lexer->keyword_end = 0; lexer->next_keyword_start = 0; lexer->line_number = 1; lexer->last_newline = 0; lexer->final_newline = 0; lexer->start_col = 0; lexer->eol = LF_FLAG; } static VALUE CLexer_alloc(VALUE klass) { VALUE obj; lexer_state *lxr = ALLOC(lexer_state); lexer_init(lxr); obj = Data_Wrap_Struct(klass, NULL, -1, lxr); return obj; } static VALUE CLexer_init(VALUE self, VALUE listener) { rb_iv_set(self, "@listener", listener); lexer_state *lxr = NULL; DATA_GET(self, lexer_state, lxr); lexer_init(lxr); return self; } static VALUE CLexer_scan(VALUE self, VALUE input) { VALUE listener = rb_iv_get(self, "@listener"); lexer_state *lexer = NULL; DATA_GET(self, lexer_state, lexer); VALUE input_copy = rb_str_dup(input); rb_str_append(input_copy, rb_str_new2("\n%_FEATURE_END_%")); char *data = RSTRING_PTR(input_copy); size_t len = RSTRING_LEN(input_copy); if (count_char('\r', data) > (count_char('\n', data) / 2)) { lexer->eol = CRLF_FLAG; } if (len == 0) { rb_raise(rb_eGherkinLexingError, "No content to lex."); } else { const char *p, *pe, *eof; int cs = 0; VALUE current_row = Qnil; p = data; pe = data + len; eof = pe; assert(*pe == '\0' && "pointer does not end on NULL"); %% write init; %% write exec; assert(p <= pe && "data overflow after parsing execute"); assert(lexer->content_start <= len && "content starts after data end"); assert(lexer->mark < len && "mark is after data end"); // Reset lexer by re-initializing the whole thing lexer_init(lexer); if (cs == lexer_error) { rb_raise(rb_eGherkinLexingError, "Invalid format, lexing fails."); } else { return Qtrue; } } } void Init_gherkin_lexer_<%= @i18n.underscored_iso_code %>() { mGherkin = rb_define_module("Gherkin"); mGherkinLexer = rb_define_module_under(mGherkin, "Lexer"); rb_eGherkinLexingError = rb_const_get(mGherkinLexer, rb_intern("LexingError")); mCLexer = rb_define_module_under(mGherkin, "CLexer"); cI18nLexer = rb_define_class_under(mCLexer, "<%= @i18n.underscored_iso_code.capitalize %>", rb_cObject); rb_define_alloc_func(cI18nLexer, CLexer_alloc); rb_define_method(cI18nLexer, "initialize", CLexer_init, 1); rb_define_method(cI18nLexer, "scan", CLexer_scan, 1); }