ext/linguist/tokenizer.l in github-linguist-7.11.1 vs ext/linguist/tokenizer.l in github-linguist-7.12.0

- old
+ new

@@ -1,14 +1,39 @@ %{ -#include "linguist.h" +#include "ruby.h" -#define feed_token(tok, typ) do { \ - yyextra->token = (tok); \ - yyextra->type = (typ); \ - } while (0) +// Anything longer is unlikely to be useful. +#define MAX_TOKEN_LEN 32 +#define FEED2(s, l) do { \ + const char* __s = (s); \ + const size_t __l = (l); \ + if ((__l) > MAX_TOKEN_LEN) \ + break; \ + *yyextra = rb_str_new(__s, __l); \ + } while(0) + +#define FEED1(s) FEED2(s, strlen(s)) + +#define FEED() FEED2(yytext, yyleng) + +#define FEED_SHEBANG(s) do { \ + const size_t __l = strlen(s); \ + if (__l > MAX_TOKEN_LEN) \ + break; \ + *yyextra = rb_str_new("SHEBANG#!", sizeof("SHEBANG#!") - 1); \ + rb_str_cat(*yyextra, s, __l); \ + } while(0) + +#define FEED_SGML() do { \ + if (yyleng > MAX_TOKEN_LEN) \ + break; \ + *yyextra = rb_str_new(yytext, yyleng); \ + rb_str_cat(*yyextra, ">", 1); \ + } while(0) + #define eat_until_eol() do { \ int c; \ while ((c = input(yyscanner)) != '\n' && c != EOF && c); \ if (c == EOF || !c) \ return 0; \ @@ -30,41 +55,41 @@ return 0; \ } while (0) %} -%option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy" +%option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="VALUE*" prefix="linguist_yy" %x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment %% ^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ { - const char *off = strrchr(yytext, ' '); - if (!off) - off = yytext; - else - ++off; - feed_token(strdup(off), SHEBANG_TOKEN); + const char *off = strrchr(yytext, ' '); + if (!off) + off = yytext; + else + ++off; + FEED_SHEBANG(off); + eat_until_eol(); + return 1; +} + +^#![ \t]*[[:alpha:]_\/]+ { + const char *off = strrchr(yytext, '/'); + if (!off) + off = yytext; + else + ++off; + if (strcmp(off, "env") == 0) { eat_until_eol(); + } else { + FEED_SHEBANG(off); + eat_until_eol(); return 1; } +} -^#![ \t]*[[:alpha:]_\/]+ { - const char *off = strrchr(yytext, '/'); - if (!off) - off = yytext; - else - ++off; - if (strcmp(off, "env") == 0) { - eat_until_eol(); - } else { - feed_token(strdup(off), SHEBANG_TOKEN); - eat_until_eol(); - return 1; - } - } - ^[ \t]*(\/\/|--|\#|%|\")" ".* { /* nothing */ } "/*" { BEGIN(c_comment); } /* See below for xml_comment start. */ "{-" { BEGIN(haskell_comment); } @@ -86,34 +111,34 @@ (0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ } \<[[:alnum:]_!./?-]+ { if (strcmp(yytext, "<!--") == 0) { BEGIN(xml_comment); } else { - feed_token(strdup(yytext), SGML_TOKEN); + FEED_SGML(); BEGIN(sgml); return 1; } } -<sgml>[[:alnum:]_]+=\" { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('"'); return 1; } -<sgml>[[:alnum:]_]+=' { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('\''); return 1; } -<sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; } -<sgml>[[:alnum:]_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } +<sgml>[[:alnum:]_]+=\" { FEED2(yytext, yyleng - 1); eat_until_unescaped('"'); return 1; } +<sgml>[[:alnum:]_]+=' { FEED2(yytext, yyleng - 1); eat_until_unescaped('\''); return 1; } +<sgml>[[:alnum:]_]+=[[:alnum:]_]* { FEED2(yytext, strchr(yytext, '=') - yytext + 1); return 1; } +<sgml>[[:alnum:]_]+ { FEED(); return 1; } <sgml>\> { BEGIN(INITIAL); } <sgml>.|\n { /* nothing */ } -;|\{|\}|\(|\)|\[|\] { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } +;|\{|\}|\(|\)|\[|\] { FEED(); return 1; } [[:alnum:]_.@#/*]+ { if (strncmp(yytext, "/*", 2) == 0) { - if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) { + if (yyleng >= 4 && strcmp(yytext + yyleng - 2, "*/") == 0) { /* nothing */ } else { BEGIN(c_comment); } } else { - feed_token(strdup(yytext), REGULAR_TOKEN); + FEED(); return 1; } } -\<\<?|\+|\-|\*|\/|%|&&?|\|\|? { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; } +\<\<?|\+|\-|\*|\/|%|&&?|\|\|? { FEED(); return 1; } .|\n { /* nothing */ } %%