ext/linguist/tokenizer.l in github-linguist-7.11.1 vs ext/linguist/tokenizer.l in github-linguist-7.12.0
- old
+ new
@@ -1,14 +1,39 @@
%{
-#include "linguist.h"
+#include "ruby.h"
-#define feed_token(tok, typ) do { \
- yyextra->token = (tok); \
- yyextra->type = (typ); \
- } while (0)
+// Anything longer is unlikely to be useful.
+#define MAX_TOKEN_LEN 32
+#define FEED2(s, l) do { \
+ const char* __s = (s); \
+ const size_t __l = (l); \
+ if ((__l) > MAX_TOKEN_LEN) \
+ break; \
+ *yyextra = rb_str_new(__s, __l); \
+ } while(0)
+
+#define FEED1(s) FEED2(s, strlen(s))
+
+#define FEED() FEED2(yytext, yyleng)
+
+#define FEED_SHEBANG(s) do { \
+ const size_t __l = strlen(s); \
+ if (__l > MAX_TOKEN_LEN) \
+ break; \
+ *yyextra = rb_str_new("SHEBANG#!", sizeof("SHEBANG#!") - 1); \
+ rb_str_cat(*yyextra, s, __l); \
+ } while(0)
+
+#define FEED_SGML() do { \
+ if (yyleng > MAX_TOKEN_LEN) \
+ break; \
+ *yyextra = rb_str_new(yytext, yyleng); \
+ rb_str_cat(*yyextra, ">", 1); \
+ } while(0)
+
#define eat_until_eol() do { \
int c; \
while ((c = input(yyscanner)) != '\n' && c != EOF && c); \
if (c == EOF || !c) \
return 0; \
@@ -30,41 +55,41 @@
return 0; \
} while (0)
%}
-%option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="struct tokenizer_extra *" prefix="linguist_yy"
+%option never-interactive yywrap reentrant nounput warn nodefault header-file="lex.linguist_yy.h" extra-type="VALUE*" prefix="linguist_yy"
%x sgml c_comment xml_comment haskell_comment ocaml_comment python_dcomment python_scomment
%%
^#![ \t]*([[:alnum:]_\/]*\/)?env([ \t]+([^ \t=]*=[^ \t]*))*[ \t]+[[:alpha:]_]+ {
- const char *off = strrchr(yytext, ' ');
- if (!off)
- off = yytext;
- else
- ++off;
- feed_token(strdup(off), SHEBANG_TOKEN);
+ const char *off = strrchr(yytext, ' ');
+ if (!off)
+ off = yytext;
+ else
+ ++off;
+ FEED_SHEBANG(off);
+ eat_until_eol();
+ return 1;
+}
+
+^#![ \t]*[[:alpha:]_\/]+ {
+ const char *off = strrchr(yytext, '/');
+ if (!off)
+ off = yytext;
+ else
+ ++off;
+ if (strcmp(off, "env") == 0) {
eat_until_eol();
+ } else {
+ FEED_SHEBANG(off);
+ eat_until_eol();
return 1;
}
+}
-^#![ \t]*[[:alpha:]_\/]+ {
- const char *off = strrchr(yytext, '/');
- if (!off)
- off = yytext;
- else
- ++off;
- if (strcmp(off, "env") == 0) {
- eat_until_eol();
- } else {
- feed_token(strdup(off), SHEBANG_TOKEN);
- eat_until_eol();
- return 1;
- }
- }
-
^[ \t]*(\/\/|--|\#|%|\")" ".* { /* nothing */ }
"/*" { BEGIN(c_comment); }
/* See below for xml_comment start. */
"{-" { BEGIN(haskell_comment); }
@@ -86,34 +111,34 @@
(0x[0-9a-fA-F]([0-9a-fA-F]|\.)*|[0-9]([0-9]|\.)*)([uU][lL]{0,2}|([eE][-+][0-9]*)?[fFlL]*) { /* nothing */ }
\<[[:alnum:]_!./?-]+ {
if (strcmp(yytext, "<!--") == 0) {
BEGIN(xml_comment);
} else {
- feed_token(strdup(yytext), SGML_TOKEN);
+ FEED_SGML();
BEGIN(sgml);
return 1;
}
}
-<sgml>[[:alnum:]_]+=\" { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('"'); return 1; }
-<sgml>[[:alnum:]_]+=' { feed_token(strndup(yytext, strlen(yytext) - 1), REGULAR_TOKEN); eat_until_unescaped('\''); return 1; }
-<sgml>[[:alnum:]_]+=[[:alnum:]_]* { feed_token(strdup(yytext), REGULAR_TOKEN); *(strchr(yyextra->token, '=') + 1) = 0; return 1; }
-<sgml>[[:alnum:]_]+ { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
+<sgml>[[:alnum:]_]+=\" { FEED2(yytext, yyleng - 1); eat_until_unescaped('"'); return 1; }
+<sgml>[[:alnum:]_]+=' { FEED2(yytext, yyleng - 1); eat_until_unescaped('\''); return 1; }
+<sgml>[[:alnum:]_]+=[[:alnum:]_]* { FEED2(yytext, strchr(yytext, '=') - yytext + 1); return 1; }
+<sgml>[[:alnum:]_]+ { FEED(); return 1; }
<sgml>\> { BEGIN(INITIAL); }
<sgml>.|\n { /* nothing */ }
-;|\{|\}|\(|\)|\[|\] { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
+;|\{|\}|\(|\)|\[|\] { FEED(); return 1; }
[[:alnum:]_.@#/*]+ {
if (strncmp(yytext, "/*", 2) == 0) {
- if (strlen(yytext) >= 4 && strcmp(yytext + strlen(yytext) - 2, "*/") == 0) {
+ if (yyleng >= 4 && strcmp(yytext + yyleng - 2, "*/") == 0) {
/* nothing */
} else {
BEGIN(c_comment);
}
} else {
- feed_token(strdup(yytext), REGULAR_TOKEN);
+ FEED();
return 1;
}
}
-\<\<?|\+|\-|\*|\/|%|&&?|\|\|? { feed_token(strdup(yytext), REGULAR_TOKEN); return 1; }
+\<\<?|\+|\-|\*|\/|%|&&?|\|\|? { FEED(); return 1; }
.|\n { /* nothing */ }
%%