/* * shtok.l - a lex rule for shell scripts * * Copyright (C) 2005 Kenichi Ishibashi * All rights reserved. * This is free software with ABSOLUTELY NO WARRANTY. * * You can redistribute it and/or modify it under the terms of * the GNU General Public License version 2. */ %option reentrant %option prefix="langscan_sh_lex_" %option noyywrap %option nodefault %option stack %s DQUOTE %s BQUOTE %s BRACE_SUBST %s PAREN_SUBST %s IN_BRACE %s HEREDOC_DELIMITER %s HEREDOC space [ \t]+ newline \r\n|\r|\n escseq \\({newline}|.) ident [0-9A-Za-z_][0-9A-Za-z_\-\.]* squote \'[^\']*\' specialvar (\$|\#|\*|@|\?|\-|\!|\_) var_ident ([A-Za-z_][0-9A-Za-z_]*|[0-9]|{specialvar}) %{ #include "sh.h" #define YY_EXTRA_TYPE langscan_sh_lex_extra_t * #if YY_NULL != 0 #error "YY_NULL is not 0." #endif #define YY_DECL langscan_sh_token_t langscan_sh_lex_lex(yyscan_t yyscanner) #define YY_INPUT(buf,result,max_size) \ if (!yyextra->eof) { \ result = yyextra->user_read(&(yyextra->user_data), (buf), (max_size)); \ if (result == 0) \ yyextra->eof = 1; \ } #define UPD update_pos(yyextra, yytext, yyleng) static void update_pos(langscan_sh_lex_extra_t *, char *, int); #define report(token) \ do { \ yyextra->text = yytext; \ yyextra->leng = yyleng; \ return langscan_sh_##token; \ } while (0) #define PUSH_STATE(state) yy_push_state(state, yyscanner) #define POP_STATE yy_pop_state(yyscanner) static int ident_length(unsigned char *ptr, int max); static char *heredoc_delimiter; static enum { HEREDOC_TAB_NO_STRIP, HEREDOC_TAB_STRIP } heredoc_type; static int set_heredoc_delimiter(unsigned char *ptr, int max); %} %% \<\<\-? { if (yytext[yyleng - 1] == '-') heredoc_type = HEREDOC_TAB_STRIP; else heredoc_type = HEREDOC_TAB_NO_STRIP; UPD; PUSH_STATE(HEREDOC_DELIMITER); report(punct); } [^ \t\r\n].* { int delimiter_leng; delimiter_leng = set_heredoc_delimiter(yytext, yyleng); if (delimiter_leng == -1) YY_FATAL_ERROR("Can't allocate memory"); yyless(delimiter_leng); PUSH_STATE(HEREDOC); UPD; report(heredoc_beg); } ^.+ { int sleng; sleng = 0; if (heredoc_type == HEREDOC_TAB_STRIP) { while (yytext[sleng] == ' ' || yytext[sleng] =='\t') { sleng++; if (sleng >= yyleng) { UPD; report(space); } } } if (strcmp((yytext + sleng), heredoc_delimiter) == 0) { /* end-of-heredoc */ free(heredoc_delimiter); POP_STATE; POP_STATE; UPD; report(heredoc_end); } else { UPD; report(string); } } <> { free(heredoc_delimiter); BEGIN(INITIAL); UPD; report(string); } \" { PUSH_STATE(DQUOTE); UPD; report(punct); } \" { POP_STATE; UPD; report(punct); } <> { BEGIN(INITIAL); } ([^\"\`\$\\]|{escseq})+ { UPD; report(string); } \` { PUSH_STATE(BQUOTE); UPD; report(punct); } \` { POP_STATE; UPD; report(punct); } <> { BEGIN(INITIAL); } \$\{ { PUSH_STATE(BRACE_SUBST); UPD; report(punct); } \} { POP_STATE; UPD; report(punct); } {var_ident} { UPD; report(ident); } <> { BEGIN(INITIAL); } \{ { PUSH_STATE(IN_BRACE); UPD; report(punct); } \} { POP_STATE; UPD; report(punct); } <> { BEGIN(INITIAL); } \$?\( { PUSH_STATE(PAREN_SUBST); UPD; report(punct); } \) { POP_STATE; UPD; report(punct); } <> { BEGIN(INITIAL); } ^\#.* { UPD; report(comment); } {space}\#.* { UPD; report(comment); } {squote} { UPD; report(string); } {space} { UPD; report(space); } {newline} { UPD; report(space); } {ident}[ \t]*\([ \t]*\) { yyless(ident_length(yytext, yyleng)); UPD; report(fundef); } {ident} { UPD; report(ident); } \${var_ident} { UPD; report(ident); } \\. { UPD; report(punct); } >=|<=|!=|\;\;|\<\<\<|&&|\|\||>&|<& { UPD; report(punct); } . { UPD; report(punct); } %% static void update_pos( langscan_sh_lex_extra_t *extra, char *text, int leng) { int i, j; extra->beg_byteno = extra->end_byteno; extra->beg_lineno = extra->end_lineno; extra->beg_columnno = extra->end_columnno; j = 0; for (i = 0; i < leng; i++) { if (text[i] == '\n') { extra->end_lineno++; j = i + 1; extra->end_columnno = 0; } } extra->end_columnno += leng - j; extra->end_byteno += leng; } static int ident_length(unsigned char *ptr, int max) { int len = 0; while (0 < max && (('0' <= *ptr && *ptr <= '9') || ('A' <= *ptr && *ptr <= 'Z') || ('a' <= *ptr && *ptr <= 'z') || *ptr == '_' || *ptr == '-' || *ptr == '.')) { ptr++; len++; max--; } return len; } static int set_heredoc_delimiter(unsigned char *ptr, int max) { char *dst, quote_char; int in_quote, len; heredoc_delimiter = malloc(max + 1); if (heredoc_delimiter == NULL) return -1; dst = heredoc_delimiter; len = 0; in_quote = 0; while (len < max) { if (in_quote == 0) { /* unquoted delimiter */ if (*ptr == '\'' || *ptr == '\"') { quote_char = *ptr; in_quote = 1; ptr++; if (++len >= max) break; continue; } if (*ptr == ' ' || *ptr == '\t') break; if (*ptr == '\\') { ptr++; if (++len >= max) break; } } else { /* quoted delimiter */ if (*ptr == quote_char) { in_quote = 0; ptr++; if (len++ >= max) break; continue; } } *dst++ = *ptr++; len++; } *dst = '\0'; return len; } langscan_sh_tokenizer_t *langscan_sh_make_tokenizer( size_t (*user_read)(void **user_data_p, char *buf, size_t maxlen), void *user_data) { langscan_sh_tokenizer_t *tokenizer; langscan_sh_lex_extra_t *extra; tokenizer = (langscan_sh_tokenizer_t *)malloc(sizeof(langscan_sh_tokenizer_t)); if (tokenizer == NULL) return NULL; extra = (langscan_sh_lex_extra_t *)malloc(sizeof(langscan_sh_lex_extra_t)); if (extra == NULL) return NULL; extra->user_read = user_read; extra->user_data = user_data; extra->beg_lineno = 1; extra->beg_columnno = 0; extra->beg_byteno = 0; extra->end_lineno = 1; extra->end_columnno = 0; extra->end_byteno = 0; extra->eof = 0; tokenizer->extra = extra; langscan_sh_lex_lex_init(&tokenizer->scanner); langscan_sh_lex_set_extra(extra, tokenizer->scanner); return tokenizer; } langscan_sh_token_t langscan_sh_get_token(langscan_sh_tokenizer_t *tokenizer) { return langscan_sh_lex_lex(tokenizer->scanner); } void langscan_sh_free_tokenizer(langscan_sh_tokenizer_t *tokenizer) { langscan_sh_lex_extra_t *extra = langscan_sh_lex_get_extra(tokenizer->scanner); free((void *)extra); langscan_sh_lex_lex_destroy(tokenizer->scanner); free((void *)tokenizer); } user_read_t langscan_sh_tokenizer_get_user_read(langscan_sh_tokenizer_t *tokenizer) { return tokenizer->extra->user_read; } void *langscan_sh_tokenizer_get_user_data(langscan_sh_tokenizer_t *tokenizer) { return tokenizer->extra->user_data; } const char *langscan_sh_token_name(langscan_sh_token_t token) { static char *token_names[] = { "*eof*", #define LANGSCAN_SH_TOKEN(name) #name, LANGSCAN_SH_TOKEN_LIST #undef LANGSCAN_SH_TOKEN }; return token_names[token]; }