%option reentrant bison-bridge bison-locations %option align %option noyywrap %option never-interactive %option stack %option nodefault %option nounput noyy_top_state %option extra-type="::nix::LexerState *" %s DEFAULT %x STRING %x IND_STRING %x INPATH %x INPATH_SLASH %x PATH_START %{ #ifdef __clang__ #pragma clang diagnostic ignored "-Wunneeded-internal-declaration" #endif #include "nixexpr.hh" #include "parser-tab.hh" // !!! FIXME !!! #define YY_EXTRA_TYPE ::nix::LexerState * int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner); YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner ); #undef YY_EXTRA_TYPE using namespace nix; namespace nix { #define CUR_POS state->at(*yylloc) static void initLoc(YYLTYPE * loc) { loc->first_line = loc->last_line = 0; loc->first_column = loc->last_column = 0; } static void adjustLoc(yyscan_t yyscanner, YYLTYPE * loc, const char * s, size_t len) { loc->stash(); LexerState & lexerState = *yyget_extra(yyscanner); if (lexerState.docCommentDistance == 1) { // Preceding token was a doc comment. ParserLocation doc; doc.first_column = lexerState.lastDocCommentLoc.first_column; ParserLocation docEnd; docEnd.first_column = lexerState.lastDocCommentLoc.last_column; DocComment docComment{lexerState.at(doc), lexerState.at(docEnd)}; PosIdx locPos = lexerState.at(*loc); lexerState.positionToDocComment.emplace(locPos, docComment); } lexerState.docCommentDistance++; loc->first_column = loc->last_column; loc->last_column += len; } // we make use of the fact that the parser receives a private copy of the input // string and can munge around in it. static StringToken unescapeStr(SymbolTable & symbols, char * s, size_t length) { char * result = s; char * t = s; char c; // the input string is terminated with *two* NULs, so we can safely take // *one* character after the one being checked against. while ((c = *s++)) { if (c == '\\') { c = *s++; if (c == 'n') *t = '\n'; else if (c == 'r') *t = '\r'; else if (c == 't') *t = '\t'; else *t = c; } else if (c == '\r') { /* Normalise CR and CR/LF into LF. */ *t = '\n'; if (*s == '\n') s++; /* cr/lf */ } else *t = c; t++; } return {result, size_t(t - result)}; } } // yacc generates code that uses unannotated fallthrough. #pragma GCC diagnostic ignored "-Wimplicit-fallthrough" #define YY_USER_INIT initLoc(yylloc) #define YY_USER_ACTION adjustLoc(yyscanner, yylloc, yytext, yyleng); #define PUSH_STATE(state) yy_push_state(state, yyscanner) #define POP_STATE() yy_pop_state(yyscanner) %} ANY .|\n ID [a-zA-Z\_][a-zA-Z0-9\_\'\-]* INT [0-9]+ FLOAT (([1-9][0-9]*\.[0-9]*)|(0?\.[0-9]+))([Ee][+-]?[0-9]+)? PATH_CHAR [a-zA-Z0-9\.\_\-\+] PATH {PATH_CHAR}*(\/{PATH_CHAR}+)+\/? PATH_SEG {PATH_CHAR}*\/ HPATH \~(\/{PATH_CHAR}+)+\/? HPATH_START \~\/ SPATH \<{PATH_CHAR}+(\/{PATH_CHAR}+)*\> URI [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+ %% if { return IF; } then { return THEN; } else { return ELSE; } assert { return ASSERT; } with { return WITH; } let { return LET; } in { return IN_KW; } rec { return REC; } inherit { return INHERIT; } or { return OR_KW; } \.\.\. { return ELLIPSIS; } \=\= { return EQ; } \!\= { return NEQ; } \<\= { return LEQ; } \>\= { return GEQ; } \&\& { return AND; } \|\| { return OR; } \-\> { return IMPL; } \/\/ { return UPDATE; } \+\+ { return CONCAT; } {ID} { yylval->id = {yytext, (size_t) yyleng}; return ID; } {INT} { errno = 0; std::optional numMay = string2Int(yytext); if (numMay.has_value()) { yylval->n = *numMay; } else { throw ParseError(ErrorInfo{ .msg = HintFmt("invalid integer '%1%'", yytext), .pos = state->positions[CUR_POS], }); } return INT_LIT; } {FLOAT} { errno = 0; yylval->nf = strtod(yytext, 0); if (errno != 0) throw ParseError(ErrorInfo{ .msg = HintFmt("invalid float '%1%'", yytext), .pos = state->positions[CUR_POS], }); return FLOAT_LIT; } \$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; } \} { /* State INITIAL only exists at the bottom of the stack and is used as a marker. DEFAULT replaces it everywhere else. Popping when in INITIAL state causes an empty stack exception, so don't */ if (YYSTATE != INITIAL) POP_STATE(); return '}'; } \{ { PUSH_STATE(DEFAULT); return '{'; } \" { PUSH_STATE(STRING); return '"'; } ([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})*\$/\" | ([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})+ { /* It is impossible to match strings ending with '$' with one regex because trailing contexts are only valid at the end of a rule. (A sane but undocumented limitation.) */ yylval->str = unescapeStr(state->symbols, yytext, yyleng); return STR; } \$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; } \" { POP_STATE(); return '"'; } \$|\\|\$\\ { /* This can only occur when we reach EOF, otherwise the above (...|\$[^\{\"\\]|\\.|\$\\.)+ would have triggered. This is technically invalid, but we leave the problem to the parser who fails with exact location. */ return EOF; } \'\'(\ *\n)? { PUSH_STATE(IND_STRING); return IND_STRING_OPEN; } ([^\$\']|\$[^\{\']|\'[^\'\$])+ { yylval->str = {yytext, (size_t) yyleng, true}; return IND_STR; } \'\'\$ | \$ { yylval->str = {"$", 1}; return IND_STR; } \'\'\' { yylval->str = {"''", 2}; return IND_STR; } \'\'\\{ANY} { yylval->str = unescapeStr(state->symbols, yytext + 2, yyleng - 2); return IND_STR; } \$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; } \'\' { POP_STATE(); return IND_STRING_CLOSE; } \' { yylval->str = {"'", 1}; return IND_STR; } {PATH_SEG}\$\{ | {HPATH_START}\$\{ { PUSH_STATE(PATH_START); yyless(0); yylloc->unstash(); } {PATH_SEG} { POP_STATE(); PUSH_STATE(INPATH_SLASH); yylval->path = {yytext, (size_t) yyleng}; return PATH; } {HPATH_START} { POP_STATE(); PUSH_STATE(INPATH_SLASH); yylval->path = {yytext, (size_t) yyleng}; return HPATH; } {PATH} { if (yytext[yyleng-1] == '/') PUSH_STATE(INPATH_SLASH); else PUSH_STATE(INPATH); yylval->path = {yytext, (size_t) yyleng}; return PATH; } {HPATH} { if (yytext[yyleng-1] == '/') PUSH_STATE(INPATH_SLASH); else PUSH_STATE(INPATH); yylval->path = {yytext, (size_t) yyleng}; return HPATH; } \$\{ { POP_STATE(); PUSH_STATE(INPATH); PUSH_STATE(DEFAULT); return DOLLAR_CURLY; } {PATH}|{PATH_SEG}|{PATH_CHAR}+ { POP_STATE(); if (yytext[yyleng-1] == '/') PUSH_STATE(INPATH_SLASH); else PUSH_STATE(INPATH); yylval->str = {yytext, (size_t) yyleng}; return STR; } {ANY} | <> { /* if we encounter a non-path character we inform the parser that the path has ended with a PATH_END token and re-parse this character in the default context (it may be ')', ';', or something of that sort) */ POP_STATE(); yyless(0); yylloc->unstash(); return PATH_END; } {ANY} | <> { throw ParseError(ErrorInfo{ .msg = HintFmt("path has a trailing slash"), .pos = state->positions[CUR_POS], }); } {SPATH} { yylval->path = {yytext, (size_t) yyleng}; return SPATH; } {URI} { yylval->uri = {yytext, (size_t) yyleng}; return URI; } %{ // Doc comment rule // // \/\*\* /** // [^/*] reject /**/ (empty comment) and /*** // ([^*]|\*+[^*/])*\*+\/ same as the long comment rule // ( )* zero or more non-ending sequences // \* end(1) // \/ end(2) %} \/\*\*[^/*]([^*]|\*+[^*/])*\*+\/ /* doc comments */ { LexerState & lexerState = *yyget_extra(yyscanner); lexerState.docCommentDistance = 0; lexerState.lastDocCommentLoc.first_line = yylloc->first_line; lexerState.lastDocCommentLoc.first_column = yylloc->first_column; lexerState.lastDocCommentLoc.last_column = yylloc->last_column; } %{ // The following rules have docCommentDistance-- // This compensates for the docCommentDistance++ which happens by default to // make all the other rules invalidate the doc comment. %} [ \t\r\n]+ /* eat up whitespace */ { yyget_extra(yyscanner)->docCommentDistance--; } \#[^\r\n]* /* single-line comments */ { yyget_extra(yyscanner)->docCommentDistance--; } \/\*([^*]|\*+[^*/])*\*+\/ /* long comments */ { yyget_extra(yyscanner)->docCommentDistance--; } {ANY} { /* Don't return a negative number, as this will cause Bison to stop parsing without an error. */ return (unsigned char) yytext[0]; } %%