nix-super/src/libexpr/lexer.l

%option reentrant bison-bridge bison-locations
%option align
%option noyywrap
%option never-interactive
%option stack
%option nodefault
%option nounput noyy_top_state
%option extra-type="::nix::LexerState *"

%s DEFAULT
%x STRING
%x IND_STRING
%x INPATH
%x INPATH_SLASH
%x PATH_START


%{
#ifdef __clang__
#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
#endif

#include "nixexpr.hh"
#include "parser-tab.hh"

// !!! FIXME !!!
#define YY_EXTRA_TYPE ::nix::LexerState *
int yylex_init_extra ( YY_EXTRA_TYPE user_defined, yyscan_t* scanner);
YY_EXTRA_TYPE yyget_extra ( yyscan_t yyscanner );
#undef YY_EXTRA_TYPE

using namespace nix;

namespace nix {

#define CUR_POS state->at(*yylloc)

static void initLoc(YYLTYPE * loc)
{
    loc->first_line = loc->last_line = 0;
    loc->first_column = loc->last_column = 0;
}

static void adjustLoc(yyscan_t yyscanner, YYLTYPE * loc, const char * s, size_t len)
{
    loc->stash();

    LexerState & lexerState = *yyget_extra(yyscanner);

    if (lexerState.docCommentDistance == 1) {
        // Preceding token was a doc comment.
        ParserLocation doc;
        doc.first_column = lexerState.lastDocCommentLoc.first_column;
        ParserLocation docEnd;
        docEnd.first_column = lexerState.lastDocCommentLoc.last_column;
        DocComment docComment{lexerState.at(doc), lexerState.at(docEnd)};
        PosIdx locPos = lexerState.at(*loc);
        lexerState.positionToDocComment.emplace(locPos, docComment);
    }
    lexerState.docCommentDistance++;

    loc->first_column = loc->last_column;
    loc->last_column += len;
}


// we make use of the fact that the parser receives a private copy of the input
// string and can munge around in it.
static StringToken unescapeStr(SymbolTable & symbols, char * s, size_t length)
{
    char * result = s;
    char * t = s;
    char c;
    // the input string is terminated with *two* NULs, so we can safely take
    // *one* character after the one being checked against.
    while ((c = *s++)) {
        if (c == '\\') {
            c = *s++;
            if (c == 'n') *t = '\n';
            else if (c == 'r') *t = '\r';
            else if (c == 't') *t = '\t';
            else *t = c;
        }
        else if (c == '\r') {
            /* Normalise CR and CR/LF into LF. */
            *t = '\n';
            if (*s == '\n') s++; /* cr/lf */
        }
        else *t = c;
        t++;
    }
    return {result, size_t(t - result)};
}


}

// yacc generates code that uses unannotated fallthrough.
#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"

#define YY_USER_INIT initLoc(yylloc)
#define YY_USER_ACTION adjustLoc(yyscanner, yylloc, yytext, yyleng);

#define PUSH_STATE(state) yy_push_state(state, yyscanner)
#define POP_STATE() yy_pop_state(yyscanner)

%}


ANY         .|\n
ID          [a-zA-Z\_][a-zA-Z0-9\_\'\-]*
INT         [0-9]+
FLOAT       (([1-9][0-9]*\.[0-9]*)|(0?\.[0-9]+))([Ee][+-]?[0-9]+)?
PATH_CHAR   [a-zA-Z0-9\.\_\-\+]
PATH        {PATH_CHAR}*(\/{PATH_CHAR}+)+\/?
PATH_SEG    {PATH_CHAR}*\/
HPATH       \~(\/{PATH_CHAR}+)+\/?
HPATH_START \~\/
SPATH       \<{PATH_CHAR}+(\/{PATH_CHAR}+)*\>
URI         [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+


%%


if          { return IF; }
then        { return THEN; }
else        { return ELSE; }
assert      { return ASSERT; }
with        { return WITH; }
let         { return LET; }
in          { return IN_KW; }
rec         { return REC; }
inherit     { return INHERIT; }
or          { return OR_KW; }
\.\.\.      { return ELLIPSIS; }

\=\=        { return EQ; }
\!\=        { return NEQ; }
\<\=        { return LEQ; }
\>\=        { return GEQ; }
\&\&        { return AND; }
\|\|        { return OR; }
\-\>        { return IMPL; }
\/\/        { return UPDATE; }
\+\+        { return CONCAT; }

{ID}        { yylval->id = {yytext, (size_t) yyleng}; return ID; }
{INT}       { errno = 0;
              std::optional<int64_t> numMay = string2Int<int64_t>(yytext);
              if (numMay.has_value()) {
                  yylval->n = *numMay;
              } else {
                  throw ParseError(ErrorInfo{
                      .msg = HintFmt("invalid integer '%1%'", yytext),
                      .pos = state->positions[CUR_POS],
                  });
              }
              return INT_LIT;
            }
{FLOAT}     { errno = 0;
              yylval->nf = strtod(yytext, 0);
              if (errno != 0)
                  throw ParseError(ErrorInfo{
                      .msg = HintFmt("invalid float '%1%'", yytext),
                      .pos = state->positions[CUR_POS],
                  });
              return FLOAT_LIT;
            }

\$\{        { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }

\}          { /* State INITIAL only exists at the bottom of the stack and is
                 used as a marker. DEFAULT replaces it everywhere else.
                 Popping when in INITIAL state causes an empty stack exception,
                 so don't */
              if (YYSTATE != INITIAL)
                POP_STATE();
              return '}';
            }
\{          { PUSH_STATE(DEFAULT); return '{'; }

\"          { PUSH_STATE(STRING); return '"'; }
<STRING>([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})*\$/\" |
<STRING>([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})+ {
                /* It is impossible to match strings ending with '$' with one
                   regex because trailing contexts are only valid at the end
                   of a rule. (A sane but undocumented limitation.) */
                yylval->str = unescapeStr(state->symbols, yytext, yyleng);
                return STR;
              }
<STRING>\$\{  { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
<STRING>\"    { POP_STATE(); return '"'; }
<STRING>\$|\\|\$\\ {
                /* This can only occur when we reach EOF, otherwise the above
                   (...|\$[^\{\"\\]|\\.|\$\\.)+ would have triggered.
                   This is technically invalid, but we leave the problem to the
                   parser who fails with exact location. */
                return EOF;
              }

\'\'(\ *\n)?     { PUSH_STATE(IND_STRING); return IND_STRING_OPEN; }
<IND_STRING>([^\$\']|\$[^\{\']|\'[^\'\$])+ {
                   yylval->str = {yytext, (size_t) yyleng, true};
                   return IND_STR;
                 }
<IND_STRING>\'\'\$ |
<IND_STRING>\$   {
                   yylval->str = {"$", 1};
                   return IND_STR;
                 }
<IND_STRING>\'\'\' {
                   yylval->str = {"''", 2};
                   return IND_STR;
                 }
<IND_STRING>\'\'\\{ANY} {
                   yylval->str = unescapeStr(state->symbols, yytext + 2, yyleng - 2);
                   return IND_STR;
                 }
<IND_STRING>\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
<IND_STRING>\'\' { POP_STATE(); return IND_STRING_CLOSE; }
<IND_STRING>\'   {
                   yylval->str = {"'", 1};
                   return IND_STR;
                 }

{PATH_SEG}\$\{ |
{HPATH_START}\$\{ {
  PUSH_STATE(PATH_START);
  yyless(0);
  yylloc->unstash();
}

<PATH_START>{PATH_SEG} {
  POP_STATE();
  PUSH_STATE(INPATH_SLASH);
  yylval->path = {yytext, (size_t) yyleng};
  return PATH;
}

<PATH_START>{HPATH_START} {
  POP_STATE();
  PUSH_STATE(INPATH_SLASH);
  yylval->path = {yytext, (size_t) yyleng};
  return HPATH;
}

{PATH} {
  if (yytext[yyleng-1] == '/')
    PUSH_STATE(INPATH_SLASH);
  else
    PUSH_STATE(INPATH);
  yylval->path = {yytext, (size_t) yyleng};
  return PATH;
}
{HPATH} {
  if (yytext[yyleng-1] == '/')
    PUSH_STATE(INPATH_SLASH);
  else
    PUSH_STATE(INPATH);
  yylval->path = {yytext, (size_t) yyleng};
  return HPATH;
}

<INPATH,INPATH_SLASH>\$\{ {
  POP_STATE();
  PUSH_STATE(INPATH);
  PUSH_STATE(DEFAULT);
  return DOLLAR_CURLY;
}
<INPATH,INPATH_SLASH>{PATH}|{PATH_SEG}|{PATH_CHAR}+ {
  POP_STATE();
  if (yytext[yyleng-1] == '/')
      PUSH_STATE(INPATH_SLASH);
  else
      PUSH_STATE(INPATH);
  yylval->str = {yytext, (size_t) yyleng};
  return STR;
}
<INPATH>{ANY} |
<INPATH><<EOF>> {
  /* if we encounter a non-path character we inform the parser that the path has
     ended with a PATH_END token and re-parse this character in the default
     context (it may be ')', ';', or something of that sort) */
  POP_STATE();
  yyless(0);
  yylloc->unstash();
  return PATH_END;
}

<INPATH_SLASH>{ANY} |
<INPATH_SLASH><<EOF>> {
  throw ParseError(ErrorInfo{
      .msg = HintFmt("path has a trailing slash"),
      .pos = state->positions[CUR_POS],
  });
}

{SPATH}     { yylval->path = {yytext, (size_t) yyleng}; return SPATH; }
{URI}       { yylval->uri = {yytext, (size_t) yyleng}; return URI; }

%{
// Doc comment rule
//
//   \/\*\*                            /**
//         [^/*]                       reject /**/ (empty comment) and /***
//              ([^*]|\*+[^*/])*\*+\/  same as the long comment rule
//              (             )*       zero or more non-ending sequences
//                              \*     end(1)
//                                 \/  end(2)
%}
\/\*\*[^/*]([^*]|\*+[^*/])*\*+\/  /* doc comments */ {
    LexerState & lexerState = *yyget_extra(yyscanner);
    lexerState.docCommentDistance = 0;
    lexerState.lastDocCommentLoc.first_line = yylloc->first_line;
    lexerState.lastDocCommentLoc.first_column = yylloc->first_column;
    lexerState.lastDocCommentLoc.last_column = yylloc->last_column;
}


%{
// The following rules have docCommentDistance--
// This compensates for the docCommentDistance++ which happens by default to
// make all the other rules invalidate the doc comment.
%}
[ \t\r\n]+    /* eat up whitespace */ { yyget_extra(yyscanner)->docCommentDistance--; }
\#[^\r\n]*    /* single-line comments */ { yyget_extra(yyscanner)->docCommentDistance--; }
\/\*([^*]|\*+[^*/])*\*+\/  /* long comments */ { yyget_extra(yyscanner)->docCommentDistance--; }

{ANY}       {
              /* Don't return a negative number, as this will cause
                 Bison to stop parsing without an error. */
              return (unsigned char) yytext[0];
            }

%%