mirror of
https://github.com/privatevoid-net/nix-super.git
synced 2025-02-15 14:47:18 +02:00
![pennae](/assets/img/avatar_default.png)
we now keep not a table of all positions, but a table of all origins and their sizes. position indices are now direct pointers into the virtual concatenation of all parsed contents. this slightly reduces memory usage and time spent in the parser, at the cost of not being able to report positions if the total input size exceeds 4GiB. this limit is not unique to nix though, rustc and clang also limit their input to 4GiB (although at least clang refuses to process inputs that are larger, we will not). this new 4GiB limit probably will not cause any problems for quite a while, all of nixpkgs together is less than 100MiB in size and already needs over 700MiB of memory and multiple seconds just to parse. 4GiB worth of input will easily take multiple minutes and over 30GiB of memory without even evaluating anything. if problems *do* arise we can probably recover the old table-based system by adding some tracking to Pos::Origin (or increasing the size of PosIdx outright), but for time being this looks like more complexity than it's worth. since we now need to read the entire input again to determine the line/column of a position we'll make unsafeGetAttrPos slightly lazy: mostly the set it returns is only used to determine the file of origin of an attribute, not its exact location. the thunks do not add measurable runtime overhead. notably this change is necessary to allow changing the parser since apparently nothing supports nix's very idiosyncratic line ending choice of "anything goes", making it very hard to calculate line/column positions in the parser (while byte offsets are very easy).
293 lines
8 KiB
Text
293 lines
8 KiB
Text
%option reentrant bison-bridge bison-locations
|
|
%option align
|
|
%option noyywrap
|
|
%option never-interactive
|
|
%option stack
|
|
%option nodefault
|
|
%option nounput noyy_top_state
|
|
|
|
|
|
%s DEFAULT
|
|
%x STRING
|
|
%x IND_STRING
|
|
%x INPATH
|
|
%x INPATH_SLASH
|
|
%x PATH_START
|
|
|
|
|
|
%{
|
|
#ifdef __clang__
|
|
#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
|
|
#endif
|
|
|
|
#include <boost/lexical_cast.hpp>
|
|
|
|
#include "nixexpr.hh"
|
|
#include "parser-tab.hh"
|
|
|
|
using namespace nix;
|
|
|
|
namespace nix {
|
|
|
|
#define CUR_POS state->at(*yylloc)
|
|
|
|
static void initLoc(YYLTYPE * loc)
|
|
{
|
|
loc->first_line = loc->last_line = 0;
|
|
loc->first_column = loc->last_column = 0;
|
|
}
|
|
|
|
static void adjustLoc(YYLTYPE * loc, const char * s, size_t len)
|
|
{
|
|
loc->stash();
|
|
|
|
loc->first_column = loc->last_column;
|
|
loc->last_column += len;
|
|
}
|
|
|
|
|
|
// we make use of the fact that the parser receives a private copy of the input
|
|
// string and can munge around in it.
|
|
static StringToken unescapeStr(SymbolTable & symbols, char * s, size_t length)
|
|
{
|
|
char * result = s;
|
|
char * t = s;
|
|
char c;
|
|
// the input string is terminated with *two* NULs, so we can safely take
|
|
// *one* character after the one being checked against.
|
|
while ((c = *s++)) {
|
|
if (c == '\\') {
|
|
c = *s++;
|
|
if (c == 'n') *t = '\n';
|
|
else if (c == 'r') *t = '\r';
|
|
else if (c == 't') *t = '\t';
|
|
else *t = c;
|
|
}
|
|
else if (c == '\r') {
|
|
/* Normalise CR and CR/LF into LF. */
|
|
*t = '\n';
|
|
if (*s == '\n') s++; /* cr/lf */
|
|
}
|
|
else *t = c;
|
|
t++;
|
|
}
|
|
return {result, size_t(t - result)};
|
|
}
|
|
|
|
|
|
}
|
|
|
|
// yacc generates code that uses unannotated fallthrough.
|
|
#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
|
|
|
|
#define YY_USER_INIT initLoc(yylloc)
|
|
#define YY_USER_ACTION adjustLoc(yylloc, yytext, yyleng);
|
|
|
|
#define PUSH_STATE(state) yy_push_state(state, yyscanner)
|
|
#define POP_STATE() yy_pop_state(yyscanner)
|
|
|
|
%}
|
|
|
|
|
|
ANY .|\n
|
|
ID [a-zA-Z\_][a-zA-Z0-9\_\'\-]*
|
|
INT [0-9]+
|
|
FLOAT (([1-9][0-9]*\.[0-9]*)|(0?\.[0-9]+))([Ee][+-]?[0-9]+)?
|
|
PATH_CHAR [a-zA-Z0-9\.\_\-\+]
|
|
PATH {PATH_CHAR}*(\/{PATH_CHAR}+)+\/?
|
|
PATH_SEG {PATH_CHAR}*\/
|
|
HPATH \~(\/{PATH_CHAR}+)+\/?
|
|
HPATH_START \~\/
|
|
SPATH \<{PATH_CHAR}+(\/{PATH_CHAR}+)*\>
|
|
URI [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+
|
|
|
|
|
|
%%
|
|
|
|
|
|
if { return IF; }
|
|
then { return THEN; }
|
|
else { return ELSE; }
|
|
assert { return ASSERT; }
|
|
with { return WITH; }
|
|
let { return LET; }
|
|
in { return IN_KW; }
|
|
rec { return REC; }
|
|
inherit { return INHERIT; }
|
|
or { return OR_KW; }
|
|
\.\.\. { return ELLIPSIS; }
|
|
|
|
\=\= { return EQ; }
|
|
\!\= { return NEQ; }
|
|
\<\= { return LEQ; }
|
|
\>\= { return GEQ; }
|
|
\&\& { return AND; }
|
|
\|\| { return OR; }
|
|
\-\> { return IMPL; }
|
|
\/\/ { return UPDATE; }
|
|
\+\+ { return CONCAT; }
|
|
|
|
{ID} { yylval->id = {yytext, (size_t) yyleng}; return ID; }
|
|
{INT} { errno = 0;
|
|
try {
|
|
yylval->n = boost::lexical_cast<int64_t>(yytext);
|
|
} catch (const boost::bad_lexical_cast &) {
|
|
throw ParseError(ErrorInfo{
|
|
.msg = HintFmt("invalid integer '%1%'", yytext),
|
|
.pos = state->positions[CUR_POS],
|
|
});
|
|
}
|
|
return INT_LIT;
|
|
}
|
|
{FLOAT} { errno = 0;
|
|
yylval->nf = strtod(yytext, 0);
|
|
if (errno != 0)
|
|
throw ParseError(ErrorInfo{
|
|
.msg = HintFmt("invalid float '%1%'", yytext),
|
|
.pos = state->positions[CUR_POS],
|
|
});
|
|
return FLOAT_LIT;
|
|
}
|
|
|
|
\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
|
|
|
|
\} { /* State INITIAL only exists at the bottom of the stack and is
|
|
used as a marker. DEFAULT replaces it everywhere else.
|
|
Popping when in INITIAL state causes an empty stack exception,
|
|
so don't */
|
|
if (YYSTATE != INITIAL)
|
|
POP_STATE();
|
|
return '}';
|
|
}
|
|
\{ { PUSH_STATE(DEFAULT); return '{'; }
|
|
|
|
\" { PUSH_STATE(STRING); return '"'; }
|
|
<STRING>([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})*\$/\" |
|
|
<STRING>([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})+ {
|
|
/* It is impossible to match strings ending with '$' with one
|
|
regex because trailing contexts are only valid at the end
|
|
of a rule. (A sane but undocumented limitation.) */
|
|
yylval->str = unescapeStr(state->symbols, yytext, yyleng);
|
|
return STR;
|
|
}
|
|
<STRING>\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
|
|
<STRING>\" { POP_STATE(); return '"'; }
|
|
<STRING>\$|\\|\$\\ {
|
|
/* This can only occur when we reach EOF, otherwise the above
|
|
(...|\$[^\{\"\\]|\\.|\$\\.)+ would have triggered.
|
|
This is technically invalid, but we leave the problem to the
|
|
parser who fails with exact location. */
|
|
return EOF;
|
|
}
|
|
|
|
\'\'(\ *\n)? { PUSH_STATE(IND_STRING); return IND_STRING_OPEN; }
|
|
<IND_STRING>([^\$\']|\$[^\{\']|\'[^\'\$])+ {
|
|
yylval->str = {yytext, (size_t) yyleng, true};
|
|
return IND_STR;
|
|
}
|
|
<IND_STRING>\'\'\$ |
|
|
<IND_STRING>\$ {
|
|
yylval->str = {"$", 1};
|
|
return IND_STR;
|
|
}
|
|
<IND_STRING>\'\'\' {
|
|
yylval->str = {"''", 2};
|
|
return IND_STR;
|
|
}
|
|
<IND_STRING>\'\'\\{ANY} {
|
|
yylval->str = unescapeStr(state->symbols, yytext + 2, yyleng - 2);
|
|
return IND_STR;
|
|
}
|
|
<IND_STRING>\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
|
|
<IND_STRING>\'\' { POP_STATE(); return IND_STRING_CLOSE; }
|
|
<IND_STRING>\' {
|
|
yylval->str = {"'", 1};
|
|
return IND_STR;
|
|
}
|
|
|
|
{PATH_SEG}\$\{ |
|
|
{HPATH_START}\$\{ {
|
|
PUSH_STATE(PATH_START);
|
|
yyless(0);
|
|
yylloc->unstash();
|
|
}
|
|
|
|
<PATH_START>{PATH_SEG} {
|
|
POP_STATE();
|
|
PUSH_STATE(INPATH_SLASH);
|
|
yylval->path = {yytext, (size_t) yyleng};
|
|
return PATH;
|
|
}
|
|
|
|
<PATH_START>{HPATH_START} {
|
|
POP_STATE();
|
|
PUSH_STATE(INPATH_SLASH);
|
|
yylval->path = {yytext, (size_t) yyleng};
|
|
return HPATH;
|
|
}
|
|
|
|
{PATH} {
|
|
if (yytext[yyleng-1] == '/')
|
|
PUSH_STATE(INPATH_SLASH);
|
|
else
|
|
PUSH_STATE(INPATH);
|
|
yylval->path = {yytext, (size_t) yyleng};
|
|
return PATH;
|
|
}
|
|
{HPATH} {
|
|
if (yytext[yyleng-1] == '/')
|
|
PUSH_STATE(INPATH_SLASH);
|
|
else
|
|
PUSH_STATE(INPATH);
|
|
yylval->path = {yytext, (size_t) yyleng};
|
|
return HPATH;
|
|
}
|
|
|
|
<INPATH,INPATH_SLASH>\$\{ {
|
|
POP_STATE();
|
|
PUSH_STATE(INPATH);
|
|
PUSH_STATE(DEFAULT);
|
|
return DOLLAR_CURLY;
|
|
}
|
|
<INPATH,INPATH_SLASH>{PATH}|{PATH_SEG}|{PATH_CHAR}+ {
|
|
POP_STATE();
|
|
if (yytext[yyleng-1] == '/')
|
|
PUSH_STATE(INPATH_SLASH);
|
|
else
|
|
PUSH_STATE(INPATH);
|
|
yylval->str = {yytext, (size_t) yyleng};
|
|
return STR;
|
|
}
|
|
<INPATH>{ANY} |
|
|
<INPATH><<EOF>> {
|
|
/* if we encounter a non-path character we inform the parser that the path has
|
|
ended with a PATH_END token and re-parse this character in the default
|
|
context (it may be ')', ';', or something of that sort) */
|
|
POP_STATE();
|
|
yyless(0);
|
|
yylloc->unstash();
|
|
return PATH_END;
|
|
}
|
|
|
|
<INPATH_SLASH>{ANY} |
|
|
<INPATH_SLASH><<EOF>> {
|
|
throw ParseError(ErrorInfo{
|
|
.msg = HintFmt("path has a trailing slash"),
|
|
.pos = state->positions[CUR_POS],
|
|
});
|
|
}
|
|
|
|
{SPATH} { yylval->path = {yytext, (size_t) yyleng}; return SPATH; }
|
|
{URI} { yylval->uri = {yytext, (size_t) yyleng}; return URI; }
|
|
|
|
[ \t\r\n]+ /* eat up whitespace */
|
|
\#[^\r\n]* /* single-line comments */
|
|
\/\*([^*]|\*+[^*/])*\*+\/ /* long comments */
|
|
|
|
{ANY} {
|
|
/* Don't return a negative number, as this will cause
|
|
Bison to stop parsing without an error. */
|
|
return (unsigned char) yytext[0];
|
|
}
|
|
|
|
%%
|