nix-super/src/libexpr/lexer.l

294 lines
8 KiB
Text
Raw Normal View History

%option reentrant bison-bridge bison-locations
%option align
%option noyywrap
%option never-interactive
%option stack
%option nodefault
%option nounput noyy_top_state
2016-06-14 18:42:46 +03:00
%s DEFAULT
%x STRING
%x IND_STRING
2021-07-29 19:03:07 +03:00
%x INPATH
%x INPATH_SLASH
%x PATH_START
%{
2020-12-02 15:33:20 +02:00
#ifdef __clang__
#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
2020-12-02 15:33:20 +02:00
#endif
libexpr: Use int64_t for NixInt Using a 64bit integer on 32bit systems will come with a bit of a performance overhead, but given that Nix doesn't use a lot of integers compared to other types, I think the overhead is negligible also considering that 32bit systems are in decline. The biggest advantage however is that when we use a consistent integer size across all platforms it's less likely that we miss things that we break due to that. One example would be: https://github.com/NixOS/nixpkgs/pull/44233 On Hydra it will evaluate, because the evaluator runs on a 64bit machine, but when evaluating the same on a 32bit machine it will fail, so using 64bit integers should make that consistent. While the change of the type in value.hh is rather easy to do, we have a few more options available for doing the conversion in the lexer: * Via an #ifdef on the architecture and using strtol() or strtoll() accordingly depending on which architecture we are. For the #ifdef we would need another AX_COMPILE_CHECK_SIZEOF in configure.ac. * Using istringstream, which would involve copying the value. * As we're already using boost, lexical_cast might be a good idea. Spoiler: I went for the latter, first of all because lexical_cast does have an overload for const char* and second of all, because it doesn't involve copying around the input string. Also, because istringstream seems to come with a bigger overhead than boost::lexical_cast: https://www.boost.org/doc/libs/release/doc/html/boost_lexical_cast/performance.html The first method (still using strtol/strtoll) also wasn't something I pursued further, because it is also locale-aware which I doubt is what we want, given that the regex for int is [0-9]+. Signed-off-by: aszlig <aszlig@nix.build> Fixes: #2339
2018-08-29 01:23:51 +03:00
#include <boost/lexical_cast.hpp>
#include "nixexpr.hh"
#include "parser-tab.hh"
using namespace nix;
namespace nix {
#define CUR_POS state->at(*yylloc)
static void initLoc(YYLTYPE * loc)
{
use byte indexed locations for PosIdx we now keep not a table of all positions, but a table of all origins and their sizes. position indices are now direct pointers into the virtual concatenation of all parsed contents. this slightly reduces memory usage and time spent in the parser, at the cost of not being able to report positions if the total input size exceeds 4GiB. this limit is not unique to nix though, rustc and clang also limit their input to 4GiB (although at least clang refuses to process inputs that are larger, we will not). this new 4GiB limit probably will not cause any problems for quite a while, all of nixpkgs together is less than 100MiB in size and already needs over 700MiB of memory and multiple seconds just to parse. 4GiB worth of input will easily take multiple minutes and over 30GiB of memory without even evaluating anything. if problems *do* arise we can probably recover the old table-based system by adding some tracking to Pos::Origin (or increasing the size of PosIdx outright), but for time being this looks like more complexity than it's worth. since we now need to read the entire input again to determine the line/column of a position we'll make unsafeGetAttrPos slightly lazy: mostly the set it returns is only used to determine the file of origin of an attribute, not its exact location. the thunks do not add measurable runtime overhead. notably this change is necessary to allow changing the parser since apparently nothing supports nix's very idiosyncratic line ending choice of "anything goes", making it very hard to calculate line/column positions in the parser (while byte offsets are very easy).
2024-01-29 07:19:23 +02:00
loc->first_line = loc->last_line = 0;
loc->first_column = loc->last_column = 0;
}
static void adjustLoc(YYLTYPE * loc, const char * s, size_t len)
{
loc->stash();
2021-09-30 02:37:51 +03:00
loc->first_column = loc->last_column;
use byte indexed locations for PosIdx we now keep not a table of all positions, but a table of all origins and their sizes. position indices are now direct pointers into the virtual concatenation of all parsed contents. this slightly reduces memory usage and time spent in the parser, at the cost of not being able to report positions if the total input size exceeds 4GiB. this limit is not unique to nix though, rustc and clang also limit their input to 4GiB (although at least clang refuses to process inputs that are larger, we will not). this new 4GiB limit probably will not cause any problems for quite a while, all of nixpkgs together is less than 100MiB in size and already needs over 700MiB of memory and multiple seconds just to parse. 4GiB worth of input will easily take multiple minutes and over 30GiB of memory without even evaluating anything. if problems *do* arise we can probably recover the old table-based system by adding some tracking to Pos::Origin (or increasing the size of PosIdx outright), but for time being this looks like more complexity than it's worth. since we now need to read the entire input again to determine the line/column of a position we'll make unsafeGetAttrPos slightly lazy: mostly the set it returns is only used to determine the file of origin of an attribute, not its exact location. the thunks do not add measurable runtime overhead. notably this change is necessary to allow changing the parser since apparently nothing supports nix's very idiosyncratic line ending choice of "anything goes", making it very hard to calculate line/column positions in the parser (while byte offsets are very easy).
2024-01-29 07:19:23 +02:00
loc->last_column += len;
}
// we make use of the fact that the parser receives a private copy of the input
// string and can munge around in it.
2022-01-19 14:39:42 +02:00
static StringToken unescapeStr(SymbolTable & symbols, char * s, size_t length)
{
char * result = s;
char * t = s;
char c;
// the input string is terminated with *two* NULs, so we can safely take
// *one* character after the one being checked against.
while ((c = *s++)) {
if (c == '\\') {
c = *s++;
if (c == 'n') *t = '\n';
else if (c == 'r') *t = '\r';
else if (c == 't') *t = '\t';
else *t = c;
}
else if (c == '\r') {
/* Normalise CR and CR/LF into LF. */
*t = '\n';
if (*s == '\n') s++; /* cr/lf */
}
else *t = c;
t++;
}
2022-01-19 14:39:42 +02:00
return {result, size_t(t - result)};
}
2013-09-02 17:29:15 +03:00
}
// yacc generates code that uses unannotated fallthrough.
#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
#define YY_USER_INIT initLoc(yylloc)
#define YY_USER_ACTION adjustLoc(yylloc, yytext, yyleng);
#define PUSH_STATE(state) yy_push_state(state, yyscanner)
#define POP_STATE() yy_pop_state(yyscanner)
%}
ANY .|\n
ID [a-zA-Z\_][a-zA-Z0-9\_\'\-]*
INT [0-9]+
FLOAT (([1-9][0-9]*\.[0-9]*)|(0?\.[0-9]+))([Ee][+-]?[0-9]+)?
2021-07-29 19:03:07 +03:00
PATH_CHAR [a-zA-Z0-9\.\_\-\+]
PATH {PATH_CHAR}*(\/{PATH_CHAR}+)+\/?
PATH_SEG {PATH_CHAR}*\/
HPATH \~(\/{PATH_CHAR}+)+\/?
HPATH_START \~\/
SPATH \<{PATH_CHAR}+(\/{PATH_CHAR}+)*\>
URI [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+
%%
if { return IF; }
then { return THEN; }
else { return ELSE; }
assert { return ASSERT; }
with { return WITH; }
let { return LET; }
in { return IN_KW; }
rec { return REC; }
inherit { return INHERIT; }
or { return OR_KW; }
\.\.\. { return ELLIPSIS; }
\=\= { return EQ; }
\!\= { return NEQ; }
\<\= { return LEQ; }
\>\= { return GEQ; }
\&\& { return AND; }
\|\| { return OR; }
\-\> { return IMPL; }
\/\/ { return UPDATE; }
\+\+ { return CONCAT; }
{ID} { yylval->id = {yytext, (size_t) yyleng}; return ID; }
{INT} { errno = 0;
libexpr: Use int64_t for NixInt Using a 64bit integer on 32bit systems will come with a bit of a performance overhead, but given that Nix doesn't use a lot of integers compared to other types, I think the overhead is negligible also considering that 32bit systems are in decline. The biggest advantage however is that when we use a consistent integer size across all platforms it's less likely that we miss things that we break due to that. One example would be: https://github.com/NixOS/nixpkgs/pull/44233 On Hydra it will evaluate, because the evaluator runs on a 64bit machine, but when evaluating the same on a 32bit machine it will fail, so using 64bit integers should make that consistent. While the change of the type in value.hh is rather easy to do, we have a few more options available for doing the conversion in the lexer: * Via an #ifdef on the architecture and using strtol() or strtoll() accordingly depending on which architecture we are. For the #ifdef we would need another AX_COMPILE_CHECK_SIZEOF in configure.ac. * Using istringstream, which would involve copying the value. * As we're already using boost, lexical_cast might be a good idea. Spoiler: I went for the latter, first of all because lexical_cast does have an overload for const char* and second of all, because it doesn't involve copying around the input string. Also, because istringstream seems to come with a bigger overhead than boost::lexical_cast: https://www.boost.org/doc/libs/release/doc/html/boost_lexical_cast/performance.html The first method (still using strtol/strtoll) also wasn't something I pursued further, because it is also locale-aware which I doubt is what we want, given that the regex for int is [0-9]+. Signed-off-by: aszlig <aszlig@nix.build> Fixes: #2339
2018-08-29 01:23:51 +03:00
try {
yylval->n = boost::lexical_cast<int64_t>(yytext);
} catch (const boost::bad_lexical_cast &) {
throw ParseError(ErrorInfo{
2024-02-04 06:35:19 +02:00
.msg = HintFmt("invalid integer '%1%'", yytext),
.pos = state->positions[CUR_POS],
});
libexpr: Use int64_t for NixInt Using a 64bit integer on 32bit systems will come with a bit of a performance overhead, but given that Nix doesn't use a lot of integers compared to other types, I think the overhead is negligible also considering that 32bit systems are in decline. The biggest advantage however is that when we use a consistent integer size across all platforms it's less likely that we miss things that we break due to that. One example would be: https://github.com/NixOS/nixpkgs/pull/44233 On Hydra it will evaluate, because the evaluator runs on a 64bit machine, but when evaluating the same on a 32bit machine it will fail, so using 64bit integers should make that consistent. While the change of the type in value.hh is rather easy to do, we have a few more options available for doing the conversion in the lexer: * Via an #ifdef on the architecture and using strtol() or strtoll() accordingly depending on which architecture we are. For the #ifdef we would need another AX_COMPILE_CHECK_SIZEOF in configure.ac. * Using istringstream, which would involve copying the value. * As we're already using boost, lexical_cast might be a good idea. Spoiler: I went for the latter, first of all because lexical_cast does have an overload for const char* and second of all, because it doesn't involve copying around the input string. Also, because istringstream seems to come with a bigger overhead than boost::lexical_cast: https://www.boost.org/doc/libs/release/doc/html/boost_lexical_cast/performance.html The first method (still using strtol/strtoll) also wasn't something I pursued further, because it is also locale-aware which I doubt is what we want, given that the regex for int is [0-9]+. Signed-off-by: aszlig <aszlig@nix.build> Fixes: #2339
2018-08-29 01:23:51 +03:00
}
return INT_LIT;
}
{FLOAT} { errno = 0;
2016-01-05 10:46:37 +02:00
yylval->nf = strtod(yytext, 0);
if (errno != 0)
throw ParseError(ErrorInfo{
2024-02-04 06:35:19 +02:00
.msg = HintFmt("invalid float '%1%'", yytext),
.pos = state->positions[CUR_POS],
});
return FLOAT_LIT;
}
2016-06-14 18:42:46 +03:00
\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
2016-06-14 18:42:46 +03:00
\} { /* State INITIAL only exists at the bottom of the stack and is
used as a marker. DEFAULT replaces it everywhere else.
Popping when in INITIAL state causes an empty stack exception,
so don't */
if (YYSTATE != INITIAL)
POP_STATE();
return '}';
}
\{ { PUSH_STATE(DEFAULT); return '{'; }
2016-06-14 18:42:46 +03:00
\" { PUSH_STATE(STRING); return '"'; }
<STRING>([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})*\$/\" |
<STRING>([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})+ {
/* It is impossible to match strings ending with '$' with one
regex because trailing contexts are only valid at the end
of a rule. (A sane but undocumented limitation.) */
yylval->str = unescapeStr(state->symbols, yytext, yyleng);
return STR;
}
2016-06-14 18:42:46 +03:00
<STRING>\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
<STRING>\" { POP_STATE(); return '"'; }
<STRING>\$|\\|\$\\ {
/* This can only occur when we reach EOF, otherwise the above
(...|\$[^\{\"\\]|\\.|\$\\.)+ would have triggered.
This is technically invalid, but we leave the problem to the
parser who fails with exact location. */
return EOF;
}
2016-06-14 18:42:46 +03:00
\'\'(\ *\n)? { PUSH_STATE(IND_STRING); return IND_STRING_OPEN; }
<IND_STRING>([^\$\']|\$[^\{\']|\'[^\'\$])+ {
2022-01-19 14:39:42 +02:00
yylval->str = {yytext, (size_t) yyleng, true};
return IND_STR;
2007-12-06 12:20:58 +02:00
}
<IND_STRING>\'\'\$ |
<IND_STRING>\$ {
2022-01-19 14:39:42 +02:00
yylval->str = {"$", 1};
2007-12-06 12:20:58 +02:00
return IND_STR;
}
<IND_STRING>\'\'\' {
2022-01-19 14:39:42 +02:00
yylval->str = {"''", 2};
2007-12-06 12:20:58 +02:00
return IND_STR;
}
<IND_STRING>\'\'\\{ANY} {
yylval->str = unescapeStr(state->symbols, yytext + 2, yyleng - 2);
2007-12-06 12:20:58 +02:00
return IND_STR;
}
2016-06-14 18:42:46 +03:00
<IND_STRING>\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
<IND_STRING>\'\' { POP_STATE(); return IND_STRING_CLOSE; }
<IND_STRING>\' {
2022-01-19 14:39:42 +02:00
yylval->str = {"'", 1};
return IND_STR;
}
2021-07-29 19:03:07 +03:00
{PATH_SEG}\$\{ |
{HPATH_START}\$\{ {
PUSH_STATE(PATH_START);
yyless(0);
yylloc->unstash();
2021-07-29 19:03:07 +03:00
}
<PATH_START>{PATH_SEG} {
POP_STATE();
PUSH_STATE(INPATH_SLASH);
yylval->path = {yytext, (size_t) yyleng};
2021-07-29 19:03:07 +03:00
return PATH;
}
<PATH_START>{HPATH_START} {
POP_STATE();
PUSH_STATE(INPATH_SLASH);
yylval->path = {yytext, (size_t) yyleng};
2021-07-29 19:03:07 +03:00
return HPATH;
}
{PATH} {
if (yytext[yyleng-1] == '/')
PUSH_STATE(INPATH_SLASH);
else
PUSH_STATE(INPATH);
yylval->path = {yytext, (size_t) yyleng};
2021-07-29 19:03:07 +03:00
return PATH;
}
{HPATH} {
if (yytext[yyleng-1] == '/')
PUSH_STATE(INPATH_SLASH);
else
PUSH_STATE(INPATH);
yylval->path = {yytext, (size_t) yyleng};
2021-07-29 19:03:07 +03:00
return HPATH;
}
<INPATH,INPATH_SLASH>\$\{ {
POP_STATE();
PUSH_STATE(INPATH);
PUSH_STATE(DEFAULT);
return DOLLAR_CURLY;
}
<INPATH,INPATH_SLASH>{PATH}|{PATH_SEG}|{PATH_CHAR}+ {
POP_STATE();
if (yytext[yyleng-1] == '/')
PUSH_STATE(INPATH_SLASH);
else
PUSH_STATE(INPATH);
2022-01-19 14:39:42 +02:00
yylval->str = {yytext, (size_t) yyleng};
2021-07-29 19:03:07 +03:00
return STR;
}
<INPATH>{ANY} |
<INPATH><<EOF>> {
/* if we encounter a non-path character we inform the parser that the path has
ended with a PATH_END token and re-parse this character in the default
context (it may be ')', ';', or something of that sort) */
POP_STATE();
yyless(0);
yylloc->unstash();
2021-07-29 19:03:07 +03:00
return PATH_END;
}
<INPATH_SLASH>{ANY} |
<INPATH_SLASH><<EOF>> {
throw ParseError(ErrorInfo{
2024-02-04 06:35:19 +02:00
.msg = HintFmt("path has a trailing slash"),
.pos = state->positions[CUR_POS],
});
2021-07-29 19:03:07 +03:00
}
{SPATH} { yylval->path = {yytext, (size_t) yyleng}; return SPATH; }
{URI} { yylval->uri = {yytext, (size_t) yyleng}; return URI; }
[ \t\r\n]+ /* eat up whitespace */
\#[^\r\n]* /* single-line comments */
\/\*([^*]|\*+[^*/])*\*+\/ /* long comments */
{ANY} {
/* Don't return a negative number, as this will cause
Bison to stop parsing without an error. */
return (unsigned char) yytext[0];
}
%%