nix-super/src/libexpr/lexer.l

293 lines
8 KiB
Text
Raw Normal View History

%option reentrant bison-bridge bison-locations
%option align
%option noyywrap
%option never-interactive
%option stack
%option nodefault
%option nounput noyy_top_state
2016-06-14 18:42:46 +03:00
%s DEFAULT
%x STRING
%x IND_STRING
2021-07-29 19:03:07 +03:00
%x INPATH
%x INPATH_SLASH
%x PATH_START
%{
2020-12-02 15:33:20 +02:00
#ifdef __clang__
#pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
2020-12-02 15:33:20 +02:00
#endif
#include "nixexpr.hh"
#include "parser-tab.hh"
using namespace nix;
namespace nix {
#define CUR_POS state->at(*yylloc)
static void initLoc(YYLTYPE * loc)
{
use byte indexed locations for PosIdx we now keep not a table of all positions, but a table of all origins and their sizes. position indices are now direct pointers into the virtual concatenation of all parsed contents. this slightly reduces memory usage and time spent in the parser, at the cost of not being able to report positions if the total input size exceeds 4GiB. this limit is not unique to nix though, rustc and clang also limit their input to 4GiB (although at least clang refuses to process inputs that are larger, we will not). this new 4GiB limit probably will not cause any problems for quite a while, all of nixpkgs together is less than 100MiB in size and already needs over 700MiB of memory and multiple seconds just to parse. 4GiB worth of input will easily take multiple minutes and over 30GiB of memory without even evaluating anything. if problems *do* arise we can probably recover the old table-based system by adding some tracking to Pos::Origin (or increasing the size of PosIdx outright), but for time being this looks like more complexity than it's worth. since we now need to read the entire input again to determine the line/column of a position we'll make unsafeGetAttrPos slightly lazy: mostly the set it returns is only used to determine the file of origin of an attribute, not its exact location. the thunks do not add measurable runtime overhead. notably this change is necessary to allow changing the parser since apparently nothing supports nix's very idiosyncratic line ending choice of "anything goes", making it very hard to calculate line/column positions in the parser (while byte offsets are very easy).
2024-01-29 07:19:23 +02:00
loc->first_line = loc->last_line = 0;
loc->first_column = loc->last_column = 0;
}
static void adjustLoc(YYLTYPE * loc, const char * s, size_t len)
{
loc->stash();
2021-09-30 02:37:51 +03:00
loc->first_column = loc->last_column;
use byte indexed locations for PosIdx we now keep not a table of all positions, but a table of all origins and their sizes. position indices are now direct pointers into the virtual concatenation of all parsed contents. this slightly reduces memory usage and time spent in the parser, at the cost of not being able to report positions if the total input size exceeds 4GiB. this limit is not unique to nix though, rustc and clang also limit their input to 4GiB (although at least clang refuses to process inputs that are larger, we will not). this new 4GiB limit probably will not cause any problems for quite a while, all of nixpkgs together is less than 100MiB in size and already needs over 700MiB of memory and multiple seconds just to parse. 4GiB worth of input will easily take multiple minutes and over 30GiB of memory without even evaluating anything. if problems *do* arise we can probably recover the old table-based system by adding some tracking to Pos::Origin (or increasing the size of PosIdx outright), but for time being this looks like more complexity than it's worth. since we now need to read the entire input again to determine the line/column of a position we'll make unsafeGetAttrPos slightly lazy: mostly the set it returns is only used to determine the file of origin of an attribute, not its exact location. the thunks do not add measurable runtime overhead. notably this change is necessary to allow changing the parser since apparently nothing supports nix's very idiosyncratic line ending choice of "anything goes", making it very hard to calculate line/column positions in the parser (while byte offsets are very easy).
2024-01-29 07:19:23 +02:00
loc->last_column += len;
}
// we make use of the fact that the parser receives a private copy of the input
// string and can munge around in it.
2022-01-19 14:39:42 +02:00
static StringToken unescapeStr(SymbolTable & symbols, char * s, size_t length)
{
char * result = s;
char * t = s;
char c;
// the input string is terminated with *two* NULs, so we can safely take
// *one* character after the one being checked against.
while ((c = *s++)) {
if (c == '\\') {
c = *s++;
if (c == 'n') *t = '\n';
else if (c == 'r') *t = '\r';
else if (c == 't') *t = '\t';
else *t = c;
}
else if (c == '\r') {
/* Normalise CR and CR/LF into LF. */
*t = '\n';
if (*s == '\n') s++; /* cr/lf */
}
else *t = c;
t++;
}
2022-01-19 14:39:42 +02:00
return {result, size_t(t - result)};
}
2013-09-02 17:29:15 +03:00
}
// yacc generates code that uses unannotated fallthrough.
#pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
#define YY_USER_INIT initLoc(yylloc)
#define YY_USER_ACTION adjustLoc(yylloc, yytext, yyleng);
#define PUSH_STATE(state) yy_push_state(state, yyscanner)
#define POP_STATE() yy_pop_state(yyscanner)
%}
ANY .|\n
ID [a-zA-Z\_][a-zA-Z0-9\_\'\-]*
INT [0-9]+
FLOAT (([1-9][0-9]*\.[0-9]*)|(0?\.[0-9]+))([Ee][+-]?[0-9]+)?
2021-07-29 19:03:07 +03:00
PATH_CHAR [a-zA-Z0-9\.\_\-\+]
PATH {PATH_CHAR}*(\/{PATH_CHAR}+)+\/?
PATH_SEG {PATH_CHAR}*\/
HPATH \~(\/{PATH_CHAR}+)+\/?
HPATH_START \~\/
SPATH \<{PATH_CHAR}+(\/{PATH_CHAR}+)*\>
URI [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+
%%
if { return IF; }
then { return THEN; }
else { return ELSE; }
assert { return ASSERT; }
with { return WITH; }
let { return LET; }
in { return IN_KW; }
rec { return REC; }
inherit { return INHERIT; }
or { return OR_KW; }
\.\.\. { return ELLIPSIS; }
\=\= { return EQ; }
\!\= { return NEQ; }
\<\= { return LEQ; }
\>\= { return GEQ; }
\&\& { return AND; }
\|\| { return OR; }
\-\> { return IMPL; }
\/\/ { return UPDATE; }
\+\+ { return CONCAT; }
{ID} { yylval->id = {yytext, (size_t) yyleng}; return ID; }
{INT} { errno = 0;
Remove 100s of CPU time (10%) from build times (1465s -> 1302s) Result's from Mic92's framework 13th Gen Intel Core i7-1360P: Before: 3595.92s user 183.01s system 1360% cpu 4:37.74 total After: 3486.07s user 168.93s system 1354% cpu 4:29.79 total I saw that boost/lexical_cast was costing about 100s in CPU time on our compiles. We can fix this trivially by doing explicit template instantiation in exactly one place and eliminating all other includes of it, which is a code improvement anyway by hiding the boost. Before: ``` lix/lix2 » ClangBuildAnalyzer --analyze buildtimeold.bin Analyzing build trace from 'buildtimeold.bin'... **** Time summary: Compilation (551 times): Parsing (frontend): 1465.3 s Codegen & opts (backend): 1110.9 s <snip> **** Expensive headers: 178153 ms: ../src/libcmd/installable-value.hh (included 52 times, avg 3426 ms), included via: 40x: command.hh 5x: command-installable-value.hh 3x: installable-flake.hh 2x: <direct include> 2x: installable-attr-path.hh 176217 ms: ../src/libutil/error.hh (included 246 times, avg 716 ms), included via: 36x: command.hh installable-value.hh installables.hh derived-path.hh config.hh experimental-features.hh 12x: globals.hh config.hh experimental-features.hh 11x: file-system.hh file-descriptor.hh 6x: serialise.hh strings.hh 6x: <direct include> 6x: archive.hh serialise.hh strings.hh ... 173243 ms: ../src/libstore/store-api.hh (included 152 times, avg 1139 ms), included via: 55x: <direct include> 39x: command.hh installable-value.hh installables.hh 7x: libexpr.hh 4x: local-store.hh 4x: command-installable-value.hh installable-value.hh installables.hh 3x: binary-cache-store.hh ... 170482 ms: ../src/libutil/serialise.hh (included 201 times, avg 848 ms), included via: 37x: command.hh installable-value.hh installables.hh built-path.hh realisation.hh hash.hh 14x: store-api.hh nar-info.hh hash.hh 11x: <direct include> 7x: primops.hh eval.hh attr-set.hh nixexpr.hh value.hh source-path.hh archive.hh 7x: libexpr.hh value.hh source-path.hh archive.hh 6x: fetchers.hh hash.hh ... 169397 ms: ../src/libcmd/installables.hh (included 53 times, avg 3196 ms), included via: 40x: command.hh installable-value.hh 5x: command-installable-value.hh installable-value.hh 3x: installable-flake.hh installable-value.hh 2x: <direct include> 1x: installable-derived-path.hh 1x: installable-value.hh ... 159740 ms: ../src/libutil/strings.hh (included 221 times, avg 722 ms), included via: 37x: command.hh installable-value.hh installables.hh built-path.hh realisation.hh hash.hh serialise.hh 19x: <direct include> 14x: store-api.hh nar-info.hh hash.hh serialise.hh 11x: serialise.hh 7x: primops.hh eval.hh attr-set.hh nixexpr.hh value.hh source-path.hh archive.hh serialise.hh 7x: libexpr.hh value.hh source-path.hh archive.hh serialise.hh ... 156796 ms: ../src/libcmd/command.hh (included 51 times, avg 3074 ms), included via: 42x: <direct include> 7x: command-installable-value.hh 2x: installable-attr-path.hh 150392 ms: ../src/libutil/types.hh (included 251 times, avg 599 ms), included via: 36x: command.hh installable-value.hh installables.hh path.hh 11x: file-system.hh 10x: globals.hh 6x: fetchers.hh 6x: serialise.hh strings.hh error.hh 5x: archive.hh ... 133101 ms: /nix/store/644b90j1vms44nr18yw3520pzkrg4dd1-boost-1.81.0-dev/include/boost/lexical_cast.hpp (included 226 times, avg 588 ms), included via : 37x: command.hh installable-value.hh installables.hh built-path.hh realisation.hh hash.hh serialise.hh strings.hh 19x: file-system.hh 11x: store-api.hh nar-info.hh hash.hh serialise.hh strings.hh 7x: primops.hh eval.hh attr-set.hh nixexpr.hh value.hh source-path.hh archive.hh serialise.hh strings.hh 7x: libexpr.hh value.hh source-path.hh archive.hh serialise.hh strings.hh 6x: eval.hh attr-set.hh nixexpr.hh value.hh source-path.hh archive.hh serialise.hh strings.hh ... 132887 ms: /nix/store/h2abv2l8irqj942i5rq9wbrj42kbsh5y-gcc-12.3.0/include/c++/12.3.0/memory (included 262 times, avg 507 ms), included via: 36x: command.hh installable-value.hh installables.hh path.hh types.hh ref.hh 16x: gtest.h 11x: file-system.hh types.hh ref.hh 10x: globals.hh types.hh ref.hh 10x: json.hpp 6x: serialise.hh ... done in 0.6s. ``` After: ``` lix/lix2 » maintainers/buildtime_report.sh build Processing all files and saving to '/home/jade/lix/lix2/maintainers/../buildtime.bin'... done in 0.6s. Run 'ClangBuildAnalyzer --analyze /home/jade/lix/lix2/maintainers/../buildtime.bin' to analyze it. Analyzing build trace from '/home/jade/lix/lix2/maintainers/../buildtime.bin'... **** Time summary: Compilation (551 times): Parsing (frontend): 1302.1 s Codegen & opts (backend): 956.3 s <snip> **** Expensive headers: 178145 ms: ../src/libutil/error.hh (included 246 times, avg 724 ms), included via: 36x: command.hh installable-value.hh installables.hh derived-path.hh config.hh experimental-features.hh 12x: globals.hh config.hh experimental-features.hh 11x: file-system.hh file-descriptor.hh 6x: <direct include> 6x: serialise.hh strings.hh 6x: fetchers.hh hash.hh serialise.hh strings.hh ... 154043 ms: ../src/libcmd/installable-value.hh (included 52 times, avg 2962 ms), included via: 40x: command.hh 5x: command-installable-value.hh 3x: installable-flake.hh 2x: <direct include> 2x: installable-attr-path.hh 153593 ms: ../src/libstore/store-api.hh (included 152 times, avg 1010 ms), included via: 55x: <direct include> 39x: command.hh installable-value.hh installables.hh 7x: libexpr.hh 4x: local-store.hh 4x: command-installable-value.hh installable-value.hh installables.hh 3x: binary-cache-store.hh ... 149948 ms: ../src/libutil/types.hh (included 251 times, avg 597 ms), included via: 36x: command.hh installable-value.hh installables.hh path.hh 11x: file-system.hh 10x: globals.hh 6x: fetchers.hh 6x: serialise.hh strings.hh error.hh 5x: archive.hh ... 144560 ms: ../src/libcmd/installables.hh (included 53 times, avg 2727 ms), included via: 40x: command.hh installable-value.hh 5x: command-installable-value.hh installable-value.hh 3x: installable-flake.hh installable-value.hh 2x: <direct include> 1x: installable-value.hh 1x: installable-derived-path.hh ... 136585 ms: ../src/libcmd/command.hh (included 51 times, avg 2678 ms), included via: 42x: <direct include> 7x: command-installable-value.hh 2x: installable-attr-path.hh 133394 ms: /nix/store/h2abv2l8irqj942i5rq9wbrj42kbsh5y-gcc-12.3.0/include/c++/12.3.0/memory (included 262 times, avg 509 ms), included via: 36x: command.hh installable-value.hh installables.hh path.hh types.hh ref.hh 16x: gtest.h 11x: file-system.hh types.hh ref.hh 10x: globals.hh types.hh ref.hh 10x: json.hpp 6x: serialise.hh ... 89315 ms: ../src/libstore/derived-path.hh (included 178 times, avg 501 ms), included via: 37x: command.hh installable-value.hh installables.hh 25x: store-api.hh realisation.hh 7x: primops.hh eval.hh attr-set.hh nixexpr.hh value.hh context.hh 6x: eval.hh attr-set.hh nixexpr.hh value.hh context.hh 6x: libexpr.hh value.hh context.hh 6x: shared.hh ... 87347 ms: /nix/store/h2abv2l8irqj942i5rq9wbrj42kbsh5y-gcc-12.3.0/include/c++/12.3.0/ostream (included 273 times, avg 319 ms), included via: 35x: command.hh installable-value.hh installables.hh path.hh types.hh ref.hh memory unique_ptr.h 12x: regex sstream istream 10x: file-system.hh types.hh ref.hh memory unique_ptr.h 10x: gtest.h memory unique_ptr.h 10x: globals.hh types.hh ref.hh memory unique_ptr.h 6x: fetchers.hh types.hh ref.hh memory unique_ptr.h ... 85249 ms: ../src/libutil/config.hh (included 213 times, avg 400 ms), included via: 37x: command.hh installable-value.hh installables.hh derived-path.hh 20x: globals.hh 20x: logging.hh 16x: store-api.hh logging.hh 6x: <direct include> 6x: eval.hh attr-set.hh nixexpr.hh value.hh context.hh derived-path.hh ... done in 0.5s. ``` Adapated from https://git.lix.systems/lix-project/lix/commit/18aa3e1d570b4ecbb9962376e5fba5757dad8da9
2024-05-30 07:12:34 +03:00
std::optional<int64_t> numMay = string2Int<int64_t>(yytext);
if (numMay.has_value()) {
yylval->n = *numMay;
} else {
throw ParseError(ErrorInfo{
2024-02-04 06:35:19 +02:00
.msg = HintFmt("invalid integer '%1%'", yytext),
.pos = state->positions[CUR_POS],
});
libexpr: Use int64_t for NixInt Using a 64bit integer on 32bit systems will come with a bit of a performance overhead, but given that Nix doesn't use a lot of integers compared to other types, I think the overhead is negligible also considering that 32bit systems are in decline. The biggest advantage however is that when we use a consistent integer size across all platforms it's less likely that we miss things that we break due to that. One example would be: https://github.com/NixOS/nixpkgs/pull/44233 On Hydra it will evaluate, because the evaluator runs on a 64bit machine, but when evaluating the same on a 32bit machine it will fail, so using 64bit integers should make that consistent. While the change of the type in value.hh is rather easy to do, we have a few more options available for doing the conversion in the lexer: * Via an #ifdef on the architecture and using strtol() or strtoll() accordingly depending on which architecture we are. For the #ifdef we would need another AX_COMPILE_CHECK_SIZEOF in configure.ac. * Using istringstream, which would involve copying the value. * As we're already using boost, lexical_cast might be a good idea. Spoiler: I went for the latter, first of all because lexical_cast does have an overload for const char* and second of all, because it doesn't involve copying around the input string. Also, because istringstream seems to come with a bigger overhead than boost::lexical_cast: https://www.boost.org/doc/libs/release/doc/html/boost_lexical_cast/performance.html The first method (still using strtol/strtoll) also wasn't something I pursued further, because it is also locale-aware which I doubt is what we want, given that the regex for int is [0-9]+. Signed-off-by: aszlig <aszlig@nix.build> Fixes: #2339
2018-08-29 01:23:51 +03:00
}
return INT_LIT;
}
{FLOAT} { errno = 0;
2016-01-05 10:46:37 +02:00
yylval->nf = strtod(yytext, 0);
if (errno != 0)
throw ParseError(ErrorInfo{
2024-02-04 06:35:19 +02:00
.msg = HintFmt("invalid float '%1%'", yytext),
.pos = state->positions[CUR_POS],
});
return FLOAT_LIT;
}
2016-06-14 18:42:46 +03:00
\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
2016-06-14 18:42:46 +03:00
\} { /* State INITIAL only exists at the bottom of the stack and is
used as a marker. DEFAULT replaces it everywhere else.
Popping when in INITIAL state causes an empty stack exception,
so don't */
if (YYSTATE != INITIAL)
POP_STATE();
return '}';
}
\{ { PUSH_STATE(DEFAULT); return '{'; }
2016-06-14 18:42:46 +03:00
\" { PUSH_STATE(STRING); return '"'; }
<STRING>([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})*\$/\" |
<STRING>([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})+ {
/* It is impossible to match strings ending with '$' with one
regex because trailing contexts are only valid at the end
of a rule. (A sane but undocumented limitation.) */
yylval->str = unescapeStr(state->symbols, yytext, yyleng);
return STR;
}
2016-06-14 18:42:46 +03:00
<STRING>\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
<STRING>\" { POP_STATE(); return '"'; }
<STRING>\$|\\|\$\\ {
/* This can only occur when we reach EOF, otherwise the above
(...|\$[^\{\"\\]|\\.|\$\\.)+ would have triggered.
This is technically invalid, but we leave the problem to the
parser who fails with exact location. */
return EOF;
}
2016-06-14 18:42:46 +03:00
\'\'(\ *\n)? { PUSH_STATE(IND_STRING); return IND_STRING_OPEN; }
<IND_STRING>([^\$\']|\$[^\{\']|\'[^\'\$])+ {
2022-01-19 14:39:42 +02:00
yylval->str = {yytext, (size_t) yyleng, true};
return IND_STR;
2007-12-06 12:20:58 +02:00
}
<IND_STRING>\'\'\$ |
<IND_STRING>\$ {
2022-01-19 14:39:42 +02:00
yylval->str = {"$", 1};
2007-12-06 12:20:58 +02:00
return IND_STR;
}
<IND_STRING>\'\'\' {
2022-01-19 14:39:42 +02:00
yylval->str = {"''", 2};
2007-12-06 12:20:58 +02:00
return IND_STR;
}
<IND_STRING>\'\'\\{ANY} {
yylval->str = unescapeStr(state->symbols, yytext + 2, yyleng - 2);
2007-12-06 12:20:58 +02:00
return IND_STR;
}
2016-06-14 18:42:46 +03:00
<IND_STRING>\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; }
<IND_STRING>\'\' { POP_STATE(); return IND_STRING_CLOSE; }
<IND_STRING>\' {
2022-01-19 14:39:42 +02:00
yylval->str = {"'", 1};
return IND_STR;
}
2021-07-29 19:03:07 +03:00
{PATH_SEG}\$\{ |
{HPATH_START}\$\{ {
PUSH_STATE(PATH_START);
yyless(0);
yylloc->unstash();
2021-07-29 19:03:07 +03:00
}
<PATH_START>{PATH_SEG} {
POP_STATE();
PUSH_STATE(INPATH_SLASH);
yylval->path = {yytext, (size_t) yyleng};
2021-07-29 19:03:07 +03:00
return PATH;
}
<PATH_START>{HPATH_START} {
POP_STATE();
PUSH_STATE(INPATH_SLASH);
yylval->path = {yytext, (size_t) yyleng};
2021-07-29 19:03:07 +03:00
return HPATH;
}
{PATH} {
if (yytext[yyleng-1] == '/')
PUSH_STATE(INPATH_SLASH);
else
PUSH_STATE(INPATH);
yylval->path = {yytext, (size_t) yyleng};
2021-07-29 19:03:07 +03:00
return PATH;
}
{HPATH} {
if (yytext[yyleng-1] == '/')
PUSH_STATE(INPATH_SLASH);
else
PUSH_STATE(INPATH);
yylval->path = {yytext, (size_t) yyleng};
2021-07-29 19:03:07 +03:00
return HPATH;
}
<INPATH,INPATH_SLASH>\$\{ {
POP_STATE();
PUSH_STATE(INPATH);
PUSH_STATE(DEFAULT);
return DOLLAR_CURLY;
}
<INPATH,INPATH_SLASH>{PATH}|{PATH_SEG}|{PATH_CHAR}+ {
POP_STATE();
if (yytext[yyleng-1] == '/')
PUSH_STATE(INPATH_SLASH);
else
PUSH_STATE(INPATH);
2022-01-19 14:39:42 +02:00
yylval->str = {yytext, (size_t) yyleng};
2021-07-29 19:03:07 +03:00
return STR;
}
<INPATH>{ANY} |
<INPATH><<EOF>> {
/* if we encounter a non-path character we inform the parser that the path has
ended with a PATH_END token and re-parse this character in the default
context (it may be ')', ';', or something of that sort) */
POP_STATE();
yyless(0);
yylloc->unstash();
2021-07-29 19:03:07 +03:00
return PATH_END;
}
<INPATH_SLASH>{ANY} |
<INPATH_SLASH><<EOF>> {
throw ParseError(ErrorInfo{
2024-02-04 06:35:19 +02:00
.msg = HintFmt("path has a trailing slash"),
.pos = state->positions[CUR_POS],
});
2021-07-29 19:03:07 +03:00
}
{SPATH} { yylval->path = {yytext, (size_t) yyleng}; return SPATH; }
{URI} { yylval->uri = {yytext, (size_t) yyleng}; return URI; }
[ \t\r\n]+ /* eat up whitespace */
\#[^\r\n]* /* single-line comments */
\/\*([^*]|\*+[^*/])*\*+\/ /* long comments */
{ANY} {
/* Don't return a negative number, as this will cause
Bison to stop parsing without an error. */
return (unsigned char) yytext[0];
}
%%