1 /* 2 * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io> 3 * 4 * Permission to use, copy, modify, and/or distribute this software for any 5 * purpose with or without fee is hereby granted, provided that the above 6 * copyright notice and this permission notice appear in all copies. 7 * 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 */ 16 17 #include <stdio.h> 18 19 #include <stdbool.h> 20 #include <stdlib.h> 21 #include <string.h> 22 #include <ctype.h> 23 #include <regex.h> 24 #include <math.h> 25 #include <errno.h> 26 #include <endian.h> 27 28 #include "ucode/vm.h" 29 #include "ucode/lib.h" 30 #include "ucode/lexer.h" 31 32 #define UC_LEX_CONTINUE_PARSING (void *)1 33 34 struct keyword { 35 unsigned type; 36 const char *pat; 37 unsigned plen; 38 }; 39 40 struct token { 41 unsigned type; 42 union { 43 uint32_t patn; 44 char pat[4]; 45 } u; 46 unsigned plen; 47 uc_token_t *(*parse)(uc_lexer_t *); 48 }; 49 50 #define dec(o) \ 51 ((o) - '') 52 53 #define hex(x) \ 54 (((x) >= 'a') ? (10 + (x) - 'a') : \ 55 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) 56 57 #ifndef NO_COMPILE 58 59 static uc_token_t *parse_comment(uc_lexer_t *); 60 static uc_token_t *parse_string(uc_lexer_t *); 61 static uc_token_t *parse_regexp(uc_lexer_t *); 62 static uc_token_t *parse_number(uc_lexer_t *); 63 static uc_token_t *parse_label(uc_lexer_t *); 64 65 static const struct token tokens[] = { 66 { TK_ASLEFT, { .pat = "<<=" }, 3, NULL }, 67 { TK_ASRIGHT, { .pat = ">>=" }, 3, NULL }, 68 { TK_LEXP, { .pat = "{{-" }, 3, NULL }, 69 { TK_REXP, { .pat = "-}}" }, 3, NULL }, 70 { TK_LSTM, { .pat = "{%+" }, 3, NULL }, 71 { TK_LSTM, { .pat = "{%-" }, 3, NULL }, 72 { TK_RSTM, { .pat = "-%}" }, 3, NULL }, 73 { TK_EQS, { .pat = "===" }, 3, NULL }, 74 { TK_NES, { .pat = "!==" }, 3, NULL }, 75 { TK_ELLIP, { .pat = "..." }, 3, NULL }, 76 { TK_QLBRACK, { .pat = "?.[" }, 3, NULL }, 77 { TK_QLPAREN, { .pat = "?.(" }, 3, NULL }, 78 { TK_ASEXP, { .pat = "**=" }, 3, NULL }, 79 { TK_ASAND, { .pat = "&&=" }, 3, NULL }, 80 { TK_ASOR, { .pat = "||=" }, 3, NULL }, 81 { TK_ASNULLISH, { .pat = "\?\?=" }, 3, NULL }, 82 { TK_AND, { .pat = "&&" }, 2, NULL }, 83 { TK_ASADD, { .pat = "+=" }, 2, NULL }, 84 { TK_ASBAND, { .pat = "&=" }, 2, NULL }, 85 { TK_ASBOR, { .pat = "|=" }, 2, NULL }, 86 { TK_ASBXOR, { .pat = "^=" }, 2, NULL }, 87 //{ TK_ASDIV, { .pat = "/=" }, 2, NULL }, 88 { TK_ASMOD, { .pat = "%=" }, 2, NULL }, 89 { TK_ASMUL, { .pat = "*=" }, 2, NULL }, 90 { TK_ASSUB, { .pat = "-=" }, 2, NULL }, 91 { TK_EXP, { .pat = "**" }, 2, NULL }, 92 { TK_DEC, { .pat = "--" }, 2, NULL }, 93 { TK_INC, { .pat = "++" }, 2, NULL }, 94 { TK_EQ, { .pat = "==" }, 2, NULL }, 95 { TK_NE, { .pat = "!=" }, 2, NULL }, 96 { TK_LE, { .pat = "<=" }, 2, NULL }, 97 { TK_GE, { .pat = ">=" }, 2, NULL }, 98 { TK_LSHIFT, { .pat = "<<" }, 2, NULL }, 99 { TK_RSHIFT, { .pat = ">>" }, 2, NULL }, 100 { 0, { .pat = "//" }, 2, parse_comment }, 101 { 0, { .pat = "/*" }, 2, parse_comment }, 102 { TK_OR, { .pat = "||" }, 2, NULL }, 103 { TK_LEXP, { .pat = "{{" }, 2, NULL }, 104 { TK_REXP, { .pat = "}}" }, 2, NULL }, 105 { TK_LSTM, { .pat = "{%" }, 2, NULL }, 106 { TK_RSTM, { .pat = "%}" }, 2, NULL }, 107 { TK_ARROW, { .pat = "=>" }, 2, NULL }, 108 { TK_NULLISH, { .pat = "??" }, 2, NULL }, 109 { TK_QDOT, { .pat = "?." }, 2, NULL }, 110 { TK_PLACEH, { .pat = "${" }, 2, NULL }, 111 { TK_ADD, { .pat = "+" }, 1, NULL }, 112 { TK_ASSIGN, { .pat = "=" }, 1, NULL }, 113 { TK_BAND, { .pat = "&" }, 1, NULL }, 114 { TK_BOR, { .pat = "|" }, 1, NULL }, 115 { TK_LBRACK, { .pat = "[" }, 1, NULL }, 116 { TK_RBRACK, { .pat = "]" }, 1, NULL }, 117 { TK_BXOR, { .pat = "^" }, 1, NULL }, 118 { TK_LBRACE, { .pat = "{" }, 1, NULL }, 119 { TK_RBRACE, { .pat = "}" }, 1, NULL }, 120 { TK_COLON, { .pat = ":" }, 1, NULL }, 121 { TK_COMMA, { .pat = "," }, 1, NULL }, 122 { TK_COMPL, { .pat = "~" }, 1, NULL }, 123 //{ TK_DIV, { .pat = "/" }, 1, NULL }, 124 { TK_GT, { .pat = ">" }, 1, NULL }, 125 { TK_NOT, { .pat = "!" }, 1, NULL }, 126 { TK_LT, { .pat = "<" }, 1, NULL }, 127 { TK_MOD, { .pat = "%" }, 1, NULL }, 128 { TK_MUL, { .pat = "*" }, 1, NULL }, 129 { TK_LPAREN, { .pat = "(" }, 1, NULL }, 130 { TK_RPAREN, { .pat = ")" }, 1, NULL }, 131 { TK_QMARK, { .pat = "?" }, 1, NULL }, 132 { TK_SCOL, { .pat = ";" }, 1, NULL }, 133 { TK_SUB, { .pat = "-" }, 1, NULL }, 134 { TK_DOT, { .pat = "." }, 1, NULL }, 135 { TK_STRING, { .pat = "'" }, 1, parse_string }, 136 { TK_STRING, { .pat = "\"" }, 1, parse_string }, 137 { TK_REGEXP, { .pat = "/" }, 1, parse_regexp }, 138 { TK_LABEL, { .pat = "_" }, 1, parse_label }, 139 { TK_LABEL, { .pat = "az" }, 0, parse_label }, 140 { TK_LABEL, { .pat = "AZ" }, 0, parse_label }, 141 { TK_NUMBER, { .pat = "09" }, 0, parse_number }, 142 143 /* NB: this must be last for simple retrieval */ 144 { TK_TEMPLATE, { .pat = "`" }, 1, parse_string } 145 }; 146 147 static const struct keyword reserved_words[] = { 148 { TK_ENDFUNC, "endfunction", 11 }, 149 { TK_CONTINUE, "continue", 8 }, 150 { TK_ENDWHILE, "endwhile", 8 }, 151 { TK_FUNC, "function", 8 }, 152 { TK_DEFAULT, "default", 7 }, 153 { TK_DELETE, "delete", 6 }, 154 { TK_RETURN, "return", 6 }, 155 { TK_ENDFOR, "endfor", 6 }, 156 { TK_SWITCH, "switch", 6 }, 157 { TK_ENDIF, "endif", 5 }, 158 { TK_WHILE, "while", 5 }, 159 { TK_BREAK, "break", 5 }, 160 { TK_CATCH, "catch", 5 }, 161 { TK_CONST, "const", 5 }, 162 { TK_FALSE, "false", 5 }, 163 { TK_TRUE, "true", 4 }, 164 { TK_ELIF, "elif", 4 }, 165 { TK_ELSE, "else", 4 }, 166 { TK_THIS, "this", 4 }, 167 { TK_NULL, "null", 4 }, 168 { TK_CASE, "case", 4 }, 169 { TK_TRY, "try", 3 }, 170 { TK_FOR, "for", 3 }, 171 { TK_LOCAL, "let", 3 }, 172 { TK_IF, "if", 2 }, 173 { TK_IN, "in", 2 }, 174 }; 175 176 177 /* length of the longest token in our lookup table */ 178 #define UC_LEX_MAX_TOKEN_LEN 3 179 180 static uc_token_t * 181 emit_op(uc_lexer_t *lex, uint32_t pos, int type, uc_value_t *uv) 182 { 183 lex->curr.type = type; 184 lex->curr.uv = uv; 185 lex->curr.pos = pos; 186 187 return &lex->curr; 188 } 189 190 static void lookbehind_append(uc_lexer_t *lex, const char *data, size_t len) 191 { 192 if (len) { 193 lex->lookbehind = xrealloc(lex->lookbehind, lex->lookbehindlen + len); 194 memcpy(lex->lookbehind + lex->lookbehindlen, data, len); 195 lex->lookbehindlen += len; 196 } 197 } 198 199 static void lookbehind_reset(uc_lexer_t *lex) { 200 free(lex->lookbehind); 201 lex->lookbehind = NULL; 202 lex->lookbehindlen = 0; 203 } 204 205 static uc_token_t * 206 lookbehind_to_text(uc_lexer_t *lex, uint32_t pos, int type, const char *strip_trailing_chars) { 207 uc_token_t *rv = NULL; 208 209 if (lex->lookbehind) { 210 if (strip_trailing_chars) { 211 while (lex->lookbehindlen > 0 && strchr(strip_trailing_chars, lex->lookbehind[lex->lookbehindlen-1])) 212 lex->lookbehindlen--; 213 } 214 215 rv = emit_op(lex, pos, type, ucv_string_new_length(lex->lookbehind, lex->lookbehindlen)); 216 217 lookbehind_reset(lex); 218 } 219 220 return rv; 221 } 222 223 static inline size_t 224 buf_remaining(uc_lexer_t *lex) { 225 return (lex->bufend - lex->bufstart); 226 } 227 228 static inline bool 229 _buf_startswith(uc_lexer_t *lex, const char *str, size_t len) { 230 return (buf_remaining(lex) >= len && !strncmp(lex->bufstart, str, len)); 231 } 232 233 #define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1) 234 235 236 static void 237 buf_consume(uc_lexer_t *lex, size_t len) { 238 size_t i, linelen; 239 240 for (i = 0, linelen = 0; i < len; i++) { 241 if (lex->bufstart[i] == '\n') { 242 uc_source_line_update(lex->source, linelen); 243 uc_source_line_next(lex->source); 244 245 linelen = 0; 246 } 247 else { 248 linelen++; 249 } 250 } 251 252 if (linelen) 253 uc_source_line_update(lex->source, linelen); 254 255 lex->bufstart += len; 256 lex->source->off += len; 257 } 258 259 static uc_token_t * 260 parse_comment(uc_lexer_t *lex) 261 { 262 const struct token *tok = lex->tok; 263 const char *ptr, *end; 264 size_t elen; 265 266 if (!strcmp(tok->u.pat, "//")) { 267 end = "\n"; 268 elen = 1; 269 } 270 else { 271 end = "*/"; 272 elen = 2; 273 } 274 275 for (ptr = lex->bufstart; ptr < lex->bufend - elen; ptr++) { 276 if (!strncmp(ptr, end, elen)) { 277 buf_consume(lex, (ptr - lex->bufstart) + elen); 278 279 return UC_LEX_CONTINUE_PARSING; 280 } 281 } 282 283 buf_consume(lex, ptr - lex->bufstart); 284 285 if (lex->eof) { 286 lex->state = UC_LEX_EOF; 287 288 if (elen == 2) 289 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment")); 290 } 291 292 return NULL; 293 } 294 295 static void 296 append_utf8(uc_lexer_t *lex, int code) { 297 char ustr[8], *up; 298 int rem; 299 300 up = ustr; 301 rem = sizeof(ustr); 302 303 if (utf8enc(&up, &rem, code)) 304 lookbehind_append(lex, ustr, up - ustr); 305 } 306 307 static uc_token_t * 308 parse_string(uc_lexer_t *lex) 309 { 310 const struct token *tok = lex->tok; 311 char q = tok->u.pat[0]; 312 char *ptr, *c; 313 uc_token_t *rv; 314 int code; 315 316 if (!buf_remaining(lex)) 317 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string")); 318 319 for (ptr = lex->bufstart; ptr < lex->bufend; ptr++) { 320 /* continuation of placeholder start */ 321 if (lex->is_placeholder) { 322 if (*ptr == '{') { 323 buf_consume(lex, 1); 324 rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL); 325 326 if (!rv) 327 rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0)); 328 329 return rv; 330 } 331 332 lex->is_placeholder = false; 333 lookbehind_append(lex, "$", 1); 334 } 335 336 /* continuation of escape sequence */ 337 if (lex->is_escape) { 338 if (lex->esclen == 0) { 339 /* non-unicode escape following a lead surrogate, emit replacement... */ 340 if (lex->lead_surrogate && *ptr != 'u') { 341 append_utf8(lex, 0xFFFD); 342 lex->lead_surrogate = 0; 343 } 344 345 switch ((q == '/') ? 0 : *ptr) { 346 case 'u': 347 case 'x': 348 lex->esc[lex->esclen++] = *ptr; 349 break; 350 351 case '': 352 case '1': 353 case '2': 354 case '3': 355 case '4': 356 case '5': 357 case '6': 358 case '7': 359 lex->esc[lex->esclen++] = 'o'; 360 lex->esc[lex->esclen++] = *ptr; 361 break; 362 363 default: 364 lex->is_escape = false; 365 c = strchr("a\ab\be\033f\fn\nr\rt\tv\v", *ptr); 366 367 if (c && *c >= 'a') { 368 lookbehind_append(lex, c + 1, 1); 369 } 370 else { 371 /* regex mode => retain backslash */ 372 if (q == '/') 373 lookbehind_append(lex, "\\", 1); 374 375 lookbehind_append(lex, ptr, 1); 376 } 377 378 buf_consume(lex, (ptr + 1) - lex->bufstart); 379 380 break; 381 } 382 } 383 else { 384 switch (lex->esc[0]) { 385 case 'u': 386 if (lex->esclen < 5) { 387 if (!isxdigit(*ptr)) 388 return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence")); 389 390 lex->esc[lex->esclen++] = *ptr; 391 } 392 393 if (lex->esclen == 5) { 394 code = hex(lex->esc[1]) * 16 * 16 * 16 + 395 hex(lex->esc[2]) * 16 * 16 + 396 hex(lex->esc[3]) * 16 + 397 hex(lex->esc[4]); 398 399 /* is a leading surrogate value */ 400 if ((code & 0xFC00) == 0xD800) { 401 /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */ 402 if (lex->lead_surrogate) 403 append_utf8(lex, 0xFFFD); 404 405 /* store surrogate value and advance to next escape sequence */ 406 lex->lead_surrogate = code; 407 } 408 409 /* is a trailing surrogate value */ 410 else if ((code & 0xFC00) == 0xDC00) { 411 /* found a trailing surrogate following a leading one, combine and encode */ 412 if (lex->lead_surrogate) { 413 code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); 414 lex->lead_surrogate = 0; 415 } 416 417 /* trailing surrogate not following a leading one, ignore and use replacement char */ 418 else { 419 code = 0xFFFD; 420 } 421 422 append_utf8(lex, code); 423 } 424 425 /* is a normal codepoint */ 426 else { 427 append_utf8(lex, code); 428 } 429 430 lex->esclen = 0; 431 lex->is_escape = false; 432 buf_consume(lex, (ptr + 1) - lex->bufstart); 433 } 434 435 break; 436 437 case 'x': 438 if (lex->esclen < 3) { 439 if (!isxdigit(*ptr)) 440 return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence")); 441 442 lex->esc[lex->esclen++] = *ptr; 443 } 444 445 if (lex->esclen == 3) { 446 append_utf8(lex, hex(lex->esc[1]) * 16 + hex(lex->esc[2])); 447 448 lex->esclen = 0; 449 lex->is_escape = false; 450 buf_consume(lex, (ptr + 1) - lex->bufstart); 451 } 452 453 break; 454 455 case 'o': 456 if (lex->esclen < 4) { 457 /* found a non-octal char */ 458 if (*ptr < '' || *ptr > '7') { 459 /* pad sequence to three chars */ 460 switch (lex->esclen) { 461 case 3: 462 lex->esc[3] = lex->esc[2]; 463 lex->esc[2] = lex->esc[1]; 464 lex->esc[1] = ''; 465 break; 466 467 case 2: 468 lex->esc[3] = lex->esc[1]; 469 lex->esc[2] = ''; 470 lex->esc[1] = ''; 471 break; 472 } 473 474 lex->esclen = 4; 475 buf_consume(lex, ptr-- - lex->bufstart); 476 } 477 478 /* append */ 479 else { 480 lex->esc[lex->esclen++] = *ptr; 481 buf_consume(lex, (ptr + 1) - lex->bufstart); 482 } 483 } 484 485 if (lex->esclen == 4) { 486 code = dec(lex->esc[1]) * 8 * 8 + 487 dec(lex->esc[2]) * 8 + 488 dec(lex->esc[3]); 489 490 if (code > 255) 491 return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence")); 492 493 append_utf8(lex, code); 494 495 lex->esclen = 0; 496 lex->is_escape = false; 497 } 498 499 break; 500 } 501 } 502 } 503 504 /* terminating char */ 505 else if (*ptr == q) { 506 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); 507 buf_consume(lex, (ptr + 1) - lex->bufstart); 508 509 rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL); 510 511 if (!rv) 512 rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0)); 513 514 return rv; 515 } 516 517 /* escape sequence start */ 518 else if (*ptr == '\\') { 519 lex->is_escape = true; 520 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); 521 buf_consume(lex, (ptr - lex->bufstart) + 1); 522 } 523 524 /* potential placeholder start */ 525 else if (q == '`' && *ptr == '$') { 526 lex->is_placeholder = true; 527 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); 528 buf_consume(lex, (ptr - lex->bufstart) + 1); 529 } 530 } 531 532 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); 533 buf_consume(lex, ptr - lex->bufstart); 534 535 return NULL; 536 } 537 538 539 /* 540 * Parses a regexp literal from the given buffer. 541 * 542 * Returns a negative value on error, otherwise the amount of consumed 543 * characters from the given buffer. 544 * 545 * Error values: 546 * -UC_ERROR_UNTERMINATED_STRING Unterminated regexp 547 * -UC_ERROR_INVALID_ESCAPE Invalid escape sequence 548 * -UC_ERROR_OVERLONG_STRING Regexp literal too long 549 * -UC_ERROR_INVALID_REGEXP Could not compile regexp 550 */ 551 552 enum { 553 UC_LEX_PARSE_REGEX_INIT, 554 UC_LEX_PARSE_REGEX_PATTERN, 555 UC_LEX_PARSE_REGEX_FLAGS 556 }; 557 558 static uc_token_t * 559 parse_regexp(uc_lexer_t *lex) 560 { 561 bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false; 562 uc_token_t *rv; 563 size_t len; 564 char *s; 565 566 switch (lex->esc[0]) { 567 case UC_LEX_PARSE_REGEX_INIT: 568 if (lex->no_regexp) { 569 if (buf_startswith(lex, "=")) { 570 buf_consume(lex, 1); 571 572 return emit_op(lex, lex->source->off, TK_ASDIV, NULL); 573 } 574 575 return emit_op(lex, lex->source->off, TK_DIV, NULL); 576 } 577 578 lex->esc[0] = UC_LEX_PARSE_REGEX_PATTERN; 579 break; 580 581 case UC_LEX_PARSE_REGEX_PATTERN: 582 rv = parse_string(lex); 583 584 if (rv && rv->type == TK_ERROR) 585 return rv; 586 587 if (rv != NULL && rv != UC_LEX_CONTINUE_PARSING) { 588 lex->lookbehind = (char *)rv; 589 lex->esc[0] = UC_LEX_PARSE_REGEX_FLAGS; 590 } 591 592 break; 593 594 case UC_LEX_PARSE_REGEX_FLAGS: 595 rv = (uc_token_t *)lex->lookbehind; 596 597 while (lex->bufstart < lex->bufend || lex->eof) { 598 switch (lex->eof ? EOF : lex->bufstart[0]) { 599 case 'g': 600 buf_consume(lex, 1); 601 is_reg_global = true; 602 break; 603 604 case 'i': 605 buf_consume(lex, 1); 606 is_reg_icase = true; 607 break; 608 609 case 's': 610 buf_consume(lex, 1); 611 is_reg_newline = true; 612 break; 613 614 default: 615 lex->lookbehind = NULL; 616 617 len = xasprintf(&s, "%c%*s", 618 (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2), 619 ucv_string_length(rv->uv), 620 ucv_string_get(rv->uv)); 621 622 ucv_free(rv->uv, false); 623 rv->uv = ucv_string_new_length(s, len); 624 free(s); 625 626 rv->type = TK_REGEXP; 627 628 return rv; 629 } 630 } 631 632 break; 633 } 634 635 return NULL; 636 } 637 638 639 /* 640 * Parses a label from the given buffer. 641 * 642 * Returns a negative value on error, otherwise the amount of consumed 643 * characters from the given buffer. 644 * 645 * Error values: 646 * -UC_ERROR_OVERLONG_STRING Label too long 647 */ 648 649 static uc_token_t * 650 parse_label(uc_lexer_t *lex) 651 { 652 const struct token *tok = lex->tok; 653 const struct keyword *word; 654 char *ptr; 655 size_t i; 656 657 if (!lex->lookbehind && tok->plen) 658 lookbehind_append(lex, tok->u.pat, tok->plen); 659 660 if (!buf_remaining(lex) || (lex->bufstart[0] != '_' && !isalnum(lex->bufstart[0]))) { 661 if (lex->no_keyword == false) { 662 for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { 663 if (lex->lookbehind && lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) { 664 lookbehind_reset(lex); 665 666 return emit_op(lex, lex->source->off - word->plen, word->type, NULL); 667 } 668 } 669 } 670 671 return lookbehind_to_text(lex, lex->source->off - lex->lookbehindlen, TK_LABEL, NULL); 672 } 673 674 for (ptr = lex->bufstart; ptr < lex->bufend && (*ptr == '_' || isalnum(*ptr)); ptr++) 675 ; 676 677 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); 678 buf_consume(lex, ptr - lex->bufstart); 679 680 return NULL; 681 } 682 683 684 /* 685 * Parses a number literal from the given buffer. 686 * 687 * Returns a negative value on error, otherwise the amount of consumed 688 * characters from the given buffer. 689 * 690 * Error values: 691 * -UC_ERROR_INVALID_ESCAPE Invalid number character 692 */ 693 694 static inline bool 695 is_numeric_char(uc_lexer_t *lex, char c) 696 { 697 char prev = lex->lookbehindlen ? lex->lookbehind[lex->lookbehindlen-1] : 0; 698 699 switch (c|32) { 700 case '.': 701 case '': 702 case '1': 703 case '2': 704 case '3': 705 case '4': 706 case '5': 707 case '6': 708 case '7': 709 case '8': 710 case '9': 711 return true; 712 713 case 'a': 714 case 'b': 715 case 'c': 716 case 'd': 717 case 'e': 718 case 'f': 719 case 'o': 720 case 'x': 721 /* require previous char, a number literal cannot start with these */ 722 return prev != 0; 723 724 case '+': 725 case '-': 726 /* sign is only allowed after an exponent char */ 727 return (prev|32) == 'e'; 728 } 729 730 return false; 731 } 732 733 static uc_token_t * 734 parse_number(uc_lexer_t *lex) 735 { 736 uc_token_t *rv = NULL; 737 uc_value_t *nv = NULL; 738 const char *ptr; 739 char *e; 740 741 if (!buf_remaining(lex) || !is_numeric_char(lex, lex->bufstart[0])) { 742 lookbehind_append(lex, "\0", 1); 743 744 nv = uc_number_parse_octal(lex->lookbehind, &e); 745 746 switch (ucv_type(nv)) { 747 case UC_DOUBLE: 748 rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_DOUBLE, nv); 749 break; 750 751 case UC_INTEGER: 752 rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_NUMBER, nv); 753 break; 754 755 default: 756 rv = emit_op(lex, lex->source->off - (lex->lookbehindlen - (e - lex->lookbehind) - 1), TK_ERROR, ucv_string_new("Invalid number literal")); 757 } 758 759 lookbehind_reset(lex); 760 761 return rv; 762 } 763 764 for (ptr = lex->bufstart; ptr < lex->bufend && is_numeric_char(lex, *ptr); ptr++) 765 ; 766 767 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); 768 buf_consume(lex, ptr - lex->bufstart); 769 770 return NULL; 771 } 772 773 static uc_token_t * 774 lex_step(uc_lexer_t *lex, FILE *fp) 775 { 776 uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) }; 777 union { uint32_t n; char str[4]; } search; 778 const struct token *tok; 779 size_t rlen, rem, *nest; 780 char *ptr, c; 781 uc_token_t *rv; 782 size_t i; 783 784 /* only less than UC_LEX_MAX_TOKEN_LEN unread buffer chars remaining, 785 * move the remaining bytes to the beginning and read more data */ 786 if (buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN) { 787 if (!lex->buf) { 788 lex->buflen = 128; 789 lex->buf = xalloc(lex->buflen); 790 } 791 792 rem = lex->bufend - lex->bufstart; 793 794 if (rem) 795 memcpy(lex->buf, lex->bufstart, rem); 796 797 rlen = fread(lex->buf + rem, 1, lex->buflen - rem, fp); 798 799 lex->bufstart = lex->buf; 800 lex->bufend = lex->buf + rlen + rem; 801 802 if (rlen == 0 && (ferror(fp) || feof(fp))) 803 lex->eof = 1; 804 } 805 806 switch (lex->state) { 807 case UC_LEX_IDENTIFY_BLOCK: 808 /* previous block had strip trailing whitespace flag, skip leading whitespace */ 809 if (lex->modifier == MINUS) { 810 while (buf_remaining(lex) && isspace(lex->bufstart[0])) 811 buf_consume(lex, 1); 812 813 lex->modifier = UNSPEC; 814 } 815 816 /* previous block was a statement block and trim_blocks is enabld, skip leading newline */ 817 else if (lex->modifier == NEWLINE) { 818 if (buf_startswith(lex, "\n")) 819 buf_consume(lex, 1); 820 821 lex->modifier = UNSPEC; 822 } 823 824 /* scan forward through buffer to identify start token */ 825 for (ptr = lex->bufstart; ptr < lex->bufend - strlen("{#"); ptr++) { 826 /* found start of comment block */ 827 if (!strncmp(ptr, "{#", 2)) { 828 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); 829 buf_consume(lex, (ptr + 2) - lex->bufstart); 830 lex->lastoff = lex->source->off - 2; 831 lex->state = UC_LEX_BLOCK_COMMENT_START; 832 833 return NULL; 834 } 835 836 /* found start of expression block */ 837 else if (!strncmp(ptr, "{{", 2)) { 838 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); 839 buf_consume(lex, (ptr + 2) - lex->bufstart); 840 lex->lastoff = lex->source->off - 2; 841 lex->state = UC_LEX_BLOCK_EXPRESSION_START; 842 843 return NULL; 844 } 845 846 /* found start of statement block */ 847 else if (!strncmp(ptr, "{%", 2)) { 848 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); 849 buf_consume(lex, (ptr + 2) - lex->bufstart); 850 lex->lastoff = lex->source->off - 2; 851 lex->state = UC_LEX_BLOCK_STATEMENT_START; 852 853 return NULL; 854 } 855 } 856 857 /* we're at eof */ 858 if (lex->eof) { 859 lookbehind_append(lex, ptr, lex->bufend - ptr); 860 lex->state = UC_LEX_EOF; 861 862 return lookbehind_to_text(lex, lex->lastoff, TK_TEXT, NULL); 863 } 864 865 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart); 866 buf_consume(lex, ptr - lex->bufstart); 867 break; 868 869 870 case UC_LEX_BLOCK_COMMENT_START: 871 case UC_LEX_BLOCK_EXPRESSION_START: 872 case UC_LEX_BLOCK_STATEMENT_START: 873 rv = NULL; 874 lex->modifier = UNSPEC; 875 876 /* strip whitespace before block */ 877 if (buf_startswith(lex, "-")) { 878 rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \n\t\v\f\r"); 879 buf_consume(lex, 1); 880 } 881 882 /* disable lstrip flag (only valid for statement blocks) */ 883 else if (lex->state == UC_LEX_BLOCK_STATEMENT_START) { 884 /* disable lstrip flag */ 885 if (buf_startswith(lex, "+")) { 886 rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL); 887 buf_consume(lex, 1); 888 } 889 890 /* global block lstrip */ 891 else if (lex->config && lex->config->lstrip_blocks) { 892 rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \t\v\f\r"); 893 } 894 } 895 else { 896 rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL); 897 } 898 899 switch (lex->state) { 900 case UC_LEX_BLOCK_COMMENT_START: 901 lex->state = UC_LEX_BLOCK_COMMENT; 902 lex->block = COMMENT; 903 break; 904 905 case UC_LEX_BLOCK_STATEMENT_START: 906 lex->state = UC_LEX_IDENTIFY_TOKEN; 907 lex->block = STATEMENTS; 908 break; 909 910 case UC_LEX_BLOCK_EXPRESSION_START: 911 lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG; 912 break; 913 914 default: 915 break; 916 } 917 918 return rv; 919 920 921 case UC_LEX_BLOCK_COMMENT: 922 /* scan forward through buffer to identify end token */ 923 while (lex->bufstart < lex->bufend - 2) { 924 if (buf_startswith(lex, "-#}")) { 925 lex->state = UC_LEX_IDENTIFY_BLOCK; 926 lex->modifier = MINUS; 927 buf_consume(lex, 3); 928 lex->lastoff = lex->source->off; 929 break; 930 } 931 else if (buf_startswith(lex, "#}")) { 932 lex->state = UC_LEX_IDENTIFY_BLOCK; 933 buf_consume(lex, 2); 934 lex->lastoff = lex->source->off; 935 break; 936 } 937 938 buf_consume(lex, 1); 939 } 940 941 /* we're at eof */ 942 if (lex->eof) { 943 lex->state = UC_LEX_EOF; 944 945 buf_consume(lex, lex->bufend - lex->bufstart); 946 947 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block")); 948 } 949 950 break; 951 952 953 case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG: 954 lex->state = UC_LEX_IDENTIFY_TOKEN; 955 lex->block = EXPRESSION; 956 957 return emit_op(lex, lex->source->off, TK_LEXP, NULL); 958 959 960 case UC_LEX_IDENTIFY_TOKEN: 961 /* skip leading whitespace */ 962 for (i = 0; i < buf_remaining(lex) && isspace(lex->bufstart[i]); i++) 963 ; 964 965 buf_consume(lex, i); 966 967 if (i > 0 && buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN) 968 return NULL; 969 970 for (i = 0; i < sizeof(search.str); i++) 971 search.str[i] = (i < buf_remaining(lex)) ? lex->bufstart[i] : 0; 972 973 for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) { 974 /* remaining buffer data is shorter than token, skip */ 975 if (tok->plen > buf_remaining(lex)) 976 continue; 977 978 c = buf_remaining(lex) ? lex->bufstart[0] : 0; 979 980 if (tok->plen ? ((search.n & masks[tok->plen]) == tok->u.patn) 981 : (c >= tok->u.pat[0] && c <= tok->u.pat[1])) { 982 lex->lastoff = lex->source->off; 983 984 /* token has a parse method, switch state */ 985 if (tok->parse) { 986 lex->tok = tok; 987 lex->state = UC_LEX_PARSE_TOKEN; 988 989 buf_consume(lex, tok->plen); 990 991 return NULL; 992 } 993 994 /* in raw code mode, ignore template tag tokens */ 995 if (lex->config && lex->config->raw_mode && 996 (tok->type == TK_LSTM || tok->type == TK_RSTM || 997 tok->type == TK_LEXP || tok->type == TK_REXP)) { 998 continue; 999 } 1000 1001 /* disallow nesting blocks */ 1002 if (tok->type == TK_LSTM || tok->type == TK_LEXP) { 1003 buf_consume(lex, tok->plen); 1004 1005 return emit_op(lex, lex->source->off - tok->plen, TK_ERROR, ucv_string_new("Template blocks may not be nested")); 1006 } 1007 1008 /* found end of block */ 1009 else if ((lex->block == STATEMENTS && tok->type == TK_RSTM) || 1010 (lex->block == EXPRESSION && tok->type == TK_REXP)) { 1011 /* strip whitespace after block */ 1012 if (tok->u.pat[0] == '-') 1013 lex->modifier = MINUS; 1014 1015 /* strip newline after statement block */ 1016 else if (lex->block == STATEMENTS && 1017 lex->config && lex->config->trim_blocks) 1018 lex->modifier = NEWLINE; 1019 1020 lex->state = UC_LEX_IDENTIFY_BLOCK; 1021 lex->block = NONE; 1022 } 1023 1024 /* track opening braces */ 1025 else if (tok->type == TK_LBRACE && lex->templates.count > 0) { 1026 nest = uc_vector_last(&lex->templates); 1027 (*nest)++; 1028 } 1029 1030 /* check end of placeholder expression */ 1031 else if (tok->type == TK_RBRACE && lex->templates.count > 0) { 1032 nest = uc_vector_last(&lex->templates); 1033 1034 if (*nest == 0) { 1035 lex->templates.count--; 1036 lex->state = UC_LEX_PARSE_TOKEN; 1037 lex->tok = &tokens[ARRAY_SIZE(tokens) - 1]; /* NB: TK_TEMPLATE token spec */ 1038 } 1039 else { 1040 (*nest)--; 1041 } 1042 } 1043 1044 /* do not report statement tags to the parser */ 1045 if (tok->type != 0 && tok->type != TK_LSTM) 1046 rv = emit_op(lex, lex->source->off, 1047 (tok->type == TK_RSTM) ? TK_SCOL : tok->type, NULL); 1048 else 1049 rv = NULL; 1050 1051 buf_consume(lex, tok->plen); 1052 1053 return rv; 1054 } 1055 } 1056 1057 /* no possible return beyond this point can advance, 1058 mark lex state as eof */ 1059 lex->state = UC_LEX_EOF; 1060 1061 /* no token matched and we do have remaining data, junk */ 1062 if (buf_remaining(lex)) 1063 return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unexpected character")); 1064 1065 /* we're at eof, allow unclosed statement blocks */ 1066 if (lex->block == STATEMENTS) 1067 return NULL; 1068 1069 /* premature EOF */ 1070 return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unterminated template block")); 1071 1072 1073 case UC_LEX_PARSE_TOKEN: 1074 tok = lex->tok; 1075 rv = tok->parse(lex); 1076 1077 if (rv) { 1078 memset(lex->esc, 0, sizeof(lex->esc)); 1079 lex->state = lex->is_placeholder ? UC_LEX_PLACEHOLDER : UC_LEX_IDENTIFY_TOKEN; 1080 lex->is_placeholder = false; 1081 lex->tok = NULL; 1082 1083 if (rv == UC_LEX_CONTINUE_PARSING) 1084 rv = NULL; 1085 1086 return rv; 1087 } 1088 1089 break; 1090 1091 1092 case UC_LEX_PLACEHOLDER: 1093 lex->state = UC_LEX_IDENTIFY_TOKEN; 1094 1095 uc_vector_push(&lex->templates, 0); 1096 1097 return emit_op(lex, lex->source->off, TK_PLACEH, NULL); 1098 1099 1100 case UC_LEX_EOF: 1101 break; 1102 } 1103 1104 return NULL; 1105 } 1106 1107 void 1108 uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source) 1109 { 1110 lex->state = UC_LEX_IDENTIFY_BLOCK; 1111 1112 lex->config = config; 1113 lex->source = uc_source_get(source); 1114 1115 lex->eof = 0; 1116 lex->is_escape = 0; 1117 1118 lex->block = NONE; 1119 lex->modifier = UNSPEC; 1120 1121 lex->buflen = 0; 1122 lex->buf = NULL; 1123 lex->bufstart = NULL; 1124 lex->bufend = NULL; 1125 1126 lex->lookbehindlen = 0; 1127 lex->lookbehind = NULL; 1128 1129 lex->tok = NULL; 1130 1131 lex->esclen = 0; 1132 memset(lex->esc, 0, sizeof(lex->esc)); 1133 1134 lex->lead_surrogate = 0; 1135 1136 lex->lastoff = 0; 1137 1138 lex->templates.count = 0; 1139 lex->templates.entries = NULL; 1140 1141 if (config && config->raw_mode) { 1142 lex->state = UC_LEX_IDENTIFY_TOKEN; 1143 lex->block = STATEMENTS; 1144 } 1145 } 1146 1147 void 1148 uc_lexer_free(uc_lexer_t *lex) 1149 { 1150 uc_vector_clear(&lex->templates); 1151 uc_source_put(lex->source); 1152 1153 free(lex->lookbehind); 1154 free(lex->buf); 1155 } 1156 1157 uc_token_t * 1158 uc_lexer_next_token(uc_lexer_t *lex) 1159 { 1160 uc_token_t *rv = NULL; 1161 1162 while (lex->state != UC_LEX_EOF) { 1163 rv = lex_step(lex, lex->source->fp); 1164 1165 if (rv != NULL) 1166 break; 1167 } 1168 1169 if (rv) { 1170 lex->no_keyword = false; 1171 lex->no_regexp = false; 1172 1173 return rv; 1174 } 1175 1176 return emit_op(lex, lex->source->off, TK_EOF, NULL); 1177 } 1178 1179 const char * 1180 uc_tokenname(unsigned type) 1181 { 1182 static char buf[sizeof("'endfunction'")]; 1183 size_t i; 1184 1185 switch (type) { 1186 case 0: return "End of file"; 1187 case TK_TEMPLATE: return "Template"; 1188 case TK_STRING: return "String"; 1189 case TK_LABEL: return "Label"; 1190 case TK_NUMBER: return "Number"; 1191 case TK_DOUBLE: return "Double"; 1192 case TK_REGEXP: return "Regexp"; 1193 } 1194 1195 for (i = 0; i < ARRAY_SIZE(tokens); i++) { 1196 if (tokens[i].type != type) 1197 continue; 1198 1199 snprintf(buf, sizeof(buf), "'%s'", tokens[i].u.pat); 1200 1201 return buf; 1202 } 1203 1204 for (i = 0; i < ARRAY_SIZE(reserved_words); i++) { 1205 if (reserved_words[i].type != type) 1206 continue; 1207 1208 snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat); 1209 1210 return buf; 1211 } 1212 1213 return "?"; 1214 } 1215 1216 bool 1217 uc_lexer_is_keyword(uc_value_t *label) 1218 { 1219 size_t i; 1220 1221 if (ucv_type(label) != UC_STRING) 1222 return false; 1223 1224 for (i = 0; i < ARRAY_SIZE(reserved_words); i++) 1225 if (!strcmp(reserved_words[i].pat, ucv_string_get(label))) 1226 return true; 1227 1228 return false; 1229 } 1230 1231 #endif /* NO_COMPILE */ 1232 1233 /* 1234 * Stores the given codepoint as a utf8 multibyte sequence into the given 1235 * output buffer and substracts the required amount of bytes from the given 1236 * length pointer. 1237 * 1238 * Returns false if the multibyte sequence would not fit into the buffer, 1239 * otherwise true. 1240 */ 1241 1242 bool 1243 utf8enc(char **out, int *rem, int code) 1244 { 1245 if (code >= 0 && code <= 0x7F) { 1246 if (*rem < 1) 1247 return false; 1248 1249 *(*out)++ = code; (*rem)--; 1250 1251 return true; 1252 } 1253 else if (code > 0 && code <= 0x7FF) { 1254 if (*rem < 2) 1255 return false; 1256 1257 *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--; 1258 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1259 1260 return true; 1261 } 1262 else if (code > 0 && code <= 0xFFFF) { 1263 if (*rem < 3) 1264 return false; 1265 1266 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--; 1267 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; 1268 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1269 1270 return true; 1271 } 1272 else if (code > 0 && code <= 0x10FFFF) { 1273 if (*rem < 4) 1274 return false; 1275 1276 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--; 1277 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--; 1278 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; 1279 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1280 1281 return true; 1282 } 1283 1284 return true; 1285 } 1286
This page was automatically generated by LXR 0.3.1. • OpenWrt