1 /* 2 * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io> 3 * 4 * Permission to use, copy, modify, and/or distribute this software for any 5 * purpose with or without fee is hereby granted, provided that the above 6 * copyright notice and this permission notice appear in all copies. 7 * 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 */ 16 17 #include <stdio.h> 18 19 #include <stdbool.h> 20 #include <stdlib.h> 21 #include <string.h> 22 #include <ctype.h> 23 #include <regex.h> 24 #include <math.h> 25 #include <errno.h> 26 #include <endian.h> 27 28 #include "ucode/vm.h" 29 #include "ucode/lib.h" 30 #include "ucode/lexer.h" 31 32 struct keyword { 33 unsigned type; 34 const char *pat; 35 unsigned plen; 36 }; 37 38 #define dec(o) \ 39 ((o) - '') 40 41 #define hex(x) \ 42 (((x) >= 'a') ? (10 + (x) - 'a') : \ 43 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) 44 45 #ifndef NO_COMPILE 46 47 static const struct keyword reserved_words[] = { 48 { TK_ENDFUNC, "endfunction", 11 }, 49 { TK_CONTINUE, "continue", 8 }, 50 { TK_ENDWHILE, "endwhile", 8 }, 51 { TK_FUNC, "function", 8 }, 52 { TK_DEFAULT, "default", 7 }, 53 { TK_DELETE, "delete", 6 }, 54 { TK_RETURN, "return", 6 }, 55 { TK_ENDFOR, "endfor", 6 }, 56 { TK_SWITCH, "switch", 6 }, 57 { TK_IMPORT, "import", 6 }, 58 { TK_EXPORT, "export", 6 }, 59 { TK_ENDIF, "endif", 5 }, 60 { TK_WHILE, "while", 5 }, 61 { TK_BREAK, "break", 5 }, 62 { TK_CATCH, "catch", 5 }, 63 { TK_CONST, "const", 5 }, 64 { TK_FALSE, "false", 5 }, 65 { TK_TRUE, "true", 4 }, 66 { TK_ELIF, "elif", 4 }, 67 { TK_ELSE, "else", 4 }, 68 { TK_THIS, "this", 4 }, 69 { TK_NULL, "null", 4 }, 70 { TK_CASE, "case", 4 }, 71 { TK_FROM, "from", 4 }, 72 { TK_TRY, "try", 3 }, 73 { TK_FOR, "for", 3 }, 74 { TK_LOCAL, "let", 3 }, 75 { TK_IF, "if", 2 }, 76 { TK_IN, "in", 2 }, 77 { TK_AS, "as", 2 }, 78 }; 79 80 81 static int 82 fill_buf(uc_lexer_t *lex) { 83 lex->rbuf = xrealloc(lex->rbuf, 128); 84 lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp); 85 lex->rpos = 0; 86 87 if (!lex->rlen) 88 return EOF; 89 90 lex->rpos++; 91 92 return (int)lex->rbuf[0]; 93 } 94 95 static int 96 update_line(uc_lexer_t *lex, int ch) { 97 if (ch == '\n' || ch == EOF) 98 uc_source_line_next(lex->source); 99 else 100 uc_source_line_update(lex->source, 1); 101 102 lex->source->off++; 103 104 return ch; 105 } 106 107 static int 108 lookahead_char(uc_lexer_t *lex) { 109 int c; 110 111 if (lex->rpos < lex->rlen) 112 return (int)lex->rbuf[lex->rpos]; 113 114 c = fill_buf(lex); 115 lex->rpos = 0; 116 117 return c; 118 } 119 120 static bool 121 check_char(uc_lexer_t *lex, int ch) { 122 if (lookahead_char(lex) != ch) 123 return false; 124 125 lex->rpos++; 126 127 update_line(lex, ch); 128 129 return true; 130 } 131 132 static int 133 next_char(uc_lexer_t *lex) { 134 int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex); 135 136 return update_line(lex, ch); 137 } 138 139 static uc_token_t * 140 emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv) 141 { 142 lex->curr.type = type; 143 lex->curr.uv = uv; 144 145 if (pos < 0) 146 lex->curr.pos = lex->source->off + pos; 147 else 148 lex->curr.pos = (size_t)pos; 149 150 return &lex->curr; 151 } 152 153 static uc_token_t * 154 emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) { 155 uc_token_t *rv = NULL; 156 157 if (lex->buffer.count) { 158 if (strip_trailing_chars) 159 while (lex->buffer.count > 0 && strchr(strip_trailing_chars, *uc_vector_last(&lex->buffer))) 160 lex->buffer.count--; 161 162 rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count)); 163 164 uc_vector_clear(&lex->buffer); 165 } 166 else if (type != TK_TEXT) { 167 rv = emit_op(lex, pos, type, ucv_string_new_length("", 0)); 168 } 169 170 return rv; 171 } 172 173 174 static uc_token_t * 175 parse_comment(uc_lexer_t *lex, int kind) 176 { 177 int ch; 178 179 while (true) { 180 ch = next_char(lex); 181 182 if (kind == '/' && (ch == '\n' || ch == EOF)) 183 break; 184 185 if (kind == '*' && ch == '*' && check_char(lex, '/')) 186 break; 187 188 if (ch == EOF) { 189 lex->state = UC_LEX_EOF; 190 191 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment")); 192 } 193 } 194 195 return NULL; 196 } 197 198 static void 199 append_utf8(uc_lexer_t *lex, int code) { 200 char ustr[8], *up; 201 int rem; 202 203 up = ustr; 204 rem = sizeof(ustr); 205 206 if (utf8enc(&up, &rem, code)) 207 for (up = ustr; rem < (int)sizeof(ustr); rem++) 208 uc_vector_push(&lex->buffer, *up++); 209 } 210 211 static uc_token_t * 212 parse_escape(uc_lexer_t *lex, const char *regex_macros) 213 { 214 int code, ch, i; 215 const char *p; 216 217 /* unicode escape sequence */ 218 if (check_char(lex, 'u')) { 219 for (i = 0, code = 0; i < 4; i++) { 220 ch = next_char(lex); 221 222 if (!isxdigit(ch)) 223 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence")); 224 225 code = code * 16 + hex(ch); 226 } 227 228 /* is a leading surrogate value */ 229 if ((code & 0xFC00) == 0xD800) { 230 /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */ 231 if (lex->lead_surrogate) 232 append_utf8(lex, 0xFFFD); 233 234 /* store surrogate value and advance to next escape sequence */ 235 lex->lead_surrogate = code; 236 } 237 238 /* is a trailing surrogate value */ 239 else if ((code & 0xFC00) == 0xDC00) { 240 /* found a trailing surrogate following a leading one, combine and encode */ 241 if (lex->lead_surrogate) { 242 code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); 243 lex->lead_surrogate = 0; 244 } 245 246 /* trailing surrogate not following a leading one, ignore and use replacement char */ 247 else { 248 code = 0xFFFD; 249 } 250 251 append_utf8(lex, code); 252 } 253 254 /* is a normal codepoint */ 255 else { 256 append_utf8(lex, code); 257 } 258 } 259 260 /* hex escape sequence */ 261 else if (check_char(lex, 'x')) { 262 for (i = 0, code = 0; i < 2; i++) { 263 ch = next_char(lex); 264 265 if (!isxdigit(ch)) 266 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence")); 267 268 code = code * 16 + hex(ch); 269 } 270 271 append_utf8(lex, code); 272 } 273 274 /* octal or letter */ 275 else { 276 /* try to parse octal sequence... */ 277 for (i = 0, code = 0, ch = lookahead_char(lex); 278 i < 3 && ch >= '' && ch <= '7'; 279 i++, next_char(lex), ch = lookahead_char(lex)) { 280 code = code * 8 + dec(ch); 281 } 282 283 if (i) { 284 if (code > 255) 285 return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence")); 286 287 append_utf8(lex, code); 288 } 289 290 /* ... no octal sequence, handle potential regex macros */ 291 else if (strchr(regex_macros, ch)) { 292 ch = next_char(lex); 293 294 switch (ch) { 295 case 'd': p = "[[:digit:]]"; break; 296 case 'D': p = "[^[:digit:]]"; break; 297 case 'w': p = "[[:alnum:]_]"; break; 298 case 'W': p = "[^[:alnum:]_]"; break; 299 case 's': p = "[[:space:]]"; break; 300 case 'S': p = "[^[:space:]]"; break; 301 default: p = NULL; 302 } 303 304 if (p) { 305 while (*p) 306 uc_vector_push(&lex->buffer, *p++); 307 } 308 else { 309 uc_vector_push(&lex->buffer, '\\'); 310 uc_vector_push(&lex->buffer, ch); 311 } 312 } 313 314 /* ... handle other escape */ 315 else { 316 ch = next_char(lex); 317 318 switch (ch) { 319 case 'a': uc_vector_push(&lex->buffer, '\a'); break; 320 case 'b': uc_vector_push(&lex->buffer, '\b'); break; 321 case 'e': uc_vector_push(&lex->buffer, '\033'); break; 322 case 'f': uc_vector_push(&lex->buffer, '\f'); break; 323 case 'n': uc_vector_push(&lex->buffer, '\n'); break; 324 case 'r': uc_vector_push(&lex->buffer, '\r'); break; 325 case 't': uc_vector_push(&lex->buffer, '\t'); break; 326 case 'v': uc_vector_push(&lex->buffer, '\v'); break; 327 328 case EOF: 329 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string")); 330 331 default: 332 uc_vector_push(&lex->buffer, ch); 333 } 334 } 335 } 336 337 return NULL; 338 } 339 340 static uc_token_t * 341 parse_string(uc_lexer_t *lex, int kind) 342 { 343 uc_token_t *err; 344 unsigned type; 345 int code, ch; 346 size_t off; 347 348 if (kind == '`') 349 type = TK_TEMPLATE; 350 else if (kind == '/') 351 type = TK_REGEXP; 352 else 353 type = TK_STRING; 354 355 off = lex->source->off - 1; 356 357 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) { 358 switch (ch) { 359 /* placeholder */ 360 case '$': 361 if (type == TK_TEMPLATE && check_char(lex, '{')) { 362 lex->state = UC_LEX_PLACEHOLDER_START; 363 364 return emit_buffer(lex, off, type, NULL); 365 } 366 367 uc_vector_push(&lex->buffer, '$'); 368 break; 369 370 /* regexp bracket expression */ 371 case '[': 372 uc_vector_push(&lex->buffer, '['); 373 374 if (type == TK_REGEXP) { 375 /* skip leading negation (^) */ 376 if (check_char(lex, '^')) 377 uc_vector_push(&lex->buffer, '^'); 378 379 /* skip leading `]` - it is literal and not closing the bracket expr */ 380 if (check_char(lex, ']')) 381 uc_vector_push(&lex->buffer, ']'); 382 383 /* read until closing `]` */ 384 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) { 385 if (ch == '\\') { 386 err = parse_escape(lex, "^"); 387 388 if (err) 389 return err; 390 391 continue; 392 } 393 394 uc_vector_push(&lex->buffer, ch); 395 396 if (ch == ']') 397 break; 398 399 /* skip nested char classes / equivalence classes / collating chars */ 400 if (ch == '[') { 401 code = lookahead_char(lex); 402 403 if (code == ':' || code == '.' || code == '=') { 404 uc_vector_push(&lex->buffer, code); 405 next_char(lex); 406 407 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) { 408 if (ch == '\\') { 409 err = parse_escape(lex, ""); 410 411 if (err) 412 return err; 413 414 continue; 415 } 416 417 uc_vector_push(&lex->buffer, ch); 418 419 if (ch == code && check_char(lex, ']')) { 420 uc_vector_push(&lex->buffer, ']'); 421 break; 422 } 423 } 424 } 425 } 426 } 427 } 428 429 break; 430 431 /* escape sequence */ 432 case '\\': 433 err = parse_escape(lex, 434 (type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : ""); 435 436 if (err) 437 return err; 438 439 break; 440 441 /* other character */ 442 default: 443 /* terminating delimitter */ 444 if (ch == kind) 445 return emit_buffer(lex, off, type, NULL); 446 447 uc_vector_push(&lex->buffer, ch); 448 } 449 } 450 451 // FIXME 452 lex->state = UC_LEX_EOF; 453 454 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string")); 455 } 456 457 458 /* 459 * Parses a regexp literal from the given buffer. 460 * 461 * Returns a negative value on error, otherwise the amount of consumed 462 * characters from the given buffer. 463 * 464 * Error values: 465 * -UC_ERROR_UNTERMINATED_STRING Unterminated regexp 466 * -UC_ERROR_INVALID_ESCAPE Invalid escape sequence 467 * -UC_ERROR_OVERLONG_STRING Regexp literal too long 468 * -UC_ERROR_INVALID_REGEXP Could not compile regexp 469 */ 470 471 enum { 472 UC_LEX_PARSE_REGEX_INIT, 473 UC_LEX_PARSE_REGEX_PATTERN, 474 UC_LEX_PARSE_REGEX_FLAGS 475 }; 476 477 static uc_token_t * 478 parse_regexp(uc_lexer_t *lex) 479 { 480 bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false; 481 uc_token_t *rv; 482 size_t len; 483 char *s; 484 485 rv = parse_string(lex, '/'); 486 487 if (rv->type == TK_REGEXP) { 488 while (true) { 489 if (check_char(lex, 'g')) 490 is_reg_global = true; 491 else if (check_char(lex, 'i')) 492 is_reg_icase = true; 493 else if (check_char(lex, 's')) 494 is_reg_newline = true; 495 else 496 break; 497 } 498 499 len = xasprintf(&s, "%c%*s", 500 (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2), 501 ucv_string_length(rv->uv), 502 ucv_string_get(rv->uv)); 503 504 ucv_free(rv->uv, false); 505 rv->uv = ucv_string_new_length(s, len); 506 free(s); 507 } 508 509 return rv; 510 } 511 512 513 /* 514 * Parses a label from the given buffer. 515 * 516 * Returns a negative value on error, otherwise the amount of consumed 517 * characters from the given buffer. 518 * 519 * Error values: 520 * -UC_ERROR_OVERLONG_STRING Label too long 521 */ 522 523 static uc_token_t * 524 parse_label(uc_lexer_t *lex, int ch) 525 { 526 const struct keyword *word; 527 size_t i, len; 528 529 while (true) { 530 uc_vector_push(&lex->buffer, ch); 531 ch = lookahead_char(lex); 532 533 if (!isalnum(ch) && ch != '_') 534 break; 535 536 next_char(lex); 537 } 538 539 len = lex->buffer.count; 540 541 if (!lex->no_keyword) { 542 for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { 543 if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) { 544 uc_vector_clear(&lex->buffer); 545 546 return emit_op(lex, -len, word->type, NULL); 547 } 548 } 549 } 550 551 return emit_buffer(lex, -len, TK_LABEL, NULL); 552 } 553 554 555 /* 556 * Parses a number literal from the given buffer. 557 * 558 * Returns a negative value on error, otherwise the amount of consumed 559 * characters from the given buffer. 560 * 561 * Error values: 562 * -UC_ERROR_INVALID_ESCAPE Invalid number character 563 */ 564 565 static inline bool 566 is_numeric_char(uc_lexer_t *lex, char c) 567 { 568 char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0; 569 570 switch (c|32) { 571 case '.': 572 case '': 573 case '1': 574 case '2': 575 case '3': 576 case '4': 577 case '5': 578 case '6': 579 case '7': 580 case '8': 581 case '9': 582 return true; 583 584 case 'a': 585 case 'b': 586 case 'c': 587 case 'd': 588 case 'e': 589 case 'f': 590 case 'o': 591 case 'x': 592 /* require previous char, a number literal cannot start with these */ 593 return prev != 0; 594 595 case '+': 596 case '-': 597 /* sign is only allowed after an exponent char */ 598 return (prev|32) == 'e'; 599 } 600 601 return false; 602 } 603 604 static uc_token_t * 605 parse_number(uc_lexer_t *lex, int ch) 606 { 607 uc_value_t *nv = NULL; 608 size_t len; 609 char *e; 610 611 while (true) { 612 uc_vector_push(&lex->buffer, ch); 613 ch = lookahead_char(lex); 614 615 if (!is_numeric_char(lex, ch)) 616 break; 617 618 next_char(lex); 619 } 620 621 len = lex->buffer.count; 622 623 uc_vector_push(&lex->buffer, '\0'); 624 625 nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e); 626 627 uc_vector_clear(&lex->buffer); 628 629 switch (ucv_type(nv)) { 630 case UC_DOUBLE: 631 return emit_op(lex, -len, TK_DOUBLE, nv); 632 633 case UC_INTEGER: 634 return emit_op(lex, -len, TK_NUMBER, nv); 635 636 default: 637 return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal")); 638 } 639 } 640 641 static uc_token_t * 642 lex_find_token(uc_lexer_t *lex) 643 { 644 bool tpl = !(lex->config && lex->config->raw_mode); 645 int ch = next_char(lex); 646 647 while (isspace(ch)) 648 ch = next_char(lex); 649 650 switch (ch) { 651 case '~': 652 return emit_op(lex, -1, TK_COMPL, NULL); 653 654 case '}': 655 if (tpl && check_char(lex, '}')) 656 return emit_op(lex, -2, TK_REXP, NULL); 657 658 return emit_op(lex, -1, TK_RBRACE, NULL); 659 660 case '|': 661 if (check_char(lex, '|')) { 662 if (check_char(lex, '=')) 663 return emit_op(lex, -3, TK_ASOR, NULL); 664 665 return emit_op(lex, -2, TK_OR, NULL); 666 } 667 668 if (check_char(lex, '=')) 669 return emit_op(lex, -2, TK_ASBOR, NULL); 670 671 return emit_op(lex, -1, TK_BOR, NULL); 672 673 case '{': 674 if (tpl && check_char(lex, '{')) 675 return emit_op(lex, -2, TK_LEXP, NULL); 676 677 if (tpl && check_char(lex, '%')) 678 return emit_op(lex, -2, TK_LSTM, NULL); 679 680 return emit_op(lex, -1, TK_LBRACE, NULL); 681 682 case '^': 683 if (check_char(lex, '=')) 684 return emit_op(lex, -2, TK_ASBXOR, NULL); 685 686 return emit_op(lex, -1, TK_BXOR, NULL); 687 688 case '[': 689 return emit_op(lex, -1, TK_LBRACK, NULL); 690 691 case ']': 692 return emit_op(lex, -1, TK_RBRACK, NULL); 693 694 case '?': 695 if (check_char(lex, '?')) { 696 if (check_char(lex, '=')) 697 return emit_op(lex, -3, TK_ASNULLISH, NULL); 698 699 return emit_op(lex, -2, TK_NULLISH, NULL); 700 } 701 702 if (check_char(lex, '.')) { 703 if (check_char(lex, '[')) 704 return emit_op(lex, -3, TK_QLBRACK, NULL); 705 706 if (check_char(lex, '(')) 707 return emit_op(lex, -3, TK_QLPAREN, NULL); 708 709 return emit_op(lex, -2, TK_QDOT, NULL); 710 } 711 712 return emit_op(lex, lex->source->off, TK_QMARK, NULL); 713 714 case '>': 715 if (check_char(lex, '>')) { 716 if (check_char(lex, '=')) 717 return emit_op(lex, -3, TK_ASRIGHT, NULL); 718 719 return emit_op(lex, -2, TK_RSHIFT, NULL); 720 } 721 722 if (check_char(lex, '=')) 723 return emit_op(lex, -2, TK_GE, NULL); 724 725 return emit_op(lex, -1, TK_GT, NULL); 726 727 case '=': 728 if (check_char(lex, '=')) { 729 if (check_char(lex, '=')) 730 return emit_op(lex, -3, TK_EQS, NULL); 731 732 return emit_op(lex, -2, TK_EQ, NULL); 733 } 734 735 if (check_char(lex, '>')) 736 return emit_op(lex, -2, TK_ARROW, NULL); 737 738 return emit_op(lex, -1, TK_ASSIGN, NULL); 739 740 case '<': 741 if (check_char(lex, '<')) { 742 if (check_char(lex, '=')) 743 return emit_op(lex, -3, TK_ASLEFT, NULL); 744 745 return emit_op(lex, -2, TK_LSHIFT, NULL); 746 } 747 748 if (check_char(lex, '=')) 749 return emit_op(lex, -2, TK_LE, NULL); 750 751 return emit_op(lex, -1, TK_LT, NULL); 752 753 case ';': 754 return emit_op(lex, -1, TK_SCOL, NULL); 755 756 case ':': 757 return emit_op(lex, -1, TK_COLON, NULL); 758 759 case '/': 760 ch = lookahead_char(lex); 761 lex->lastoff = lex->source->off - 1; 762 763 if (ch == '/' || ch == '*') 764 return parse_comment(lex, ch); 765 766 if (lex->no_regexp) { 767 if (check_char(lex, '=')) 768 return emit_op(lex, -2, TK_ASDIV, NULL); 769 770 return emit_op(lex, -1, TK_DIV, NULL); 771 } 772 773 return parse_regexp(lex); 774 775 case '.': 776 if (check_char(lex, '.')) { 777 if (check_char(lex, '.')) 778 return emit_op(lex, -3, TK_ELLIP, NULL); 779 780 /* The sequence ".." cannot be a valid */ 781 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character")); 782 } 783 784 return emit_op(lex, -1, TK_DOT, NULL); 785 786 case '-': 787 if (tpl && check_char(lex, '}')) { 788 if (check_char(lex, '}')) { 789 lex->modifier = MINUS; 790 791 return emit_op(lex, -3, TK_REXP, NULL); 792 } 793 794 /* The sequence "-}" cannot be a valid */ 795 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); 796 } 797 798 if (tpl && check_char(lex, '%')) { 799 if (check_char(lex, '}')) { 800 lex->modifier = MINUS; 801 802 return emit_op(lex, -3, TK_RSTM, NULL); 803 } 804 805 /* The sequence "-%" cannot be a valid */ 806 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); 807 } 808 809 if (check_char(lex, '=')) 810 return emit_op(lex, -2, TK_ASSUB, NULL); 811 812 if (check_char(lex, '-')) 813 return emit_op(lex, -2, TK_DEC, NULL); 814 815 return emit_op(lex, -1, TK_SUB, NULL); 816 817 case ',': 818 return emit_op(lex, -1, TK_COMMA, NULL); 819 820 case '+': 821 if (check_char(lex, '=')) 822 return emit_op(lex, -2, TK_ASADD, NULL); 823 824 if (check_char(lex, '+')) 825 return emit_op(lex, -2, TK_INC, NULL); 826 827 return emit_op(lex, -1, TK_ADD, NULL); 828 829 case '*': 830 if (check_char(lex, '*')) { 831 if (check_char(lex, '=')) 832 return emit_op(lex, -3, TK_ASEXP, NULL); 833 834 return emit_op(lex, -2, TK_EXP, NULL); 835 } 836 837 if (check_char(lex, '=')) 838 return emit_op(lex, -2, TK_ASMUL, NULL); 839 840 return emit_op(lex, -1, TK_MUL, NULL); 841 842 case '(': 843 return emit_op(lex, -1, TK_LPAREN, NULL); 844 845 case ')': 846 return emit_op(lex, -1, TK_RPAREN, NULL); 847 848 case '\'': 849 case '"': 850 case '`': 851 lex->lastoff = lex->source->off - 1; 852 853 return parse_string(lex, ch); 854 855 case '&': 856 if (check_char(lex, '&')) { 857 if (check_char(lex, '=')) 858 return emit_op(lex, -3, TK_ASAND, NULL); 859 860 return emit_op(lex, -2, TK_AND, NULL); 861 } 862 863 if (check_char(lex, '=')) 864 return emit_op(lex, -2, TK_ASBAND, NULL); 865 866 return emit_op(lex, -1, TK_BAND, NULL); 867 868 case '%': 869 if (tpl && check_char(lex, '}')) 870 return emit_op(lex, -2, TK_RSTM, NULL); 871 872 if (check_char(lex, '=')) 873 return emit_op(lex, -2, TK_ASMOD, NULL); 874 875 return emit_op(lex, -1, TK_MOD, NULL); 876 877 case '!': 878 if (check_char(lex, '=')) { 879 if (check_char(lex, '=')) 880 return emit_op(lex, -3, TK_NES, NULL); 881 882 return emit_op(lex, -2, TK_NE, NULL); 883 } 884 885 return emit_op(lex, -1, TK_NOT, NULL); 886 887 case EOF: 888 return emit_op(lex, -1, TK_EOF, NULL); 889 890 default: 891 if (isalpha(ch) || ch == '_') 892 return parse_label(lex, ch); 893 894 if (isdigit(ch)) 895 return parse_number(lex, ch); 896 897 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); 898 } 899 } 900 901 static uc_token_t * 902 lex_step(uc_lexer_t *lex) 903 { 904 const char *strip = NULL; 905 uc_token_t *tok; 906 size_t *nest; 907 int ch; 908 909 while (lex->state != UC_LEX_EOF) { 910 switch (lex->state) { 911 case UC_LEX_IDENTIFY_BLOCK: 912 ch = next_char(lex); 913 914 /* previous block had strip trailing whitespace flag, skip leading whitespace */ 915 if (lex->modifier == MINUS) { 916 while (isspace(ch)) 917 ch = next_char(lex); 918 919 lex->modifier = UNSPEC; 920 } 921 922 /* previous block was a statement block and trim_blocks is enabled, skip leading newline */ 923 else if (lex->modifier == NEWLINE) { 924 if (ch == '\n') 925 ch = next_char(lex); 926 927 lex->modifier = UNSPEC; 928 } 929 930 /* scan forward through buffer to identify block start token */ 931 while (ch != EOF) { 932 if (ch == '{') { 933 ch = next_char(lex); 934 935 switch (ch) { 936 /* found start of comment block */ 937 case '#': 938 lex->state = UC_LEX_BLOCK_COMMENT; 939 lex->block = COMMENT; 940 941 if (check_char(lex, '-')) 942 strip = " \n\t\v\f\r"; 943 944 break; 945 946 /* found start of expression block */ 947 case '{': 948 lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG; 949 950 if (check_char(lex, '-')) 951 strip = " \n\t\v\f\r"; 952 953 break; 954 955 /* found start of statement block */ 956 case '%': 957 lex->state = UC_LEX_IDENTIFY_TOKEN; 958 lex->block = STATEMENTS; 959 960 if (check_char(lex, '-')) 961 strip = " \n\t\v\f\r"; 962 else if (check_char(lex, '+')) 963 strip = NULL; 964 else if (lex->config && lex->config->lstrip_blocks) 965 strip = " \t\v\f\r"; 966 967 break; 968 969 default: 970 /* not a start tag, remember char and move on */ 971 uc_vector_push(&lex->buffer, '{'); 972 continue; 973 } 974 975 break; 976 } 977 978 uc_vector_push(&lex->buffer, ch); 979 ch = next_char(lex); 980 } 981 982 if (ch == EOF) 983 lex->state = UC_LEX_EOF; 984 985 /* push out leading text */ 986 tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip); 987 lex->lastoff = lex->source->off - 2; 988 989 if (!tok) 990 continue; 991 992 return tok; 993 994 995 case UC_LEX_BLOCK_COMMENT: 996 ch = next_char(lex); 997 998 /* scan forward through buffer to identify end token */ 999 while (ch != EOF) { 1000 if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) { 1001 lex->modifier = MINUS; 1002 break; 1003 } 1004 1005 if (ch == '#' && check_char(lex, '}')) 1006 break; 1007 1008 ch = next_char(lex); 1009 } 1010 1011 if (ch == EOF) { 1012 lex->state = UC_LEX_EOF; 1013 1014 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block")); 1015 } 1016 1017 lex->lastoff = lex->source->off; 1018 lex->state = UC_LEX_IDENTIFY_BLOCK; 1019 1020 continue; 1021 1022 1023 case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG: 1024 lex->state = UC_LEX_IDENTIFY_TOKEN; 1025 lex->block = EXPRESSION; 1026 1027 return emit_op(lex, lex->source->off, TK_LEXP, NULL); 1028 1029 1030 case UC_LEX_IDENTIFY_TOKEN: 1031 do { tok = lex_find_token(lex); } while (tok == NULL); 1032 1033 /* disallow nesting blocks */ 1034 if (tok->type == TK_LSTM || tok->type == TK_LEXP) 1035 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested")); 1036 1037 /* found end of statement block */ 1038 if (lex->block == STATEMENTS && tok->type == TK_RSTM) { 1039 /* strip newline after statement block? */ 1040 if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks) 1041 lex->modifier = NEWLINE; 1042 1043 lex->lastoff = lex->source->off; 1044 lex->state = UC_LEX_IDENTIFY_BLOCK; 1045 lex->block = NONE; 1046 1047 tok = emit_op(lex, -2, TK_SCOL, NULL); 1048 } 1049 1050 /* found end of expression block */ 1051 else if (lex->block == EXPRESSION && tok->type == TK_REXP) { 1052 lex->lastoff = lex->source->off; 1053 lex->state = UC_LEX_IDENTIFY_BLOCK; 1054 lex->block = NONE; 1055 } 1056 1057 /* track opening braces */ 1058 else if (tok->type == TK_LBRACE && lex->templates.count > 0) { 1059 nest = uc_vector_last(&lex->templates); 1060 (*nest)++; 1061 } 1062 1063 /* check end of placeholder expression */ 1064 else if (tok->type == TK_RBRACE && lex->templates.count > 0) { 1065 nest = uc_vector_last(&lex->templates); 1066 1067 if (*nest == 0) { 1068 lex->templates.count--; 1069 lex->state = UC_LEX_PLACEHOLDER_END; 1070 } 1071 else { 1072 (*nest)--; 1073 } 1074 } 1075 1076 /* premature EOF? */ 1077 else if (tok->type == TK_EOF && lex->block != STATEMENTS) { 1078 lex->state = UC_LEX_EOF; 1079 1080 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block")); 1081 } 1082 1083 return tok; 1084 1085 1086 case UC_LEX_PLACEHOLDER_START: 1087 lex->state = UC_LEX_IDENTIFY_TOKEN; 1088 1089 uc_vector_push(&lex->templates, 0); 1090 1091 return emit_op(lex, -2, TK_PLACEH, NULL); 1092 1093 1094 case UC_LEX_PLACEHOLDER_END: 1095 lex->state = UC_LEX_IDENTIFY_TOKEN; 1096 1097 return parse_string(lex, '`'); 1098 1099 1100 case UC_LEX_EOF: 1101 break; 1102 } 1103 } 1104 1105 return emit_op(lex, lex->source->off, TK_EOF, NULL); 1106 } 1107 1108 void 1109 uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source) 1110 { 1111 lex->state = UC_LEX_IDENTIFY_BLOCK; 1112 1113 lex->config = config; 1114 lex->source = uc_source_get(source); 1115 1116 lex->block = NONE; 1117 lex->modifier = UNSPEC; 1118 1119 lex->rlen = 0; 1120 lex->rpos = 0; 1121 lex->rbuf = NULL; 1122 1123 lex->buffer.count = 0; 1124 lex->buffer.entries = NULL; 1125 1126 lex->lead_surrogate = 0; 1127 1128 lex->lastoff = 0; 1129 1130 lex->templates.count = 0; 1131 lex->templates.entries = NULL; 1132 1133 if (config && config->raw_mode) { 1134 lex->state = UC_LEX_IDENTIFY_TOKEN; 1135 lex->block = STATEMENTS; 1136 } 1137 } 1138 1139 void 1140 uc_lexer_free(uc_lexer_t *lex) 1141 { 1142 uc_vector_clear(&lex->buffer); 1143 uc_vector_clear(&lex->templates); 1144 1145 uc_source_put(lex->source); 1146 1147 free(lex->rbuf); 1148 } 1149 1150 uc_token_t * 1151 uc_lexer_next_token(uc_lexer_t *lex) 1152 { 1153 uc_token_t *rv = NULL; 1154 1155 rv = lex_step(lex); 1156 1157 lex->no_keyword = false; 1158 lex->no_regexp = false; 1159 1160 return rv; 1161 } 1162 1163 const char * 1164 uc_tokenname(unsigned type) 1165 { 1166 static char buf[sizeof("'endfunction'")]; 1167 const char *tokennames[] = { 1168 [TK_LEXP] = "'{{'", 1169 [TK_REXP] = "'}}'", 1170 [TK_LSTM] = "'{%'", 1171 [TK_RSTM] = "'%}'", 1172 [TK_COMMA] = "','", 1173 [TK_ASSIGN] = "'='", 1174 [TK_ASADD] = "'+='", 1175 [TK_ASSUB] = "'-='", 1176 [TK_ASMUL] = "'*='", 1177 [TK_ASDIV] = "'/='", 1178 [TK_ASMOD] = "'%='", 1179 [TK_ASLEFT] = "'<<='", 1180 [TK_ASRIGHT] = "'>>='", 1181 [TK_ASBAND] = "'&='", 1182 [TK_ASBXOR] = "'^='", 1183 [TK_ASBOR] = "'|='", 1184 [TK_QMARK] = "'?'", 1185 [TK_COLON] = "':'", 1186 [TK_OR] = "'||'", 1187 [TK_AND] = "'&&'", 1188 [TK_BOR] = "'|'", 1189 [TK_BXOR] = "'^'", 1190 [TK_BAND] = "'&'", 1191 [TK_EQS] = "'==='", 1192 [TK_NES] = "'!=='", 1193 [TK_EQ] = "'=='", 1194 [TK_NE] = "'!='", 1195 [TK_LT] = "'<'", 1196 [TK_LE] = "'<='", 1197 [TK_GT] = "'>'", 1198 [TK_GE] = "'>='", 1199 [TK_LSHIFT] = "'<<'", 1200 [TK_RSHIFT] = "'>>'", 1201 [TK_ADD] = "'+'", 1202 [TK_SUB] = "'-'", 1203 [TK_MUL] = "'*'", 1204 [TK_DIV] = "'/'", 1205 [TK_MOD] = "'%'", 1206 [TK_EXP] = "'**'", 1207 [TK_NOT] = "'!'", 1208 [TK_COMPL] = "'~'", 1209 [TK_INC] = "'++'", 1210 [TK_DEC] = "'--'", 1211 [TK_DOT] = "'.'", 1212 [TK_LBRACK] = "'['", 1213 [TK_RBRACK] = "']'", 1214 [TK_LPAREN] = "'('", 1215 [TK_RPAREN] = "')'", 1216 [TK_LBRACE] = "'{'", 1217 [TK_RBRACE] = "'}'", 1218 [TK_SCOL] = "';'", 1219 [TK_ELLIP] = "'...'", 1220 [TK_ARROW] = "'=>'", 1221 [TK_QLBRACK] = "'?.['", 1222 [TK_QLPAREN] = "'?.('", 1223 [TK_QDOT] = "'?.'", 1224 [TK_ASEXP] = "'**='", 1225 [TK_ASAND] = "'&&='", 1226 [TK_ASOR] = "'||='", 1227 [TK_ASNULLISH] = "'\?\?='", 1228 [TK_NULLISH] = "'\?\?'", 1229 [TK_PLACEH] = "'${'", 1230 1231 [TK_TEXT] = "Text", 1232 [TK_LABEL] = "Label", 1233 [TK_NUMBER] = "Number", 1234 [TK_DOUBLE] = "Double", 1235 [TK_STRING] = "String", 1236 [TK_REGEXP] = "Regexp", 1237 [TK_TEMPLATE] = "Template", 1238 [TK_ERROR] = "Error", 1239 [TK_EOF] = "End of file", 1240 }; 1241 1242 size_t i; 1243 1244 for (i = 0; i < ARRAY_SIZE(reserved_words); i++) { 1245 if (reserved_words[i].type != type) 1246 continue; 1247 1248 snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat); 1249 1250 return buf; 1251 } 1252 1253 return tokennames[type] ? tokennames[type] : "?"; 1254 } 1255 1256 bool 1257 uc_lexer_is_keyword(uc_value_t *label) 1258 { 1259 size_t i; 1260 1261 if (ucv_type(label) != UC_STRING) 1262 return false; 1263 1264 for (i = 0; i < ARRAY_SIZE(reserved_words); i++) 1265 if (!strcmp(reserved_words[i].pat, ucv_string_get(label))) 1266 return true; 1267 1268 return false; 1269 } 1270 1271 #endif /* NO_COMPILE */ 1272 1273 /* 1274 * Stores the given codepoint as a utf8 multibyte sequence into the given 1275 * output buffer and substracts the required amount of bytes from the given 1276 * length pointer. 1277 * 1278 * Returns false if the multibyte sequence would not fit into the buffer, 1279 * otherwise true. 1280 */ 1281 1282 bool 1283 utf8enc(char **out, int *rem, int code) 1284 { 1285 if (code >= 0 && code <= 0x7F) { 1286 if (*rem < 1) 1287 return false; 1288 1289 *(*out)++ = code; (*rem)--; 1290 1291 return true; 1292 } 1293 else if (code > 0 && code <= 0x7FF) { 1294 if (*rem < 2) 1295 return false; 1296 1297 *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--; 1298 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1299 1300 return true; 1301 } 1302 else if (code > 0 && code <= 0xFFFF) { 1303 if (*rem < 3) 1304 return false; 1305 1306 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--; 1307 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; 1308 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1309 1310 return true; 1311 } 1312 else if (code > 0 && code <= 0x10FFFF) { 1313 if (*rem < 4) 1314 return false; 1315 1316 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--; 1317 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--; 1318 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; 1319 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1320 1321 return true; 1322 } 1323 1324 return true; 1325 } 1326
This page was automatically generated by LXR 0.3.1. • OpenWrt