1 /* 2 * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io> 3 * 4 * Permission to use, copy, modify, and/or distribute this software for any 5 * purpose with or without fee is hereby granted, provided that the above 6 * copyright notice and this permission notice appear in all copies. 7 * 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 */ 16 17 #include <stdio.h> 18 19 #include <stdbool.h> 20 #include <stdlib.h> 21 #include <string.h> 22 #include <ctype.h> 23 #include <regex.h> 24 #include <math.h> 25 #include <errno.h> 26 27 #include "ucode/vm.h" 28 #include "ucode/lib.h" 29 #include "ucode/lexer.h" 30 #include "ucode/platform.h" 31 32 struct keyword { 33 unsigned type; 34 const char *pat; 35 unsigned plen; 36 }; 37 38 #define dec(o) \ 39 ((o) - '') 40 41 #define hex(x) \ 42 (((x) >= 'a') ? (10 + (x) - 'a') : \ 43 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) 44 45 #ifndef NO_COMPILE 46 47 static const struct keyword reserved_words[] = { 48 { TK_ENDFUNC, "endfunction", 11 }, 49 { TK_CONTINUE, "continue", 8 }, 50 { TK_ENDWHILE, "endwhile", 8 }, 51 { TK_FUNC, "function", 8 }, 52 { TK_DEFAULT, "default", 7 }, 53 { TK_DELETE, "delete", 6 }, 54 { TK_RETURN, "return", 6 }, 55 { TK_ENDFOR, "endfor", 6 }, 56 { TK_SWITCH, "switch", 6 }, 57 { TK_IMPORT, "import", 6 }, 58 { TK_EXPORT, "export", 6 }, 59 { TK_ENDIF, "endif", 5 }, 60 { TK_WHILE, "while", 5 }, 61 { TK_BREAK, "break", 5 }, 62 { TK_CATCH, "catch", 5 }, 63 { TK_CONST, "const", 5 }, 64 { TK_FALSE, "false", 5 }, 65 { TK_TRUE, "true", 4 }, 66 { TK_ELIF, "elif", 4 }, 67 { TK_ELSE, "else", 4 }, 68 { TK_THIS, "this", 4 }, 69 { TK_NULL, "null", 4 }, 70 { TK_CASE, "case", 4 }, 71 { TK_TRY, "try", 3 }, 72 { TK_FOR, "for", 3 }, 73 { TK_LOCAL, "let", 3 }, 74 { TK_IF, "if", 2 }, 75 { TK_IN, "in", 2 }, 76 }; 77 78 79 static int 80 fill_buf(uc_lexer_t *lex) { 81 lex->rbuf = xrealloc(lex->rbuf, 128); 82 lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp); 83 lex->rpos = 0; 84 85 if (!lex->rlen) 86 return EOF; 87 88 lex->rpos++; 89 90 return (int)lex->rbuf[0]; 91 } 92 93 static int 94 update_line(uc_lexer_t *lex, int ch) { 95 if (ch == '\n') 96 uc_source_line_next(lex->source); 97 else if (ch != EOF) 98 uc_source_line_update(lex->source, 1); 99 100 lex->source->off++; 101 102 return ch; 103 } 104 105 static int 106 lookahead_char(uc_lexer_t *lex) { 107 int c; 108 109 if (lex->rpos < lex->rlen) 110 return (int)lex->rbuf[lex->rpos]; 111 112 c = fill_buf(lex); 113 lex->rpos = 0; 114 115 return c; 116 } 117 118 static bool 119 check_char(uc_lexer_t *lex, int ch) { 120 if (lookahead_char(lex) != ch) 121 return false; 122 123 lex->rpos++; 124 125 update_line(lex, ch); 126 127 return true; 128 } 129 130 static int 131 next_char(uc_lexer_t *lex) { 132 int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex); 133 134 return update_line(lex, ch); 135 } 136 137 static uc_token_t * 138 emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv) 139 { 140 lex->curr.type = type; 141 lex->curr.uv = uv; 142 143 if (pos < 0) 144 lex->curr.pos = lex->source->off + pos; 145 else 146 lex->curr.pos = (size_t)pos; 147 148 return &lex->curr; 149 } 150 151 static uc_token_t * 152 emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) { 153 uc_token_t *rv = NULL; 154 155 if (lex->buffer.count) { 156 if (strip_trailing_chars) 157 while (lex->buffer.count > 0 && strchr(strip_trailing_chars, *uc_vector_last(&lex->buffer))) 158 lex->buffer.count--; 159 160 rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count)); 161 162 uc_vector_clear(&lex->buffer); 163 } 164 else if (type != TK_TEXT) { 165 rv = emit_op(lex, pos, type, ucv_string_new_length("", 0)); 166 } 167 168 return rv; 169 } 170 171 172 static uc_token_t * 173 parse_comment(uc_lexer_t *lex, int kind) 174 { 175 int ch; 176 177 while (true) { 178 ch = next_char(lex); 179 180 if (kind == '/' && (ch == '\n' || ch == EOF)) 181 break; 182 183 if (kind == '*' && ch == '*' && check_char(lex, '/')) 184 break; 185 186 if (ch == EOF) { 187 lex->state = UC_LEX_EOF; 188 189 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment")); 190 } 191 } 192 193 return NULL; 194 } 195 196 static void 197 append_utf8(uc_lexer_t *lex, int code) { 198 char ustr[8], *up; 199 int rem; 200 201 up = ustr; 202 rem = sizeof(ustr); 203 204 if (utf8enc(&up, &rem, code)) 205 for (up = ustr; rem < (int)sizeof(ustr); rem++) 206 uc_vector_push(&lex->buffer, *up++); 207 } 208 209 static uc_token_t * 210 parse_escape(uc_lexer_t *lex, const char *regex_macros) 211 { 212 int code, ch, i; 213 const char *p; 214 215 /* unicode escape sequence */ 216 if (check_char(lex, 'u')) { 217 for (i = 0, code = 0; i < 4; i++) { 218 ch = next_char(lex); 219 220 if (!isxdigit(ch)) 221 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence")); 222 223 code = code * 16 + hex(ch); 224 } 225 226 /* is a leading surrogate value */ 227 if ((code & 0xFC00) == 0xD800) { 228 /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */ 229 if (lex->lead_surrogate) 230 append_utf8(lex, 0xFFFD); 231 232 /* store surrogate value and advance to next escape sequence */ 233 lex->lead_surrogate = code; 234 } 235 236 /* is a trailing surrogate value */ 237 else if ((code & 0xFC00) == 0xDC00) { 238 /* found a trailing surrogate following a leading one, combine and encode */ 239 if (lex->lead_surrogate) { 240 code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); 241 lex->lead_surrogate = 0; 242 } 243 244 /* trailing surrogate not following a leading one, ignore and use replacement char */ 245 else { 246 code = 0xFFFD; 247 } 248 249 append_utf8(lex, code); 250 } 251 252 /* is a normal codepoint */ 253 else { 254 append_utf8(lex, code); 255 } 256 } 257 258 /* hex escape sequence */ 259 else if (check_char(lex, 'x')) { 260 for (i = 0, code = 0; i < 2; i++) { 261 ch = next_char(lex); 262 263 if (!isxdigit(ch)) 264 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence")); 265 266 code = code * 16 + hex(ch); 267 } 268 269 append_utf8(lex, code); 270 } 271 272 /* octal or letter */ 273 else { 274 /* try to parse octal sequence... */ 275 for (i = 0, code = 0, ch = lookahead_char(lex); 276 i < 3 && ch >= '' && ch <= '7'; 277 i++, next_char(lex), ch = lookahead_char(lex)) { 278 code = code * 8 + dec(ch); 279 } 280 281 if (i) { 282 if (code > 255) 283 return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence")); 284 285 append_utf8(lex, code); 286 } 287 288 /* ... no octal sequence, handle potential regex macros */ 289 else if (strchr(regex_macros, ch)) { 290 ch = next_char(lex); 291 292 switch (ch) { 293 case 'd': p = "[[:digit:]]"; break; 294 case 'D': p = "[^[:digit:]]"; break; 295 case 'w': p = "[[:alnum:]_]"; break; 296 case 'W': p = "[^[:alnum:]_]"; break; 297 case 's': p = "[[:space:]]"; break; 298 case 'S': p = "[^[:space:]]"; break; 299 default: p = NULL; 300 } 301 302 if (p) { 303 while (*p) 304 uc_vector_push(&lex->buffer, *p++); 305 } 306 else { 307 uc_vector_push(&lex->buffer, '\\'); 308 uc_vector_push(&lex->buffer, ch); 309 } 310 } 311 312 /* ... handle other escape */ 313 else { 314 ch = next_char(lex); 315 316 switch (ch) { 317 case 'a': uc_vector_push(&lex->buffer, '\a'); break; 318 case 'b': uc_vector_push(&lex->buffer, '\b'); break; 319 case 'e': uc_vector_push(&lex->buffer, '\033'); break; 320 case 'f': uc_vector_push(&lex->buffer, '\f'); break; 321 case 'n': uc_vector_push(&lex->buffer, '\n'); break; 322 case 'r': uc_vector_push(&lex->buffer, '\r'); break; 323 case 't': uc_vector_push(&lex->buffer, '\t'); break; 324 case 'v': uc_vector_push(&lex->buffer, '\v'); break; 325 326 case EOF: 327 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string")); 328 329 default: 330 uc_vector_push(&lex->buffer, ch); 331 } 332 } 333 } 334 335 return NULL; 336 } 337 338 static uc_token_t * 339 parse_string(uc_lexer_t *lex, int kind) 340 { 341 uc_token_t *err; 342 unsigned type; 343 int code, ch; 344 size_t off; 345 346 if (kind == '`') 347 type = TK_TEMPLATE; 348 else if (kind == '/') 349 type = TK_REGEXP; 350 else 351 type = TK_STRING; 352 353 off = lex->source->off - 1; 354 355 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) { 356 switch (ch) { 357 /* placeholder */ 358 case '$': 359 if (type == TK_TEMPLATE && check_char(lex, '{')) { 360 lex->state = UC_LEX_PLACEHOLDER_START; 361 362 return emit_buffer(lex, off, type, NULL); 363 } 364 365 uc_vector_push(&lex->buffer, '$'); 366 break; 367 368 /* regexp bracket expression */ 369 case '[': 370 uc_vector_push(&lex->buffer, '['); 371 372 if (type == TK_REGEXP) { 373 /* skip leading negation (^) */ 374 if (check_char(lex, '^')) 375 uc_vector_push(&lex->buffer, '^'); 376 377 /* skip leading `]` - it is literal and not closing the bracket expr */ 378 if (check_char(lex, ']')) 379 uc_vector_push(&lex->buffer, ']'); 380 381 /* read until closing `]` */ 382 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) { 383 if (ch == '\\') { 384 err = parse_escape(lex, "^"); 385 386 if (err) 387 return err; 388 389 continue; 390 } 391 392 uc_vector_push(&lex->buffer, ch); 393 394 if (ch == ']') 395 break; 396 397 /* skip nested char classes / equivalence classes / collating chars */ 398 if (ch == '[') { 399 code = lookahead_char(lex); 400 401 if (code == ':' || code == '.' || code == '=') { 402 uc_vector_push(&lex->buffer, code); 403 next_char(lex); 404 405 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) { 406 if (ch == '\\') { 407 err = parse_escape(lex, ""); 408 409 if (err) 410 return err; 411 412 continue; 413 } 414 415 uc_vector_push(&lex->buffer, ch); 416 417 if (ch == code && check_char(lex, ']')) { 418 uc_vector_push(&lex->buffer, ']'); 419 break; 420 } 421 } 422 } 423 } 424 } 425 } 426 427 break; 428 429 /* escape sequence */ 430 case '\\': 431 err = parse_escape(lex, 432 (type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : ""); 433 434 if (err) 435 return err; 436 437 break; 438 439 /* other character */ 440 default: 441 /* terminating delimitter */ 442 if (ch == kind) 443 return emit_buffer(lex, off, type, NULL); 444 445 uc_vector_push(&lex->buffer, ch); 446 } 447 } 448 449 // FIXME 450 lex->state = UC_LEX_EOF; 451 452 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string")); 453 } 454 455 456 /* 457 * Parses a regexp literal from the given buffer. 458 * 459 * Returns a negative value on error, otherwise the amount of consumed 460 * characters from the given buffer. 461 * 462 * Error values: 463 * -UC_ERROR_UNTERMINATED_STRING Unterminated regexp 464 * -UC_ERROR_INVALID_ESCAPE Invalid escape sequence 465 * -UC_ERROR_OVERLONG_STRING Regexp literal too long 466 * -UC_ERROR_INVALID_REGEXP Could not compile regexp 467 */ 468 469 enum { 470 UC_LEX_PARSE_REGEX_INIT, 471 UC_LEX_PARSE_REGEX_PATTERN, 472 UC_LEX_PARSE_REGEX_FLAGS 473 }; 474 475 static uc_token_t * 476 parse_regexp(uc_lexer_t *lex) 477 { 478 bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false; 479 uc_token_t *rv; 480 size_t len; 481 char *s; 482 483 rv = parse_string(lex, '/'); 484 485 if (rv->type == TK_REGEXP) { 486 while (true) { 487 if (check_char(lex, 'g')) 488 is_reg_global = true; 489 else if (check_char(lex, 'i')) 490 is_reg_icase = true; 491 else if (check_char(lex, 's')) 492 is_reg_newline = true; 493 else 494 break; 495 } 496 497 len = xasprintf(&s, "%c%*s", 498 (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2), 499 ucv_string_length(rv->uv), 500 ucv_string_get(rv->uv)); 501 502 ucv_free(rv->uv, false); 503 rv->uv = ucv_string_new_length(s, len); 504 free(s); 505 } 506 507 return rv; 508 } 509 510 511 /* 512 * Parses a label from the given buffer. 513 * 514 * Returns a negative value on error, otherwise the amount of consumed 515 * characters from the given buffer. 516 * 517 * Error values: 518 * -UC_ERROR_OVERLONG_STRING Label too long 519 */ 520 521 static uc_token_t * 522 parse_label(uc_lexer_t *lex, int ch) 523 { 524 const struct keyword *word; 525 size_t i, len; 526 527 while (true) { 528 uc_vector_push(&lex->buffer, ch); 529 ch = lookahead_char(lex); 530 531 if (!isalnum(ch) && ch != '_') 532 break; 533 534 next_char(lex); 535 } 536 537 len = lex->buffer.count; 538 539 if (!lex->no_keyword) { 540 for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { 541 if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) { 542 uc_vector_clear(&lex->buffer); 543 544 return emit_op(lex, -len, word->type, NULL); 545 } 546 } 547 } 548 549 return emit_buffer(lex, -len, TK_LABEL, NULL); 550 } 551 552 553 /* 554 * Parses a number literal from the given buffer. 555 * 556 * Returns a negative value on error, otherwise the amount of consumed 557 * characters from the given buffer. 558 * 559 * Error values: 560 * -UC_ERROR_INVALID_ESCAPE Invalid number character 561 */ 562 563 static inline bool 564 is_numeric_char(uc_lexer_t *lex, char c) 565 { 566 char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0; 567 568 switch (c|32) { 569 case '.': 570 case '': 571 case '1': 572 case '2': 573 case '3': 574 case '4': 575 case '5': 576 case '6': 577 case '7': 578 case '8': 579 case '9': 580 return true; 581 582 case 'a': 583 case 'b': 584 case 'c': 585 case 'd': 586 case 'e': 587 case 'f': 588 case 'o': 589 case 'x': 590 /* require previous char, a number literal cannot start with these */ 591 return prev != 0; 592 593 case '+': 594 case '-': 595 /* sign is only allowed after an exponent char */ 596 return (prev|32) == 'e'; 597 } 598 599 return false; 600 } 601 602 static uc_token_t * 603 parse_number(uc_lexer_t *lex, int ch) 604 { 605 uc_value_t *nv = NULL; 606 size_t len; 607 char *e; 608 609 while (true) { 610 uc_vector_push(&lex->buffer, ch); 611 ch = lookahead_char(lex); 612 613 if (!is_numeric_char(lex, ch)) 614 break; 615 616 next_char(lex); 617 } 618 619 len = lex->buffer.count; 620 621 uc_vector_push(&lex->buffer, '\0'); 622 623 nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e); 624 625 uc_vector_clear(&lex->buffer); 626 627 switch (ucv_type(nv)) { 628 case UC_DOUBLE: 629 return emit_op(lex, -len, TK_DOUBLE, nv); 630 631 case UC_INTEGER: 632 return emit_op(lex, -len, TK_NUMBER, nv); 633 634 default: 635 return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal")); 636 } 637 } 638 639 static uc_token_t * 640 lex_find_token(uc_lexer_t *lex) 641 { 642 bool tpl = !(lex->config && lex->config->raw_mode); 643 int ch = next_char(lex); 644 645 while (isspace(ch)) 646 ch = next_char(lex); 647 648 switch (ch) { 649 case '~': 650 return emit_op(lex, -1, TK_COMPL, NULL); 651 652 case '}': 653 if (tpl && check_char(lex, '}')) 654 return emit_op(lex, -2, TK_REXP, NULL); 655 656 return emit_op(lex, -1, TK_RBRACE, NULL); 657 658 case '|': 659 if (check_char(lex, '|')) { 660 if (check_char(lex, '=')) 661 return emit_op(lex, -3, TK_ASOR, NULL); 662 663 return emit_op(lex, -2, TK_OR, NULL); 664 } 665 666 if (check_char(lex, '=')) 667 return emit_op(lex, -2, TK_ASBOR, NULL); 668 669 return emit_op(lex, -1, TK_BOR, NULL); 670 671 case '{': 672 if (tpl && check_char(lex, '{')) 673 return emit_op(lex, -2, TK_LEXP, NULL); 674 675 if (tpl && check_char(lex, '%')) 676 return emit_op(lex, -2, TK_LSTM, NULL); 677 678 return emit_op(lex, -1, TK_LBRACE, NULL); 679 680 case '^': 681 if (check_char(lex, '=')) 682 return emit_op(lex, -2, TK_ASBXOR, NULL); 683 684 return emit_op(lex, -1, TK_BXOR, NULL); 685 686 case '[': 687 return emit_op(lex, -1, TK_LBRACK, NULL); 688 689 case ']': 690 return emit_op(lex, -1, TK_RBRACK, NULL); 691 692 case '?': 693 if (check_char(lex, '?')) { 694 if (check_char(lex, '=')) 695 return emit_op(lex, -3, TK_ASNULLISH, NULL); 696 697 return emit_op(lex, -2, TK_NULLISH, NULL); 698 } 699 700 if (check_char(lex, '.')) { 701 if (check_char(lex, '[')) 702 return emit_op(lex, -3, TK_QLBRACK, NULL); 703 704 if (check_char(lex, '(')) 705 return emit_op(lex, -3, TK_QLPAREN, NULL); 706 707 return emit_op(lex, -2, TK_QDOT, NULL); 708 } 709 710 return emit_op(lex, lex->source->off, TK_QMARK, NULL); 711 712 case '>': 713 if (check_char(lex, '>')) { 714 if (check_char(lex, '=')) 715 return emit_op(lex, -3, TK_ASRIGHT, NULL); 716 717 return emit_op(lex, -2, TK_RSHIFT, NULL); 718 } 719 720 if (check_char(lex, '=')) 721 return emit_op(lex, -2, TK_GE, NULL); 722 723 return emit_op(lex, -1, TK_GT, NULL); 724 725 case '=': 726 if (check_char(lex, '=')) { 727 if (check_char(lex, '=')) 728 return emit_op(lex, -3, TK_EQS, NULL); 729 730 return emit_op(lex, -2, TK_EQ, NULL); 731 } 732 733 if (check_char(lex, '>')) 734 return emit_op(lex, -2, TK_ARROW, NULL); 735 736 return emit_op(lex, -1, TK_ASSIGN, NULL); 737 738 case '<': 739 if (check_char(lex, '<')) { 740 if (check_char(lex, '=')) 741 return emit_op(lex, -3, TK_ASLEFT, NULL); 742 743 return emit_op(lex, -2, TK_LSHIFT, NULL); 744 } 745 746 if (check_char(lex, '=')) 747 return emit_op(lex, -2, TK_LE, NULL); 748 749 return emit_op(lex, -1, TK_LT, NULL); 750 751 case ';': 752 return emit_op(lex, -1, TK_SCOL, NULL); 753 754 case ':': 755 return emit_op(lex, -1, TK_COLON, NULL); 756 757 case '/': 758 ch = lookahead_char(lex); 759 lex->lastoff = lex->source->off - 1; 760 761 if (ch == '/' || ch == '*') 762 return parse_comment(lex, ch); 763 764 if (lex->no_regexp) { 765 if (check_char(lex, '=')) 766 return emit_op(lex, -2, TK_ASDIV, NULL); 767 768 return emit_op(lex, -1, TK_DIV, NULL); 769 } 770 771 return parse_regexp(lex); 772 773 case '.': 774 if (check_char(lex, '.')) { 775 if (check_char(lex, '.')) 776 return emit_op(lex, -3, TK_ELLIP, NULL); 777 778 /* The sequence ".." cannot be a valid */ 779 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character")); 780 } 781 782 return emit_op(lex, -1, TK_DOT, NULL); 783 784 case '-': 785 if (tpl && check_char(lex, '}')) { 786 if (check_char(lex, '}')) { 787 lex->modifier = MINUS; 788 789 return emit_op(lex, -3, TK_REXP, NULL); 790 } 791 792 /* The sequence "-}" cannot be a valid */ 793 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); 794 } 795 796 if (tpl && check_char(lex, '%')) { 797 if (check_char(lex, '}')) { 798 lex->modifier = MINUS; 799 800 return emit_op(lex, -3, TK_RSTM, NULL); 801 } 802 803 /* The sequence "-%" cannot be a valid */ 804 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); 805 } 806 807 if (check_char(lex, '=')) 808 return emit_op(lex, -2, TK_ASSUB, NULL); 809 810 if (check_char(lex, '-')) 811 return emit_op(lex, -2, TK_DEC, NULL); 812 813 return emit_op(lex, -1, TK_SUB, NULL); 814 815 case ',': 816 return emit_op(lex, -1, TK_COMMA, NULL); 817 818 case '+': 819 if (check_char(lex, '=')) 820 return emit_op(lex, -2, TK_ASADD, NULL); 821 822 if (check_char(lex, '+')) 823 return emit_op(lex, -2, TK_INC, NULL); 824 825 return emit_op(lex, -1, TK_ADD, NULL); 826 827 case '*': 828 if (check_char(lex, '*')) { 829 if (check_char(lex, '=')) 830 return emit_op(lex, -3, TK_ASEXP, NULL); 831 832 return emit_op(lex, -2, TK_EXP, NULL); 833 } 834 835 if (check_char(lex, '=')) 836 return emit_op(lex, -2, TK_ASMUL, NULL); 837 838 return emit_op(lex, -1, TK_MUL, NULL); 839 840 case '(': 841 return emit_op(lex, -1, TK_LPAREN, NULL); 842 843 case ')': 844 return emit_op(lex, -1, TK_RPAREN, NULL); 845 846 case '\'': 847 case '"': 848 case '`': 849 lex->lastoff = lex->source->off - 1; 850 851 return parse_string(lex, ch); 852 853 case '&': 854 if (check_char(lex, '&')) { 855 if (check_char(lex, '=')) 856 return emit_op(lex, -3, TK_ASAND, NULL); 857 858 return emit_op(lex, -2, TK_AND, NULL); 859 } 860 861 if (check_char(lex, '=')) 862 return emit_op(lex, -2, TK_ASBAND, NULL); 863 864 return emit_op(lex, -1, TK_BAND, NULL); 865 866 case '%': 867 if (tpl && check_char(lex, '}')) 868 return emit_op(lex, -2, TK_RSTM, NULL); 869 870 if (check_char(lex, '=')) 871 return emit_op(lex, -2, TK_ASMOD, NULL); 872 873 return emit_op(lex, -1, TK_MOD, NULL); 874 875 case '!': 876 if (check_char(lex, '=')) { 877 if (check_char(lex, '=')) 878 return emit_op(lex, -3, TK_NES, NULL); 879 880 return emit_op(lex, -2, TK_NE, NULL); 881 } 882 883 return emit_op(lex, -1, TK_NOT, NULL); 884 885 case EOF: 886 return emit_op(lex, -1, TK_EOF, NULL); 887 888 default: 889 if (isalpha(ch) || ch == '_') 890 return parse_label(lex, ch); 891 892 if (isdigit(ch)) 893 return parse_number(lex, ch); 894 895 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); 896 } 897 } 898 899 static uc_token_t * 900 lex_step(uc_lexer_t *lex) 901 { 902 const char *strip = NULL; 903 uc_token_t *tok; 904 size_t *nest; 905 int ch; 906 907 while (lex->state != UC_LEX_EOF) { 908 switch (lex->state) { 909 case UC_LEX_IDENTIFY_BLOCK: 910 ch = next_char(lex); 911 912 /* previous block had strip trailing whitespace flag, skip leading whitespace */ 913 if (lex->modifier == MINUS) { 914 while (isspace(ch)) 915 ch = next_char(lex); 916 917 lex->modifier = UNSPEC; 918 } 919 920 /* previous block was a statement block and trim_blocks is enabled, skip leading newline */ 921 else if (lex->modifier == NEWLINE) { 922 if (ch == '\n') 923 ch = next_char(lex); 924 925 lex->modifier = UNSPEC; 926 } 927 928 /* scan forward through buffer to identify block start token */ 929 while (ch != EOF) { 930 if (ch == '{') { 931 ch = next_char(lex); 932 933 switch (ch) { 934 /* found start of comment block */ 935 case '#': 936 lex->state = UC_LEX_BLOCK_COMMENT; 937 lex->block = COMMENT; 938 939 if (check_char(lex, '-')) 940 strip = " \n\t\v\f\r"; 941 942 break; 943 944 /* found start of expression block */ 945 case '{': 946 lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG; 947 948 if (check_char(lex, '-')) 949 strip = " \n\t\v\f\r"; 950 951 break; 952 953 /* found start of statement block */ 954 case '%': 955 lex->state = UC_LEX_IDENTIFY_TOKEN; 956 lex->block = STATEMENTS; 957 958 if (check_char(lex, '-')) 959 strip = " \n\t\v\f\r"; 960 else if (check_char(lex, '+')) 961 strip = NULL; 962 else if (lex->config && lex->config->lstrip_blocks) 963 strip = " \t\v\f\r"; 964 965 break; 966 967 default: 968 /* not a start tag, remember char and move on */ 969 uc_vector_push(&lex->buffer, '{'); 970 continue; 971 } 972 973 break; 974 } 975 976 uc_vector_push(&lex->buffer, ch); 977 ch = next_char(lex); 978 } 979 980 if (ch == EOF) 981 lex->state = UC_LEX_EOF; 982 983 /* push out leading text */ 984 tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip); 985 lex->lastoff = lex->source->off - 2; 986 987 if (!tok) 988 continue; 989 990 return tok; 991 992 993 case UC_LEX_BLOCK_COMMENT: 994 ch = next_char(lex); 995 996 /* scan forward through buffer to identify end token */ 997 while (ch != EOF) { 998 if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) { 999 lex->modifier = MINUS; 1000 break; 1001 } 1002 1003 if (ch == '#' && check_char(lex, '}')) 1004 break; 1005 1006 ch = next_char(lex); 1007 } 1008 1009 if (ch == EOF) { 1010 lex->state = UC_LEX_EOF; 1011 1012 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block")); 1013 } 1014 1015 lex->lastoff = lex->source->off; 1016 lex->state = UC_LEX_IDENTIFY_BLOCK; 1017 1018 continue; 1019 1020 1021 case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG: 1022 lex->state = UC_LEX_IDENTIFY_TOKEN; 1023 lex->block = EXPRESSION; 1024 1025 return emit_op(lex, lex->source->off, TK_LEXP, NULL); 1026 1027 1028 case UC_LEX_IDENTIFY_TOKEN: 1029 do { tok = lex_find_token(lex); } while (tok == NULL); 1030 1031 /* disallow nesting blocks */ 1032 if (tok->type == TK_LSTM || tok->type == TK_LEXP) 1033 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested")); 1034 1035 /* found end of statement block */ 1036 if (lex->block == STATEMENTS && tok->type == TK_RSTM) { 1037 /* strip newline after statement block? */ 1038 if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks) 1039 lex->modifier = NEWLINE; 1040 1041 lex->lastoff = lex->source->off; 1042 lex->state = UC_LEX_IDENTIFY_BLOCK; 1043 lex->block = NONE; 1044 1045 tok = emit_op(lex, -2, TK_SCOL, NULL); 1046 } 1047 1048 /* found end of expression block */ 1049 else if (lex->block == EXPRESSION && tok->type == TK_REXP) { 1050 lex->lastoff = lex->source->off; 1051 lex->state = UC_LEX_IDENTIFY_BLOCK; 1052 lex->block = NONE; 1053 } 1054 1055 /* track opening braces */ 1056 else if (tok->type == TK_LBRACE && lex->templates.count > 0) { 1057 nest = uc_vector_last(&lex->templates); 1058 (*nest)++; 1059 } 1060 1061 /* check end of placeholder expression */ 1062 else if (tok->type == TK_RBRACE && lex->templates.count > 0) { 1063 nest = uc_vector_last(&lex->templates); 1064 1065 if (*nest == 0) { 1066 lex->templates.count--; 1067 lex->state = UC_LEX_PLACEHOLDER_END; 1068 } 1069 else { 1070 (*nest)--; 1071 } 1072 } 1073 1074 /* premature EOF? */ 1075 else if (tok->type == TK_EOF && lex->block != STATEMENTS) { 1076 lex->state = UC_LEX_EOF; 1077 1078 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block")); 1079 } 1080 1081 return tok; 1082 1083 1084 case UC_LEX_PLACEHOLDER_START: 1085 lex->state = UC_LEX_IDENTIFY_TOKEN; 1086 1087 uc_vector_push(&lex->templates, 0); 1088 1089 return emit_op(lex, -2, TK_PLACEH, NULL); 1090 1091 1092 case UC_LEX_PLACEHOLDER_END: 1093 lex->state = UC_LEX_IDENTIFY_TOKEN; 1094 1095 return parse_string(lex, '`'); 1096 1097 1098 case UC_LEX_EOF: 1099 break; 1100 } 1101 } 1102 1103 return emit_op(lex, lex->source->off, TK_EOF, NULL); 1104 } 1105 1106 void 1107 uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source) 1108 { 1109 lex->state = UC_LEX_IDENTIFY_BLOCK; 1110 1111 lex->config = config; 1112 lex->source = uc_source_get(source); 1113 1114 lex->block = NONE; 1115 lex->modifier = UNSPEC; 1116 1117 lex->rlen = 0; 1118 lex->rpos = 0; 1119 lex->rbuf = NULL; 1120 1121 lex->buffer.count = 0; 1122 lex->buffer.entries = NULL; 1123 1124 lex->lead_surrogate = 0; 1125 1126 lex->lastoff = 0; 1127 1128 lex->templates.count = 0; 1129 lex->templates.entries = NULL; 1130 1131 if (config && config->raw_mode) { 1132 lex->state = UC_LEX_IDENTIFY_TOKEN; 1133 lex->block = STATEMENTS; 1134 } 1135 } 1136 1137 void 1138 uc_lexer_free(uc_lexer_t *lex) 1139 { 1140 uc_vector_clear(&lex->buffer); 1141 uc_vector_clear(&lex->templates); 1142 1143 uc_source_put(lex->source); 1144 1145 free(lex->rbuf); 1146 } 1147 1148 uc_token_t * 1149 uc_lexer_next_token(uc_lexer_t *lex) 1150 { 1151 uc_token_t *rv = NULL; 1152 1153 rv = lex_step(lex); 1154 1155 lex->no_keyword = false; 1156 lex->no_regexp = false; 1157 1158 return rv; 1159 } 1160 1161 const char * 1162 uc_tokenname(unsigned type) 1163 { 1164 static char buf[sizeof("'endfunction'")]; 1165 const char *tokennames[] = { 1166 [TK_LEXP] = "'{{'", 1167 [TK_REXP] = "'}}'", 1168 [TK_LSTM] = "'{%'", 1169 [TK_RSTM] = "'%}'", 1170 [TK_COMMA] = "','", 1171 [TK_ASSIGN] = "'='", 1172 [TK_ASADD] = "'+='", 1173 [TK_ASSUB] = "'-='", 1174 [TK_ASMUL] = "'*='", 1175 [TK_ASDIV] = "'/='", 1176 [TK_ASMOD] = "'%='", 1177 [TK_ASLEFT] = "'<<='", 1178 [TK_ASRIGHT] = "'>>='", 1179 [TK_ASBAND] = "'&='", 1180 [TK_ASBXOR] = "'^='", 1181 [TK_ASBOR] = "'|='", 1182 [TK_QMARK] = "'?'", 1183 [TK_COLON] = "':'", 1184 [TK_OR] = "'||'", 1185 [TK_AND] = "'&&'", 1186 [TK_BOR] = "'|'", 1187 [TK_BXOR] = "'^'", 1188 [TK_BAND] = "'&'", 1189 [TK_EQS] = "'==='", 1190 [TK_NES] = "'!=='", 1191 [TK_EQ] = "'=='", 1192 [TK_NE] = "'!='", 1193 [TK_LT] = "'<'", 1194 [TK_LE] = "'<='", 1195 [TK_GT] = "'>'", 1196 [TK_GE] = "'>='", 1197 [TK_LSHIFT] = "'<<'", 1198 [TK_RSHIFT] = "'>>'", 1199 [TK_ADD] = "'+'", 1200 [TK_SUB] = "'-'", 1201 [TK_MUL] = "'*'", 1202 [TK_DIV] = "'/'", 1203 [TK_MOD] = "'%'", 1204 [TK_EXP] = "'**'", 1205 [TK_NOT] = "'!'", 1206 [TK_COMPL] = "'~'", 1207 [TK_INC] = "'++'", 1208 [TK_DEC] = "'--'", 1209 [TK_DOT] = "'.'", 1210 [TK_LBRACK] = "'['", 1211 [TK_RBRACK] = "']'", 1212 [TK_LPAREN] = "'('", 1213 [TK_RPAREN] = "')'", 1214 [TK_LBRACE] = "'{'", 1215 [TK_RBRACE] = "'}'", 1216 [TK_SCOL] = "';'", 1217 [TK_ELLIP] = "'...'", 1218 [TK_ARROW] = "'=>'", 1219 [TK_QLBRACK] = "'?.['", 1220 [TK_QLPAREN] = "'?.('", 1221 [TK_QDOT] = "'?.'", 1222 [TK_ASEXP] = "'**='", 1223 [TK_ASAND] = "'&&='", 1224 [TK_ASOR] = "'||='", 1225 [TK_ASNULLISH] = "'\?\?='", 1226 [TK_NULLISH] = "'\?\?'", 1227 [TK_PLACEH] = "'${'", 1228 1229 [TK_TEXT] = "Text", 1230 [TK_LABEL] = "Label", 1231 [TK_NUMBER] = "Number", 1232 [TK_DOUBLE] = "Double", 1233 [TK_STRING] = "String", 1234 [TK_REGEXP] = "Regexp", 1235 [TK_TEMPLATE] = "Template", 1236 [TK_ERROR] = "Error", 1237 [TK_EOF] = "End of file", 1238 }; 1239 1240 size_t i; 1241 1242 for (i = 0; i < ARRAY_SIZE(reserved_words); i++) { 1243 if (reserved_words[i].type != type) 1244 continue; 1245 1246 snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat); 1247 1248 return buf; 1249 } 1250 1251 return tokennames[type] ? tokennames[type] : "?"; 1252 } 1253 1254 bool 1255 uc_lexer_is_keyword(uc_value_t *label) 1256 { 1257 size_t i; 1258 1259 if (ucv_type(label) != UC_STRING) 1260 return false; 1261 1262 for (i = 0; i < ARRAY_SIZE(reserved_words); i++) 1263 if (!strcmp(reserved_words[i].pat, ucv_string_get(label))) 1264 return true; 1265 1266 return false; 1267 } 1268 1269 #endif /* NO_COMPILE */ 1270 1271 /* 1272 * Stores the given codepoint as a utf8 multibyte sequence into the given 1273 * output buffer and substracts the required amount of bytes from the given 1274 * length pointer. 1275 * 1276 * Returns false if the multibyte sequence would not fit into the buffer, 1277 * otherwise true. 1278 */ 1279 1280 bool 1281 utf8enc(char **out, int *rem, int code) 1282 { 1283 if (code >= 0 && code <= 0x7F) { 1284 if (*rem < 1) 1285 return false; 1286 1287 *(*out)++ = code; (*rem)--; 1288 1289 return true; 1290 } 1291 else if (code > 0 && code <= 0x7FF) { 1292 if (*rem < 2) 1293 return false; 1294 1295 *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--; 1296 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1297 1298 return true; 1299 } 1300 else if (code > 0 && code <= 0xFFFF) { 1301 if (*rem < 3) 1302 return false; 1303 1304 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--; 1305 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; 1306 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1307 1308 return true; 1309 } 1310 else if (code > 0 && code <= 0x10FFFF) { 1311 if (*rem < 4) 1312 return false; 1313 1314 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--; 1315 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--; 1316 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; 1317 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1318 1319 return true; 1320 } 1321 1322 return true; 1323 } 1324
This page was automatically generated by LXR 0.3.1. • OpenWrt