1 /* 2 * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io> 3 * 4 * Permission to use, copy, modify, and/or distribute this software for any 5 * purpose with or without fee is hereby granted, provided that the above 6 * copyright notice and this permission notice appear in all copies. 7 * 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 */ 16 17 #include <stdio.h> 18 19 #include <stdbool.h> 20 #include <stdlib.h> 21 #include <string.h> 22 #include <ctype.h> 23 #include <regex.h> 24 #include <math.h> 25 #include <errno.h> 26 27 #include "ucode/vm.h" 28 #include "ucode/lib.h" 29 #include "ucode/lexer.h" 30 #include "ucode/platform.h" 31 32 struct keyword { 33 unsigned type; 34 const char *pat; 35 unsigned plen; 36 }; 37 38 #define dec(o) \ 39 ((o) - '') 40 41 #define hex(x) \ 42 (((x) >= 'a') ? (10 + (x) - 'a') : \ 43 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) 44 45 #ifndef NO_COMPILE 46 47 static const struct keyword reserved_words[] = { 48 { TK_ENDFUNC, "endfunction", 11 }, 49 { TK_CONTINUE, "continue", 8 }, 50 { TK_ENDWHILE, "endwhile", 8 }, 51 { TK_FUNC, "function", 8 }, 52 { TK_DEFAULT, "default", 7 }, 53 { TK_DELETE, "delete", 6 }, 54 { TK_RETURN, "return", 6 }, 55 { TK_ENDFOR, "endfor", 6 }, 56 { TK_SWITCH, "switch", 6 }, 57 { TK_IMPORT, "import", 6 }, 58 { TK_EXPORT, "export", 6 }, 59 { TK_ENDIF, "endif", 5 }, 60 { TK_WHILE, "while", 5 }, 61 { TK_BREAK, "break", 5 }, 62 { TK_CATCH, "catch", 5 }, 63 { TK_CONST, "const", 5 }, 64 { TK_FALSE, "false", 5 }, 65 { TK_TRUE, "true", 4 }, 66 { TK_ELIF, "elif", 4 }, 67 { TK_ELSE, "else", 4 }, 68 { TK_THIS, "this", 4 }, 69 { TK_NULL, "null", 4 }, 70 { TK_CASE, "case", 4 }, 71 { TK_TRY, "try", 3 }, 72 { TK_FOR, "for", 3 }, 73 { TK_LOCAL, "let", 3 }, 74 { TK_IF, "if", 2 }, 75 { TK_IN, "in", 2 }, 76 }; 77 78 79 static int 80 fill_buf(uc_lexer_t *lex) { 81 lex->rbuf = xrealloc(lex->rbuf, 128); 82 lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp); 83 lex->rpos = 0; 84 85 if (!lex->rlen) 86 return EOF; 87 88 lex->rpos++; 89 90 return (int)lex->rbuf[0]; 91 } 92 93 static int 94 update_line(uc_lexer_t *lex, int ch) { 95 if (ch == '\n') 96 uc_source_line_next(lex->source); 97 else if (ch != EOF) 98 uc_source_line_update(lex->source, 1); 99 100 lex->source->off++; 101 102 return ch; 103 } 104 105 static int 106 lookahead_char(uc_lexer_t *lex) { 107 int c; 108 109 if (lex->rpos < lex->rlen) 110 return (int)lex->rbuf[lex->rpos]; 111 112 c = fill_buf(lex); 113 lex->rpos = 0; 114 115 return c; 116 } 117 118 static bool 119 check_char(uc_lexer_t *lex, int ch) { 120 if (lookahead_char(lex) != ch) 121 return false; 122 123 lex->rpos++; 124 125 update_line(lex, ch); 126 127 return true; 128 } 129 130 static int 131 next_char(uc_lexer_t *lex) { 132 int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex); 133 134 return update_line(lex, ch); 135 } 136 137 static uc_token_t * 138 emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv) 139 { 140 lex->curr.type = type; 141 lex->curr.uv = uv; 142 143 if (pos < 0) 144 lex->curr.pos = lex->source->off + pos; 145 else 146 lex->curr.pos = (size_t)pos; 147 148 lex->curr.end = lex->source->off; 149 150 return &lex->curr; 151 } 152 153 static uc_token_t * 154 emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) { 155 uc_token_t *rv = NULL; 156 char *p; 157 158 if (lex->buffer.count) { 159 if (strip_trailing_chars) 160 for (p = uc_vector_last(&lex->buffer); 161 p && strchr(strip_trailing_chars, *p); 162 lex->buffer.count--, p = uc_vector_last(&lex->buffer)); 163 164 rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count)); 165 166 uc_vector_clear(&lex->buffer); 167 } 168 else if (type != TK_TEXT) { 169 rv = emit_op(lex, pos, type, ucv_string_new_length("", 0)); 170 } 171 172 return rv; 173 } 174 175 176 static uc_token_t * 177 parse_comment(uc_lexer_t *lex, int kind) 178 { 179 size_t off = lex->source->off - 1; 180 int ch; 181 182 uc_vector_push(&lex->buffer, '/'); 183 184 while (true) { 185 ch = next_char(lex); 186 187 uc_vector_push(&lex->buffer, ch); 188 189 if (kind == '/' && (ch == '\n' || ch == EOF)) 190 break; 191 192 if (kind == '*' && ch == '*' && check_char(lex, '/')) { 193 uc_vector_push(&lex->buffer, '/'); 194 break; 195 } 196 197 if (ch == EOF) { 198 lex->state = UC_LEX_EOF; 199 200 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment")); 201 } 202 } 203 204 return emit_buffer(lex, off, TK_COMMENT, NULL); 205 } 206 207 static void 208 append_utf8(uc_lexer_t *lex, int code) { 209 char ustr[8], *up; 210 int rem; 211 212 up = ustr; 213 rem = sizeof(ustr); 214 215 if (utf8enc(&up, &rem, code)) 216 for (up = ustr; rem < (int)sizeof(ustr); rem++) 217 uc_vector_push(&lex->buffer, *up++); 218 } 219 220 static uc_token_t * 221 parse_escape(uc_lexer_t *lex, const char *regex_macros) 222 { 223 int code, ch, i; 224 const char *p; 225 226 /* unicode escape sequence */ 227 if (check_char(lex, 'u')) { 228 for (i = 0, code = 0; i < 4; i++) { 229 ch = next_char(lex); 230 231 if (!isxdigit(ch)) 232 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence")); 233 234 code = code * 16 + hex(ch); 235 } 236 237 /* is a leading surrogate value */ 238 if ((code & 0xFC00) == 0xD800) { 239 /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */ 240 if (lex->lead_surrogate) 241 append_utf8(lex, 0xFFFD); 242 243 /* store surrogate value and advance to next escape sequence */ 244 lex->lead_surrogate = code; 245 } 246 247 /* is a trailing surrogate value */ 248 else if ((code & 0xFC00) == 0xDC00) { 249 /* found a trailing surrogate following a leading one, combine and encode */ 250 if (lex->lead_surrogate) { 251 code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF); 252 lex->lead_surrogate = 0; 253 } 254 255 /* trailing surrogate not following a leading one, ignore and use replacement char */ 256 else { 257 code = 0xFFFD; 258 } 259 260 append_utf8(lex, code); 261 } 262 263 /* is a normal codepoint */ 264 else { 265 append_utf8(lex, code); 266 } 267 } 268 269 /* hex escape sequence */ 270 else if (check_char(lex, 'x')) { 271 for (i = 0, code = 0; i < 2; i++) { 272 ch = next_char(lex); 273 274 if (!isxdigit(ch)) 275 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence")); 276 277 code = code * 16 + hex(ch); 278 } 279 280 append_utf8(lex, code); 281 } 282 283 /* octal or letter */ 284 else { 285 /* try to parse octal sequence... */ 286 for (i = 0, code = 0, ch = lookahead_char(lex); 287 i < 3 && ch >= '' && ch <= '7'; 288 i++, next_char(lex), ch = lookahead_char(lex)) { 289 code = code * 8 + dec(ch); 290 } 291 292 if (i) { 293 if (code > 255) 294 return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence")); 295 296 append_utf8(lex, code); 297 } 298 299 /* ... no octal sequence, handle potential regex macros */ 300 else if (strchr(regex_macros, ch)) { 301 ch = next_char(lex); 302 303 switch (ch) { 304 case 'd': p = "[[:digit:]]"; break; 305 case 'D': p = "[^[:digit:]]"; break; 306 case 'w': p = "[[:alnum:]_]"; break; 307 case 'W': p = "[^[:alnum:]_]"; break; 308 case 's': p = "[[:space:]]"; break; 309 case 'S': p = "[^[:space:]]"; break; 310 default: p = NULL; 311 } 312 313 if (p) { 314 while (*p) 315 uc_vector_push(&lex->buffer, *p++); 316 } 317 else { 318 uc_vector_push(&lex->buffer, '\\'); 319 uc_vector_push(&lex->buffer, ch); 320 } 321 } 322 323 /* ... handle other escape */ 324 else { 325 ch = next_char(lex); 326 327 switch (ch) { 328 case 'a': uc_vector_push(&lex->buffer, '\a'); break; 329 case 'b': uc_vector_push(&lex->buffer, '\b'); break; 330 case 'e': uc_vector_push(&lex->buffer, '\033'); break; 331 case 'f': uc_vector_push(&lex->buffer, '\f'); break; 332 case 'n': uc_vector_push(&lex->buffer, '\n'); break; 333 case 'r': uc_vector_push(&lex->buffer, '\r'); break; 334 case 't': uc_vector_push(&lex->buffer, '\t'); break; 335 case 'v': uc_vector_push(&lex->buffer, '\v'); break; 336 337 case EOF: 338 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string")); 339 340 default: 341 uc_vector_push(&lex->buffer, ch); 342 } 343 } 344 } 345 346 return NULL; 347 } 348 349 static uc_token_t * 350 parse_string(uc_lexer_t *lex, int kind) 351 { 352 uc_token_t *err, *tok; 353 unsigned type; 354 int code, ch; 355 size_t off; 356 357 if (kind == '`') 358 type = TK_TEMPLATE; 359 else if (kind == '/') 360 type = TK_REGEXP; 361 else 362 type = TK_STRING; 363 364 off = lex->source->off - 1; 365 366 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) { 367 switch (ch) { 368 /* placeholder */ 369 case '$': 370 if (type == TK_TEMPLATE && check_char(lex, '{')) { 371 lex->state = UC_LEX_PLACEHOLDER_START; 372 373 tok = emit_buffer(lex, off, type, NULL); 374 tok->end -= 2; 375 376 return tok; 377 } 378 379 uc_vector_push(&lex->buffer, '$'); 380 break; 381 382 /* regexp bracket expression */ 383 case '[': 384 uc_vector_push(&lex->buffer, '['); 385 386 if (type == TK_REGEXP) { 387 /* skip leading negation (^) */ 388 if (check_char(lex, '^')) 389 uc_vector_push(&lex->buffer, '^'); 390 391 /* skip leading `]` - it is literal and not closing the bracket expr */ 392 if (check_char(lex, ']')) 393 uc_vector_push(&lex->buffer, ']'); 394 395 /* read until closing `]` */ 396 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) { 397 if (ch == '\\') { 398 err = parse_escape(lex, "^"); 399 400 if (err) 401 return err; 402 403 continue; 404 } 405 406 uc_vector_push(&lex->buffer, ch); 407 408 if (ch == ']') 409 break; 410 411 /* skip nested char classes / equivalence classes / collating chars */ 412 if (ch == '[') { 413 code = lookahead_char(lex); 414 415 if (code == ':' || code == '.' || code == '=') { 416 uc_vector_push(&lex->buffer, code); 417 next_char(lex); 418 419 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) { 420 if (ch == '\\') { 421 err = parse_escape(lex, ""); 422 423 if (err) 424 return err; 425 426 continue; 427 } 428 429 uc_vector_push(&lex->buffer, ch); 430 431 if (ch == code && check_char(lex, ']')) { 432 uc_vector_push(&lex->buffer, ']'); 433 break; 434 } 435 } 436 } 437 } 438 } 439 } 440 441 break; 442 443 /* escape sequence */ 444 case '\\': 445 err = parse_escape(lex, 446 (type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : ""); 447 448 if (err) 449 return err; 450 451 break; 452 453 /* other character */ 454 default: 455 /* terminating delimitter */ 456 if (ch == kind) 457 return emit_buffer(lex, off, type, NULL); 458 459 uc_vector_push(&lex->buffer, ch); 460 } 461 } 462 463 // FIXME 464 lex->state = UC_LEX_EOF; 465 466 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string")); 467 } 468 469 470 /* 471 * Parses a regexp literal from the given buffer. 472 * 473 * Returns a negative value on error, otherwise the amount of consumed 474 * characters from the given buffer. 475 * 476 * Error values: 477 * -UC_ERROR_UNTERMINATED_STRING Unterminated regexp 478 * -UC_ERROR_INVALID_ESCAPE Invalid escape sequence 479 * -UC_ERROR_OVERLONG_STRING Regexp literal too long 480 * -UC_ERROR_INVALID_REGEXP Could not compile regexp 481 */ 482 483 enum { 484 UC_LEX_PARSE_REGEX_INIT, 485 UC_LEX_PARSE_REGEX_PATTERN, 486 UC_LEX_PARSE_REGEX_FLAGS 487 }; 488 489 static uc_token_t * 490 parse_regexp(uc_lexer_t *lex) 491 { 492 bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false; 493 uc_token_t *rv; 494 size_t len; 495 char *s; 496 497 rv = parse_string(lex, '/'); 498 499 if (rv->type == TK_REGEXP) { 500 while (true) { 501 if (check_char(lex, 'g')) 502 is_reg_global = true; 503 else if (check_char(lex, 'i')) 504 is_reg_icase = true; 505 else if (check_char(lex, 's')) 506 is_reg_newline = true; 507 else 508 break; 509 } 510 511 len = xasprintf(&s, "%c%*s", 512 (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2), 513 ucv_string_length(rv->uv), 514 ucv_string_get(rv->uv)); 515 516 ucv_free(rv->uv, false); 517 rv->uv = ucv_string_new_length(s, len); 518 free(s); 519 } 520 521 return rv; 522 } 523 524 525 /* 526 * Parses a label from the given buffer. 527 * 528 * Returns a negative value on error, otherwise the amount of consumed 529 * characters from the given buffer. 530 * 531 * Error values: 532 * -UC_ERROR_OVERLONG_STRING Label too long 533 */ 534 535 static uc_token_t * 536 parse_label(uc_lexer_t *lex, int ch) 537 { 538 const struct keyword *word; 539 size_t i, len; 540 541 while (true) { 542 uc_vector_push(&lex->buffer, ch); 543 ch = lookahead_char(lex); 544 545 if (!isalnum(ch) && ch != '_') 546 break; 547 548 next_char(lex); 549 } 550 551 len = lex->buffer.count; 552 553 if (!lex->no_keyword) { 554 for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) { 555 if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) { 556 uc_vector_clear(&lex->buffer); 557 558 return emit_op(lex, -len, word->type, NULL); 559 } 560 } 561 } 562 563 return emit_buffer(lex, -len, TK_LABEL, NULL); 564 } 565 566 567 /* 568 * Parses a number literal from the given buffer. 569 * 570 * Returns a negative value on error, otherwise the amount of consumed 571 * characters from the given buffer. 572 * 573 * Error values: 574 * -UC_ERROR_INVALID_ESCAPE Invalid number character 575 */ 576 577 static inline bool 578 is_numeric_char(uc_lexer_t *lex, char c) 579 { 580 char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0; 581 582 switch (c|32) { 583 case '.': 584 case '': 585 case '1': 586 case '2': 587 case '3': 588 case '4': 589 case '5': 590 case '6': 591 case '7': 592 case '8': 593 case '9': 594 return true; 595 596 case 'a': 597 case 'b': 598 case 'c': 599 case 'd': 600 case 'e': 601 case 'f': 602 case 'o': 603 case 'x': 604 /* require previous char, a number literal cannot start with these */ 605 return prev != 0; 606 607 case '+': 608 case '-': 609 /* sign is only allowed after an exponent char */ 610 return (prev|32) == 'e'; 611 } 612 613 return false; 614 } 615 616 static uc_token_t * 617 parse_number(uc_lexer_t *lex, int ch) 618 { 619 uc_value_t *nv = NULL; 620 size_t len; 621 char *e; 622 623 while (true) { 624 uc_vector_push(&lex->buffer, ch); 625 ch = lookahead_char(lex); 626 627 if (!is_numeric_char(lex, ch)) 628 break; 629 630 next_char(lex); 631 } 632 633 len = lex->buffer.count; 634 635 uc_vector_push(&lex->buffer, '\0'); 636 637 nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e); 638 639 uc_vector_clear(&lex->buffer); 640 641 switch (ucv_type(nv)) { 642 case UC_DOUBLE: 643 return emit_op(lex, -len, TK_DOUBLE, nv); 644 645 case UC_INTEGER: 646 return emit_op(lex, -len, TK_NUMBER, nv); 647 648 default: 649 return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal")); 650 } 651 } 652 653 static uc_token_t * 654 lex_find_token(uc_lexer_t *lex) 655 { 656 bool tpl = !(lex->config && lex->config->raw_mode); 657 int ch = next_char(lex); 658 659 while (isspace(ch)) 660 ch = next_char(lex); 661 662 switch (ch) { 663 case '~': 664 return emit_op(lex, -1, TK_COMPL, NULL); 665 666 case '}': 667 if (tpl && check_char(lex, '}')) 668 return emit_op(lex, -2, TK_REXP, NULL); 669 670 return emit_op(lex, -1, TK_RBRACE, NULL); 671 672 case '|': 673 if (check_char(lex, '|')) { 674 if (check_char(lex, '=')) 675 return emit_op(lex, -3, TK_ASOR, NULL); 676 677 return emit_op(lex, -2, TK_OR, NULL); 678 } 679 680 if (check_char(lex, '=')) 681 return emit_op(lex, -2, TK_ASBOR, NULL); 682 683 return emit_op(lex, -1, TK_BOR, NULL); 684 685 case '{': 686 if (tpl && check_char(lex, '{')) 687 return emit_op(lex, -2, TK_LEXP, NULL); 688 689 if (tpl && check_char(lex, '%')) 690 return emit_op(lex, -2, TK_LSTM, NULL); 691 692 return emit_op(lex, -1, TK_LBRACE, NULL); 693 694 case '^': 695 if (check_char(lex, '=')) 696 return emit_op(lex, -2, TK_ASBXOR, NULL); 697 698 return emit_op(lex, -1, TK_BXOR, NULL); 699 700 case '[': 701 return emit_op(lex, -1, TK_LBRACK, NULL); 702 703 case ']': 704 return emit_op(lex, -1, TK_RBRACK, NULL); 705 706 case '?': 707 if (check_char(lex, '?')) { 708 if (check_char(lex, '=')) 709 return emit_op(lex, -3, TK_ASNULLISH, NULL); 710 711 return emit_op(lex, -2, TK_NULLISH, NULL); 712 } 713 714 if (check_char(lex, '.')) { 715 if (check_char(lex, '[')) 716 return emit_op(lex, -3, TK_QLBRACK, NULL); 717 718 if (check_char(lex, '(')) 719 return emit_op(lex, -3, TK_QLPAREN, NULL); 720 721 return emit_op(lex, -2, TK_QDOT, NULL); 722 } 723 724 return emit_op(lex, lex->source->off, TK_QMARK, NULL); 725 726 case '>': 727 if (check_char(lex, '>')) { 728 if (check_char(lex, '=')) 729 return emit_op(lex, -3, TK_ASRIGHT, NULL); 730 731 return emit_op(lex, -2, TK_RSHIFT, NULL); 732 } 733 734 if (check_char(lex, '=')) 735 return emit_op(lex, -2, TK_GE, NULL); 736 737 return emit_op(lex, -1, TK_GT, NULL); 738 739 case '=': 740 if (check_char(lex, '=')) { 741 if (check_char(lex, '=')) 742 return emit_op(lex, -3, TK_EQS, NULL); 743 744 return emit_op(lex, -2, TK_EQ, NULL); 745 } 746 747 if (check_char(lex, '>')) 748 return emit_op(lex, -2, TK_ARROW, NULL); 749 750 return emit_op(lex, -1, TK_ASSIGN, NULL); 751 752 case '<': 753 if (check_char(lex, '<')) { 754 if (check_char(lex, '=')) 755 return emit_op(lex, -3, TK_ASLEFT, NULL); 756 757 return emit_op(lex, -2, TK_LSHIFT, NULL); 758 } 759 760 if (check_char(lex, '=')) 761 return emit_op(lex, -2, TK_LE, NULL); 762 763 return emit_op(lex, -1, TK_LT, NULL); 764 765 case ';': 766 return emit_op(lex, -1, TK_SCOL, NULL); 767 768 case ':': 769 return emit_op(lex, -1, TK_COLON, NULL); 770 771 case '/': 772 ch = lookahead_char(lex); 773 lex->lastoff = lex->source->off - 1; 774 775 if (ch == '/' || ch == '*') 776 return parse_comment(lex, ch); 777 778 if (lex->no_regexp) { 779 if (check_char(lex, '=')) 780 return emit_op(lex, -2, TK_ASDIV, NULL); 781 782 return emit_op(lex, -1, TK_DIV, NULL); 783 } 784 785 return parse_regexp(lex); 786 787 case '.': 788 if (check_char(lex, '.')) { 789 if (check_char(lex, '.')) 790 return emit_op(lex, -3, TK_ELLIP, NULL); 791 792 /* The sequence ".." cannot be a valid */ 793 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character")); 794 } 795 796 return emit_op(lex, -1, TK_DOT, NULL); 797 798 case '-': 799 if (tpl && check_char(lex, '}')) { 800 if (check_char(lex, '}')) { 801 lex->modifier = MINUS; 802 803 return emit_op(lex, -3, TK_REXP, NULL); 804 } 805 806 /* The sequence "-}" cannot be a valid */ 807 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); 808 } 809 810 if (tpl && check_char(lex, '%')) { 811 if (check_char(lex, '}')) { 812 lex->modifier = MINUS; 813 814 return emit_op(lex, -3, TK_RSTM, NULL); 815 } 816 817 /* The sequence "-%" cannot be a valid */ 818 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); 819 } 820 821 if (check_char(lex, '=')) 822 return emit_op(lex, -2, TK_ASSUB, NULL); 823 824 if (check_char(lex, '-')) 825 return emit_op(lex, -2, TK_DEC, NULL); 826 827 return emit_op(lex, -1, TK_SUB, NULL); 828 829 case ',': 830 return emit_op(lex, -1, TK_COMMA, NULL); 831 832 case '+': 833 if (check_char(lex, '=')) 834 return emit_op(lex, -2, TK_ASADD, NULL); 835 836 if (check_char(lex, '+')) 837 return emit_op(lex, -2, TK_INC, NULL); 838 839 return emit_op(lex, -1, TK_ADD, NULL); 840 841 case '*': 842 if (check_char(lex, '*')) { 843 if (check_char(lex, '=')) 844 return emit_op(lex, -3, TK_ASEXP, NULL); 845 846 return emit_op(lex, -2, TK_EXP, NULL); 847 } 848 849 if (check_char(lex, '=')) 850 return emit_op(lex, -2, TK_ASMUL, NULL); 851 852 return emit_op(lex, -1, TK_MUL, NULL); 853 854 case '(': 855 return emit_op(lex, -1, TK_LPAREN, NULL); 856 857 case ')': 858 return emit_op(lex, -1, TK_RPAREN, NULL); 859 860 case '\'': 861 case '"': 862 case '`': 863 lex->lastoff = lex->source->off - 1; 864 865 return parse_string(lex, ch); 866 867 case '&': 868 if (check_char(lex, '&')) { 869 if (check_char(lex, '=')) 870 return emit_op(lex, -3, TK_ASAND, NULL); 871 872 return emit_op(lex, -2, TK_AND, NULL); 873 } 874 875 if (check_char(lex, '=')) 876 return emit_op(lex, -2, TK_ASBAND, NULL); 877 878 return emit_op(lex, -1, TK_BAND, NULL); 879 880 case '%': 881 if (tpl && check_char(lex, '}')) 882 return emit_op(lex, -2, TK_RSTM, NULL); 883 884 if (check_char(lex, '=')) 885 return emit_op(lex, -2, TK_ASMOD, NULL); 886 887 return emit_op(lex, -1, TK_MOD, NULL); 888 889 case '!': 890 if (check_char(lex, '=')) { 891 if (check_char(lex, '=')) 892 return emit_op(lex, -3, TK_NES, NULL); 893 894 return emit_op(lex, -2, TK_NE, NULL); 895 } 896 897 return emit_op(lex, -1, TK_NOT, NULL); 898 899 case EOF: 900 return emit_op(lex, -1, TK_EOF, NULL); 901 902 default: 903 if (isalpha(ch) || ch == '_') 904 return parse_label(lex, ch); 905 906 if (isdigit(ch)) 907 return parse_number(lex, ch); 908 909 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character")); 910 } 911 } 912 913 static uc_token_t * 914 lex_step(uc_lexer_t *lex) 915 { 916 const char *strip = NULL; 917 uc_token_t *tok; 918 size_t *nest; 919 int ch; 920 921 while (lex->state != UC_LEX_EOF) { 922 switch (lex->state) { 923 case UC_LEX_IDENTIFY_BLOCK: 924 ch = next_char(lex); 925 926 /* previous block had strip trailing whitespace flag, skip leading whitespace */ 927 if (lex->modifier == MINUS) { 928 while (isspace(ch)) 929 ch = next_char(lex); 930 931 lex->modifier = UNSPEC; 932 } 933 934 /* previous block was a statement block and trim_blocks is enabled, skip leading newline */ 935 else if (lex->modifier == NEWLINE) { 936 if (ch == '\n') 937 ch = next_char(lex); 938 939 lex->modifier = UNSPEC; 940 } 941 942 /* scan forward through buffer to identify block start token */ 943 while (ch != EOF) { 944 if (ch == '{') { 945 ch = next_char(lex); 946 947 switch (ch) { 948 /* found start of comment block */ 949 case '#': 950 lex->state = UC_LEX_BLOCK_COMMENT; 951 lex->block = COMMENT; 952 953 if (check_char(lex, '-')) 954 strip = " \n\t\v\f\r"; 955 956 break; 957 958 /* found start of expression block */ 959 case '{': 960 lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG; 961 962 if (check_char(lex, '-')) 963 strip = " \n\t\v\f\r"; 964 965 break; 966 967 /* found start of statement block */ 968 case '%': 969 lex->state = UC_LEX_BLOCK_STATEMENT_EMIT_TAG; 970 971 if (check_char(lex, '-')) 972 strip = " \n\t\v\f\r"; 973 else if (check_char(lex, '+')) 974 strip = NULL; 975 else if (lex->config && lex->config->lstrip_blocks) 976 strip = " \t\v\f\r"; 977 978 break; 979 980 default: 981 /* not a start tag, remember char and move on */ 982 uc_vector_push(&lex->buffer, '{'); 983 continue; 984 } 985 986 break; 987 } 988 989 uc_vector_push(&lex->buffer, ch); 990 ch = next_char(lex); 991 } 992 993 if (ch == EOF) 994 lex->state = UC_LEX_EOF; 995 996 /* push out leading text */ 997 tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip); 998 lex->lastoff = lex->source->off - 2; 999 1000 if (!tok) 1001 continue; 1002 1003 tok->end -= 2; 1004 1005 return tok; 1006 1007 1008 case UC_LEX_BLOCK_COMMENT: 1009 ch = next_char(lex); 1010 1011 /* scan forward through buffer to identify end token */ 1012 while (ch != EOF) { 1013 if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) { 1014 lex->modifier = MINUS; 1015 break; 1016 } 1017 1018 if (ch == '#' && check_char(lex, '}')) 1019 break; 1020 1021 ch = next_char(lex); 1022 } 1023 1024 if (ch == EOF) { 1025 lex->state = UC_LEX_EOF; 1026 1027 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block")); 1028 } 1029 1030 tok = emit_op(lex, lex->lastoff, TK_COMMENT, NULL); 1031 1032 lex->lastoff = lex->source->off; 1033 lex->state = UC_LEX_IDENTIFY_BLOCK; 1034 1035 return tok; 1036 1037 case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG: 1038 lex->state = UC_LEX_IDENTIFY_TOKEN; 1039 lex->block = EXPRESSION; 1040 1041 return emit_op(lex, lex->source->off - 2, TK_LEXP, NULL); 1042 1043 case UC_LEX_BLOCK_STATEMENT_EMIT_TAG: 1044 lex->state = UC_LEX_IDENTIFY_TOKEN; 1045 lex->block = STATEMENTS; 1046 1047 return emit_op(lex, lex->source->off - 2, TK_LSTM, NULL); 1048 1049 case UC_LEX_IDENTIFY_TOKEN: 1050 do { tok = lex_find_token(lex); } while (tok == NULL); 1051 1052 /* disallow nesting blocks */ 1053 if (tok->type == TK_LSTM || tok->type == TK_LEXP) 1054 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested")); 1055 1056 /* found end of statement block */ 1057 if (lex->block == STATEMENTS && tok->type == TK_RSTM) { 1058 /* strip newline after statement block? */ 1059 if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks) 1060 lex->modifier = NEWLINE; 1061 1062 lex->lastoff = lex->source->off; 1063 lex->state = UC_LEX_IDENTIFY_BLOCK; 1064 lex->block = NONE; 1065 1066 tok = emit_op(lex, -2, TK_RSTM, NULL); 1067 } 1068 1069 /* found end of expression block */ 1070 else if (lex->block == EXPRESSION && tok->type == TK_REXP) { 1071 lex->lastoff = lex->source->off; 1072 lex->state = UC_LEX_IDENTIFY_BLOCK; 1073 lex->block = NONE; 1074 } 1075 1076 /* track opening braces */ 1077 else if (tok->type == TK_LBRACE && lex->templates.count > 0) { 1078 nest = uc_vector_last(&lex->templates); 1079 (*nest)++; 1080 } 1081 1082 /* check end of placeholder expression */ 1083 else if (tok->type == TK_RBRACE && lex->templates.count > 0) { 1084 nest = uc_vector_last(&lex->templates); 1085 1086 if (*nest == 0) { 1087 lex->templates.count--; 1088 lex->state = UC_LEX_PLACEHOLDER_END; 1089 } 1090 else { 1091 (*nest)--; 1092 } 1093 } 1094 1095 /* premature EOF? */ 1096 else if (tok->type == TK_EOF && lex->block != STATEMENTS) { 1097 lex->state = UC_LEX_EOF; 1098 1099 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block")); 1100 } 1101 1102 return tok; 1103 1104 1105 case UC_LEX_PLACEHOLDER_START: 1106 lex->state = UC_LEX_IDENTIFY_TOKEN; 1107 1108 uc_vector_push(&lex->templates, 0); 1109 1110 return emit_op(lex, -2, TK_PLACEH, NULL); 1111 1112 1113 case UC_LEX_PLACEHOLDER_END: 1114 lex->state = UC_LEX_IDENTIFY_TOKEN; 1115 1116 tok = parse_string(lex, '`'); 1117 tok->pos++; 1118 1119 return tok; 1120 1121 1122 case UC_LEX_EOF: 1123 break; 1124 } 1125 } 1126 1127 return emit_op(lex, lex->source->off, TK_EOF, NULL); 1128 } 1129 1130 void 1131 uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source) 1132 { 1133 lex->state = UC_LEX_IDENTIFY_BLOCK; 1134 1135 lex->config = config; 1136 lex->source = uc_source_get(source); 1137 1138 lex->block = NONE; 1139 lex->modifier = UNSPEC; 1140 1141 lex->rlen = 0; 1142 lex->rpos = 0; 1143 lex->rbuf = NULL; 1144 1145 lex->buffer.count = 0; 1146 lex->buffer.entries = NULL; 1147 1148 lex->lead_surrogate = 0; 1149 1150 lex->lastoff = 0; 1151 1152 lex->templates.count = 0; 1153 lex->templates.entries = NULL; 1154 1155 if (config && config->raw_mode) { 1156 lex->state = UC_LEX_IDENTIFY_TOKEN; 1157 lex->block = STATEMENTS; 1158 } 1159 } 1160 1161 void 1162 uc_lexer_free(uc_lexer_t *lex) 1163 { 1164 uc_vector_clear(&lex->buffer); 1165 uc_vector_clear(&lex->templates); 1166 1167 uc_source_put(lex->source); 1168 1169 free(lex->rbuf); 1170 } 1171 1172 uc_token_t * 1173 uc_lexer_next_token(uc_lexer_t *lex) 1174 { 1175 uc_token_t *rv = NULL; 1176 1177 rv = lex_step(lex); 1178 1179 lex->no_keyword = false; 1180 lex->no_regexp = false; 1181 1182 return rv; 1183 } 1184 1185 const char * 1186 uc_tokenname(unsigned type) 1187 { 1188 static char buf[sizeof("'endfunction'")]; 1189 const char *tokennames[] = { 1190 [TK_LEXP] = "'{{'", 1191 [TK_REXP] = "'}}'", 1192 [TK_LSTM] = "'{%'", 1193 [TK_RSTM] = "'%}'", 1194 [TK_COMMA] = "','", 1195 [TK_ASSIGN] = "'='", 1196 [TK_ASADD] = "'+='", 1197 [TK_ASSUB] = "'-='", 1198 [TK_ASMUL] = "'*='", 1199 [TK_ASDIV] = "'/='", 1200 [TK_ASMOD] = "'%='", 1201 [TK_ASLEFT] = "'<<='", 1202 [TK_ASRIGHT] = "'>>='", 1203 [TK_ASBAND] = "'&='", 1204 [TK_ASBXOR] = "'^='", 1205 [TK_ASBOR] = "'|='", 1206 [TK_QMARK] = "'?'", 1207 [TK_COLON] = "':'", 1208 [TK_OR] = "'||'", 1209 [TK_AND] = "'&&'", 1210 [TK_BOR] = "'|'", 1211 [TK_BXOR] = "'^'", 1212 [TK_BAND] = "'&'", 1213 [TK_EQS] = "'==='", 1214 [TK_NES] = "'!=='", 1215 [TK_EQ] = "'=='", 1216 [TK_NE] = "'!='", 1217 [TK_LT] = "'<'", 1218 [TK_LE] = "'<='", 1219 [TK_GT] = "'>'", 1220 [TK_GE] = "'>='", 1221 [TK_LSHIFT] = "'<<'", 1222 [TK_RSHIFT] = "'>>'", 1223 [TK_ADD] = "'+'", 1224 [TK_SUB] = "'-'", 1225 [TK_MUL] = "'*'", 1226 [TK_DIV] = "'/'", 1227 [TK_MOD] = "'%'", 1228 [TK_EXP] = "'**'", 1229 [TK_NOT] = "'!'", 1230 [TK_COMPL] = "'~'", 1231 [TK_INC] = "'++'", 1232 [TK_DEC] = "'--'", 1233 [TK_DOT] = "'.'", 1234 [TK_LBRACK] = "'['", 1235 [TK_RBRACK] = "']'", 1236 [TK_LPAREN] = "'('", 1237 [TK_RPAREN] = "')'", 1238 [TK_LBRACE] = "'{'", 1239 [TK_RBRACE] = "'}'", 1240 [TK_SCOL] = "';'", 1241 [TK_ELLIP] = "'...'", 1242 [TK_ARROW] = "'=>'", 1243 [TK_QLBRACK] = "'?.['", 1244 [TK_QLPAREN] = "'?.('", 1245 [TK_QDOT] = "'?.'", 1246 [TK_ASEXP] = "'**='", 1247 [TK_ASAND] = "'&&='", 1248 [TK_ASOR] = "'||='", 1249 [TK_ASNULLISH] = "'\?\?='", 1250 [TK_NULLISH] = "'\?\?'", 1251 [TK_PLACEH] = "'${'", 1252 1253 [TK_TEXT] = "Text", 1254 [TK_LABEL] = "Label", 1255 [TK_NUMBER] = "Number", 1256 [TK_DOUBLE] = "Double", 1257 [TK_STRING] = "String", 1258 [TK_REGEXP] = "Regexp", 1259 [TK_TEMPLATE] = "Template", 1260 [TK_ERROR] = "Error", 1261 [TK_EOF] = "End of file", 1262 }; 1263 1264 size_t i; 1265 1266 for (i = 0; i < ARRAY_SIZE(reserved_words); i++) { 1267 if (reserved_words[i].type != type) 1268 continue; 1269 1270 snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat); 1271 1272 return buf; 1273 } 1274 1275 return tokennames[type] ? tokennames[type] : "?"; 1276 } 1277 1278 bool 1279 uc_lexer_is_keyword(uc_value_t *label) 1280 { 1281 size_t i; 1282 1283 if (ucv_type(label) != UC_STRING) 1284 return false; 1285 1286 for (i = 0; i < ARRAY_SIZE(reserved_words); i++) 1287 if (!strcmp(reserved_words[i].pat, ucv_string_get(label))) 1288 return true; 1289 1290 return false; 1291 } 1292 1293 #endif /* NO_COMPILE */ 1294 1295 /* 1296 * Stores the given codepoint as a utf8 multibyte sequence into the given 1297 * output buffer and substracts the required amount of bytes from the given 1298 * length pointer. 1299 * 1300 * Returns false if the multibyte sequence would not fit into the buffer, 1301 * otherwise true. 1302 */ 1303 1304 bool 1305 utf8enc(char **out, int *rem, int code) 1306 { 1307 if (code >= 0 && code <= 0x7F) { 1308 if (*rem < 1) 1309 return false; 1310 1311 *(*out)++ = code; (*rem)--; 1312 1313 return true; 1314 } 1315 else if (code > 0 && code <= 0x7FF) { 1316 if (*rem < 2) 1317 return false; 1318 1319 *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--; 1320 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1321 1322 return true; 1323 } 1324 else if (code > 0 && code <= 0xFFFF) { 1325 if (*rem < 3) 1326 return false; 1327 1328 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--; 1329 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; 1330 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1331 1332 return true; 1333 } 1334 else if (code > 0 && code <= 0x10FFFF) { 1335 if (*rem < 4) 1336 return false; 1337 1338 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--; 1339 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--; 1340 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; 1341 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 1342 1343 return true; 1344 } 1345 1346 return true; 1347 } 1348
This page was automatically generated by LXR 0.3.1. • OpenWrt