lexer.c

  1 /*
  2  * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io>
  3  *
  4  * Permission to use, copy, modify, and/or distribute this software for any
  5  * purpose with or without fee is hereby granted, provided that the above
  6  * copyright notice and this permission notice appear in all copies.
  7  *
  8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15  */
 16 
 17 #include <stdio.h>
 18 
 19 #include <stdbool.h>
 20 #include <stdlib.h>
 21 #include <string.h>
 22 #include <ctype.h>
 23 #include <regex.h>
 24 #include <math.h>
 25 #include <errno.h>
 26 
 27 #include "ucode/vm.h"
 28 #include "ucode/lib.h"
 29 #include "ucode/lexer.h"
 30 #include "ucode/platform.h"
 31 
 32 struct keyword {
 33         unsigned type;
 34         const char *pat;
 35         unsigned plen;
 36 };
 37 
 38 #define dec(o) \
 39         ((o) - '')
 40 
 41 #define hex(x) \
 42         (((x) >= 'a') ? (10 + (x) - 'a') : \
 43                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
 44 
 45 #ifndef NO_COMPILE
 46 
 47 static const struct keyword reserved_words[] = {
 48         { TK_ENDFUNC,   "endfunction", 11 },
 49         { TK_CONTINUE,  "continue", 8 },
 50         { TK_ENDWHILE,  "endwhile", 8 },
 51         { TK_FUNC,              "function", 8 },
 52         { TK_DEFAULT,   "default", 7 },
 53         { TK_DELETE,    "delete", 6 },
 54         { TK_RETURN,    "return", 6 },
 55         { TK_ENDFOR,    "endfor", 6 },
 56         { TK_SWITCH,    "switch", 6 },
 57         { TK_IMPORT,    "import", 6 },
 58         { TK_EXPORT,    "export", 6 },
 59         { TK_ENDIF,             "endif", 5 },
 60         { TK_WHILE,             "while", 5 },
 61         { TK_BREAK,             "break", 5 },
 62         { TK_CATCH,             "catch", 5 },
 63         { TK_CONST,             "const", 5 },
 64         { TK_FALSE,             "false", 5 },
 65         { TK_TRUE,              "true",  4 },
 66         { TK_ELIF,              "elif",  4 },
 67         { TK_ELSE,              "else",  4 },
 68         { TK_THIS,              "this",  4 },
 69         { TK_NULL,              "null",  4 },
 70         { TK_CASE,              "case",  4 },
 71         { TK_TRY,               "try",   3 },
 72         { TK_FOR,               "for",   3 },
 73         { TK_LOCAL,             "let",   3 },
 74         { TK_IF,                "if",    2 },
 75         { TK_IN,                "in",    2 },
 76 };
 77 
 78 
 79 static int
 80 fill_buf(uc_lexer_t *lex) {
 81         lex->rbuf = xrealloc(lex->rbuf, 128);
 82         lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp);
 83         lex->rpos = 0;
 84 
 85         if (!lex->rlen)
 86                 return EOF;
 87 
 88         lex->rpos++;
 89 
 90         return (int)lex->rbuf[0];
 91 }
 92 
 93 static int
 94 update_line(uc_lexer_t *lex, int ch) {
 95         if (ch == '\n')
 96                 uc_source_line_next(lex->source);
 97         else if (ch != EOF)
 98                 uc_source_line_update(lex->source, 1);
 99 
100         lex->source->off++;
101 
102         return ch;
103 }
104 
105 static int
106 lookahead_char(uc_lexer_t *lex) {
107         int c;
108 
109         if (lex->rpos < lex->rlen)
110                 return (int)lex->rbuf[lex->rpos];
111 
112         c = fill_buf(lex);
113         lex->rpos = 0;
114 
115         return c;
116 }
117 
118 static bool
119 check_char(uc_lexer_t *lex, int ch) {
120         if (lookahead_char(lex) != ch)
121                 return false;
122 
123         lex->rpos++;
124 
125         update_line(lex, ch);
126 
127         return true;
128 }
129 
130 static int
131 next_char(uc_lexer_t *lex) {
132         int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex);
133 
134         return update_line(lex, ch);
135 }
136 
137 static uc_token_t *
138 emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv)
139 {
140         lex->curr.type = type;
141         lex->curr.uv = uv;
142 
143         if (pos < 0)
144                 lex->curr.pos = lex->source->off + pos;
145         else
146                 lex->curr.pos = (size_t)pos;
147 
148         lex->curr.end = lex->source->off;
149 
150         return &lex->curr;
151 }
152 
153 static uc_token_t *
154 emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) {
155         uc_token_t *rv = NULL;
156         char *p;
157 
158         if (lex->buffer.count) {
159                 if (strip_trailing_chars)
160                         for (p = uc_vector_last(&lex->buffer);
161                              p && strchr(strip_trailing_chars, *p);
162                              lex->buffer.count--, p = uc_vector_last(&lex->buffer));
163 
164                 rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count));
165 
166                 uc_vector_clear(&lex->buffer);
167         }
168         else if (type != TK_TEXT) {
169                 rv = emit_op(lex, pos, type, ucv_string_new_length("", 0));
170         }
171 
172         return rv;
173 }
174 
175 
176 static uc_token_t *
177 parse_comment(uc_lexer_t *lex, int kind)
178 {
179         size_t off = lex->source->off - 1;
180         int ch;
181 
182         uc_vector_push(&lex->buffer, '/');
183 
184         while (true) {
185                 ch = next_char(lex);
186 
187                 uc_vector_push(&lex->buffer, ch);
188 
189                 if (kind == '/' && (ch == '\n' || ch == EOF))
190                         break;
191 
192                 if (kind == '*' && ch == '*' && check_char(lex, '/')) {
193                         uc_vector_push(&lex->buffer, '/');
194                         break;
195                 }
196 
197                 if (ch == EOF) {
198                         lex->state = UC_LEX_EOF;
199 
200                         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment"));
201                 }
202         }
203 
204         return emit_buffer(lex, off, TK_COMMENT, NULL);
205 }
206 
207 static void
208 append_utf8(uc_lexer_t *lex, int code) {
209         char ustr[8], *up;
210         int rem;
211 
212         up = ustr;
213         rem = sizeof(ustr);
214 
215         if (utf8enc(&up, &rem, code))
216                 for (up = ustr; rem < (int)sizeof(ustr); rem++)
217                         uc_vector_push(&lex->buffer, *up++);
218 }
219 
220 static uc_token_t *
221 parse_escape(uc_lexer_t *lex, const char *regex_macros)
222 {
223         int code, ch, i;
224         const char *p;
225 
226         /* unicode escape sequence */
227         if (check_char(lex, 'u')) {
228                 for (i = 0, code = 0; i < 4; i++) {
229                         ch = next_char(lex);
230 
231                         if (!isxdigit(ch))
232                                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
233 
234                         code = code * 16 + hex(ch);
235                 }
236 
237                 /* is a leading surrogate value */
238                 if ((code & 0xFC00) == 0xD800) {
239                         /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
240                         if (lex->lead_surrogate)
241                                 append_utf8(lex, 0xFFFD);
242 
243                         /* store surrogate value and advance to next escape sequence */
244                         lex->lead_surrogate = code;
245                 }
246 
247                 /* is a trailing surrogate value */
248                 else if ((code & 0xFC00) == 0xDC00) {
249                         /* found a trailing surrogate following a leading one, combine and encode */
250                         if (lex->lead_surrogate) {
251                                 code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
252                                 lex->lead_surrogate = 0;
253                         }
254 
255                         /* trailing surrogate not following a leading one, ignore and use replacement char */
256                         else {
257                                 code = 0xFFFD;
258                         }
259 
260                         append_utf8(lex, code);
261                 }
262 
263                 /* is a normal codepoint */
264                 else {
265                         append_utf8(lex, code);
266                 }
267         }
268 
269         /* hex escape sequence */
270         else if (check_char(lex, 'x')) {
271                 for (i = 0, code = 0; i < 2; i++) {
272                         ch = next_char(lex);
273 
274                         if (!isxdigit(ch))
275                                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
276 
277                         code = code * 16 + hex(ch);
278                 }
279 
280                 append_utf8(lex, code);
281         }
282 
283         /* octal or letter */
284         else {
285                 /* try to parse octal sequence... */
286                 for (i = 0, code = 0, ch = lookahead_char(lex);
287                      i < 3 && ch >= '' && ch <= '7';
288                      i++, next_char(lex), ch = lookahead_char(lex)) {
289                         code = code * 8 + dec(ch);
290                 }
291 
292                 if (i) {
293                         if (code > 255)
294                                 return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));
295 
296                         append_utf8(lex, code);
297                 }
298 
299                 /* ... no octal sequence, handle potential regex macros */
300                 else if (strchr(regex_macros, ch)) {
301                         ch = next_char(lex);
302 
303                         switch (ch) {
304                         case 'd': p = "[[:digit:]]";   break;
305                         case 'D': p = "[^[:digit:]]";  break;
306                         case 'w': p = "[[:alnum:]_]";  break;
307                         case 'W': p = "[^[:alnum:]_]"; break;
308                         case 's': p = "[[:space:]]";   break;
309                         case 'S': p = "[^[:space:]]";  break;
310                         default:  p = NULL;
311                         }
312 
313                         if (p) {
314                                 while (*p)
315                                         uc_vector_push(&lex->buffer, *p++);
316                         }
317                         else {
318                                 uc_vector_push(&lex->buffer, '\\');
319                                 uc_vector_push(&lex->buffer, ch);
320                         }
321                 }
322 
323                 /* ... handle other escape */
324                 else {
325                         ch = next_char(lex);
326 
327                         switch (ch) {
328                         case 'a': uc_vector_push(&lex->buffer, '\a'); break;
329                         case 'b': uc_vector_push(&lex->buffer, '\b'); break;
330                         case 'e': uc_vector_push(&lex->buffer, '\033'); break;
331                         case 'f': uc_vector_push(&lex->buffer, '\f'); break;
332                         case 'n': uc_vector_push(&lex->buffer, '\n'); break;
333                         case 'r': uc_vector_push(&lex->buffer, '\r'); break;
334                         case 't': uc_vector_push(&lex->buffer, '\t'); break;
335                         case 'v': uc_vector_push(&lex->buffer, '\v'); break;
336 
337                         case EOF:
338                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
339 
340                         default:
341                                 uc_vector_push(&lex->buffer, ch);
342                         }
343                 }
344         }
345 
346         return NULL;
347 }
348 
349 static uc_token_t *
350 parse_string(uc_lexer_t *lex, int kind)
351 {
352         uc_token_t *err, *tok;
353         unsigned type;
354         int code, ch;
355         size_t off;
356 
357         if (kind == '`')
358                 type = TK_TEMPLATE;
359         else if (kind == '/')
360                 type = TK_REGEXP;
361         else
362                 type = TK_STRING;
363 
364         off = lex->source->off - 1;
365 
366         for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
367                 switch (ch) {
368                 /* placeholder */
369                 case '$':
370                         if (type == TK_TEMPLATE && check_char(lex, '{')) {
371                                 lex->state = UC_LEX_PLACEHOLDER_START;
372 
373                                 tok = emit_buffer(lex, off, type, NULL);
374                                 tok->end -= 2;
375 
376                                 return tok;
377                         }
378 
379                         uc_vector_push(&lex->buffer, '$');
380                         break;
381 
382                 /* regexp bracket expression */
383                 case '[':
384                         uc_vector_push(&lex->buffer, '[');
385 
386                         if (type == TK_REGEXP) {
387                                 /* skip leading negation (^) */
388                                 if (check_char(lex, '^'))
389                                         uc_vector_push(&lex->buffer, '^');
390 
391                                 /* skip leading `]` - it is literal and not closing the bracket expr */
392                                 if (check_char(lex, ']'))
393                                         uc_vector_push(&lex->buffer, ']');
394 
395                                 /* read until closing `]` */
396                                 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
397                                         if (ch == '\\') {
398                                                 err = parse_escape(lex, "^");
399 
400                                                 if (err)
401                                                         return err;
402 
403                                                 continue;
404                                         }
405 
406                                         uc_vector_push(&lex->buffer, ch);
407 
408                                         if (ch == ']')
409                                                 break;
410 
411                                         /* skip nested char classes / equivalence classes / collating chars */
412                                         if (ch == '[') {
413                                                 code = lookahead_char(lex);
414 
415                                                 if (code == ':' || code == '.' || code == '=') {
416                                                         uc_vector_push(&lex->buffer, code);
417                                                         next_char(lex);
418 
419                                                         for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
420                                                                 if (ch == '\\') {
421                                                                         err = parse_escape(lex, "");
422 
423                                                                         if (err)
424                                                                                 return err;
425 
426                                                                         continue;
427                                                                 }
428 
429                                                                 uc_vector_push(&lex->buffer, ch);
430 
431                                                                 if (ch == code && check_char(lex, ']')) {
432                                                                         uc_vector_push(&lex->buffer, ']');
433                                                                         break;
434                                                                 }
435                                                         }
436                                                 }
437                                         }
438                                 }
439                         }
440 
441                         break;
442 
443                 /* escape sequence */
444                 case '\\':
445                         err = parse_escape(lex,
446                                 (type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : "");
447 
448                         if (err)
449                                 return err;
450 
451                         break;
452 
453                 /* other character */
454                 default:
455                         /* terminating delimitter */
456                         if (ch == kind)
457                                 return emit_buffer(lex, off, type, NULL);
458 
459                         uc_vector_push(&lex->buffer, ch);
460                 }
461         }
462 
463         // FIXME
464         lex->state = UC_LEX_EOF;
465 
466         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
467 }
468 
469 
470 /*
471  * Parses a regexp literal from the given buffer.
472  *
473  * Returns a negative value on error, otherwise the amount of consumed
474  * characters from the given buffer.
475  *
476  * Error values:
477  *  -UC_ERROR_UNTERMINATED_STRING       Unterminated regexp
478  *  -UC_ERROR_INVALID_ESCAPE            Invalid escape sequence
479  *  -UC_ERROR_OVERLONG_STRING           Regexp literal too long
480  *  -UC_ERROR_INVALID_REGEXP        Could not compile regexp
481  */
482 
483 enum {
484         UC_LEX_PARSE_REGEX_INIT,
485         UC_LEX_PARSE_REGEX_PATTERN,
486         UC_LEX_PARSE_REGEX_FLAGS
487 };
488 
489 static uc_token_t *
490 parse_regexp(uc_lexer_t *lex)
491 {
492         bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false;
493         uc_token_t *rv;
494         size_t len;
495         char *s;
496 
497         rv = parse_string(lex, '/');
498 
499         if (rv->type == TK_REGEXP) {
500                 while (true) {
501                         if (check_char(lex, 'g'))
502                                 is_reg_global = true;
503                         else if (check_char(lex, 'i'))
504                                 is_reg_icase = true;
505                         else if (check_char(lex, 's'))
506                                 is_reg_newline = true;
507                         else
508                                 break;
509                 }
510 
511                 len = xasprintf(&s, "%c%*s",
512                         (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
513                         ucv_string_length(rv->uv),
514                         ucv_string_get(rv->uv));
515 
516                 ucv_free(rv->uv, false);
517                 rv->uv = ucv_string_new_length(s, len);
518                 free(s);
519         }
520 
521         return rv;
522 }
523 
524 
525 /*
526  * Parses a label from the given buffer.
527  *
528  * Returns a negative value on error, otherwise the amount of consumed
529  * characters from the given buffer.
530  *
531  * Error values:
532  *  -UC_ERROR_OVERLONG_STRING   Label too long
533  */
534 
535 static uc_token_t *
536 parse_label(uc_lexer_t *lex, int ch)
537 {
538         const struct keyword *word;
539         size_t i, len;
540 
541         while (true) {
542                 uc_vector_push(&lex->buffer, ch);
543                 ch = lookahead_char(lex);
544 
545                 if (!isalnum(ch) && ch != '_')
546                         break;
547 
548                 next_char(lex);
549         }
550 
551         len = lex->buffer.count;
552 
553         if (!lex->no_keyword) {
554                 for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
555                         if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) {
556                                 uc_vector_clear(&lex->buffer);
557 
558                                 return emit_op(lex, -len, word->type, NULL);
559                         }
560                 }
561         }
562 
563         return emit_buffer(lex, -len, TK_LABEL, NULL);
564 }
565 
566 
567 /*
568  * Parses a number literal from the given buffer.
569  *
570  * Returns a negative value on error, otherwise the amount of consumed
571  * characters from the given buffer.
572  *
573  * Error values:
574  *  -UC_ERROR_INVALID_ESCAPE    Invalid number character
575  */
576 
577 static inline bool
578 is_numeric_char(uc_lexer_t *lex, char c)
579 {
580         char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0;
581 
582         switch (c|32) {
583         case '.':
584         case '':
585         case '1':
586         case '2':
587         case '3':
588         case '4':
589         case '5':
590         case '6':
591         case '7':
592         case '8':
593         case '9':
594                 return true;
595 
596         case 'a':
597         case 'b':
598         case 'c':
599         case 'd':
600         case 'e':
601         case 'f':
602         case 'o':
603         case 'x':
604                 /* require previous char, a number literal cannot start with these */
605                 return prev != 0;
606 
607         case '+':
608         case '-':
609                 /* sign is only allowed after an exponent char */
610                 return (prev|32) == 'e';
611         }
612 
613         return false;
614 }
615 
616 static uc_token_t *
617 parse_number(uc_lexer_t *lex, int ch)
618 {
619         uc_value_t *nv = NULL;
620         size_t len;
621         char *e;
622 
623         while (true) {
624                 uc_vector_push(&lex->buffer, ch);
625                 ch = lookahead_char(lex);
626 
627                 if (!is_numeric_char(lex, ch))
628                         break;
629 
630                 next_char(lex);
631         }
632 
633         len = lex->buffer.count;
634 
635         uc_vector_push(&lex->buffer, '\0');
636 
637         nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e);
638 
639         uc_vector_clear(&lex->buffer);
640 
641         switch (ucv_type(nv)) {
642         case UC_DOUBLE:
643                 return emit_op(lex, -len, TK_DOUBLE, nv);
644 
645         case UC_INTEGER:
646                 return emit_op(lex, -len, TK_NUMBER, nv);
647 
648         default:
649                 return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal"));
650         }
651 }
652 
653 static uc_token_t *
654 lex_find_token(uc_lexer_t *lex)
655 {
656         bool tpl = !(lex->config && lex->config->raw_mode);
657         int ch = next_char(lex);
658 
659         while (isspace(ch))
660                 ch = next_char(lex);
661 
662         switch (ch) {
663         case '~':
664                 return emit_op(lex, -1, TK_COMPL, NULL);
665 
666         case '}':
667                 if (tpl && check_char(lex, '}'))
668                         return emit_op(lex, -2, TK_REXP, NULL);
669 
670                 return emit_op(lex, -1, TK_RBRACE, NULL);
671 
672         case '|':
673                 if (check_char(lex, '|')) {
674                         if (check_char(lex, '='))
675                                 return emit_op(lex, -3, TK_ASOR, NULL);
676 
677                         return emit_op(lex, -2, TK_OR, NULL);
678                 }
679 
680                 if (check_char(lex, '='))
681                         return emit_op(lex, -2, TK_ASBOR, NULL);
682 
683                 return emit_op(lex, -1, TK_BOR, NULL);
684 
685         case '{':
686                 if (tpl && check_char(lex, '{'))
687                         return emit_op(lex, -2, TK_LEXP, NULL);
688 
689                 if (tpl && check_char(lex, '%'))
690                         return emit_op(lex, -2, TK_LSTM, NULL);
691 
692                 return emit_op(lex, -1, TK_LBRACE, NULL);
693 
694         case '^':
695                 if (check_char(lex, '='))
696                         return emit_op(lex, -2, TK_ASBXOR, NULL);
697 
698                 return emit_op(lex, -1, TK_BXOR, NULL);
699 
700         case '[':
701                 return emit_op(lex, -1, TK_LBRACK, NULL);
702 
703         case ']':
704                 return emit_op(lex, -1, TK_RBRACK, NULL);
705 
706         case '?':
707                 if (check_char(lex, '?')) {
708                         if (check_char(lex, '='))
709                                 return emit_op(lex, -3, TK_ASNULLISH, NULL);
710 
711                         return emit_op(lex, -2, TK_NULLISH, NULL);
712                 }
713 
714                 if (check_char(lex, '.')) {
715                         if (check_char(lex, '['))
716                                 return emit_op(lex, -3, TK_QLBRACK, NULL);
717 
718                         if (check_char(lex, '('))
719                                 return emit_op(lex, -3, TK_QLPAREN, NULL);
720 
721                         return emit_op(lex, -2, TK_QDOT, NULL);
722                 }
723 
724                 return emit_op(lex, lex->source->off, TK_QMARK, NULL);
725 
726         case '>':
727                 if (check_char(lex, '>')) {
728                         if (check_char(lex, '='))
729                                 return emit_op(lex, -3, TK_ASRIGHT, NULL);
730 
731                         return emit_op(lex, -2, TK_RSHIFT, NULL);
732                 }
733 
734                 if (check_char(lex, '='))
735                         return emit_op(lex, -2, TK_GE, NULL);
736 
737                 return emit_op(lex, -1, TK_GT, NULL);
738 
739         case '=':
740                 if (check_char(lex, '=')) {
741                         if (check_char(lex, '='))
742                                 return emit_op(lex, -3, TK_EQS, NULL);
743 
744                         return emit_op(lex, -2, TK_EQ, NULL);
745                 }
746 
747                 if (check_char(lex, '>'))
748                         return emit_op(lex, -2, TK_ARROW, NULL);
749 
750                 return emit_op(lex, -1, TK_ASSIGN, NULL);
751 
752         case '<':
753                 if (check_char(lex, '<')) {
754                         if (check_char(lex, '='))
755                                 return emit_op(lex, -3, TK_ASLEFT, NULL);
756 
757                         return emit_op(lex, -2, TK_LSHIFT, NULL);
758                 }
759 
760                 if (check_char(lex, '='))
761                         return emit_op(lex, -2, TK_LE, NULL);
762 
763                 return emit_op(lex, -1, TK_LT, NULL);
764 
765         case ';':
766                 return emit_op(lex, -1, TK_SCOL, NULL);
767 
768         case ':':
769                 return emit_op(lex, -1, TK_COLON, NULL);
770 
771         case '/':
772                 ch = lookahead_char(lex);
773                 lex->lastoff = lex->source->off - 1;
774 
775                 if (ch == '/' || ch == '*')
776                         return parse_comment(lex, ch);
777 
778                 if (lex->no_regexp) {
779                         if (check_char(lex, '='))
780                                 return emit_op(lex, -2, TK_ASDIV, NULL);
781 
782                         return emit_op(lex, -1, TK_DIV, NULL);
783                 }
784 
785                 return parse_regexp(lex);
786 
787         case '.':
788                 if (check_char(lex, '.')) {
789                         if (check_char(lex, '.'))
790                                 return emit_op(lex, -3, TK_ELLIP, NULL);
791 
792                         /* The sequence ".." cannot be a valid */
793                         return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character"));
794                 }
795 
796                 return emit_op(lex, -1, TK_DOT, NULL);
797 
798         case '-':
799                 if (tpl && check_char(lex, '}')) {
800                         if (check_char(lex, '}')) {
801                                 lex->modifier = MINUS;
802 
803                                 return emit_op(lex, -3, TK_REXP, NULL);
804                         }
805 
806                         /* The sequence "-}" cannot be a valid */
807                         return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
808                 }
809 
810                 if (tpl && check_char(lex, '%')) {
811                         if (check_char(lex, '}')) {
812                                 lex->modifier = MINUS;
813 
814                                 return emit_op(lex, -3, TK_RSTM, NULL);
815                         }
816 
817                         /* The sequence "-%" cannot be a valid */
818                         return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
819                 }
820 
821                 if (check_char(lex, '='))
822                         return emit_op(lex, -2, TK_ASSUB, NULL);
823 
824                 if (check_char(lex, '-'))
825                         return emit_op(lex, -2, TK_DEC, NULL);
826 
827                 return emit_op(lex, -1, TK_SUB, NULL);
828 
829         case ',':
830                 return emit_op(lex, -1, TK_COMMA, NULL);
831 
832         case '+':
833                 if (check_char(lex, '='))
834                         return emit_op(lex, -2, TK_ASADD, NULL);
835 
836                 if (check_char(lex, '+'))
837                         return emit_op(lex, -2, TK_INC, NULL);
838 
839                 return emit_op(lex, -1, TK_ADD, NULL);
840 
841         case '*':
842                 if (check_char(lex, '*')) {
843                         if (check_char(lex, '='))
844                                 return emit_op(lex, -3, TK_ASEXP, NULL);
845 
846                         return emit_op(lex, -2, TK_EXP, NULL);
847                 }
848 
849                 if (check_char(lex, '='))
850                         return emit_op(lex, -2, TK_ASMUL, NULL);
851 
852                 return emit_op(lex, -1, TK_MUL, NULL);
853 
854         case '(':
855                 return emit_op(lex, -1, TK_LPAREN, NULL);
856 
857         case ')':
858                 return emit_op(lex, -1, TK_RPAREN, NULL);
859 
860         case '\'':
861         case '"':
862         case '`':
863                 lex->lastoff = lex->source->off - 1;
864 
865                 return parse_string(lex, ch);
866 
867         case '&':
868                 if (check_char(lex, '&')) {
869                         if (check_char(lex, '='))
870                                 return emit_op(lex, -3, TK_ASAND, NULL);
871 
872                         return emit_op(lex, -2, TK_AND, NULL);
873                 }
874 
875                 if (check_char(lex, '='))
876                         return emit_op(lex, -2, TK_ASBAND, NULL);
877 
878                 return emit_op(lex, -1, TK_BAND, NULL);
879 
880         case '%':
881                 if (tpl && check_char(lex, '}'))
882                         return emit_op(lex, -2, TK_RSTM, NULL);
883 
884                 if (check_char(lex, '='))
885                         return emit_op(lex, -2, TK_ASMOD, NULL);
886 
887                 return emit_op(lex, -1, TK_MOD, NULL);
888 
889         case '!':
890                 if (check_char(lex, '=')) {
891                         if (check_char(lex, '='))
892                                 return emit_op(lex, -3, TK_NES, NULL);
893 
894                         return emit_op(lex, -2, TK_NE, NULL);
895                 }
896 
897                 return emit_op(lex, -1, TK_NOT, NULL);
898 
899         case EOF:
900                 return emit_op(lex, -1, TK_EOF, NULL);
901 
902         default:
903                 if (isalpha(ch) || ch == '_')
904                         return parse_label(lex, ch);
905 
906                 if (isdigit(ch))
907                         return parse_number(lex, ch);
908 
909                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
910         }
911 }
912 
913 static uc_token_t *
914 lex_step(uc_lexer_t *lex)
915 {
916         const char *strip = NULL;
917         uc_token_t *tok;
918         size_t *nest;
919         int ch;
920 
921         while (lex->state != UC_LEX_EOF) {
922                 switch (lex->state) {
923                 case UC_LEX_IDENTIFY_BLOCK:
924                         ch = next_char(lex);
925 
926                         /* previous block had strip trailing whitespace flag, skip leading whitespace */
927                         if (lex->modifier == MINUS) {
928                                 while (isspace(ch))
929                                         ch = next_char(lex);
930 
931                                 lex->modifier = UNSPEC;
932                         }
933 
934                         /* previous block was a statement block and trim_blocks is enabled, skip leading newline */
935                         else if (lex->modifier == NEWLINE) {
936                                 if (ch == '\n')
937                                         ch = next_char(lex);
938 
939                                 lex->modifier = UNSPEC;
940                         }
941 
942                         /* scan forward through buffer to identify block start token */
943                         while (ch != EOF) {
944                                 if (ch == '{') {
945                                         ch = next_char(lex);
946 
947                                         switch (ch) {
948                                         /* found start of comment block */
949                                         case '#':
950                                                 lex->state = UC_LEX_BLOCK_COMMENT;
951                                                 lex->block = COMMENT;
952 
953                                                 if (check_char(lex, '-'))
954                                                         strip = " \n\t\v\f\r";
955 
956                                                 break;
957 
958                                         /* found start of expression block */
959                                         case '{':
960                                                 lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;
961 
962                                                 if (check_char(lex, '-'))
963                                                         strip = " \n\t\v\f\r";
964 
965                                                 break;
966 
967                                         /* found start of statement block */
968                                         case '%':
969                                                 lex->state = UC_LEX_BLOCK_STATEMENT_EMIT_TAG;
970 
971                                                 if (check_char(lex, '-'))
972                                                         strip = " \n\t\v\f\r";
973                                                 else if (check_char(lex, '+'))
974                                                         strip = NULL;
975                                                 else if (lex->config && lex->config->lstrip_blocks)
976                                                         strip = " \t\v\f\r";
977 
978                                                 break;
979 
980                                         default:
981                                                 /* not a start tag, remember char and move on */
982                                                 uc_vector_push(&lex->buffer, '{');
983                                                 continue;
984                                         }
985 
986                                         break;
987                                 }
988 
989                                 uc_vector_push(&lex->buffer, ch);
990                                 ch = next_char(lex);
991                         }
992 
993                         if (ch == EOF)
994                                 lex->state = UC_LEX_EOF;
995 
996                         /* push out leading text */
997                         tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip);
998                         lex->lastoff = lex->source->off - 2;
999 
1000                         if (!tok)
1001                                 continue;
1002 
1003                         tok->end -= 2;
1004 
1005                         return tok;
1006 
1007 
1008                 case UC_LEX_BLOCK_COMMENT:
1009                         ch = next_char(lex);
1010 
1011                         /* scan forward through buffer to identify end token */
1012                         while (ch != EOF) {
1013                                 if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) {
1014                                         lex->modifier = MINUS;
1015                                         break;
1016                                 }
1017 
1018                                 if (ch == '#' && check_char(lex, '}'))
1019                                         break;
1020 
1021                                 ch = next_char(lex);
1022                         }
1023 
1024                         if (ch == EOF) {
1025                                 lex->state = UC_LEX_EOF;
1026 
1027                                 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
1028                         }
1029 
1030                         tok = emit_op(lex, lex->lastoff, TK_COMMENT, NULL);
1031 
1032                         lex->lastoff = lex->source->off;
1033                         lex->state = UC_LEX_IDENTIFY_BLOCK;
1034 
1035                         return tok;
1036 
1037                 case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
1038                         lex->state = UC_LEX_IDENTIFY_TOKEN;
1039                         lex->block = EXPRESSION;
1040 
1041                         return emit_op(lex, lex->source->off - 2, TK_LEXP, NULL);
1042 
1043                 case UC_LEX_BLOCK_STATEMENT_EMIT_TAG:
1044                         lex->state = UC_LEX_IDENTIFY_TOKEN;
1045                         lex->block = STATEMENTS;
1046 
1047                         return emit_op(lex, lex->source->off - 2, TK_LSTM, NULL);
1048 
1049                 case UC_LEX_IDENTIFY_TOKEN:
1050                         do { tok = lex_find_token(lex); } while (tok == NULL);
1051 
1052                         /* disallow nesting blocks */
1053                         if (tok->type == TK_LSTM || tok->type == TK_LEXP)
1054                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested"));
1055 
1056                         /* found end of statement block */
1057                         if (lex->block == STATEMENTS && tok->type == TK_RSTM) {
1058                                 /* strip newline after statement block? */
1059                                 if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks)
1060                                         lex->modifier = NEWLINE;
1061 
1062                                 lex->lastoff = lex->source->off;
1063                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
1064                                 lex->block = NONE;
1065 
1066                                 tok = emit_op(lex, -2, TK_RSTM, NULL);
1067                         }
1068 
1069                         /* found end of expression block */
1070                         else if (lex->block == EXPRESSION && tok->type == TK_REXP) {
1071                                 lex->lastoff = lex->source->off;
1072                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
1073                                 lex->block = NONE;
1074                         }
1075 
1076                         /* track opening braces */
1077                         else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
1078                                 nest = uc_vector_last(&lex->templates);
1079                                 (*nest)++;
1080                         }
1081 
1082                         /* check end of placeholder expression */
1083                         else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
1084                                 nest = uc_vector_last(&lex->templates);
1085 
1086                                 if (*nest == 0) {
1087                                         lex->templates.count--;
1088                                         lex->state = UC_LEX_PLACEHOLDER_END;
1089                                 }
1090                                 else {
1091                                         (*nest)--;
1092                                 }
1093                         }
1094 
1095                         /* premature EOF? */
1096                         else if (tok->type == TK_EOF && lex->block != STATEMENTS) {
1097                                 lex->state = UC_LEX_EOF;
1098 
1099                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block"));
1100                         }
1101 
1102                         return tok;
1103 
1104 
1105                 case UC_LEX_PLACEHOLDER_START:
1106                         lex->state = UC_LEX_IDENTIFY_TOKEN;
1107 
1108                         uc_vector_push(&lex->templates, 0);
1109 
1110                         return emit_op(lex, -2, TK_PLACEH, NULL);
1111 
1112 
1113                 case UC_LEX_PLACEHOLDER_END:
1114                         lex->state = UC_LEX_IDENTIFY_TOKEN;
1115 
1116                         tok = parse_string(lex, '`');
1117                         tok->pos++;
1118 
1119                         return tok;
1120 
1121 
1122                 case UC_LEX_EOF:
1123                         break;
1124                 }
1125         }
1126 
1127         return emit_op(lex, lex->source->off, TK_EOF, NULL);
1128 }
1129 
1130 void
1131 uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
1132 {
1133         lex->state = UC_LEX_IDENTIFY_BLOCK;
1134 
1135         lex->config = config;
1136         lex->source = uc_source_get(source);
1137 
1138         lex->block = NONE;
1139         lex->modifier = UNSPEC;
1140 
1141         lex->rlen = 0;
1142         lex->rpos = 0;
1143         lex->rbuf = NULL;
1144 
1145         lex->buffer.count = 0;
1146         lex->buffer.entries = NULL;
1147 
1148         lex->lead_surrogate = 0;
1149 
1150         lex->lastoff = 0;
1151 
1152         lex->templates.count = 0;
1153         lex->templates.entries = NULL;
1154 
1155         if (config && config->raw_mode) {
1156                 lex->state = UC_LEX_IDENTIFY_TOKEN;
1157                 lex->block = STATEMENTS;
1158         }
1159 }
1160 
1161 void
1162 uc_lexer_free(uc_lexer_t *lex)
1163 {
1164         uc_vector_clear(&lex->buffer);
1165         uc_vector_clear(&lex->templates);
1166 
1167         uc_source_put(lex->source);
1168 
1169         free(lex->rbuf);
1170 }
1171 
1172 uc_token_t *
1173 uc_lexer_next_token(uc_lexer_t *lex)
1174 {
1175         uc_token_t *rv = NULL;
1176 
1177         rv = lex_step(lex);
1178 
1179         if (rv && rv->type != TK_COMMENT) {
1180                 lex->no_keyword = false;
1181                 lex->no_regexp = false;
1182         }
1183 
1184         return rv;
1185 }
1186 
1187 const char *
1188 uc_tokenname(unsigned type)
1189 {
1190         static char buf[sizeof("'endfunction'")];
1191         const char *tokennames[] = {
1192                 [TK_LEXP] = "'{{'",
1193                 [TK_REXP] = "'}}'",
1194                 [TK_LSTM] = "'{%'",
1195                 [TK_RSTM] = "'%}'",
1196                 [TK_COMMA] = "','",
1197                 [TK_ASSIGN] = "'='",
1198                 [TK_ASADD] = "'+='",
1199                 [TK_ASSUB] = "'-='",
1200                 [TK_ASMUL] = "'*='",
1201                 [TK_ASDIV] = "'/='",
1202                 [TK_ASMOD] = "'%='",
1203                 [TK_ASLEFT] = "'<<='",
1204                 [TK_ASRIGHT] = "'>>='",
1205                 [TK_ASBAND] = "'&='",
1206                 [TK_ASBXOR] = "'^='",
1207                 [TK_ASBOR] = "'|='",
1208                 [TK_QMARK] = "'?'",
1209                 [TK_COLON] = "':'",
1210                 [TK_OR] = "'||'",
1211                 [TK_AND] = "'&&'",
1212                 [TK_BOR] = "'|'",
1213                 [TK_BXOR] = "'^'",
1214                 [TK_BAND] = "'&'",
1215                 [TK_EQS] = "'==='",
1216                 [TK_NES] = "'!=='",
1217                 [TK_EQ] = "'=='",
1218                 [TK_NE] = "'!='",
1219                 [TK_LT] = "'<'",
1220                 [TK_LE] = "'<='",
1221                 [TK_GT] = "'>'",
1222                 [TK_GE] = "'>='",
1223                 [TK_LSHIFT] = "'<<'",
1224                 [TK_RSHIFT] = "'>>'",
1225                 [TK_ADD] = "'+'",
1226                 [TK_SUB] = "'-'",
1227                 [TK_MUL] = "'*'",
1228                 [TK_DIV] = "'/'",
1229                 [TK_MOD] = "'%'",
1230                 [TK_EXP] = "'**'",
1231                 [TK_NOT] = "'!'",
1232                 [TK_COMPL] = "'~'",
1233                 [TK_INC] = "'++'",
1234                 [TK_DEC] = "'--'",
1235                 [TK_DOT] = "'.'",
1236                 [TK_LBRACK] = "'['",
1237                 [TK_RBRACK] = "']'",
1238                 [TK_LPAREN] = "'('",
1239                 [TK_RPAREN] = "')'",
1240                 [TK_LBRACE] = "'{'",
1241                 [TK_RBRACE] = "'}'",
1242                 [TK_SCOL] = "';'",
1243                 [TK_ELLIP] = "'...'",
1244                 [TK_ARROW] = "'=>'",
1245                 [TK_QLBRACK] = "'?.['",
1246                 [TK_QLPAREN] = "'?.('",
1247                 [TK_QDOT] = "'?.'",
1248                 [TK_ASEXP] = "'**='",
1249                 [TK_ASAND] = "'&&='",
1250                 [TK_ASOR] = "'||='",
1251                 [TK_ASNULLISH] = "'\?\?='",
1252                 [TK_NULLISH] = "'\?\?'",
1253                 [TK_PLACEH] = "'${'",
1254 
1255                 [TK_TEXT] = "Text",
1256                 [TK_LABEL] = "Label",
1257                 [TK_NUMBER] = "Number",
1258                 [TK_DOUBLE] = "Double",
1259                 [TK_STRING] = "String",
1260                 [TK_REGEXP] = "Regexp",
1261                 [TK_TEMPLATE] = "Template",
1262                 [TK_ERROR] = "Error",
1263                 [TK_EOF] = "End of file",
1264         };
1265 
1266         size_t i;
1267 
1268         for (i = 0; i < ARRAY_SIZE(reserved_words); i++) {
1269                 if (reserved_words[i].type != type)
1270                         continue;
1271 
1272                 snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat);
1273 
1274                 return buf;
1275         }
1276 
1277         return tokennames[type] ? tokennames[type] : "?";
1278 }
1279 
1280 bool
1281 uc_lexer_is_keyword(uc_value_t *label)
1282 {
1283         size_t i;
1284 
1285         if (ucv_type(label) != UC_STRING)
1286                 return false;
1287 
1288         for (i = 0; i < ARRAY_SIZE(reserved_words); i++)
1289                 if (!strcmp(reserved_words[i].pat, ucv_string_get(label)))
1290                         return true;
1291 
1292         return false;
1293 }
1294 
1295 #endif /* NO_COMPILE */
1296 
1297 /*
1298  * Stores the given codepoint as a utf8 multibyte sequence into the given
1299  * output buffer and substracts the required amount of bytes from  the given
1300  * length pointer.
1301  *
1302  * Returns false if the multibyte sequence would not fit into the buffer,
1303  * otherwise true.
1304  */
1305 
1306 bool
1307 utf8enc(char **out, int *rem, int code)
1308 {
1309         if (code >= 0 && code <= 0x7F) {
1310                 if (*rem < 1)
1311                         return false;
1312 
1313                 *(*out)++ = code; (*rem)--;
1314 
1315                 return true;
1316         }
1317         else if (code > 0 && code <= 0x7FF) {
1318                 if (*rem < 2)
1319                         return false;
1320 
1321                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
1322                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1323 
1324                 return true;
1325         }
1326         else if (code > 0 && code <= 0xFFFF) {
1327                 if (*rem < 3)
1328                         return false;
1329 
1330                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
1331                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1332                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1333 
1334                 return true;
1335         }
1336         else if (code > 0 && code <= 0x10FFFF) {
1337                 if (*rem < 4)
1338                         return false;
1339 
1340                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
1341                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
1342                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1343                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1344 
1345                 return true;
1346         }
1347 
1348         return true;
1349 }
1350
This page was automatically generated by LXR 0.3.1. • OpenWrt
OpenWrt.org Cross Reference

Sources/ucode/lexer.c