• source navigation  • diff markup  • identifier search  • freetext search  • 

Sources/ucode/lexer.c

  1 /*
  2  * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io>
  3  *
  4  * Permission to use, copy, modify, and/or distribute this software for any
  5  * purpose with or without fee is hereby granted, provided that the above
  6  * copyright notice and this permission notice appear in all copies.
  7  *
  8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15  */
 16 
 17 #include <stdio.h>
 18 
 19 #include <stdbool.h>
 20 #include <stdlib.h>
 21 #include <string.h>
 22 #include <ctype.h>
 23 #include <regex.h>
 24 #include <math.h>
 25 #include <errno.h>
 26 #include <endian.h>
 27 
 28 #include "ucode/vm.h"
 29 #include "ucode/lib.h"
 30 #include "ucode/lexer.h"
 31 
 32 struct keyword {
 33         unsigned type;
 34         const char *pat;
 35         unsigned plen;
 36 };
 37 
 38 #define dec(o) \
 39         ((o) - '')
 40 
 41 #define hex(x) \
 42         (((x) >= 'a') ? (10 + (x) - 'a') : \
 43                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
 44 
 45 #ifndef NO_COMPILE
 46 
 47 static const struct keyword reserved_words[] = {
 48         { TK_ENDFUNC,   "endfunction", 11 },
 49         { TK_CONTINUE,  "continue", 8 },
 50         { TK_ENDWHILE,  "endwhile", 8 },
 51         { TK_FUNC,              "function", 8 },
 52         { TK_DEFAULT,   "default", 7 },
 53         { TK_DELETE,    "delete", 6 },
 54         { TK_RETURN,    "return", 6 },
 55         { TK_ENDFOR,    "endfor", 6 },
 56         { TK_SWITCH,    "switch", 6 },
 57         { TK_IMPORT,    "import", 6 },
 58         { TK_EXPORT,    "export", 6 },
 59         { TK_ENDIF,             "endif", 5 },
 60         { TK_WHILE,             "while", 5 },
 61         { TK_BREAK,             "break", 5 },
 62         { TK_CATCH,             "catch", 5 },
 63         { TK_CONST,             "const", 5 },
 64         { TK_FALSE,             "false", 5 },
 65         { TK_TRUE,              "true",  4 },
 66         { TK_ELIF,              "elif",  4 },
 67         { TK_ELSE,              "else",  4 },
 68         { TK_THIS,              "this",  4 },
 69         { TK_NULL,              "null",  4 },
 70         { TK_CASE,              "case",  4 },
 71         { TK_FROM,              "from",  4 },
 72         { TK_TRY,               "try",   3 },
 73         { TK_FOR,               "for",   3 },
 74         { TK_LOCAL,             "let",   3 },
 75         { TK_IF,                "if",    2 },
 76         { TK_IN,                "in",    2 },
 77         { TK_AS,                "as",    2 },
 78 };
 79 
 80 
 81 static int
 82 fill_buf(uc_lexer_t *lex) {
 83         lex->rbuf = xrealloc(lex->rbuf, 128);
 84         lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp);
 85         lex->rpos = 0;
 86 
 87         if (!lex->rlen)
 88                 return EOF;
 89 
 90         lex->rpos++;
 91 
 92         return (int)lex->rbuf[0];
 93 }
 94 
 95 static int
 96 update_line(uc_lexer_t *lex, int ch) {
 97         if (ch == '\n' || ch == EOF)
 98                 uc_source_line_next(lex->source);
 99         else
100                 uc_source_line_update(lex->source, 1);
101 
102         lex->source->off++;
103 
104         return ch;
105 }
106 
107 static int
108 lookahead_char(uc_lexer_t *lex) {
109         int c;
110 
111         if (lex->rpos < lex->rlen)
112                 return (int)lex->rbuf[lex->rpos];
113 
114         c = fill_buf(lex);
115         lex->rpos = 0;
116 
117         return c;
118 }
119 
120 static bool
121 check_char(uc_lexer_t *lex, int ch) {
122         if (lookahead_char(lex) != ch)
123                 return false;
124 
125         lex->rpos++;
126 
127         update_line(lex, ch);
128 
129         return true;
130 }
131 
132 static int
133 next_char(uc_lexer_t *lex) {
134         int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex);
135 
136         return update_line(lex, ch);
137 }
138 
139 static uc_token_t *
140 emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv)
141 {
142         lex->curr.type = type;
143         lex->curr.uv = uv;
144 
145         if (pos < 0)
146                 lex->curr.pos = lex->source->off + pos;
147         else
148                 lex->curr.pos = (size_t)pos;
149 
150         return &lex->curr;
151 }
152 
153 static uc_token_t *
154 emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) {
155         uc_token_t *rv = NULL;
156 
157         if (lex->buffer.count) {
158                 if (strip_trailing_chars)
159                         while (lex->buffer.count > 0 && strchr(strip_trailing_chars, *uc_vector_last(&lex->buffer)))
160                                 lex->buffer.count--;
161 
162                 rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count));
163 
164                 uc_vector_clear(&lex->buffer);
165         }
166         else if (type != TK_TEXT) {
167                 rv = emit_op(lex, pos, type, ucv_string_new_length("", 0));
168         }
169 
170         return rv;
171 }
172 
173 
174 static uc_token_t *
175 parse_comment(uc_lexer_t *lex, int kind)
176 {
177         int ch;
178 
179         while (true) {
180                 ch = next_char(lex);
181 
182                 if (kind == '/' && (ch == '\n' || ch == EOF))
183                         break;
184 
185                 if (kind == '*' && ch == '*' && check_char(lex, '/'))
186                         break;
187 
188                 if (ch == EOF) {
189                         lex->state = UC_LEX_EOF;
190 
191                         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment"));
192                 }
193         }
194 
195         return NULL;
196 }
197 
198 static void
199 append_utf8(uc_lexer_t *lex, int code) {
200         char ustr[8], *up;
201         int rem;
202 
203         up = ustr;
204         rem = sizeof(ustr);
205 
206         if (utf8enc(&up, &rem, code))
207                 for (up = ustr; rem < (int)sizeof(ustr); rem++)
208                         uc_vector_push(&lex->buffer, *up++);
209 }
210 
211 static uc_token_t *
212 parse_string(uc_lexer_t *lex, int kind)
213 {
214         int code, ch, i;
215         unsigned type;
216         size_t off;
217 
218         if (kind == '`')
219                 type = TK_TEMPLATE;
220         else if (kind == '/')
221                 type = TK_REGEXP;
222         else
223                 type = TK_STRING;
224 
225         off = lex->source->off - 1;
226 
227         for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
228                 switch (ch) {
229                 /* placeholder */
230                 case '$':
231                         if (type == TK_TEMPLATE && check_char(lex, '{')) {
232                                 lex->state = UC_LEX_PLACEHOLDER_START;
233 
234                                 return emit_buffer(lex, off, type, NULL);
235                         }
236 
237                         uc_vector_push(&lex->buffer, '$');
238                         break;
239 
240                 /* escape sequence */
241                 case '\\':
242                         /* unicode escape sequence */
243                         if (type != TK_REGEXP && check_char(lex, 'u')) {
244                                 for (i = 0, code = 0; i < 4; i++) {
245                                         ch = next_char(lex);
246 
247                                         if (!isxdigit(ch))
248                                                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
249 
250                                         code = code * 16 + hex(ch);
251                                 }
252 
253                                 /* is a leading surrogate value */
254                                 if ((code & 0xFC00) == 0xD800) {
255                                         /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
256                                         if (lex->lead_surrogate)
257                                                 append_utf8(lex, 0xFFFD);
258 
259                                         /* store surrogate value and advance to next escape sequence */
260                                         lex->lead_surrogate = code;
261                                 }
262 
263                                 /* is a trailing surrogate value */
264                                 else if ((code & 0xFC00) == 0xDC00) {
265                                         /* found a trailing surrogate following a leading one, combine and encode */
266                                         if (lex->lead_surrogate) {
267                                                 code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
268                                                 lex->lead_surrogate = 0;
269                                         }
270 
271                                         /* trailing surrogate not following a leading one, ignore and use replacement char */
272                                         else {
273                                                 code = 0xFFFD;
274                                         }
275 
276                                         append_utf8(lex, code);
277                                 }
278 
279                                 /* is a normal codepoint */
280                                 else {
281                                         append_utf8(lex, code);
282                                 }
283                         }
284 
285                         /* hex escape sequence */
286                         else if (type != TK_REGEXP && check_char(lex, 'x')) {
287                                 for (i = 0, code = 0; i < 2; i++) {
288                                         ch = next_char(lex);
289 
290                                         if (!isxdigit(ch))
291                                                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
292 
293                                         code = code * 16 + hex(ch);
294                                 }
295 
296                                 append_utf8(lex, code);
297                         }
298 
299                         /* octal or letter */
300                         else {
301                                 /* try to parse octal sequence... */
302                                 for (i = 0, code = 0, ch = lookahead_char(lex);
303                                      kind != '/' && i < 3 && ch >= '' && ch <= '7';
304                                      i++, next_char(lex), ch = lookahead_char(lex)) {
305                                         code = code * 8 + dec(ch);
306                                 }
307 
308                                 if (i) {
309                                         if (code > 255)
310                                                 return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));
311 
312                                         append_utf8(lex, code);
313                                 }
314 
315                                 /* ... no octal sequence, handle other escape */
316                                 else {
317                                         ch = next_char(lex);
318 
319                                         switch (ch) {
320                                         case 'a': uc_vector_push(&lex->buffer, '\a'); break;
321                                         case 'b': uc_vector_push(&lex->buffer, '\b'); break;
322                                         case 'e': uc_vector_push(&lex->buffer, '\033'); break;
323                                         case 'f': uc_vector_push(&lex->buffer, '\f'); break;
324                                         case 'n': uc_vector_push(&lex->buffer, '\n'); break;
325                                         case 'r': uc_vector_push(&lex->buffer, '\r'); break;
326                                         case 't': uc_vector_push(&lex->buffer, '\t'); break;
327                                         case 'v': uc_vector_push(&lex->buffer, '\v'); break;
328 
329                                         case EOF:
330                                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
331 
332                                         default:
333                                                 /* regex mode => retain backslash */
334                                                 if (type == TK_REGEXP)
335                                                         uc_vector_push(&lex->buffer, '\\');
336 
337                                                 uc_vector_push(&lex->buffer, ch);
338                                         }
339                                 }
340                         }
341 
342                         break;
343 
344                 /* other character */
345                 default:
346                         /* terminating delimitter */
347                         if (ch == kind)
348                                 return emit_buffer(lex, off, type, NULL);
349 
350                         uc_vector_push(&lex->buffer, ch);
351                 }
352         }
353 
354         // FIXME
355         lex->state = UC_LEX_EOF;
356 
357         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
358 }
359 
360 
361 /*
362  * Parses a regexp literal from the given buffer.
363  *
364  * Returns a negative value on error, otherwise the amount of consumed
365  * characters from the given buffer.
366  *
367  * Error values:
368  *  -UC_ERROR_UNTERMINATED_STRING       Unterminated regexp
369  *  -UC_ERROR_INVALID_ESCAPE            Invalid escape sequence
370  *  -UC_ERROR_OVERLONG_STRING           Regexp literal too long
371  *  -UC_ERROR_INVALID_REGEXP        Could not compile regexp
372  */
373 
374 enum {
375         UC_LEX_PARSE_REGEX_INIT,
376         UC_LEX_PARSE_REGEX_PATTERN,
377         UC_LEX_PARSE_REGEX_FLAGS
378 };
379 
380 static uc_token_t *
381 parse_regexp(uc_lexer_t *lex)
382 {
383         bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false;
384         uc_token_t *rv;
385         size_t len;
386         char *s;
387 
388         rv = parse_string(lex, '/');
389 
390         if (rv->type == TK_REGEXP) {
391                 while (true) {
392                         if (check_char(lex, 'g'))
393                                 is_reg_global = true;
394                         else if (check_char(lex, 'i'))
395                                 is_reg_icase = true;
396                         else if (check_char(lex, 's'))
397                                 is_reg_newline = true;
398                         else
399                                 break;
400                 }
401 
402                 len = xasprintf(&s, "%c%*s",
403                         (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
404                         ucv_string_length(rv->uv),
405                         ucv_string_get(rv->uv));
406 
407                 ucv_free(rv->uv, false);
408                 rv->uv = ucv_string_new_length(s, len);
409                 free(s);
410         }
411 
412         return rv;
413 }
414 
415 
416 /*
417  * Parses a label from the given buffer.
418  *
419  * Returns a negative value on error, otherwise the amount of consumed
420  * characters from the given buffer.
421  *
422  * Error values:
423  *  -UC_ERROR_OVERLONG_STRING   Label too long
424  */
425 
426 static uc_token_t *
427 parse_label(uc_lexer_t *lex, int ch)
428 {
429         const struct keyword *word;
430         size_t i, len;
431 
432         while (true) {
433                 uc_vector_push(&lex->buffer, ch);
434                 ch = lookahead_char(lex);
435 
436                 if (!isalnum(ch) && ch != '_')
437                         break;
438 
439                 next_char(lex);
440         }
441 
442         len = lex->buffer.count;
443 
444         if (!lex->no_keyword) {
445                 for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
446                         if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) {
447                                 uc_vector_clear(&lex->buffer);
448 
449                                 return emit_op(lex, -len, word->type, NULL);
450                         }
451                 }
452         }
453 
454         return emit_buffer(lex, -len, TK_LABEL, NULL);
455 }
456 
457 
458 /*
459  * Parses a number literal from the given buffer.
460  *
461  * Returns a negative value on error, otherwise the amount of consumed
462  * characters from the given buffer.
463  *
464  * Error values:
465  *  -UC_ERROR_INVALID_ESCAPE    Invalid number character
466  */
467 
468 static inline bool
469 is_numeric_char(uc_lexer_t *lex, char c)
470 {
471         char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0;
472 
473         switch (c|32) {
474         case '.':
475         case '':
476         case '1':
477         case '2':
478         case '3':
479         case '4':
480         case '5':
481         case '6':
482         case '7':
483         case '8':
484         case '9':
485                 return true;
486 
487         case 'a':
488         case 'b':
489         case 'c':
490         case 'd':
491         case 'e':
492         case 'f':
493         case 'o':
494         case 'x':
495                 /* require previous char, a number literal cannot start with these */
496                 return prev != 0;
497 
498         case '+':
499         case '-':
500                 /* sign is only allowed after an exponent char */
501                 return (prev|32) == 'e';
502         }
503 
504         return false;
505 }
506 
507 static uc_token_t *
508 parse_number(uc_lexer_t *lex, int ch)
509 {
510         uc_value_t *nv = NULL;
511         size_t len;
512         char *e;
513 
514         while (true) {
515                 uc_vector_push(&lex->buffer, ch);
516                 ch = lookahead_char(lex);
517 
518                 if (!is_numeric_char(lex, ch))
519                         break;
520 
521                 next_char(lex);
522         }
523 
524         len = lex->buffer.count;
525 
526         uc_vector_push(&lex->buffer, '\0');
527 
528         nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e);
529 
530         uc_vector_clear(&lex->buffer);
531 
532         switch (ucv_type(nv)) {
533         case UC_DOUBLE:
534                 return emit_op(lex, -len, TK_DOUBLE, nv);
535 
536         case UC_INTEGER:
537                 return emit_op(lex, -len, TK_NUMBER, nv);
538 
539         default:
540                 return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal"));
541         }
542 }
543 
544 static uc_token_t *
545 lex_find_token(uc_lexer_t *lex)
546 {
547         bool tpl = !(lex->config && lex->config->raw_mode);
548         int ch = next_char(lex);
549 
550         while (isspace(ch))
551                 ch = next_char(lex);
552 
553         switch (ch) {
554         case '~':
555                 return emit_op(lex, -1, TK_COMPL, NULL);
556 
557         case '}':
558                 if (tpl && check_char(lex, '}'))
559                         return emit_op(lex, -2, TK_REXP, NULL);
560 
561                 return emit_op(lex, -1, TK_RBRACE, NULL);
562 
563         case '|':
564                 if (check_char(lex, '|')) {
565                         if (check_char(lex, '='))
566                                 return emit_op(lex, -3, TK_ASOR, NULL);
567 
568                         return emit_op(lex, -2, TK_OR, NULL);
569                 }
570 
571                 if (check_char(lex, '='))
572                         return emit_op(lex, -2, TK_ASBOR, NULL);
573 
574                 return emit_op(lex, -1, TK_BOR, NULL);
575 
576         case '{':
577                 if (tpl && check_char(lex, '{'))
578                         return emit_op(lex, -2, TK_LEXP, NULL);
579 
580                 if (tpl && check_char(lex, '%'))
581                         return emit_op(lex, -2, TK_LSTM, NULL);
582 
583                 return emit_op(lex, -1, TK_LBRACE, NULL);
584 
585         case '^':
586                 if (check_char(lex, '='))
587                         return emit_op(lex, -2, TK_ASBXOR, NULL);
588 
589                 return emit_op(lex, -1, TK_BXOR, NULL);
590 
591         case '[':
592                 return emit_op(lex, -1, TK_LBRACK, NULL);
593 
594         case ']':
595                 return emit_op(lex, -1, TK_RBRACK, NULL);
596 
597         case '?':
598                 if (check_char(lex, '?')) {
599                         if (check_char(lex, '='))
600                                 return emit_op(lex, -3, TK_ASNULLISH, NULL);
601 
602                         return emit_op(lex, -2, TK_NULLISH, NULL);
603                 }
604 
605                 if (check_char(lex, '.')) {
606                         if (check_char(lex, '['))
607                                 return emit_op(lex, -3, TK_QLBRACK, NULL);
608 
609                         if (check_char(lex, '('))
610                                 return emit_op(lex, -3, TK_QLPAREN, NULL);
611 
612                         return emit_op(lex, -2, TK_QDOT, NULL);
613                 }
614 
615                 return emit_op(lex, lex->source->off, TK_QMARK, NULL);
616 
617         case '>':
618                 if (check_char(lex, '>')) {
619                         if (check_char(lex, '='))
620                                 return emit_op(lex, -3, TK_ASRIGHT, NULL);
621 
622                         return emit_op(lex, -2, TK_RSHIFT, NULL);
623                 }
624 
625                 if (check_char(lex, '='))
626                         return emit_op(lex, -2, TK_GE, NULL);
627 
628                 return emit_op(lex, -1, TK_GT, NULL);
629 
630         case '=':
631                 if (check_char(lex, '=')) {
632                         if (check_char(lex, '='))
633                                 return emit_op(lex, -3, TK_EQS, NULL);
634 
635                         return emit_op(lex, -2, TK_EQ, NULL);
636                 }
637 
638                 if (check_char(lex, '>'))
639                         return emit_op(lex, -2, TK_ARROW, NULL);
640 
641                 return emit_op(lex, -1, TK_ASSIGN, NULL);
642 
643         case '<':
644                 if (check_char(lex, '<')) {
645                         if (check_char(lex, '='))
646                                 return emit_op(lex, -3, TK_ASLEFT, NULL);
647 
648                         return emit_op(lex, -2, TK_LSHIFT, NULL);
649                 }
650 
651                 if (check_char(lex, '='))
652                         return emit_op(lex, -2, TK_LE, NULL);
653 
654                 return emit_op(lex, -1, TK_LT, NULL);
655 
656         case ';':
657                 return emit_op(lex, -1, TK_SCOL, NULL);
658 
659         case ':':
660                 return emit_op(lex, -1, TK_COLON, NULL);
661 
662         case '/':
663                 ch = lookahead_char(lex);
664                 lex->lastoff = lex->source->off - 1;
665 
666                 if (ch == '/' || ch == '*')
667                         return parse_comment(lex, ch);
668 
669                 if (lex->no_regexp) {
670                         if (check_char(lex, '='))
671                                 return emit_op(lex, -2, TK_ASDIV, NULL);
672 
673                         return emit_op(lex, -1, TK_DIV, NULL);
674                 }
675 
676                 return parse_regexp(lex);
677 
678         case '.':
679                 if (check_char(lex, '.')) {
680                         if (check_char(lex, '.'))
681                                 return emit_op(lex, -3, TK_ELLIP, NULL);
682 
683                         /* The sequence ".." cannot be a valid */
684                         return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character"));
685                 }
686 
687                 return emit_op(lex, -1, TK_DOT, NULL);
688 
689         case '-':
690                 if (tpl && check_char(lex, '}')) {
691                         if (check_char(lex, '}')) {
692                                 lex->modifier = MINUS;
693 
694                                 return emit_op(lex, -3, TK_REXP, NULL);
695                         }
696 
697                         /* The sequence "-}" cannot be a valid */
698                         return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
699                 }
700 
701                 if (tpl && check_char(lex, '%')) {
702                         if (check_char(lex, '}')) {
703                                 lex->modifier = MINUS;
704 
705                                 return emit_op(lex, -3, TK_RSTM, NULL);
706                         }
707 
708                         /* The sequence "-%" cannot be a valid */
709                         return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
710                 }
711 
712                 if (check_char(lex, '='))
713                         return emit_op(lex, -2, TK_ASSUB, NULL);
714 
715                 if (check_char(lex, '-'))
716                         return emit_op(lex, -2, TK_DEC, NULL);
717 
718                 return emit_op(lex, -1, TK_SUB, NULL);
719 
720         case ',':
721                 return emit_op(lex, -1, TK_COMMA, NULL);
722 
723         case '+':
724                 if (check_char(lex, '='))
725                         return emit_op(lex, -2, TK_ASADD, NULL);
726 
727                 if (check_char(lex, '+'))
728                         return emit_op(lex, -2, TK_INC, NULL);
729 
730                 return emit_op(lex, -1, TK_ADD, NULL);
731 
732         case '*':
733                 if (check_char(lex, '*')) {
734                         if (check_char(lex, '='))
735                                 return emit_op(lex, -3, TK_ASEXP, NULL);
736 
737                         return emit_op(lex, -2, TK_EXP, NULL);
738                 }
739 
740                 if (check_char(lex, '='))
741                         return emit_op(lex, -2, TK_ASMUL, NULL);
742 
743                 return emit_op(lex, -1, TK_MUL, NULL);
744 
745         case '(':
746                 return emit_op(lex, -1, TK_LPAREN, NULL);
747 
748         case ')':
749                 return emit_op(lex, -1, TK_RPAREN, NULL);
750 
751         case '\'':
752         case '"':
753         case '`':
754                 lex->lastoff = lex->source->off - 1;
755 
756                 return parse_string(lex, ch);
757 
758         case '&':
759                 if (check_char(lex, '&')) {
760                         if (check_char(lex, '='))
761                                 return emit_op(lex, -3, TK_ASAND, NULL);
762 
763                         return emit_op(lex, -2, TK_AND, NULL);
764                 }
765 
766                 if (check_char(lex, '='))
767                         return emit_op(lex, -2, TK_ASBAND, NULL);
768 
769                 return emit_op(lex, -1, TK_BAND, NULL);
770 
771         case '%':
772                 if (tpl && check_char(lex, '}'))
773                         return emit_op(lex, -2, TK_RSTM, NULL);
774 
775                 if (check_char(lex, '='))
776                         return emit_op(lex, -2, TK_ASMOD, NULL);
777 
778                 return emit_op(lex, -1, TK_MOD, NULL);
779 
780         case '!':
781                 if (check_char(lex, '=')) {
782                         if (check_char(lex, '='))
783                                 return emit_op(lex, -3, TK_NES, NULL);
784 
785                         return emit_op(lex, -2, TK_NE, NULL);
786                 }
787 
788                 return emit_op(lex, -1, TK_NOT, NULL);
789 
790         case EOF:
791                 return emit_op(lex, -1, TK_EOF, NULL);
792 
793         default:
794                 if (isalpha(ch) || ch == '_')
795                         return parse_label(lex, ch);
796 
797                 if (isdigit(ch))
798                         return parse_number(lex, ch);
799 
800                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
801         }
802 }
803 
804 static uc_token_t *
805 lex_step(uc_lexer_t *lex)
806 {
807         const char *strip = NULL;
808         uc_token_t *tok;
809         size_t *nest;
810         int ch;
811 
812         while (lex->state != UC_LEX_EOF) {
813                 switch (lex->state) {
814                 case UC_LEX_IDENTIFY_BLOCK:
815                         ch = next_char(lex);
816 
817                         /* previous block had strip trailing whitespace flag, skip leading whitespace */
818                         if (lex->modifier == MINUS) {
819                                 while (isspace(ch))
820                                         ch = next_char(lex);
821 
822                                 lex->modifier = UNSPEC;
823                         }
824 
825                         /* previous block was a statement block and trim_blocks is enabled, skip leading newline */
826                         else if (lex->modifier == NEWLINE) {
827                                 if (ch == '\n')
828                                         ch = next_char(lex);
829 
830                                 lex->modifier = UNSPEC;
831                         }
832 
833                         /* scan forward through buffer to identify block start token */
834                         while (ch != EOF) {
835                                 if (ch == '{') {
836                                         ch = next_char(lex);
837 
838                                         switch (ch) {
839                                         /* found start of comment block */
840                                         case '#':
841                                                 lex->state = UC_LEX_BLOCK_COMMENT;
842                                                 lex->block = COMMENT;
843 
844                                                 if (check_char(lex, '-'))
845                                                         strip = " \n\t\v\f\r";
846 
847                                                 break;
848 
849                                         /* found start of expression block */
850                                         case '{':
851                                                 lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;
852 
853                                                 if (check_char(lex, '-'))
854                                                         strip = " \n\t\v\f\r";
855 
856                                                 break;
857 
858                                         /* found start of statement block */
859                                         case '%':
860                                                 lex->state = UC_LEX_IDENTIFY_TOKEN;
861                                                 lex->block = STATEMENTS;
862 
863                                                 if (check_char(lex, '-'))
864                                                         strip = " \n\t\v\f\r";
865                                                 else if (check_char(lex, '+'))
866                                                         strip = NULL;
867                                                 else if (lex->config && lex->config->lstrip_blocks)
868                                                         strip = " \t\v\f\r";
869 
870                                                 break;
871 
872                                         default:
873                                                 /* not a start tag, remember char and move on */
874                                                 uc_vector_push(&lex->buffer, '{');
875                                                 continue;
876                                         }
877 
878                                         break;
879                                 }
880 
881                                 uc_vector_push(&lex->buffer, ch);
882                                 ch = next_char(lex);
883                         }
884 
885                         if (ch == EOF)
886                                 lex->state = UC_LEX_EOF;
887 
888                         /* push out leading text */
889                         tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip);
890                         lex->lastoff = lex->source->off - 2;
891 
892                         if (!tok)
893                                 continue;
894 
895                         return tok;
896 
897 
898                 case UC_LEX_BLOCK_COMMENT:
899                         ch = next_char(lex);
900 
901                         /* scan forward through buffer to identify end token */
902                         while (ch != EOF) {
903                                 if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) {
904                                         lex->modifier = MINUS;
905                                         break;
906                                 }
907 
908                                 if (ch == '#' && check_char(lex, '}'))
909                                         break;
910 
911                                 ch = next_char(lex);
912                         }
913 
914                         if (ch == EOF) {
915                                 lex->state = UC_LEX_EOF;
916 
917                                 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
918                         }
919 
920                         lex->lastoff = lex->source->off;
921                         lex->state = UC_LEX_IDENTIFY_BLOCK;
922 
923                         continue;
924 
925 
926                 case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
927                         lex->state = UC_LEX_IDENTIFY_TOKEN;
928                         lex->block = EXPRESSION;
929 
930                         return emit_op(lex, lex->source->off, TK_LEXP, NULL);
931 
932 
933                 case UC_LEX_IDENTIFY_TOKEN:
934                         do { tok = lex_find_token(lex); } while (tok == NULL);
935 
936                         /* disallow nesting blocks */
937                         if (tok->type == TK_LSTM || tok->type == TK_LEXP)
938                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested"));
939 
940                         /* found end of statement block */
941                         if (lex->block == STATEMENTS && tok->type == TK_RSTM) {
942                                 /* strip newline after statement block? */
943                                 if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks)
944                                         lex->modifier = NEWLINE;
945 
946                                 lex->lastoff = lex->source->off;
947                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
948                                 lex->block = NONE;
949 
950                                 tok = emit_op(lex, -2, TK_SCOL, NULL);
951                         }
952 
953                         /* found end of expression block */
954                         else if (lex->block == EXPRESSION && tok->type == TK_REXP) {
955                                 lex->lastoff = lex->source->off;
956                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
957                                 lex->block = NONE;
958                         }
959 
960                         /* track opening braces */
961                         else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
962                                 nest = uc_vector_last(&lex->templates);
963                                 (*nest)++;
964                         }
965 
966                         /* check end of placeholder expression */
967                         else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
968                                 nest = uc_vector_last(&lex->templates);
969 
970                                 if (*nest == 0) {
971                                         lex->templates.count--;
972                                         lex->state = UC_LEX_PLACEHOLDER_END;
973                                 }
974                                 else {
975                                         (*nest)--;
976                                 }
977                         }
978 
979                         /* premature EOF? */
980                         else if (tok->type == TK_EOF && lex->block != STATEMENTS) {
981                                 lex->state = UC_LEX_EOF;
982 
983                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block"));
984                         }
985 
986                         return tok;
987 
988 
989                 case UC_LEX_PLACEHOLDER_START:
990                         lex->state = UC_LEX_IDENTIFY_TOKEN;
991 
992                         uc_vector_push(&lex->templates, 0);
993 
994                         return emit_op(lex, -2, TK_PLACEH, NULL);
995 
996 
997                 case UC_LEX_PLACEHOLDER_END:
998                         lex->state = UC_LEX_IDENTIFY_TOKEN;
999 
1000                         return parse_string(lex, '`');
1001 
1002 
1003                 case UC_LEX_EOF:
1004                         break;
1005                 }
1006         }
1007 
1008         return emit_op(lex, lex->source->off, TK_EOF, NULL);
1009 }
1010 
1011 void
1012 uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
1013 {
1014         lex->state = UC_LEX_IDENTIFY_BLOCK;
1015 
1016         lex->config = config;
1017         lex->source = uc_source_get(source);
1018 
1019         lex->block = NONE;
1020         lex->modifier = UNSPEC;
1021 
1022         lex->rlen = 0;
1023         lex->rpos = 0;
1024         lex->rbuf = NULL;
1025 
1026         lex->buffer.count = 0;
1027         lex->buffer.entries = NULL;
1028 
1029         lex->lead_surrogate = 0;
1030 
1031         lex->lastoff = 0;
1032 
1033         lex->templates.count = 0;
1034         lex->templates.entries = NULL;
1035 
1036         if (config && config->raw_mode) {
1037                 lex->state = UC_LEX_IDENTIFY_TOKEN;
1038                 lex->block = STATEMENTS;
1039         }
1040 }
1041 
1042 void
1043 uc_lexer_free(uc_lexer_t *lex)
1044 {
1045         uc_vector_clear(&lex->buffer);
1046         uc_vector_clear(&lex->templates);
1047 
1048         uc_source_put(lex->source);
1049 
1050         free(lex->rbuf);
1051 }
1052 
1053 uc_token_t *
1054 uc_lexer_next_token(uc_lexer_t *lex)
1055 {
1056         uc_token_t *rv = NULL;
1057 
1058         rv = lex_step(lex);
1059 
1060         lex->no_keyword = false;
1061         lex->no_regexp = false;
1062 
1063         return rv;
1064 }
1065 
1066 const char *
1067 uc_tokenname(unsigned type)
1068 {
1069         static char buf[sizeof("'endfunction'")];
1070         const char *tokennames[] = {
1071                 [TK_LEXP] = "'{{'",
1072                 [TK_REXP] = "'}}'",
1073                 [TK_LSTM] = "'{%'",
1074                 [TK_RSTM] = "'%}'",
1075                 [TK_COMMA] = "','",
1076                 [TK_ASSIGN] = "'='",
1077                 [TK_ASADD] = "'+='",
1078                 [TK_ASSUB] = "'-='",
1079                 [TK_ASMUL] = "'*='",
1080                 [TK_ASDIV] = "'/='",
1081                 [TK_ASMOD] = "'%='",
1082                 [TK_ASLEFT] = "'<<='",
1083                 [TK_ASRIGHT] = "'>>='",
1084                 [TK_ASBAND] = "'&='",
1085                 [TK_ASBXOR] = "'^='",
1086                 [TK_ASBOR] = "'|='",
1087                 [TK_QMARK] = "'?'",
1088                 [TK_COLON] = "':'",
1089                 [TK_OR] = "'||'",
1090                 [TK_AND] = "'&&'",
1091                 [TK_BOR] = "'|'",
1092                 [TK_BXOR] = "'^'",
1093                 [TK_BAND] = "'&'",
1094                 [TK_EQS] = "'==='",
1095                 [TK_NES] = "'!=='",
1096                 [TK_EQ] = "'=='",
1097                 [TK_NE] = "'!='",
1098                 [TK_LT] = "'<'",
1099                 [TK_LE] = "'<='",
1100                 [TK_GT] = "'>'",
1101                 [TK_GE] = "'>='",
1102                 [TK_LSHIFT] = "'<<'",
1103                 [TK_RSHIFT] = "'>>'",
1104                 [TK_ADD] = "'+'",
1105                 [TK_SUB] = "'-'",
1106                 [TK_MUL] = "'*'",
1107                 [TK_DIV] = "'/'",
1108                 [TK_MOD] = "'%'",
1109                 [TK_EXP] = "'**'",
1110                 [TK_NOT] = "'!'",
1111                 [TK_COMPL] = "'~'",
1112                 [TK_INC] = "'++'",
1113                 [TK_DEC] = "'--'",
1114                 [TK_DOT] = "'.'",
1115                 [TK_LBRACK] = "'['",
1116                 [TK_RBRACK] = "']'",
1117                 [TK_LPAREN] = "'('",
1118                 [TK_RPAREN] = "')'",
1119                 [TK_LBRACE] = "'{'",
1120                 [TK_RBRACE] = "'}'",
1121                 [TK_SCOL] = "';'",
1122                 [TK_ELLIP] = "'...'",
1123                 [TK_ARROW] = "'=>'",
1124                 [TK_QLBRACK] = "'?.['",
1125                 [TK_QLPAREN] = "'?.('",
1126                 [TK_QDOT] = "'?.'",
1127                 [TK_ASEXP] = "'**='",
1128                 [TK_ASAND] = "'&&='",
1129                 [TK_ASOR] = "'||='",
1130                 [TK_ASNULLISH] = "'\?\?='",
1131                 [TK_NULLISH] = "'\?\?'",
1132                 [TK_PLACEH] = "'${'",
1133 
1134                 [TK_TEXT] = "Text",
1135                 [TK_LABEL] = "Label",
1136                 [TK_NUMBER] = "Number",
1137                 [TK_DOUBLE] = "Double",
1138                 [TK_STRING] = "String",
1139                 [TK_REGEXP] = "Regexp",
1140                 [TK_TEMPLATE] = "Template",
1141                 [TK_ERROR] = "Error",
1142                 [TK_EOF] = "End of file",
1143         };
1144 
1145         size_t i;
1146 
1147         for (i = 0; i < ARRAY_SIZE(reserved_words); i++) {
1148                 if (reserved_words[i].type != type)
1149                         continue;
1150 
1151                 snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat);
1152 
1153                 return buf;
1154         }
1155 
1156         return tokennames[type] ? tokennames[type] : "?";
1157 }
1158 
1159 bool
1160 uc_lexer_is_keyword(uc_value_t *label)
1161 {
1162         size_t i;
1163 
1164         if (ucv_type(label) != UC_STRING)
1165                 return false;
1166 
1167         for (i = 0; i < ARRAY_SIZE(reserved_words); i++)
1168                 if (!strcmp(reserved_words[i].pat, ucv_string_get(label)))
1169                         return true;
1170 
1171         return false;
1172 }
1173 
1174 #endif /* NO_COMPILE */
1175 
1176 /*
1177  * Stores the given codepoint as a utf8 multibyte sequence into the given
1178  * output buffer and substracts the required amount of bytes from  the given
1179  * length pointer.
1180  *
1181  * Returns false if the multibyte sequence would not fit into the buffer,
1182  * otherwise true.
1183  */
1184 
1185 bool
1186 utf8enc(char **out, int *rem, int code)
1187 {
1188         if (code >= 0 && code <= 0x7F) {
1189                 if (*rem < 1)
1190                         return false;
1191 
1192                 *(*out)++ = code; (*rem)--;
1193 
1194                 return true;
1195         }
1196         else if (code > 0 && code <= 0x7FF) {
1197                 if (*rem < 2)
1198                         return false;
1199 
1200                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
1201                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1202 
1203                 return true;
1204         }
1205         else if (code > 0 && code <= 0xFFFF) {
1206                 if (*rem < 3)
1207                         return false;
1208 
1209                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
1210                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1211                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1212 
1213                 return true;
1214         }
1215         else if (code > 0 && code <= 0x10FFFF) {
1216                 if (*rem < 4)
1217                         return false;
1218 
1219                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
1220                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
1221                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1222                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1223 
1224                 return true;
1225         }
1226 
1227         return true;
1228 }
1229 

This page was automatically generated by LXR 0.3.1.  •  OpenWrt