• source navigation  • diff markup  • identifier search  • freetext search  • 

Sources/ucode/lexer.c

  1 /*
  2  * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io>
  3  *
  4  * Permission to use, copy, modify, and/or distribute this software for any
  5  * purpose with or without fee is hereby granted, provided that the above
  6  * copyright notice and this permission notice appear in all copies.
  7  *
  8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15  */
 16 
 17 #include <stdio.h>
 18 
 19 #include <stdbool.h>
 20 #include <stdlib.h>
 21 #include <string.h>
 22 #include <ctype.h>
 23 #include <regex.h>
 24 #include <math.h>
 25 #include <errno.h>
 26 #include <endian.h>
 27 
 28 #include "ucode/vm.h"
 29 #include "ucode/lib.h"
 30 #include "ucode/lexer.h"
 31 
 32 struct keyword {
 33         unsigned type;
 34         const char *pat;
 35         unsigned plen;
 36 };
 37 
 38 #define dec(o) \
 39         ((o) - '')
 40 
 41 #define hex(x) \
 42         (((x) >= 'a') ? (10 + (x) - 'a') : \
 43                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
 44 
 45 #ifndef NO_COMPILE
 46 
 47 static const struct keyword reserved_words[] = {
 48         { TK_ENDFUNC,   "endfunction", 11 },
 49         { TK_CONTINUE,  "continue", 8 },
 50         { TK_ENDWHILE,  "endwhile", 8 },
 51         { TK_FUNC,              "function", 8 },
 52         { TK_DEFAULT,   "default", 7 },
 53         { TK_DELETE,    "delete", 6 },
 54         { TK_RETURN,    "return", 6 },
 55         { TK_ENDFOR,    "endfor", 6 },
 56         { TK_SWITCH,    "switch", 6 },
 57         { TK_IMPORT,    "import", 6 },
 58         { TK_EXPORT,    "export", 6 },
 59         { TK_ENDIF,             "endif", 5 },
 60         { TK_WHILE,             "while", 5 },
 61         { TK_BREAK,             "break", 5 },
 62         { TK_CATCH,             "catch", 5 },
 63         { TK_CONST,             "const", 5 },
 64         { TK_FALSE,             "false", 5 },
 65         { TK_TRUE,              "true",  4 },
 66         { TK_ELIF,              "elif",  4 },
 67         { TK_ELSE,              "else",  4 },
 68         { TK_THIS,              "this",  4 },
 69         { TK_NULL,              "null",  4 },
 70         { TK_CASE,              "case",  4 },
 71         { TK_FROM,              "from",  4 },
 72         { TK_TRY,               "try",   3 },
 73         { TK_FOR,               "for",   3 },
 74         { TK_LOCAL,             "let",   3 },
 75         { TK_IF,                "if",    2 },
 76         { TK_IN,                "in",    2 },
 77         { TK_AS,                "as",    2 },
 78 };
 79 
 80 
 81 static int
 82 fill_buf(uc_lexer_t *lex) {
 83         lex->rbuf = xrealloc(lex->rbuf, 128);
 84         lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp);
 85         lex->rpos = 0;
 86 
 87         if (!lex->rlen)
 88                 return EOF;
 89 
 90         lex->rpos++;
 91 
 92         return (int)lex->rbuf[0];
 93 }
 94 
 95 static int
 96 update_line(uc_lexer_t *lex, int ch) {
 97         if (ch == '\n' || ch == EOF)
 98                 uc_source_line_next(lex->source);
 99         else
100                 uc_source_line_update(lex->source, 1);
101 
102         lex->source->off++;
103 
104         return ch;
105 }
106 
107 static int
108 lookahead_char(uc_lexer_t *lex) {
109         int c;
110 
111         if (lex->rpos < lex->rlen)
112                 return (int)lex->rbuf[lex->rpos];
113 
114         c = fill_buf(lex);
115         lex->rpos = 0;
116 
117         return c;
118 }
119 
120 static bool
121 check_char(uc_lexer_t *lex, int ch) {
122         if (lookahead_char(lex) != ch)
123                 return false;
124 
125         lex->rpos++;
126 
127         update_line(lex, ch);
128 
129         return true;
130 }
131 
132 static int
133 next_char(uc_lexer_t *lex) {
134         int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex);
135 
136         return update_line(lex, ch);
137 }
138 
139 static uc_token_t *
140 emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv)
141 {
142         lex->curr.type = type;
143         lex->curr.uv = uv;
144 
145         if (pos < 0)
146                 lex->curr.pos = lex->source->off + pos;
147         else
148                 lex->curr.pos = (size_t)pos;
149 
150         return &lex->curr;
151 }
152 
153 static uc_token_t *
154 emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) {
155         uc_token_t *rv = NULL;
156 
157         if (lex->buffer.count) {
158                 if (strip_trailing_chars)
159                         while (lex->buffer.count > 0 && strchr(strip_trailing_chars, *uc_vector_last(&lex->buffer)))
160                                 lex->buffer.count--;
161 
162                 rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count));
163 
164                 uc_vector_clear(&lex->buffer);
165         }
166         else if (type != TK_TEXT) {
167                 rv = emit_op(lex, pos, type, ucv_string_new_length("", 0));
168         }
169 
170         return rv;
171 }
172 
173 
174 static uc_token_t *
175 parse_comment(uc_lexer_t *lex, int kind)
176 {
177         int ch;
178 
179         while (true) {
180                 ch = next_char(lex);
181 
182                 if (kind == '/' && (ch == '\n' || ch == EOF))
183                         break;
184 
185                 if (kind == '*' && ch == '*' && check_char(lex, '/'))
186                         break;
187 
188                 if (ch == EOF) {
189                         lex->state = UC_LEX_EOF;
190 
191                         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment"));
192                 }
193         }
194 
195         return NULL;
196 }
197 
198 static void
199 append_utf8(uc_lexer_t *lex, int code) {
200         char ustr[8], *up;
201         int rem;
202 
203         up = ustr;
204         rem = sizeof(ustr);
205 
206         if (utf8enc(&up, &rem, code))
207                 for (up = ustr; rem < (int)sizeof(ustr); rem++)
208                         uc_vector_push(&lex->buffer, *up++);
209 }
210 
211 static uc_token_t *
212 parse_escape(uc_lexer_t *lex, const char *regex_macros)
213 {
214         int code, ch, i;
215         const char *p;
216 
217         /* unicode escape sequence */
218         if (check_char(lex, 'u')) {
219                 for (i = 0, code = 0; i < 4; i++) {
220                         ch = next_char(lex);
221 
222                         if (!isxdigit(ch))
223                                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
224 
225                         code = code * 16 + hex(ch);
226                 }
227 
228                 /* is a leading surrogate value */
229                 if ((code & 0xFC00) == 0xD800) {
230                         /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
231                         if (lex->lead_surrogate)
232                                 append_utf8(lex, 0xFFFD);
233 
234                         /* store surrogate value and advance to next escape sequence */
235                         lex->lead_surrogate = code;
236                 }
237 
238                 /* is a trailing surrogate value */
239                 else if ((code & 0xFC00) == 0xDC00) {
240                         /* found a trailing surrogate following a leading one, combine and encode */
241                         if (lex->lead_surrogate) {
242                                 code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
243                                 lex->lead_surrogate = 0;
244                         }
245 
246                         /* trailing surrogate not following a leading one, ignore and use replacement char */
247                         else {
248                                 code = 0xFFFD;
249                         }
250 
251                         append_utf8(lex, code);
252                 }
253 
254                 /* is a normal codepoint */
255                 else {
256                         append_utf8(lex, code);
257                 }
258         }
259 
260         /* hex escape sequence */
261         else if (check_char(lex, 'x')) {
262                 for (i = 0, code = 0; i < 2; i++) {
263                         ch = next_char(lex);
264 
265                         if (!isxdigit(ch))
266                                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
267 
268                         code = code * 16 + hex(ch);
269                 }
270 
271                 append_utf8(lex, code);
272         }
273 
274         /* octal or letter */
275         else {
276                 /* try to parse octal sequence... */
277                 for (i = 0, code = 0, ch = lookahead_char(lex);
278                      i < 3 && ch >= '' && ch <= '7';
279                      i++, next_char(lex), ch = lookahead_char(lex)) {
280                         code = code * 8 + dec(ch);
281                 }
282 
283                 if (i) {
284                         if (code > 255)
285                                 return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));
286 
287                         append_utf8(lex, code);
288                 }
289 
290                 /* ... no octal sequence, handle potential regex macros */
291                 else if (strchr(regex_macros, ch)) {
292                         ch = next_char(lex);
293 
294                         switch (ch) {
295                         case 'd': p = "[[:digit:]]";   break;
296                         case 'D': p = "[^[:digit:]]";  break;
297                         case 'w': p = "[[:alnum:]_]";  break;
298                         case 'W': p = "[^[:alnum:]_]"; break;
299                         case 's': p = "[[:space:]]";   break;
300                         case 'S': p = "[^[:space:]]";  break;
301                         default:  p = NULL;
302                         }
303 
304                         if (p) {
305                                 while (*p)
306                                         uc_vector_push(&lex->buffer, *p++);
307                         }
308                         else {
309                                 uc_vector_push(&lex->buffer, '\\');
310                                 uc_vector_push(&lex->buffer, ch);
311                         }
312                 }
313 
314                 /* ... handle other escape */
315                 else {
316                         ch = next_char(lex);
317 
318                         switch (ch) {
319                         case 'a': uc_vector_push(&lex->buffer, '\a'); break;
320                         case 'b': uc_vector_push(&lex->buffer, '\b'); break;
321                         case 'e': uc_vector_push(&lex->buffer, '\033'); break;
322                         case 'f': uc_vector_push(&lex->buffer, '\f'); break;
323                         case 'n': uc_vector_push(&lex->buffer, '\n'); break;
324                         case 'r': uc_vector_push(&lex->buffer, '\r'); break;
325                         case 't': uc_vector_push(&lex->buffer, '\t'); break;
326                         case 'v': uc_vector_push(&lex->buffer, '\v'); break;
327 
328                         case EOF:
329                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
330 
331                         default:
332                                 uc_vector_push(&lex->buffer, ch);
333                         }
334                 }
335         }
336 
337         return NULL;
338 }
339 
340 static uc_token_t *
341 parse_string(uc_lexer_t *lex, int kind)
342 {
343         uc_token_t *err;
344         unsigned type;
345         int code, ch;
346         size_t off;
347 
348         if (kind == '`')
349                 type = TK_TEMPLATE;
350         else if (kind == '/')
351                 type = TK_REGEXP;
352         else
353                 type = TK_STRING;
354 
355         off = lex->source->off - 1;
356 
357         for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
358                 switch (ch) {
359                 /* placeholder */
360                 case '$':
361                         if (type == TK_TEMPLATE && check_char(lex, '{')) {
362                                 lex->state = UC_LEX_PLACEHOLDER_START;
363 
364                                 return emit_buffer(lex, off, type, NULL);
365                         }
366 
367                         uc_vector_push(&lex->buffer, '$');
368                         break;
369 
370                 /* regexp bracket expression */
371                 case '[':
372                         uc_vector_push(&lex->buffer, '[');
373 
374                         if (type == TK_REGEXP) {
375                                 /* skip leading negation (^) */
376                                 if (check_char(lex, '^'))
377                                         uc_vector_push(&lex->buffer, '^');
378 
379                                 /* skip leading `]` - it is literal and not closing the bracket expr */
380                                 if (check_char(lex, ']'))
381                                         uc_vector_push(&lex->buffer, ']');
382 
383                                 /* read until closing `]` */
384                                 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
385                                         if (ch == '\\') {
386                                                 err = parse_escape(lex, "^");
387 
388                                                 if (err)
389                                                         return err;
390 
391                                                 continue;
392                                         }
393 
394                                         uc_vector_push(&lex->buffer, ch);
395 
396                                         if (ch == ']')
397                                                 break;
398 
399                                         /* skip nested char classes / equivalence classes / collating chars */
400                                         if (ch == '[') {
401                                                 code = lookahead_char(lex);
402 
403                                                 if (code == ':' || code == '.' || code == '=') {
404                                                         uc_vector_push(&lex->buffer, code);
405                                                         next_char(lex);
406 
407                                                         for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
408                                                                 if (ch == '\\') {
409                                                                         err = parse_escape(lex, "");
410 
411                                                                         if (err)
412                                                                                 return err;
413 
414                                                                         continue;
415                                                                 }
416 
417                                                                 uc_vector_push(&lex->buffer, ch);
418 
419                                                                 if (ch == code && check_char(lex, ']')) {
420                                                                         uc_vector_push(&lex->buffer, ']');
421                                                                         break;
422                                                                 }
423                                                         }
424                                                 }
425                                         }
426                                 }
427                         }
428 
429                         break;
430 
431                 /* escape sequence */
432                 case '\\':
433                         err = parse_escape(lex,
434                                 (type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : "");
435 
436                         if (err)
437                                 return err;
438 
439                         break;
440 
441                 /* other character */
442                 default:
443                         /* terminating delimitter */
444                         if (ch == kind)
445                                 return emit_buffer(lex, off, type, NULL);
446 
447                         uc_vector_push(&lex->buffer, ch);
448                 }
449         }
450 
451         // FIXME
452         lex->state = UC_LEX_EOF;
453 
454         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
455 }
456 
457 
458 /*
459  * Parses a regexp literal from the given buffer.
460  *
461  * Returns a negative value on error, otherwise the amount of consumed
462  * characters from the given buffer.
463  *
464  * Error values:
465  *  -UC_ERROR_UNTERMINATED_STRING       Unterminated regexp
466  *  -UC_ERROR_INVALID_ESCAPE            Invalid escape sequence
467  *  -UC_ERROR_OVERLONG_STRING           Regexp literal too long
468  *  -UC_ERROR_INVALID_REGEXP        Could not compile regexp
469  */
470 
471 enum {
472         UC_LEX_PARSE_REGEX_INIT,
473         UC_LEX_PARSE_REGEX_PATTERN,
474         UC_LEX_PARSE_REGEX_FLAGS
475 };
476 
477 static uc_token_t *
478 parse_regexp(uc_lexer_t *lex)
479 {
480         bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false;
481         uc_token_t *rv;
482         size_t len;
483         char *s;
484 
485         rv = parse_string(lex, '/');
486 
487         if (rv->type == TK_REGEXP) {
488                 while (true) {
489                         if (check_char(lex, 'g'))
490                                 is_reg_global = true;
491                         else if (check_char(lex, 'i'))
492                                 is_reg_icase = true;
493                         else if (check_char(lex, 's'))
494                                 is_reg_newline = true;
495                         else
496                                 break;
497                 }
498 
499                 len = xasprintf(&s, "%c%*s",
500                         (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
501                         ucv_string_length(rv->uv),
502                         ucv_string_get(rv->uv));
503 
504                 ucv_free(rv->uv, false);
505                 rv->uv = ucv_string_new_length(s, len);
506                 free(s);
507         }
508 
509         return rv;
510 }
511 
512 
513 /*
514  * Parses a label from the given buffer.
515  *
516  * Returns a negative value on error, otherwise the amount of consumed
517  * characters from the given buffer.
518  *
519  * Error values:
520  *  -UC_ERROR_OVERLONG_STRING   Label too long
521  */
522 
523 static uc_token_t *
524 parse_label(uc_lexer_t *lex, int ch)
525 {
526         const struct keyword *word;
527         size_t i, len;
528 
529         while (true) {
530                 uc_vector_push(&lex->buffer, ch);
531                 ch = lookahead_char(lex);
532 
533                 if (!isalnum(ch) && ch != '_')
534                         break;
535 
536                 next_char(lex);
537         }
538 
539         len = lex->buffer.count;
540 
541         if (!lex->no_keyword) {
542                 for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
543                         if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) {
544                                 uc_vector_clear(&lex->buffer);
545 
546                                 return emit_op(lex, -len, word->type, NULL);
547                         }
548                 }
549         }
550 
551         return emit_buffer(lex, -len, TK_LABEL, NULL);
552 }
553 
554 
555 /*
556  * Parses a number literal from the given buffer.
557  *
558  * Returns a negative value on error, otherwise the amount of consumed
559  * characters from the given buffer.
560  *
561  * Error values:
562  *  -UC_ERROR_INVALID_ESCAPE    Invalid number character
563  */
564 
565 static inline bool
566 is_numeric_char(uc_lexer_t *lex, char c)
567 {
568         char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0;
569 
570         switch (c|32) {
571         case '.':
572         case '':
573         case '1':
574         case '2':
575         case '3':
576         case '4':
577         case '5':
578         case '6':
579         case '7':
580         case '8':
581         case '9':
582                 return true;
583 
584         case 'a':
585         case 'b':
586         case 'c':
587         case 'd':
588         case 'e':
589         case 'f':
590         case 'o':
591         case 'x':
592                 /* require previous char, a number literal cannot start with these */
593                 return prev != 0;
594 
595         case '+':
596         case '-':
597                 /* sign is only allowed after an exponent char */
598                 return (prev|32) == 'e';
599         }
600 
601         return false;
602 }
603 
604 static uc_token_t *
605 parse_number(uc_lexer_t *lex, int ch)
606 {
607         uc_value_t *nv = NULL;
608         size_t len;
609         char *e;
610 
611         while (true) {
612                 uc_vector_push(&lex->buffer, ch);
613                 ch = lookahead_char(lex);
614 
615                 if (!is_numeric_char(lex, ch))
616                         break;
617 
618                 next_char(lex);
619         }
620 
621         len = lex->buffer.count;
622 
623         uc_vector_push(&lex->buffer, '\0');
624 
625         nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e);
626 
627         uc_vector_clear(&lex->buffer);
628 
629         switch (ucv_type(nv)) {
630         case UC_DOUBLE:
631                 return emit_op(lex, -len, TK_DOUBLE, nv);
632 
633         case UC_INTEGER:
634                 return emit_op(lex, -len, TK_NUMBER, nv);
635 
636         default:
637                 return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal"));
638         }
639 }
640 
641 static uc_token_t *
642 lex_find_token(uc_lexer_t *lex)
643 {
644         bool tpl = !(lex->config && lex->config->raw_mode);
645         int ch = next_char(lex);
646 
647         while (isspace(ch))
648                 ch = next_char(lex);
649 
650         switch (ch) {
651         case '~':
652                 return emit_op(lex, -1, TK_COMPL, NULL);
653 
654         case '}':
655                 if (tpl && check_char(lex, '}'))
656                         return emit_op(lex, -2, TK_REXP, NULL);
657 
658                 return emit_op(lex, -1, TK_RBRACE, NULL);
659 
660         case '|':
661                 if (check_char(lex, '|')) {
662                         if (check_char(lex, '='))
663                                 return emit_op(lex, -3, TK_ASOR, NULL);
664 
665                         return emit_op(lex, -2, TK_OR, NULL);
666                 }
667 
668                 if (check_char(lex, '='))
669                         return emit_op(lex, -2, TK_ASBOR, NULL);
670 
671                 return emit_op(lex, -1, TK_BOR, NULL);
672 
673         case '{':
674                 if (tpl && check_char(lex, '{'))
675                         return emit_op(lex, -2, TK_LEXP, NULL);
676 
677                 if (tpl && check_char(lex, '%'))
678                         return emit_op(lex, -2, TK_LSTM, NULL);
679 
680                 return emit_op(lex, -1, TK_LBRACE, NULL);
681 
682         case '^':
683                 if (check_char(lex, '='))
684                         return emit_op(lex, -2, TK_ASBXOR, NULL);
685 
686                 return emit_op(lex, -1, TK_BXOR, NULL);
687 
688         case '[':
689                 return emit_op(lex, -1, TK_LBRACK, NULL);
690 
691         case ']':
692                 return emit_op(lex, -1, TK_RBRACK, NULL);
693 
694         case '?':
695                 if (check_char(lex, '?')) {
696                         if (check_char(lex, '='))
697                                 return emit_op(lex, -3, TK_ASNULLISH, NULL);
698 
699                         return emit_op(lex, -2, TK_NULLISH, NULL);
700                 }
701 
702                 if (check_char(lex, '.')) {
703                         if (check_char(lex, '['))
704                                 return emit_op(lex, -3, TK_QLBRACK, NULL);
705 
706                         if (check_char(lex, '('))
707                                 return emit_op(lex, -3, TK_QLPAREN, NULL);
708 
709                         return emit_op(lex, -2, TK_QDOT, NULL);
710                 }
711 
712                 return emit_op(lex, lex->source->off, TK_QMARK, NULL);
713 
714         case '>':
715                 if (check_char(lex, '>')) {
716                         if (check_char(lex, '='))
717                                 return emit_op(lex, -3, TK_ASRIGHT, NULL);
718 
719                         return emit_op(lex, -2, TK_RSHIFT, NULL);
720                 }
721 
722                 if (check_char(lex, '='))
723                         return emit_op(lex, -2, TK_GE, NULL);
724 
725                 return emit_op(lex, -1, TK_GT, NULL);
726 
727         case '=':
728                 if (check_char(lex, '=')) {
729                         if (check_char(lex, '='))
730                                 return emit_op(lex, -3, TK_EQS, NULL);
731 
732                         return emit_op(lex, -2, TK_EQ, NULL);
733                 }
734 
735                 if (check_char(lex, '>'))
736                         return emit_op(lex, -2, TK_ARROW, NULL);
737 
738                 return emit_op(lex, -1, TK_ASSIGN, NULL);
739 
740         case '<':
741                 if (check_char(lex, '<')) {
742                         if (check_char(lex, '='))
743                                 return emit_op(lex, -3, TK_ASLEFT, NULL);
744 
745                         return emit_op(lex, -2, TK_LSHIFT, NULL);
746                 }
747 
748                 if (check_char(lex, '='))
749                         return emit_op(lex, -2, TK_LE, NULL);
750 
751                 return emit_op(lex, -1, TK_LT, NULL);
752 
753         case ';':
754                 return emit_op(lex, -1, TK_SCOL, NULL);
755 
756         case ':':
757                 return emit_op(lex, -1, TK_COLON, NULL);
758 
759         case '/':
760                 ch = lookahead_char(lex);
761                 lex->lastoff = lex->source->off - 1;
762 
763                 if (ch == '/' || ch == '*')
764                         return parse_comment(lex, ch);
765 
766                 if (lex->no_regexp) {
767                         if (check_char(lex, '='))
768                                 return emit_op(lex, -2, TK_ASDIV, NULL);
769 
770                         return emit_op(lex, -1, TK_DIV, NULL);
771                 }
772 
773                 return parse_regexp(lex);
774 
775         case '.':
776                 if (check_char(lex, '.')) {
777                         if (check_char(lex, '.'))
778                                 return emit_op(lex, -3, TK_ELLIP, NULL);
779 
780                         /* The sequence ".." cannot be a valid */
781                         return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character"));
782                 }
783 
784                 return emit_op(lex, -1, TK_DOT, NULL);
785 
786         case '-':
787                 if (tpl && check_char(lex, '}')) {
788                         if (check_char(lex, '}')) {
789                                 lex->modifier = MINUS;
790 
791                                 return emit_op(lex, -3, TK_REXP, NULL);
792                         }
793 
794                         /* The sequence "-}" cannot be a valid */
795                         return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
796                 }
797 
798                 if (tpl && check_char(lex, '%')) {
799                         if (check_char(lex, '}')) {
800                                 lex->modifier = MINUS;
801 
802                                 return emit_op(lex, -3, TK_RSTM, NULL);
803                         }
804 
805                         /* The sequence "-%" cannot be a valid */
806                         return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
807                 }
808 
809                 if (check_char(lex, '='))
810                         return emit_op(lex, -2, TK_ASSUB, NULL);
811 
812                 if (check_char(lex, '-'))
813                         return emit_op(lex, -2, TK_DEC, NULL);
814 
815                 return emit_op(lex, -1, TK_SUB, NULL);
816 
817         case ',':
818                 return emit_op(lex, -1, TK_COMMA, NULL);
819 
820         case '+':
821                 if (check_char(lex, '='))
822                         return emit_op(lex, -2, TK_ASADD, NULL);
823 
824                 if (check_char(lex, '+'))
825                         return emit_op(lex, -2, TK_INC, NULL);
826 
827                 return emit_op(lex, -1, TK_ADD, NULL);
828 
829         case '*':
830                 if (check_char(lex, '*')) {
831                         if (check_char(lex, '='))
832                                 return emit_op(lex, -3, TK_ASEXP, NULL);
833 
834                         return emit_op(lex, -2, TK_EXP, NULL);
835                 }
836 
837                 if (check_char(lex, '='))
838                         return emit_op(lex, -2, TK_ASMUL, NULL);
839 
840                 return emit_op(lex, -1, TK_MUL, NULL);
841 
842         case '(':
843                 return emit_op(lex, -1, TK_LPAREN, NULL);
844 
845         case ')':
846                 return emit_op(lex, -1, TK_RPAREN, NULL);
847 
848         case '\'':
849         case '"':
850         case '`':
851                 lex->lastoff = lex->source->off - 1;
852 
853                 return parse_string(lex, ch);
854 
855         case '&':
856                 if (check_char(lex, '&')) {
857                         if (check_char(lex, '='))
858                                 return emit_op(lex, -3, TK_ASAND, NULL);
859 
860                         return emit_op(lex, -2, TK_AND, NULL);
861                 }
862 
863                 if (check_char(lex, '='))
864                         return emit_op(lex, -2, TK_ASBAND, NULL);
865 
866                 return emit_op(lex, -1, TK_BAND, NULL);
867 
868         case '%':
869                 if (tpl && check_char(lex, '}'))
870                         return emit_op(lex, -2, TK_RSTM, NULL);
871 
872                 if (check_char(lex, '='))
873                         return emit_op(lex, -2, TK_ASMOD, NULL);
874 
875                 return emit_op(lex, -1, TK_MOD, NULL);
876 
877         case '!':
878                 if (check_char(lex, '=')) {
879                         if (check_char(lex, '='))
880                                 return emit_op(lex, -3, TK_NES, NULL);
881 
882                         return emit_op(lex, -2, TK_NE, NULL);
883                 }
884 
885                 return emit_op(lex, -1, TK_NOT, NULL);
886 
887         case EOF:
888                 return emit_op(lex, -1, TK_EOF, NULL);
889 
890         default:
891                 if (isalpha(ch) || ch == '_')
892                         return parse_label(lex, ch);
893 
894                 if (isdigit(ch))
895                         return parse_number(lex, ch);
896 
897                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
898         }
899 }
900 
901 static uc_token_t *
902 lex_step(uc_lexer_t *lex)
903 {
904         const char *strip = NULL;
905         uc_token_t *tok;
906         size_t *nest;
907         int ch;
908 
909         while (lex->state != UC_LEX_EOF) {
910                 switch (lex->state) {
911                 case UC_LEX_IDENTIFY_BLOCK:
912                         ch = next_char(lex);
913 
914                         /* previous block had strip trailing whitespace flag, skip leading whitespace */
915                         if (lex->modifier == MINUS) {
916                                 while (isspace(ch))
917                                         ch = next_char(lex);
918 
919                                 lex->modifier = UNSPEC;
920                         }
921 
922                         /* previous block was a statement block and trim_blocks is enabled, skip leading newline */
923                         else if (lex->modifier == NEWLINE) {
924                                 if (ch == '\n')
925                                         ch = next_char(lex);
926 
927                                 lex->modifier = UNSPEC;
928                         }
929 
930                         /* scan forward through buffer to identify block start token */
931                         while (ch != EOF) {
932                                 if (ch == '{') {
933                                         ch = next_char(lex);
934 
935                                         switch (ch) {
936                                         /* found start of comment block */
937                                         case '#':
938                                                 lex->state = UC_LEX_BLOCK_COMMENT;
939                                                 lex->block = COMMENT;
940 
941                                                 if (check_char(lex, '-'))
942                                                         strip = " \n\t\v\f\r";
943 
944                                                 break;
945 
946                                         /* found start of expression block */
947                                         case '{':
948                                                 lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;
949 
950                                                 if (check_char(lex, '-'))
951                                                         strip = " \n\t\v\f\r";
952 
953                                                 break;
954 
955                                         /* found start of statement block */
956                                         case '%':
957                                                 lex->state = UC_LEX_IDENTIFY_TOKEN;
958                                                 lex->block = STATEMENTS;
959 
960                                                 if (check_char(lex, '-'))
961                                                         strip = " \n\t\v\f\r";
962                                                 else if (check_char(lex, '+'))
963                                                         strip = NULL;
964                                                 else if (lex->config && lex->config->lstrip_blocks)
965                                                         strip = " \t\v\f\r";
966 
967                                                 break;
968 
969                                         default:
970                                                 /* not a start tag, remember char and move on */
971                                                 uc_vector_push(&lex->buffer, '{');
972                                                 continue;
973                                         }
974 
975                                         break;
976                                 }
977 
978                                 uc_vector_push(&lex->buffer, ch);
979                                 ch = next_char(lex);
980                         }
981 
982                         if (ch == EOF)
983                                 lex->state = UC_LEX_EOF;
984 
985                         /* push out leading text */
986                         tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip);
987                         lex->lastoff = lex->source->off - 2;
988 
989                         if (!tok)
990                                 continue;
991 
992                         return tok;
993 
994 
995                 case UC_LEX_BLOCK_COMMENT:
996                         ch = next_char(lex);
997 
998                         /* scan forward through buffer to identify end token */
999                         while (ch != EOF) {
1000                                 if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) {
1001                                         lex->modifier = MINUS;
1002                                         break;
1003                                 }
1004 
1005                                 if (ch == '#' && check_char(lex, '}'))
1006                                         break;
1007 
1008                                 ch = next_char(lex);
1009                         }
1010 
1011                         if (ch == EOF) {
1012                                 lex->state = UC_LEX_EOF;
1013 
1014                                 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
1015                         }
1016 
1017                         lex->lastoff = lex->source->off;
1018                         lex->state = UC_LEX_IDENTIFY_BLOCK;
1019 
1020                         continue;
1021 
1022 
1023                 case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
1024                         lex->state = UC_LEX_IDENTIFY_TOKEN;
1025                         lex->block = EXPRESSION;
1026 
1027                         return emit_op(lex, lex->source->off, TK_LEXP, NULL);
1028 
1029 
1030                 case UC_LEX_IDENTIFY_TOKEN:
1031                         do { tok = lex_find_token(lex); } while (tok == NULL);
1032 
1033                         /* disallow nesting blocks */
1034                         if (tok->type == TK_LSTM || tok->type == TK_LEXP)
1035                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested"));
1036 
1037                         /* found end of statement block */
1038                         if (lex->block == STATEMENTS && tok->type == TK_RSTM) {
1039                                 /* strip newline after statement block? */
1040                                 if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks)
1041                                         lex->modifier = NEWLINE;
1042 
1043                                 lex->lastoff = lex->source->off;
1044                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
1045                                 lex->block = NONE;
1046 
1047                                 tok = emit_op(lex, -2, TK_SCOL, NULL);
1048                         }
1049 
1050                         /* found end of expression block */
1051                         else if (lex->block == EXPRESSION && tok->type == TK_REXP) {
1052                                 lex->lastoff = lex->source->off;
1053                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
1054                                 lex->block = NONE;
1055                         }
1056 
1057                         /* track opening braces */
1058                         else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
1059                                 nest = uc_vector_last(&lex->templates);
1060                                 (*nest)++;
1061                         }
1062 
1063                         /* check end of placeholder expression */
1064                         else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
1065                                 nest = uc_vector_last(&lex->templates);
1066 
1067                                 if (*nest == 0) {
1068                                         lex->templates.count--;
1069                                         lex->state = UC_LEX_PLACEHOLDER_END;
1070                                 }
1071                                 else {
1072                                         (*nest)--;
1073                                 }
1074                         }
1075 
1076                         /* premature EOF? */
1077                         else if (tok->type == TK_EOF && lex->block != STATEMENTS) {
1078                                 lex->state = UC_LEX_EOF;
1079 
1080                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block"));
1081                         }
1082 
1083                         return tok;
1084 
1085 
1086                 case UC_LEX_PLACEHOLDER_START:
1087                         lex->state = UC_LEX_IDENTIFY_TOKEN;
1088 
1089                         uc_vector_push(&lex->templates, 0);
1090 
1091                         return emit_op(lex, -2, TK_PLACEH, NULL);
1092 
1093 
1094                 case UC_LEX_PLACEHOLDER_END:
1095                         lex->state = UC_LEX_IDENTIFY_TOKEN;
1096 
1097                         return parse_string(lex, '`');
1098 
1099 
1100                 case UC_LEX_EOF:
1101                         break;
1102                 }
1103         }
1104 
1105         return emit_op(lex, lex->source->off, TK_EOF, NULL);
1106 }
1107 
1108 void
1109 uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
1110 {
1111         lex->state = UC_LEX_IDENTIFY_BLOCK;
1112 
1113         lex->config = config;
1114         lex->source = uc_source_get(source);
1115 
1116         lex->block = NONE;
1117         lex->modifier = UNSPEC;
1118 
1119         lex->rlen = 0;
1120         lex->rpos = 0;
1121         lex->rbuf = NULL;
1122 
1123         lex->buffer.count = 0;
1124         lex->buffer.entries = NULL;
1125 
1126         lex->lead_surrogate = 0;
1127 
1128         lex->lastoff = 0;
1129 
1130         lex->templates.count = 0;
1131         lex->templates.entries = NULL;
1132 
1133         if (config && config->raw_mode) {
1134                 lex->state = UC_LEX_IDENTIFY_TOKEN;
1135                 lex->block = STATEMENTS;
1136         }
1137 }
1138 
1139 void
1140 uc_lexer_free(uc_lexer_t *lex)
1141 {
1142         uc_vector_clear(&lex->buffer);
1143         uc_vector_clear(&lex->templates);
1144 
1145         uc_source_put(lex->source);
1146 
1147         free(lex->rbuf);
1148 }
1149 
1150 uc_token_t *
1151 uc_lexer_next_token(uc_lexer_t *lex)
1152 {
1153         uc_token_t *rv = NULL;
1154 
1155         rv = lex_step(lex);
1156 
1157         lex->no_keyword = false;
1158         lex->no_regexp = false;
1159 
1160         return rv;
1161 }
1162 
1163 const char *
1164 uc_tokenname(unsigned type)
1165 {
1166         static char buf[sizeof("'endfunction'")];
1167         const char *tokennames[] = {
1168                 [TK_LEXP] = "'{{'",
1169                 [TK_REXP] = "'}}'",
1170                 [TK_LSTM] = "'{%'",
1171                 [TK_RSTM] = "'%}'",
1172                 [TK_COMMA] = "','",
1173                 [TK_ASSIGN] = "'='",
1174                 [TK_ASADD] = "'+='",
1175                 [TK_ASSUB] = "'-='",
1176                 [TK_ASMUL] = "'*='",
1177                 [TK_ASDIV] = "'/='",
1178                 [TK_ASMOD] = "'%='",
1179                 [TK_ASLEFT] = "'<<='",
1180                 [TK_ASRIGHT] = "'>>='",
1181                 [TK_ASBAND] = "'&='",
1182                 [TK_ASBXOR] = "'^='",
1183                 [TK_ASBOR] = "'|='",
1184                 [TK_QMARK] = "'?'",
1185                 [TK_COLON] = "':'",
1186                 [TK_OR] = "'||'",
1187                 [TK_AND] = "'&&'",
1188                 [TK_BOR] = "'|'",
1189                 [TK_BXOR] = "'^'",
1190                 [TK_BAND] = "'&'",
1191                 [TK_EQS] = "'==='",
1192                 [TK_NES] = "'!=='",
1193                 [TK_EQ] = "'=='",
1194                 [TK_NE] = "'!='",
1195                 [TK_LT] = "'<'",
1196                 [TK_LE] = "'<='",
1197                 [TK_GT] = "'>'",
1198                 [TK_GE] = "'>='",
1199                 [TK_LSHIFT] = "'<<'",
1200                 [TK_RSHIFT] = "'>>'",
1201                 [TK_ADD] = "'+'",
1202                 [TK_SUB] = "'-'",
1203                 [TK_MUL] = "'*'",
1204                 [TK_DIV] = "'/'",
1205                 [TK_MOD] = "'%'",
1206                 [TK_EXP] = "'**'",
1207                 [TK_NOT] = "'!'",
1208                 [TK_COMPL] = "'~'",
1209                 [TK_INC] = "'++'",
1210                 [TK_DEC] = "'--'",
1211                 [TK_DOT] = "'.'",
1212                 [TK_LBRACK] = "'['",
1213                 [TK_RBRACK] = "']'",
1214                 [TK_LPAREN] = "'('",
1215                 [TK_RPAREN] = "')'",
1216                 [TK_LBRACE] = "'{'",
1217                 [TK_RBRACE] = "'}'",
1218                 [TK_SCOL] = "';'",
1219                 [TK_ELLIP] = "'...'",
1220                 [TK_ARROW] = "'=>'",
1221                 [TK_QLBRACK] = "'?.['",
1222                 [TK_QLPAREN] = "'?.('",
1223                 [TK_QDOT] = "'?.'",
1224                 [TK_ASEXP] = "'**='",
1225                 [TK_ASAND] = "'&&='",
1226                 [TK_ASOR] = "'||='",
1227                 [TK_ASNULLISH] = "'\?\?='",
1228                 [TK_NULLISH] = "'\?\?'",
1229                 [TK_PLACEH] = "'${'",
1230 
1231                 [TK_TEXT] = "Text",
1232                 [TK_LABEL] = "Label",
1233                 [TK_NUMBER] = "Number",
1234                 [TK_DOUBLE] = "Double",
1235                 [TK_STRING] = "String",
1236                 [TK_REGEXP] = "Regexp",
1237                 [TK_TEMPLATE] = "Template",
1238                 [TK_ERROR] = "Error",
1239                 [TK_EOF] = "End of file",
1240         };
1241 
1242         size_t i;
1243 
1244         for (i = 0; i < ARRAY_SIZE(reserved_words); i++) {
1245                 if (reserved_words[i].type != type)
1246                         continue;
1247 
1248                 snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat);
1249 
1250                 return buf;
1251         }
1252 
1253         return tokennames[type] ? tokennames[type] : "?";
1254 }
1255 
1256 bool
1257 uc_lexer_is_keyword(uc_value_t *label)
1258 {
1259         size_t i;
1260 
1261         if (ucv_type(label) != UC_STRING)
1262                 return false;
1263 
1264         for (i = 0; i < ARRAY_SIZE(reserved_words); i++)
1265                 if (!strcmp(reserved_words[i].pat, ucv_string_get(label)))
1266                         return true;
1267 
1268         return false;
1269 }
1270 
1271 #endif /* NO_COMPILE */
1272 
1273 /*
1274  * Stores the given codepoint as a utf8 multibyte sequence into the given
1275  * output buffer and substracts the required amount of bytes from  the given
1276  * length pointer.
1277  *
1278  * Returns false if the multibyte sequence would not fit into the buffer,
1279  * otherwise true.
1280  */
1281 
1282 bool
1283 utf8enc(char **out, int *rem, int code)
1284 {
1285         if (code >= 0 && code <= 0x7F) {
1286                 if (*rem < 1)
1287                         return false;
1288 
1289                 *(*out)++ = code; (*rem)--;
1290 
1291                 return true;
1292         }
1293         else if (code > 0 && code <= 0x7FF) {
1294                 if (*rem < 2)
1295                         return false;
1296 
1297                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
1298                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1299 
1300                 return true;
1301         }
1302         else if (code > 0 && code <= 0xFFFF) {
1303                 if (*rem < 3)
1304                         return false;
1305 
1306                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
1307                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1308                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1309 
1310                 return true;
1311         }
1312         else if (code > 0 && code <= 0x10FFFF) {
1313                 if (*rem < 4)
1314                         return false;
1315 
1316                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
1317                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
1318                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1319                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1320 
1321                 return true;
1322         }
1323 
1324         return true;
1325 }
1326 

This page was automatically generated by LXR 0.3.1.  •  OpenWrt