• source navigation  • diff markup  • identifier search  • freetext search  • 

Sources/ucode/lexer.c

  1 /*
  2  * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io>
  3  *
  4  * Permission to use, copy, modify, and/or distribute this software for any
  5  * purpose with or without fee is hereby granted, provided that the above
  6  * copyright notice and this permission notice appear in all copies.
  7  *
  8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15  */
 16 
 17 #include <stdio.h>
 18 
 19 #include <stdbool.h>
 20 #include <stdlib.h>
 21 #include <string.h>
 22 #include <ctype.h>
 23 #include <regex.h>
 24 #include <math.h>
 25 #include <errno.h>
 26 
 27 #include "ucode/vm.h"
 28 #include "ucode/lib.h"
 29 #include "ucode/lexer.h"
 30 #include "ucode/platform.h"
 31 
 32 struct keyword {
 33         unsigned type;
 34         const char *pat;
 35         unsigned plen;
 36 };
 37 
 38 #define dec(o) \
 39         ((o) - '')
 40 
 41 #define hex(x) \
 42         (((x) >= 'a') ? (10 + (x) - 'a') : \
 43                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
 44 
 45 #ifndef NO_COMPILE
 46 
 47 static const struct keyword reserved_words[] = {
 48         { TK_ENDFUNC,   "endfunction", 11 },
 49         { TK_CONTINUE,  "continue", 8 },
 50         { TK_ENDWHILE,  "endwhile", 8 },
 51         { TK_FUNC,              "function", 8 },
 52         { TK_DEFAULT,   "default", 7 },
 53         { TK_DELETE,    "delete", 6 },
 54         { TK_RETURN,    "return", 6 },
 55         { TK_ENDFOR,    "endfor", 6 },
 56         { TK_SWITCH,    "switch", 6 },
 57         { TK_IMPORT,    "import", 6 },
 58         { TK_EXPORT,    "export", 6 },
 59         { TK_ENDIF,             "endif", 5 },
 60         { TK_WHILE,             "while", 5 },
 61         { TK_BREAK,             "break", 5 },
 62         { TK_CATCH,             "catch", 5 },
 63         { TK_CONST,             "const", 5 },
 64         { TK_FALSE,             "false", 5 },
 65         { TK_TRUE,              "true",  4 },
 66         { TK_ELIF,              "elif",  4 },
 67         { TK_ELSE,              "else",  4 },
 68         { TK_THIS,              "this",  4 },
 69         { TK_NULL,              "null",  4 },
 70         { TK_CASE,              "case",  4 },
 71         { TK_TRY,               "try",   3 },
 72         { TK_FOR,               "for",   3 },
 73         { TK_LOCAL,             "let",   3 },
 74         { TK_IF,                "if",    2 },
 75         { TK_IN,                "in",    2 },
 76 };
 77 
 78 
 79 static int
 80 fill_buf(uc_lexer_t *lex) {
 81         lex->rbuf = xrealloc(lex->rbuf, 128);
 82         lex->rlen = fread(lex->rbuf, 1, 128, lex->source->fp);
 83         lex->rpos = 0;
 84 
 85         if (!lex->rlen)
 86                 return EOF;
 87 
 88         lex->rpos++;
 89 
 90         return (int)lex->rbuf[0];
 91 }
 92 
 93 static int
 94 update_line(uc_lexer_t *lex, int ch) {
 95         if (ch == '\n')
 96                 uc_source_line_next(lex->source);
 97         else if (ch != EOF)
 98                 uc_source_line_update(lex->source, 1);
 99 
100         lex->source->off++;
101 
102         return ch;
103 }
104 
105 static int
106 lookahead_char(uc_lexer_t *lex) {
107         int c;
108 
109         if (lex->rpos < lex->rlen)
110                 return (int)lex->rbuf[lex->rpos];
111 
112         c = fill_buf(lex);
113         lex->rpos = 0;
114 
115         return c;
116 }
117 
118 static bool
119 check_char(uc_lexer_t *lex, int ch) {
120         if (lookahead_char(lex) != ch)
121                 return false;
122 
123         lex->rpos++;
124 
125         update_line(lex, ch);
126 
127         return true;
128 }
129 
130 static int
131 next_char(uc_lexer_t *lex) {
132         int ch = (lex->rpos < lex->rlen) ? (int)lex->rbuf[lex->rpos++] : fill_buf(lex);
133 
134         return update_line(lex, ch);
135 }
136 
137 static uc_token_t *
138 emit_op(uc_lexer_t *lex, ssize_t pos, int type, uc_value_t *uv)
139 {
140         lex->curr.type = type;
141         lex->curr.uv = uv;
142 
143         if (pos < 0)
144                 lex->curr.pos = lex->source->off + pos;
145         else
146                 lex->curr.pos = (size_t)pos;
147 
148         return &lex->curr;
149 }
150 
151 static uc_token_t *
152 emit_buffer(uc_lexer_t *lex, ssize_t pos, int type, const char *strip_trailing_chars) {
153         uc_token_t *rv = NULL;
154 
155         if (lex->buffer.count) {
156                 if (strip_trailing_chars)
157                         while (lex->buffer.count > 0 && strchr(strip_trailing_chars, *uc_vector_last(&lex->buffer)))
158                                 lex->buffer.count--;
159 
160                 rv = emit_op(lex, pos, type, ucv_string_new_length(uc_vector_first(&lex->buffer), lex->buffer.count));
161 
162                 uc_vector_clear(&lex->buffer);
163         }
164         else if (type != TK_TEXT) {
165                 rv = emit_op(lex, pos, type, ucv_string_new_length("", 0));
166         }
167 
168         return rv;
169 }
170 
171 
172 static uc_token_t *
173 parse_comment(uc_lexer_t *lex, int kind)
174 {
175         int ch;
176 
177         while (true) {
178                 ch = next_char(lex);
179 
180                 if (kind == '/' && (ch == '\n' || ch == EOF))
181                         break;
182 
183                 if (kind == '*' && ch == '*' && check_char(lex, '/'))
184                         break;
185 
186                 if (ch == EOF) {
187                         lex->state = UC_LEX_EOF;
188 
189                         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment"));
190                 }
191         }
192 
193         return NULL;
194 }
195 
196 static void
197 append_utf8(uc_lexer_t *lex, int code) {
198         char ustr[8], *up;
199         int rem;
200 
201         up = ustr;
202         rem = sizeof(ustr);
203 
204         if (utf8enc(&up, &rem, code))
205                 for (up = ustr; rem < (int)sizeof(ustr); rem++)
206                         uc_vector_push(&lex->buffer, *up++);
207 }
208 
209 static uc_token_t *
210 parse_escape(uc_lexer_t *lex, const char *regex_macros)
211 {
212         int code, ch, i;
213         const char *p;
214 
215         /* unicode escape sequence */
216         if (check_char(lex, 'u')) {
217                 for (i = 0, code = 0; i < 4; i++) {
218                         ch = next_char(lex);
219 
220                         if (!isxdigit(ch))
221                                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
222 
223                         code = code * 16 + hex(ch);
224                 }
225 
226                 /* is a leading surrogate value */
227                 if ((code & 0xFC00) == 0xD800) {
228                         /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
229                         if (lex->lead_surrogate)
230                                 append_utf8(lex, 0xFFFD);
231 
232                         /* store surrogate value and advance to next escape sequence */
233                         lex->lead_surrogate = code;
234                 }
235 
236                 /* is a trailing surrogate value */
237                 else if ((code & 0xFC00) == 0xDC00) {
238                         /* found a trailing surrogate following a leading one, combine and encode */
239                         if (lex->lead_surrogate) {
240                                 code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
241                                 lex->lead_surrogate = 0;
242                         }
243 
244                         /* trailing surrogate not following a leading one, ignore and use replacement char */
245                         else {
246                                 code = 0xFFFD;
247                         }
248 
249                         append_utf8(lex, code);
250                 }
251 
252                 /* is a normal codepoint */
253                 else {
254                         append_utf8(lex, code);
255                 }
256         }
257 
258         /* hex escape sequence */
259         else if (check_char(lex, 'x')) {
260                 for (i = 0, code = 0; i < 2; i++) {
261                         ch = next_char(lex);
262 
263                         if (!isxdigit(ch))
264                                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
265 
266                         code = code * 16 + hex(ch);
267                 }
268 
269                 append_utf8(lex, code);
270         }
271 
272         /* octal or letter */
273         else {
274                 /* try to parse octal sequence... */
275                 for (i = 0, code = 0, ch = lookahead_char(lex);
276                      i < 3 && ch >= '' && ch <= '7';
277                      i++, next_char(lex), ch = lookahead_char(lex)) {
278                         code = code * 8 + dec(ch);
279                 }
280 
281                 if (i) {
282                         if (code > 255)
283                                 return emit_op(lex, -3, TK_ERROR, ucv_string_new("Invalid escape sequence"));
284 
285                         append_utf8(lex, code);
286                 }
287 
288                 /* ... no octal sequence, handle potential regex macros */
289                 else if (strchr(regex_macros, ch)) {
290                         ch = next_char(lex);
291 
292                         switch (ch) {
293                         case 'd': p = "[[:digit:]]";   break;
294                         case 'D': p = "[^[:digit:]]";  break;
295                         case 'w': p = "[[:alnum:]_]";  break;
296                         case 'W': p = "[^[:alnum:]_]"; break;
297                         case 's': p = "[[:space:]]";   break;
298                         case 'S': p = "[^[:space:]]";  break;
299                         default:  p = NULL;
300                         }
301 
302                         if (p) {
303                                 while (*p)
304                                         uc_vector_push(&lex->buffer, *p++);
305                         }
306                         else {
307                                 uc_vector_push(&lex->buffer, '\\');
308                                 uc_vector_push(&lex->buffer, ch);
309                         }
310                 }
311 
312                 /* ... handle other escape */
313                 else {
314                         ch = next_char(lex);
315 
316                         switch (ch) {
317                         case 'a': uc_vector_push(&lex->buffer, '\a'); break;
318                         case 'b': uc_vector_push(&lex->buffer, '\b'); break;
319                         case 'e': uc_vector_push(&lex->buffer, '\033'); break;
320                         case 'f': uc_vector_push(&lex->buffer, '\f'); break;
321                         case 'n': uc_vector_push(&lex->buffer, '\n'); break;
322                         case 'r': uc_vector_push(&lex->buffer, '\r'); break;
323                         case 't': uc_vector_push(&lex->buffer, '\t'); break;
324                         case 'v': uc_vector_push(&lex->buffer, '\v'); break;
325 
326                         case EOF:
327                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated string"));
328 
329                         default:
330                                 uc_vector_push(&lex->buffer, ch);
331                         }
332                 }
333         }
334 
335         return NULL;
336 }
337 
338 static uc_token_t *
339 parse_string(uc_lexer_t *lex, int kind)
340 {
341         uc_token_t *err;
342         unsigned type;
343         int code, ch;
344         size_t off;
345 
346         if (kind == '`')
347                 type = TK_TEMPLATE;
348         else if (kind == '/')
349                 type = TK_REGEXP;
350         else
351                 type = TK_STRING;
352 
353         off = lex->source->off - 1;
354 
355         for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
356                 switch (ch) {
357                 /* placeholder */
358                 case '$':
359                         if (type == TK_TEMPLATE && check_char(lex, '{')) {
360                                 lex->state = UC_LEX_PLACEHOLDER_START;
361 
362                                 return emit_buffer(lex, off, type, NULL);
363                         }
364 
365                         uc_vector_push(&lex->buffer, '$');
366                         break;
367 
368                 /* regexp bracket expression */
369                 case '[':
370                         uc_vector_push(&lex->buffer, '[');
371 
372                         if (type == TK_REGEXP) {
373                                 /* skip leading negation (^) */
374                                 if (check_char(lex, '^'))
375                                         uc_vector_push(&lex->buffer, '^');
376 
377                                 /* skip leading `]` - it is literal and not closing the bracket expr */
378                                 if (check_char(lex, ']'))
379                                         uc_vector_push(&lex->buffer, ']');
380 
381                                 /* read until closing `]` */
382                                 for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
383                                         if (ch == '\\') {
384                                                 err = parse_escape(lex, "^");
385 
386                                                 if (err)
387                                                         return err;
388 
389                                                 continue;
390                                         }
391 
392                                         uc_vector_push(&lex->buffer, ch);
393 
394                                         if (ch == ']')
395                                                 break;
396 
397                                         /* skip nested char classes / equivalence classes / collating chars */
398                                         if (ch == '[') {
399                                                 code = lookahead_char(lex);
400 
401                                                 if (code == ':' || code == '.' || code == '=') {
402                                                         uc_vector_push(&lex->buffer, code);
403                                                         next_char(lex);
404 
405                                                         for (ch = next_char(lex); ch != EOF; ch = next_char(lex)) {
406                                                                 if (ch == '\\') {
407                                                                         err = parse_escape(lex, "");
408 
409                                                                         if (err)
410                                                                                 return err;
411 
412                                                                         continue;
413                                                                 }
414 
415                                                                 uc_vector_push(&lex->buffer, ch);
416 
417                                                                 if (ch == code && check_char(lex, ']')) {
418                                                                         uc_vector_push(&lex->buffer, ']');
419                                                                         break;
420                                                                 }
421                                                         }
422                                                 }
423                                         }
424                                 }
425                         }
426 
427                         break;
428 
429                 /* escape sequence */
430                 case '\\':
431                         err = parse_escape(lex,
432                                 (type == TK_REGEXP) ? "^bBdDsSwW<>.[$()|*+?{\\" : "");
433 
434                         if (err)
435                                 return err;
436 
437                         break;
438 
439                 /* other character */
440                 default:
441                         /* terminating delimitter */
442                         if (ch == kind)
443                                 return emit_buffer(lex, off, type, NULL);
444 
445                         uc_vector_push(&lex->buffer, ch);
446                 }
447         }
448 
449         // FIXME
450         lex->state = UC_LEX_EOF;
451 
452         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
453 }
454 
455 
456 /*
457  * Parses a regexp literal from the given buffer.
458  *
459  * Returns a negative value on error, otherwise the amount of consumed
460  * characters from the given buffer.
461  *
462  * Error values:
463  *  -UC_ERROR_UNTERMINATED_STRING       Unterminated regexp
464  *  -UC_ERROR_INVALID_ESCAPE            Invalid escape sequence
465  *  -UC_ERROR_OVERLONG_STRING           Regexp literal too long
466  *  -UC_ERROR_INVALID_REGEXP        Could not compile regexp
467  */
468 
469 enum {
470         UC_LEX_PARSE_REGEX_INIT,
471         UC_LEX_PARSE_REGEX_PATTERN,
472         UC_LEX_PARSE_REGEX_FLAGS
473 };
474 
475 static uc_token_t *
476 parse_regexp(uc_lexer_t *lex)
477 {
478         bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false;
479         uc_token_t *rv;
480         size_t len;
481         char *s;
482 
483         rv = parse_string(lex, '/');
484 
485         if (rv->type == TK_REGEXP) {
486                 while (true) {
487                         if (check_char(lex, 'g'))
488                                 is_reg_global = true;
489                         else if (check_char(lex, 'i'))
490                                 is_reg_icase = true;
491                         else if (check_char(lex, 's'))
492                                 is_reg_newline = true;
493                         else
494                                 break;
495                 }
496 
497                 len = xasprintf(&s, "%c%*s",
498                         (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
499                         ucv_string_length(rv->uv),
500                         ucv_string_get(rv->uv));
501 
502                 ucv_free(rv->uv, false);
503                 rv->uv = ucv_string_new_length(s, len);
504                 free(s);
505         }
506 
507         return rv;
508 }
509 
510 
511 /*
512  * Parses a label from the given buffer.
513  *
514  * Returns a negative value on error, otherwise the amount of consumed
515  * characters from the given buffer.
516  *
517  * Error values:
518  *  -UC_ERROR_OVERLONG_STRING   Label too long
519  */
520 
521 static uc_token_t *
522 parse_label(uc_lexer_t *lex, int ch)
523 {
524         const struct keyword *word;
525         size_t i, len;
526 
527         while (true) {
528                 uc_vector_push(&lex->buffer, ch);
529                 ch = lookahead_char(lex);
530 
531                 if (!isalnum(ch) && ch != '_')
532                         break;
533 
534                 next_char(lex);
535         }
536 
537         len = lex->buffer.count;
538 
539         if (!lex->no_keyword) {
540                 for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
541                         if (lex->buffer.count == word->plen && !strncmp(uc_vector_first(&lex->buffer), word->pat, word->plen)) {
542                                 uc_vector_clear(&lex->buffer);
543 
544                                 return emit_op(lex, -len, word->type, NULL);
545                         }
546                 }
547         }
548 
549         return emit_buffer(lex, -len, TK_LABEL, NULL);
550 }
551 
552 
553 /*
554  * Parses a number literal from the given buffer.
555  *
556  * Returns a negative value on error, otherwise the amount of consumed
557  * characters from the given buffer.
558  *
559  * Error values:
560  *  -UC_ERROR_INVALID_ESCAPE    Invalid number character
561  */
562 
563 static inline bool
564 is_numeric_char(uc_lexer_t *lex, char c)
565 {
566         char prev = lex->buffer.count ? *uc_vector_last(&lex->buffer) : 0;
567 
568         switch (c|32) {
569         case '.':
570         case '':
571         case '1':
572         case '2':
573         case '3':
574         case '4':
575         case '5':
576         case '6':
577         case '7':
578         case '8':
579         case '9':
580                 return true;
581 
582         case 'a':
583         case 'b':
584         case 'c':
585         case 'd':
586         case 'e':
587         case 'f':
588         case 'o':
589         case 'x':
590                 /* require previous char, a number literal cannot start with these */
591                 return prev != 0;
592 
593         case '+':
594         case '-':
595                 /* sign is only allowed after an exponent char */
596                 return (prev|32) == 'e';
597         }
598 
599         return false;
600 }
601 
602 static uc_token_t *
603 parse_number(uc_lexer_t *lex, int ch)
604 {
605         uc_value_t *nv = NULL;
606         size_t len;
607         char *e;
608 
609         while (true) {
610                 uc_vector_push(&lex->buffer, ch);
611                 ch = lookahead_char(lex);
612 
613                 if (!is_numeric_char(lex, ch))
614                         break;
615 
616                 next_char(lex);
617         }
618 
619         len = lex->buffer.count;
620 
621         uc_vector_push(&lex->buffer, '\0');
622 
623         nv = uc_number_parse_octal(uc_vector_first(&lex->buffer), &e);
624 
625         uc_vector_clear(&lex->buffer);
626 
627         switch (ucv_type(nv)) {
628         case UC_DOUBLE:
629                 return emit_op(lex, -len, TK_DOUBLE, nv);
630 
631         case UC_INTEGER:
632                 return emit_op(lex, -len, TK_NUMBER, nv);
633 
634         default:
635                 return emit_op(lex, -len, TK_ERROR, ucv_string_new("Invalid number literal"));
636         }
637 }
638 
639 static uc_token_t *
640 lex_find_token(uc_lexer_t *lex)
641 {
642         bool tpl = !(lex->config && lex->config->raw_mode);
643         int ch = next_char(lex);
644 
645         while (isspace(ch))
646                 ch = next_char(lex);
647 
648         switch (ch) {
649         case '~':
650                 return emit_op(lex, -1, TK_COMPL, NULL);
651 
652         case '}':
653                 if (tpl && check_char(lex, '}'))
654                         return emit_op(lex, -2, TK_REXP, NULL);
655 
656                 return emit_op(lex, -1, TK_RBRACE, NULL);
657 
658         case '|':
659                 if (check_char(lex, '|')) {
660                         if (check_char(lex, '='))
661                                 return emit_op(lex, -3, TK_ASOR, NULL);
662 
663                         return emit_op(lex, -2, TK_OR, NULL);
664                 }
665 
666                 if (check_char(lex, '='))
667                         return emit_op(lex, -2, TK_ASBOR, NULL);
668 
669                 return emit_op(lex, -1, TK_BOR, NULL);
670 
671         case '{':
672                 if (tpl && check_char(lex, '{'))
673                         return emit_op(lex, -2, TK_LEXP, NULL);
674 
675                 if (tpl && check_char(lex, '%'))
676                         return emit_op(lex, -2, TK_LSTM, NULL);
677 
678                 return emit_op(lex, -1, TK_LBRACE, NULL);
679 
680         case '^':
681                 if (check_char(lex, '='))
682                         return emit_op(lex, -2, TK_ASBXOR, NULL);
683 
684                 return emit_op(lex, -1, TK_BXOR, NULL);
685 
686         case '[':
687                 return emit_op(lex, -1, TK_LBRACK, NULL);
688 
689         case ']':
690                 return emit_op(lex, -1, TK_RBRACK, NULL);
691 
692         case '?':
693                 if (check_char(lex, '?')) {
694                         if (check_char(lex, '='))
695                                 return emit_op(lex, -3, TK_ASNULLISH, NULL);
696 
697                         return emit_op(lex, -2, TK_NULLISH, NULL);
698                 }
699 
700                 if (check_char(lex, '.')) {
701                         if (check_char(lex, '['))
702                                 return emit_op(lex, -3, TK_QLBRACK, NULL);
703 
704                         if (check_char(lex, '('))
705                                 return emit_op(lex, -3, TK_QLPAREN, NULL);
706 
707                         return emit_op(lex, -2, TK_QDOT, NULL);
708                 }
709 
710                 return emit_op(lex, lex->source->off, TK_QMARK, NULL);
711 
712         case '>':
713                 if (check_char(lex, '>')) {
714                         if (check_char(lex, '='))
715                                 return emit_op(lex, -3, TK_ASRIGHT, NULL);
716 
717                         return emit_op(lex, -2, TK_RSHIFT, NULL);
718                 }
719 
720                 if (check_char(lex, '='))
721                         return emit_op(lex, -2, TK_GE, NULL);
722 
723                 return emit_op(lex, -1, TK_GT, NULL);
724 
725         case '=':
726                 if (check_char(lex, '=')) {
727                         if (check_char(lex, '='))
728                                 return emit_op(lex, -3, TK_EQS, NULL);
729 
730                         return emit_op(lex, -2, TK_EQ, NULL);
731                 }
732 
733                 if (check_char(lex, '>'))
734                         return emit_op(lex, -2, TK_ARROW, NULL);
735 
736                 return emit_op(lex, -1, TK_ASSIGN, NULL);
737 
738         case '<':
739                 if (check_char(lex, '<')) {
740                         if (check_char(lex, '='))
741                                 return emit_op(lex, -3, TK_ASLEFT, NULL);
742 
743                         return emit_op(lex, -2, TK_LSHIFT, NULL);
744                 }
745 
746                 if (check_char(lex, '='))
747                         return emit_op(lex, -2, TK_LE, NULL);
748 
749                 return emit_op(lex, -1, TK_LT, NULL);
750 
751         case ';':
752                 return emit_op(lex, -1, TK_SCOL, NULL);
753 
754         case ':':
755                 return emit_op(lex, -1, TK_COLON, NULL);
756 
757         case '/':
758                 ch = lookahead_char(lex);
759                 lex->lastoff = lex->source->off - 1;
760 
761                 if (ch == '/' || ch == '*')
762                         return parse_comment(lex, ch);
763 
764                 if (lex->no_regexp) {
765                         if (check_char(lex, '='))
766                                 return emit_op(lex, -2, TK_ASDIV, NULL);
767 
768                         return emit_op(lex, -1, TK_DIV, NULL);
769                 }
770 
771                 return parse_regexp(lex);
772 
773         case '.':
774                 if (check_char(lex, '.')) {
775                         if (check_char(lex, '.'))
776                                 return emit_op(lex, -3, TK_ELLIP, NULL);
777 
778                         /* The sequence ".." cannot be a valid */
779                         return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unexpected character"));
780                 }
781 
782                 return emit_op(lex, -1, TK_DOT, NULL);
783 
784         case '-':
785                 if (tpl && check_char(lex, '}')) {
786                         if (check_char(lex, '}')) {
787                                 lex->modifier = MINUS;
788 
789                                 return emit_op(lex, -3, TK_REXP, NULL);
790                         }
791 
792                         /* The sequence "-}" cannot be a valid */
793                         return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
794                 }
795 
796                 if (tpl && check_char(lex, '%')) {
797                         if (check_char(lex, '}')) {
798                                 lex->modifier = MINUS;
799 
800                                 return emit_op(lex, -3, TK_RSTM, NULL);
801                         }
802 
803                         /* The sequence "-%" cannot be a valid */
804                         return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
805                 }
806 
807                 if (check_char(lex, '='))
808                         return emit_op(lex, -2, TK_ASSUB, NULL);
809 
810                 if (check_char(lex, '-'))
811                         return emit_op(lex, -2, TK_DEC, NULL);
812 
813                 return emit_op(lex, -1, TK_SUB, NULL);
814 
815         case ',':
816                 return emit_op(lex, -1, TK_COMMA, NULL);
817 
818         case '+':
819                 if (check_char(lex, '='))
820                         return emit_op(lex, -2, TK_ASADD, NULL);
821 
822                 if (check_char(lex, '+'))
823                         return emit_op(lex, -2, TK_INC, NULL);
824 
825                 return emit_op(lex, -1, TK_ADD, NULL);
826 
827         case '*':
828                 if (check_char(lex, '*')) {
829                         if (check_char(lex, '='))
830                                 return emit_op(lex, -3, TK_ASEXP, NULL);
831 
832                         return emit_op(lex, -2, TK_EXP, NULL);
833                 }
834 
835                 if (check_char(lex, '='))
836                         return emit_op(lex, -2, TK_ASMUL, NULL);
837 
838                 return emit_op(lex, -1, TK_MUL, NULL);
839 
840         case '(':
841                 return emit_op(lex, -1, TK_LPAREN, NULL);
842 
843         case ')':
844                 return emit_op(lex, -1, TK_RPAREN, NULL);
845 
846         case '\'':
847         case '"':
848         case '`':
849                 lex->lastoff = lex->source->off - 1;
850 
851                 return parse_string(lex, ch);
852 
853         case '&':
854                 if (check_char(lex, '&')) {
855                         if (check_char(lex, '='))
856                                 return emit_op(lex, -3, TK_ASAND, NULL);
857 
858                         return emit_op(lex, -2, TK_AND, NULL);
859                 }
860 
861                 if (check_char(lex, '='))
862                         return emit_op(lex, -2, TK_ASBAND, NULL);
863 
864                 return emit_op(lex, -1, TK_BAND, NULL);
865 
866         case '%':
867                 if (tpl && check_char(lex, '}'))
868                         return emit_op(lex, -2, TK_RSTM, NULL);
869 
870                 if (check_char(lex, '='))
871                         return emit_op(lex, -2, TK_ASMOD, NULL);
872 
873                 return emit_op(lex, -1, TK_MOD, NULL);
874 
875         case '!':
876                 if (check_char(lex, '=')) {
877                         if (check_char(lex, '='))
878                                 return emit_op(lex, -3, TK_NES, NULL);
879 
880                         return emit_op(lex, -2, TK_NE, NULL);
881                 }
882 
883                 return emit_op(lex, -1, TK_NOT, NULL);
884 
885         case EOF:
886                 return emit_op(lex, -1, TK_EOF, NULL);
887 
888         default:
889                 if (isalpha(ch) || ch == '_')
890                         return parse_label(lex, ch);
891 
892                 if (isdigit(ch))
893                         return parse_number(lex, ch);
894 
895                 return emit_op(lex, -1, TK_ERROR, ucv_string_new("Unexpected character"));
896         }
897 }
898 
899 static uc_token_t *
900 lex_step(uc_lexer_t *lex)
901 {
902         const char *strip = NULL;
903         uc_token_t *tok;
904         size_t *nest;
905         int ch;
906 
907         while (lex->state != UC_LEX_EOF) {
908                 switch (lex->state) {
909                 case UC_LEX_IDENTIFY_BLOCK:
910                         ch = next_char(lex);
911 
912                         /* previous block had strip trailing whitespace flag, skip leading whitespace */
913                         if (lex->modifier == MINUS) {
914                                 while (isspace(ch))
915                                         ch = next_char(lex);
916 
917                                 lex->modifier = UNSPEC;
918                         }
919 
920                         /* previous block was a statement block and trim_blocks is enabled, skip leading newline */
921                         else if (lex->modifier == NEWLINE) {
922                                 if (ch == '\n')
923                                         ch = next_char(lex);
924 
925                                 lex->modifier = UNSPEC;
926                         }
927 
928                         /* scan forward through buffer to identify block start token */
929                         while (ch != EOF) {
930                                 if (ch == '{') {
931                                         ch = next_char(lex);
932 
933                                         switch (ch) {
934                                         /* found start of comment block */
935                                         case '#':
936                                                 lex->state = UC_LEX_BLOCK_COMMENT;
937                                                 lex->block = COMMENT;
938 
939                                                 if (check_char(lex, '-'))
940                                                         strip = " \n\t\v\f\r";
941 
942                                                 break;
943 
944                                         /* found start of expression block */
945                                         case '{':
946                                                 lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;
947 
948                                                 if (check_char(lex, '-'))
949                                                         strip = " \n\t\v\f\r";
950 
951                                                 break;
952 
953                                         /* found start of statement block */
954                                         case '%':
955                                                 lex->state = UC_LEX_IDENTIFY_TOKEN;
956                                                 lex->block = STATEMENTS;
957 
958                                                 if (check_char(lex, '-'))
959                                                         strip = " \n\t\v\f\r";
960                                                 else if (check_char(lex, '+'))
961                                                         strip = NULL;
962                                                 else if (lex->config && lex->config->lstrip_blocks)
963                                                         strip = " \t\v\f\r";
964 
965                                                 break;
966 
967                                         default:
968                                                 /* not a start tag, remember char and move on */
969                                                 uc_vector_push(&lex->buffer, '{');
970                                                 continue;
971                                         }
972 
973                                         break;
974                                 }
975 
976                                 uc_vector_push(&lex->buffer, ch);
977                                 ch = next_char(lex);
978                         }
979 
980                         if (ch == EOF)
981                                 lex->state = UC_LEX_EOF;
982 
983                         /* push out leading text */
984                         tok = emit_buffer(lex, lex->lastoff, TK_TEXT, strip);
985                         lex->lastoff = lex->source->off - 2;
986 
987                         if (!tok)
988                                 continue;
989 
990                         return tok;
991 
992 
993                 case UC_LEX_BLOCK_COMMENT:
994                         ch = next_char(lex);
995 
996                         /* scan forward through buffer to identify end token */
997                         while (ch != EOF) {
998                                 if (ch == '-' && check_char(lex, '#') && check_char(lex, '}')) {
999                                         lex->modifier = MINUS;
1000                                         break;
1001                                 }
1002 
1003                                 if (ch == '#' && check_char(lex, '}'))
1004                                         break;
1005 
1006                                 ch = next_char(lex);
1007                         }
1008 
1009                         if (ch == EOF) {
1010                                 lex->state = UC_LEX_EOF;
1011 
1012                                 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
1013                         }
1014 
1015                         lex->lastoff = lex->source->off;
1016                         lex->state = UC_LEX_IDENTIFY_BLOCK;
1017 
1018                         continue;
1019 
1020 
1021                 case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
1022                         lex->state = UC_LEX_IDENTIFY_TOKEN;
1023                         lex->block = EXPRESSION;
1024 
1025                         return emit_op(lex, lex->source->off, TK_LEXP, NULL);
1026 
1027 
1028                 case UC_LEX_IDENTIFY_TOKEN:
1029                         do { tok = lex_find_token(lex); } while (tok == NULL);
1030 
1031                         /* disallow nesting blocks */
1032                         if (tok->type == TK_LSTM || tok->type == TK_LEXP)
1033                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Template blocks may not be nested"));
1034 
1035                         /* found end of statement block */
1036                         if (lex->block == STATEMENTS && tok->type == TK_RSTM) {
1037                                 /* strip newline after statement block? */
1038                                 if (lex->modifier == UNSPEC && lex->config && lex->config->trim_blocks)
1039                                         lex->modifier = NEWLINE;
1040 
1041                                 lex->lastoff = lex->source->off;
1042                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
1043                                 lex->block = NONE;
1044 
1045                                 tok = emit_op(lex, -2, TK_SCOL, NULL);
1046                         }
1047 
1048                         /* found end of expression block */
1049                         else if (lex->block == EXPRESSION && tok->type == TK_REXP) {
1050                                 lex->lastoff = lex->source->off;
1051                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
1052                                 lex->block = NONE;
1053                         }
1054 
1055                         /* track opening braces */
1056                         else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
1057                                 nest = uc_vector_last(&lex->templates);
1058                                 (*nest)++;
1059                         }
1060 
1061                         /* check end of placeholder expression */
1062                         else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
1063                                 nest = uc_vector_last(&lex->templates);
1064 
1065                                 if (*nest == 0) {
1066                                         lex->templates.count--;
1067                                         lex->state = UC_LEX_PLACEHOLDER_END;
1068                                 }
1069                                 else {
1070                                         (*nest)--;
1071                                 }
1072                         }
1073 
1074                         /* premature EOF? */
1075                         else if (tok->type == TK_EOF && lex->block != STATEMENTS) {
1076                                 lex->state = UC_LEX_EOF;
1077 
1078                                 return emit_op(lex, -2, TK_ERROR, ucv_string_new("Unterminated template block"));
1079                         }
1080 
1081                         return tok;
1082 
1083 
1084                 case UC_LEX_PLACEHOLDER_START:
1085                         lex->state = UC_LEX_IDENTIFY_TOKEN;
1086 
1087                         uc_vector_push(&lex->templates, 0);
1088 
1089                         return emit_op(lex, -2, TK_PLACEH, NULL);
1090 
1091 
1092                 case UC_LEX_PLACEHOLDER_END:
1093                         lex->state = UC_LEX_IDENTIFY_TOKEN;
1094 
1095                         return parse_string(lex, '`');
1096 
1097 
1098                 case UC_LEX_EOF:
1099                         break;
1100                 }
1101         }
1102 
1103         return emit_op(lex, lex->source->off, TK_EOF, NULL);
1104 }
1105 
1106 void
1107 uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
1108 {
1109         lex->state = UC_LEX_IDENTIFY_BLOCK;
1110 
1111         lex->config = config;
1112         lex->source = uc_source_get(source);
1113 
1114         lex->block = NONE;
1115         lex->modifier = UNSPEC;
1116 
1117         lex->rlen = 0;
1118         lex->rpos = 0;
1119         lex->rbuf = NULL;
1120 
1121         lex->buffer.count = 0;
1122         lex->buffer.entries = NULL;
1123 
1124         lex->lead_surrogate = 0;
1125 
1126         lex->lastoff = 0;
1127 
1128         lex->templates.count = 0;
1129         lex->templates.entries = NULL;
1130 
1131         if (config && config->raw_mode) {
1132                 lex->state = UC_LEX_IDENTIFY_TOKEN;
1133                 lex->block = STATEMENTS;
1134         }
1135 }
1136 
1137 void
1138 uc_lexer_free(uc_lexer_t *lex)
1139 {
1140         uc_vector_clear(&lex->buffer);
1141         uc_vector_clear(&lex->templates);
1142 
1143         uc_source_put(lex->source);
1144 
1145         free(lex->rbuf);
1146 }
1147 
1148 uc_token_t *
1149 uc_lexer_next_token(uc_lexer_t *lex)
1150 {
1151         uc_token_t *rv = NULL;
1152 
1153         rv = lex_step(lex);
1154 
1155         lex->no_keyword = false;
1156         lex->no_regexp = false;
1157 
1158         return rv;
1159 }
1160 
1161 const char *
1162 uc_tokenname(unsigned type)
1163 {
1164         static char buf[sizeof("'endfunction'")];
1165         const char *tokennames[] = {
1166                 [TK_LEXP] = "'{{'",
1167                 [TK_REXP] = "'}}'",
1168                 [TK_LSTM] = "'{%'",
1169                 [TK_RSTM] = "'%}'",
1170                 [TK_COMMA] = "','",
1171                 [TK_ASSIGN] = "'='",
1172                 [TK_ASADD] = "'+='",
1173                 [TK_ASSUB] = "'-='",
1174                 [TK_ASMUL] = "'*='",
1175                 [TK_ASDIV] = "'/='",
1176                 [TK_ASMOD] = "'%='",
1177                 [TK_ASLEFT] = "'<<='",
1178                 [TK_ASRIGHT] = "'>>='",
1179                 [TK_ASBAND] = "'&='",
1180                 [TK_ASBXOR] = "'^='",
1181                 [TK_ASBOR] = "'|='",
1182                 [TK_QMARK] = "'?'",
1183                 [TK_COLON] = "':'",
1184                 [TK_OR] = "'||'",
1185                 [TK_AND] = "'&&'",
1186                 [TK_BOR] = "'|'",
1187                 [TK_BXOR] = "'^'",
1188                 [TK_BAND] = "'&'",
1189                 [TK_EQS] = "'==='",
1190                 [TK_NES] = "'!=='",
1191                 [TK_EQ] = "'=='",
1192                 [TK_NE] = "'!='",
1193                 [TK_LT] = "'<'",
1194                 [TK_LE] = "'<='",
1195                 [TK_GT] = "'>'",
1196                 [TK_GE] = "'>='",
1197                 [TK_LSHIFT] = "'<<'",
1198                 [TK_RSHIFT] = "'>>'",
1199                 [TK_ADD] = "'+'",
1200                 [TK_SUB] = "'-'",
1201                 [TK_MUL] = "'*'",
1202                 [TK_DIV] = "'/'",
1203                 [TK_MOD] = "'%'",
1204                 [TK_EXP] = "'**'",
1205                 [TK_NOT] = "'!'",
1206                 [TK_COMPL] = "'~'",
1207                 [TK_INC] = "'++'",
1208                 [TK_DEC] = "'--'",
1209                 [TK_DOT] = "'.'",
1210                 [TK_LBRACK] = "'['",
1211                 [TK_RBRACK] = "']'",
1212                 [TK_LPAREN] = "'('",
1213                 [TK_RPAREN] = "')'",
1214                 [TK_LBRACE] = "'{'",
1215                 [TK_RBRACE] = "'}'",
1216                 [TK_SCOL] = "';'",
1217                 [TK_ELLIP] = "'...'",
1218                 [TK_ARROW] = "'=>'",
1219                 [TK_QLBRACK] = "'?.['",
1220                 [TK_QLPAREN] = "'?.('",
1221                 [TK_QDOT] = "'?.'",
1222                 [TK_ASEXP] = "'**='",
1223                 [TK_ASAND] = "'&&='",
1224                 [TK_ASOR] = "'||='",
1225                 [TK_ASNULLISH] = "'\?\?='",
1226                 [TK_NULLISH] = "'\?\?'",
1227                 [TK_PLACEH] = "'${'",
1228 
1229                 [TK_TEXT] = "Text",
1230                 [TK_LABEL] = "Label",
1231                 [TK_NUMBER] = "Number",
1232                 [TK_DOUBLE] = "Double",
1233                 [TK_STRING] = "String",
1234                 [TK_REGEXP] = "Regexp",
1235                 [TK_TEMPLATE] = "Template",
1236                 [TK_ERROR] = "Error",
1237                 [TK_EOF] = "End of file",
1238         };
1239 
1240         size_t i;
1241 
1242         for (i = 0; i < ARRAY_SIZE(reserved_words); i++) {
1243                 if (reserved_words[i].type != type)
1244                         continue;
1245 
1246                 snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat);
1247 
1248                 return buf;
1249         }
1250 
1251         return tokennames[type] ? tokennames[type] : "?";
1252 }
1253 
1254 bool
1255 uc_lexer_is_keyword(uc_value_t *label)
1256 {
1257         size_t i;
1258 
1259         if (ucv_type(label) != UC_STRING)
1260                 return false;
1261 
1262         for (i = 0; i < ARRAY_SIZE(reserved_words); i++)
1263                 if (!strcmp(reserved_words[i].pat, ucv_string_get(label)))
1264                         return true;
1265 
1266         return false;
1267 }
1268 
1269 #endif /* NO_COMPILE */
1270 
1271 /*
1272  * Stores the given codepoint as a utf8 multibyte sequence into the given
1273  * output buffer and substracts the required amount of bytes from  the given
1274  * length pointer.
1275  *
1276  * Returns false if the multibyte sequence would not fit into the buffer,
1277  * otherwise true.
1278  */
1279 
1280 bool
1281 utf8enc(char **out, int *rem, int code)
1282 {
1283         if (code >= 0 && code <= 0x7F) {
1284                 if (*rem < 1)
1285                         return false;
1286 
1287                 *(*out)++ = code; (*rem)--;
1288 
1289                 return true;
1290         }
1291         else if (code > 0 && code <= 0x7FF) {
1292                 if (*rem < 2)
1293                         return false;
1294 
1295                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
1296                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1297 
1298                 return true;
1299         }
1300         else if (code > 0 && code <= 0xFFFF) {
1301                 if (*rem < 3)
1302                         return false;
1303 
1304                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
1305                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1306                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1307 
1308                 return true;
1309         }
1310         else if (code > 0 && code <= 0x10FFFF) {
1311                 if (*rem < 4)
1312                         return false;
1313 
1314                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
1315                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
1316                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1317                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1318 
1319                 return true;
1320         }
1321 
1322         return true;
1323 }
1324 

This page was automatically generated by LXR 0.3.1.  •  OpenWrt