• source navigation  • diff markup  • identifier search  • freetext search  • 

Sources/ucode/lexer.c

  1 /*
  2  * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io>
  3  *
  4  * Permission to use, copy, modify, and/or distribute this software for any
  5  * purpose with or without fee is hereby granted, provided that the above
  6  * copyright notice and this permission notice appear in all copies.
  7  *
  8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15  */
 16 
 17 #include <stdio.h>
 18 
 19 #include <stdbool.h>
 20 #include <stdlib.h>
 21 #include <string.h>
 22 #include <ctype.h>
 23 #include <regex.h>
 24 #include <math.h>
 25 #include <errno.h>
 26 #include <endian.h>
 27 
 28 #include "ucode/vm.h"
 29 #include "ucode/lib.h"
 30 #include "ucode/lexer.h"
 31 
 32 #define UC_LEX_CONTINUE_PARSING (void *)1
 33 
 34 struct keyword {
 35         unsigned type;
 36         const char *pat;
 37         unsigned plen;
 38 };
 39 
 40 struct token {
 41         unsigned type;
 42         union {
 43                 uint32_t patn;
 44                 char pat[4];
 45         } u;
 46         unsigned plen;
 47         uc_token_t *(*parse)(uc_lexer_t *);
 48 };
 49 
 50 #define dec(o) \
 51         ((o) - '')
 52 
 53 #define hex(x) \
 54         (((x) >= 'a') ? (10 + (x) - 'a') : \
 55                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
 56 
 57 #ifndef NO_COMPILE
 58 
 59 static uc_token_t *parse_comment(uc_lexer_t *);
 60 static uc_token_t *parse_string(uc_lexer_t *);
 61 static uc_token_t *parse_regexp(uc_lexer_t *);
 62 static uc_token_t *parse_number(uc_lexer_t *);
 63 static uc_token_t *parse_label(uc_lexer_t *);
 64 
 65 static const struct token tokens[] = {
 66         { TK_ASLEFT,    { .pat = "<<=" },   3, NULL },
 67         { TK_ASRIGHT,   { .pat = ">>=" },   3, NULL },
 68         { TK_LEXP,              { .pat = "{{-" },   3, NULL },
 69         { TK_REXP,              { .pat = "-}}" },   3, NULL },
 70         { TK_LSTM,              { .pat = "{%+" },   3, NULL },
 71         { TK_LSTM,              { .pat = "{%-" },   3, NULL },
 72         { TK_RSTM,              { .pat = "-%}" },   3, NULL },
 73         { TK_EQS,               { .pat = "===" },   3, NULL },
 74         { TK_NES,               { .pat = "!==" },   3, NULL },
 75         { TK_ELLIP,             { .pat = "..." },   3, NULL },
 76         { TK_QLBRACK,   { .pat = "?.[" },   3, NULL },
 77         { TK_QLPAREN,   { .pat = "?.(" },   3, NULL },
 78         { TK_AND,               { .pat = "&&" },    2, NULL },
 79         { TK_ASADD,             { .pat = "+=" },    2, NULL },
 80         { TK_ASBAND,    { .pat = "&=" },    2, NULL },
 81         { TK_ASBOR,             { .pat = "|=" },    2, NULL },
 82         { TK_ASBXOR,    { .pat = "^=" },    2, NULL },
 83         //{ TK_ASDIV,   { .pat = "/=" },    2, NULL },
 84         { TK_ASMOD,             { .pat = "%=" },    2, NULL },
 85         { TK_ASMUL,             { .pat = "*=" },    2, NULL },
 86         { TK_ASSUB,             { .pat = "-=" },    2, NULL },
 87         { TK_DEC,               { .pat = "--" },    2, NULL },
 88         { TK_INC,               { .pat = "++" },    2, NULL },
 89         { TK_EQ,                { .pat = "==" },    2, NULL },
 90         { TK_NE,                { .pat = "!=" },    2, NULL },
 91         { TK_LE,                { .pat = "<=" },    2, NULL },
 92         { TK_GE,                { .pat = ">=" },    2, NULL },
 93         { TK_LSHIFT,    { .pat = "<<" },    2, NULL },
 94         { TK_RSHIFT,    { .pat = ">>" },    2, NULL },
 95         { 0,                    { .pat = "//" },    2, parse_comment },
 96         { 0,                    { .pat = "/*" },    2, parse_comment },
 97         { TK_OR,                { .pat = "||" },    2, NULL },
 98         { TK_LEXP,              { .pat = "{{" },    2, NULL },
 99         { TK_REXP,              { .pat = "}}" },    2, NULL },
100         { TK_LSTM,              { .pat = "{%" },    2, NULL },
101         { TK_RSTM,              { .pat = "%}" },    2, NULL },
102         { TK_ARROW,             { .pat = "=>" },    2, NULL },
103         { TK_QDOT,              { .pat = "?." },    2, NULL },
104         { TK_ADD,               { .pat = "+" },     1, NULL },
105         { TK_ASSIGN,    { .pat = "=" },     1, NULL },
106         { TK_BAND,              { .pat = "&" },     1, NULL },
107         { TK_BOR,               { .pat = "|" },     1, NULL },
108         { TK_LBRACK,    { .pat = "[" },     1, NULL },
109         { TK_RBRACK,    { .pat = "]" },     1, NULL },
110         { TK_BXOR,              { .pat = "^" },     1, NULL },
111         { TK_LBRACE,    { .pat = "{" },     1, NULL },
112         { TK_RBRACE,    { .pat = "}" },     1, NULL },
113         { TK_COLON,             { .pat = ":" },     1, NULL },
114         { TK_COMMA,             { .pat = "," },     1, NULL },
115         { TK_COMPL,             { .pat = "~" },     1, NULL },
116         //{ TK_DIV,             { .pat = "/" },     1, NULL },
117         { TK_GT,                { .pat = ">" },     1, NULL },
118         { TK_NOT,               { .pat = "!" },     1, NULL },
119         { TK_LT,                { .pat = "<" },     1, NULL },
120         { TK_MOD,               { .pat = "%" },     1, NULL },
121         { TK_MUL,               { .pat = "*" },     1, NULL },
122         { TK_LPAREN,    { .pat = "(" },     1, NULL },
123         { TK_RPAREN,    { .pat = ")" },     1, NULL },
124         { TK_QMARK,             { .pat = "?" },     1, NULL },
125         { TK_SCOL,              { .pat = ";" },     1, NULL },
126         { TK_SUB,               { .pat = "-" },     1, NULL },
127         { TK_DOT,               { .pat = "." },     1, NULL },
128         { TK_STRING,    { .pat = "'" },     1, parse_string },
129         { TK_STRING,    { .pat = "\"" },    1, parse_string },
130         { TK_REGEXP,    { .pat = "/" },     1, parse_regexp },
131         { TK_LABEL,             { .pat = "_" },     1, parse_label },
132         { TK_LABEL,             { .pat = "az" },    0, parse_label },
133         { TK_LABEL,             { .pat = "AZ" },    0, parse_label },
134         { TK_NUMBER,    { .pat = "09" },    0, parse_number },
135 };
136 
137 static const struct keyword reserved_words[] = {
138         { TK_ENDFUNC,   "endfunction", 11 },
139         { TK_CONTINUE,  "continue", 8 },
140         { TK_ENDWHILE,  "endwhile", 8 },
141         { TK_FUNC,              "function", 8 },
142         { TK_DEFAULT,   "default", 7 },
143         { TK_DELETE,    "delete", 6 },
144         { TK_RETURN,    "return", 6 },
145         { TK_ENDFOR,    "endfor", 6 },
146         { TK_SWITCH,    "switch", 6 },
147         { TK_ENDIF,             "endif", 5 },
148         { TK_WHILE,             "while", 5 },
149         { TK_BREAK,             "break", 5 },
150         { TK_CATCH,             "catch", 5 },
151         { TK_CONST,             "const", 5 },
152         { TK_FALSE,             "false", 5 },
153         { TK_TRUE,              "true",  4 },
154         { TK_ELIF,              "elif",  4 },
155         { TK_ELSE,              "else",  4 },
156         { TK_THIS,              "this",  4 },
157         { TK_NULL,              "null",  4 },
158         { TK_CASE,              "case",  4 },
159         { TK_TRY,               "try",   3 },
160         { TK_FOR,               "for",   3 },
161         { TK_LOCAL,             "let",   3 },
162         { TK_IF,                "if",    2 },
163         { TK_IN,                "in",    2 },
164 };
165 
166 
167 /* length of the longest token in our lookup table */
168 #define UC_LEX_MAX_TOKEN_LEN 3
169 
170 static uc_token_t *
171 emit_op(uc_lexer_t *lex, uint32_t pos, int type, uc_value_t *uv)
172 {
173         lex->curr.type = type;
174         lex->curr.uv = uv;
175         lex->curr.pos = pos;
176 
177         return &lex->curr;
178 }
179 
180 static void lookbehind_append(uc_lexer_t *lex, const char *data, size_t len)
181 {
182         if (len) {
183                 lex->lookbehind = xrealloc(lex->lookbehind, lex->lookbehindlen + len);
184                 memcpy(lex->lookbehind + lex->lookbehindlen, data, len);
185                 lex->lookbehindlen += len;
186         }
187 }
188 
189 static void lookbehind_reset(uc_lexer_t *lex) {
190         free(lex->lookbehind);
191         lex->lookbehind = NULL;
192         lex->lookbehindlen = 0;
193 }
194 
195 static uc_token_t *
196 lookbehind_to_text(uc_lexer_t *lex, uint32_t pos, int type, const char *strip_trailing_chars) {
197         uc_token_t *rv = NULL;
198 
199         if (lex->lookbehind) {
200                 if (strip_trailing_chars) {
201                         while (lex->lookbehindlen > 0 && strchr(strip_trailing_chars, lex->lookbehind[lex->lookbehindlen-1]))
202                                 lex->lookbehindlen--;
203                 }
204 
205                 rv = emit_op(lex, pos, type, ucv_string_new_length(lex->lookbehind, lex->lookbehindlen));
206 
207                 lookbehind_reset(lex);
208         }
209 
210         return rv;
211 }
212 
213 static inline size_t
214 buf_remaining(uc_lexer_t *lex) {
215         return (lex->bufend - lex->bufstart);
216 }
217 
218 static inline bool
219 _buf_startswith(uc_lexer_t *lex, const char *str, size_t len) {
220         return (buf_remaining(lex) >= len && !strncmp(lex->bufstart, str, len));
221 }
222 
223 #define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1)
224 
225 
226 static void
227 buf_consume(uc_lexer_t *lex, size_t len) {
228         size_t i, linelen;
229 
230         for (i = 0, linelen = 0; i < len; i++) {
231                 if (lex->bufstart[i] == '\n') {
232                         uc_source_line_update(lex->source, linelen);
233                         uc_source_line_next(lex->source);
234 
235                         linelen = 0;
236                 }
237                 else {
238                         linelen++;
239                 }
240         }
241 
242         if (linelen)
243                 uc_source_line_update(lex->source, linelen);
244 
245         lex->bufstart += len;
246         lex->source->off += len;
247 }
248 
249 static uc_token_t *
250 parse_comment(uc_lexer_t *lex)
251 {
252         const struct token *tok = lex->tok;
253         const char *ptr, *end;
254         size_t elen;
255 
256         if (!strcmp(tok->u.pat, "//")) {
257                 end = "\n";
258                 elen = 1;
259         }
260         else {
261                 end = "*/";
262                 elen = 2;
263         }
264 
265         for (ptr = lex->bufstart; ptr < lex->bufend - elen; ptr++) {
266                 if (!strncmp(ptr, end, elen)) {
267                         buf_consume(lex, (ptr - lex->bufstart) + elen);
268 
269                         return UC_LEX_CONTINUE_PARSING;
270                 }
271         }
272 
273         buf_consume(lex, ptr - lex->bufstart);
274 
275         if (lex->eof) {
276                 lex->state = UC_LEX_EOF;
277 
278                 if (elen == 2)
279                         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment"));
280         }
281 
282         return NULL;
283 }
284 
285 static void
286 append_utf8(uc_lexer_t *lex, int code) {
287         char ustr[8], *up;
288         int rem;
289 
290         up = ustr;
291         rem = sizeof(ustr);
292 
293         if (utf8enc(&up, &rem, code))
294                 lookbehind_append(lex, ustr, up - ustr);
295 }
296 
297 static uc_token_t *
298 parse_string(uc_lexer_t *lex)
299 {
300         const struct token *tok = lex->tok;
301         char q = tok->u.pat[0];
302         char *ptr, *c;
303         uc_token_t *rv;
304         int code;
305 
306         if (!buf_remaining(lex))
307                 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
308 
309         for (ptr = lex->bufstart; ptr < lex->bufend; ptr++) {
310                 /* continuation of escape sequence */
311                 if (lex->is_escape) {
312                         if (lex->esclen == 0) {
313                                 /* non-unicode escape following a lead surrogate, emit replacement... */
314                                 if (lex->lead_surrogate && *ptr != 'u') {
315                                         append_utf8(lex, 0xFFFD);
316                                         lex->lead_surrogate = 0;
317                                 }
318 
319                                 switch ((q == '/') ? 0 : *ptr) {
320                                 case 'u':
321                                 case 'x':
322                                         lex->esc[lex->esclen++] = *ptr;
323                                         break;
324 
325                                 case '':
326                                 case '1':
327                                 case '2':
328                                 case '3':
329                                 case '4':
330                                 case '5':
331                                 case '6':
332                                 case '7':
333                                         lex->esc[lex->esclen++] = 'o';
334                                         lex->esc[lex->esclen++] = *ptr;
335                                         break;
336 
337                                 default:
338                                         lex->is_escape = false;
339                                         c = strchr("a\ab\be\033f\fn\nr\rt\tv\v", *ptr);
340 
341                                         if (c && *c >= 'a') {
342                                                 lookbehind_append(lex, c + 1, 1);
343                                         }
344                                         else {
345                                                 /* regex mode => retain backslash */
346                                                 if (q == '/')
347                                                         lookbehind_append(lex, "\\", 1);
348 
349                                                 lookbehind_append(lex, ptr, 1);
350                                         }
351 
352                                         buf_consume(lex, (ptr + 1) - lex->bufstart);
353 
354                                         break;
355                                 }
356                         }
357                         else {
358                                 switch (lex->esc[0]) {
359                                 case 'u':
360                                         if (lex->esclen < 5) {
361                                                 if (!isxdigit(*ptr))
362                                                         return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
363 
364                                                 lex->esc[lex->esclen++] = *ptr;
365                                         }
366 
367                                         if (lex->esclen == 5) {
368                                                 code = hex(lex->esc[1]) * 16 * 16 * 16 +
369                                                        hex(lex->esc[2]) * 16 * 16 +
370                                                        hex(lex->esc[3]) * 16 +
371                                                        hex(lex->esc[4]);
372 
373                                                 /* is a leading surrogate value */
374                                                 if ((code & 0xFC00) == 0xD800) {
375                                                         /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
376                                                         if (lex->lead_surrogate)
377                                                                 append_utf8(lex, 0xFFFD);
378 
379                                                         /* store surrogate value and advance to next escape sequence */
380                                                         lex->lead_surrogate = code;
381                                                 }
382 
383                                                 /* is a trailing surrogate value */
384                                                 else if ((code & 0xFC00) == 0xDC00) {
385                                                         /* found a trailing surrogate following a leading one, combine and encode */
386                                                         if (lex->lead_surrogate) {
387                                                                 code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
388                                                                 lex->lead_surrogate = 0;
389                                                         }
390 
391                                                         /* trailing surrogate not following a leading one, ignore and use replacement char */
392                                                         else {
393                                                                 code = 0xFFFD;
394                                                         }
395 
396                                                         append_utf8(lex, code);
397                                                 }
398 
399                                                 /* is a normal codepoint */
400                                                 else {
401                                                         append_utf8(lex, code);
402                                                 }
403 
404                                                 lex->esclen = 0;
405                                                 lex->is_escape = false;
406                                                 buf_consume(lex, (ptr + 1) - lex->bufstart);
407                                         }
408 
409                                         break;
410 
411                                 case 'x':
412                                         if (lex->esclen < 3) {
413                                                 if (!isxdigit(*ptr))
414                                                         return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
415 
416                                                 lex->esc[lex->esclen++] = *ptr;
417                                         }
418 
419                                         if (lex->esclen == 3) {
420                                                 append_utf8(lex, hex(lex->esc[1]) * 16 + hex(lex->esc[2]));
421 
422                                                 lex->esclen = 0;
423                                                 lex->is_escape = false;
424                                                 buf_consume(lex, (ptr + 1) - lex->bufstart);
425                                         }
426 
427                                         break;
428 
429                                 case 'o':
430                                         if (lex->esclen < 4) {
431                                                 /* found a non-octal char */
432                                                 if (*ptr < '' || *ptr > '7') {
433                                                         /* pad sequence to three chars */
434                                                         switch (lex->esclen) {
435                                                         case 3:
436                                                                 lex->esc[3] = lex->esc[2];
437                                                                 lex->esc[2] = lex->esc[1];
438                                                                 lex->esc[1] = '';
439                                                                 break;
440 
441                                                         case 2:
442                                                                 lex->esc[3] = lex->esc[1];
443                                                                 lex->esc[2] = '';
444                                                                 lex->esc[1] = '';
445                                                                 break;
446                                                         }
447 
448                                                         lex->esclen = 4;
449                                                         buf_consume(lex, ptr-- - lex->bufstart);
450                                                 }
451 
452                                                 /* append */
453                                                 else {
454                                                         lex->esc[lex->esclen++] = *ptr;
455                                                         buf_consume(lex, (ptr + 1) - lex->bufstart);
456                                                 }
457                                         }
458 
459                                         if (lex->esclen == 4) {
460                                                 code = dec(lex->esc[1]) * 8 * 8 +
461                                                        dec(lex->esc[2]) * 8 +
462                                                        dec(lex->esc[3]);
463 
464                                                 if (code > 255)
465                                                         return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
466 
467                                                 append_utf8(lex, code);
468 
469                                                 lex->esclen = 0;
470                                                 lex->is_escape = false;
471                                         }
472 
473                                         break;
474                                 }
475                         }
476                 }
477 
478                 /* terminating char */
479                 else if (*ptr == q) {
480                         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
481                         buf_consume(lex, (ptr + 1) - lex->bufstart);
482 
483                         rv = lookbehind_to_text(lex, lex->lastoff, TK_STRING, NULL);
484 
485                         if (!rv)
486                                 rv = emit_op(lex, lex->lastoff, TK_STRING, ucv_string_new_length("", 0));
487 
488                         return rv;
489                 }
490 
491                 /* escape sequence start */
492                 else if (*ptr == '\\') {
493                         lex->is_escape = true;
494                         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
495                         buf_consume(lex, (ptr - lex->bufstart) + 1);
496                 }
497         }
498 
499         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
500         buf_consume(lex, ptr - lex->bufstart);
501 
502         return NULL;
503 }
504 
505 
506 /*
507  * Parses a regexp literal from the given buffer.
508  *
509  * Returns a negative value on error, otherwise the amount of consumed
510  * characters from the given buffer.
511  *
512  * Error values:
513  *  -UC_ERROR_UNTERMINATED_STRING       Unterminated regexp
514  *  -UC_ERROR_INVALID_ESCAPE            Invalid escape sequence
515  *  -UC_ERROR_OVERLONG_STRING           Regexp literal too long
516  *  -UC_ERROR_INVALID_REGEXP        Could not compile regexp
517  */
518 
519 enum {
520         UC_LEX_PARSE_REGEX_INIT,
521         UC_LEX_PARSE_REGEX_PATTERN,
522         UC_LEX_PARSE_REGEX_FLAGS
523 };
524 
525 static uc_token_t *
526 parse_regexp(uc_lexer_t *lex)
527 {
528         bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false;
529         uc_token_t *rv;
530         size_t len;
531         char *s;
532 
533         switch (lex->esc[0]) {
534         case UC_LEX_PARSE_REGEX_INIT:
535                 if (lex->no_regexp) {
536                         if (buf_startswith(lex, "=")) {
537                                 buf_consume(lex, 1);
538 
539                                 return emit_op(lex, lex->source->off, TK_ASDIV, NULL);
540                         }
541 
542                         return emit_op(lex, lex->source->off, TK_DIV, NULL);
543                 }
544 
545                 lex->esc[0] = UC_LEX_PARSE_REGEX_PATTERN;
546                 break;
547 
548         case UC_LEX_PARSE_REGEX_PATTERN:
549                 rv = parse_string(lex);
550 
551                 if (rv && rv->type == TK_ERROR)
552                         return rv;
553 
554                 if (rv != NULL && rv != UC_LEX_CONTINUE_PARSING) {
555                         lex->lookbehind = (char *)rv;
556                         lex->esc[0] = UC_LEX_PARSE_REGEX_FLAGS;
557                 }
558 
559                 break;
560 
561         case UC_LEX_PARSE_REGEX_FLAGS:
562                 rv = (uc_token_t *)lex->lookbehind;
563 
564                 while (lex->bufstart < lex->bufend || lex->eof) {
565                         switch (lex->eof ? EOF : lex->bufstart[0]) {
566                         case 'g':
567                                 buf_consume(lex, 1);
568                                 is_reg_global = true;
569                                 break;
570 
571                         case 'i':
572                                 buf_consume(lex, 1);
573                                 is_reg_icase = true;
574                                 break;
575 
576                         case 's':
577                                 buf_consume(lex, 1);
578                                 is_reg_newline = true;
579                                 break;
580 
581                         default:
582                                 lex->lookbehind = NULL;
583 
584                                 len = xasprintf(&s, "%c%*s",
585                                         (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
586                                         ucv_string_length(rv->uv),
587                                         ucv_string_get(rv->uv));
588 
589                                 ucv_free(rv->uv, false);
590                                 rv->uv = ucv_string_new_length(s, len);
591                                 free(s);
592 
593                                 rv->type = TK_REGEXP;
594 
595                                 return rv;
596                         }
597                 }
598 
599                 break;
600         }
601 
602         return NULL;
603 }
604 
605 
606 /*
607  * Parses a label from the given buffer.
608  *
609  * Returns a negative value on error, otherwise the amount of consumed
610  * characters from the given buffer.
611  *
612  * Error values:
613  *  -UC_ERROR_OVERLONG_STRING   Label too long
614  */
615 
616 static uc_token_t *
617 parse_label(uc_lexer_t *lex)
618 {
619         const struct token *tok = lex->tok;
620         const struct keyword *word;
621         char *ptr;
622         size_t i;
623 
624         if (!lex->lookbehind && tok->plen)
625                 lookbehind_append(lex, tok->u.pat, tok->plen);
626 
627         if (!buf_remaining(lex) || (lex->bufstart[0] != '_' && !isalnum(lex->bufstart[0]))) {
628                 if (lex->no_keyword == false) {
629                         for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
630                                 if (lex->lookbehind && lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) {
631                                         lookbehind_reset(lex);
632 
633                                         return emit_op(lex, lex->source->off - word->plen, word->type, NULL);
634                                 }
635                         }
636                 }
637 
638                 return lookbehind_to_text(lex, lex->source->off - lex->lookbehindlen, TK_LABEL, NULL);
639         }
640 
641         for (ptr = lex->bufstart; ptr < lex->bufend && (*ptr == '_' || isalnum(*ptr)); ptr++)
642                 ;
643 
644         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
645         buf_consume(lex, ptr - lex->bufstart);
646 
647         return NULL;
648 }
649 
650 
651 /*
652  * Parses a number literal from the given buffer.
653  *
654  * Returns a negative value on error, otherwise the amount of consumed
655  * characters from the given buffer.
656  *
657  * Error values:
658  *  -UC_ERROR_INVALID_ESCAPE    Invalid number character
659  */
660 
661 static inline bool
662 is_numeric_char(uc_lexer_t *lex, char c)
663 {
664         char prev = lex->lookbehindlen ? lex->lookbehind[lex->lookbehindlen-1] : 0;
665 
666         if ((prev == 'e' || prev == 'E') && (c == '-' || c == '+'))
667                 return true;
668 
669         return prev ? (isxdigit(c) || c == 'x' || c == 'X' || c == '.') : (isdigit(c) || c == '.');
670 }
671 
672 static uc_token_t *
673 parse_number(uc_lexer_t *lex)
674 {
675         uc_token_t *rv = NULL;
676         uc_value_t *nv = NULL;
677         const char *ptr;
678         char *e;
679 
680         if (!buf_remaining(lex) || !is_numeric_char(lex, lex->bufstart[0])) {
681                 lookbehind_append(lex, "\0", 1);
682 
683                 nv = uc_number_parse(lex->lookbehind, &e);
684 
685                 switch (ucv_type(nv)) {
686                 case UC_DOUBLE:
687                         rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_DOUBLE, nv);
688                         break;
689 
690                 case UC_INTEGER:
691                         rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_NUMBER, nv);
692                         break;
693 
694                 default:
695                         rv = emit_op(lex, lex->source->off - (lex->lookbehindlen - (e - lex->lookbehind) - 1), TK_ERROR, ucv_string_new("Invalid number literal"));
696                 }
697 
698                 lookbehind_reset(lex);
699 
700                 return rv;
701         }
702 
703         for (ptr = lex->bufstart; ptr < lex->bufend && is_numeric_char(lex, *ptr); ptr++)
704                 ;
705 
706         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
707         buf_consume(lex, ptr - lex->bufstart);
708 
709         return NULL;
710 }
711 
712 static uc_token_t *
713 lex_step(uc_lexer_t *lex, FILE *fp)
714 {
715         uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) };
716         union { uint32_t n; char str[4]; } search;
717         const struct token *tok;
718         size_t rlen, rem;
719         char *ptr, c;
720         uc_token_t *rv;
721         size_t i;
722 
723         /* only less than UC_LEX_MAX_TOKEN_LEN unread buffer chars remaining,
724          * move the remaining bytes to the beginning and read more data */
725         if (buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN) {
726                 if (!lex->buf) {
727                         lex->buflen = 128;
728                         lex->buf = xalloc(lex->buflen);
729                 }
730 
731                 rem = lex->bufend - lex->bufstart;
732 
733                 if (rem)
734                         memcpy(lex->buf, lex->bufstart, rem);
735 
736                 rlen = fread(lex->buf + rem, 1, lex->buflen - rem, fp);
737 
738                 lex->bufstart = lex->buf;
739                 lex->bufend   = lex->buf + rlen + rem;
740 
741                 if (rlen == 0 && (ferror(fp) || feof(fp)))
742                         lex->eof = 1;
743         }
744 
745         switch (lex->state) {
746         case UC_LEX_IDENTIFY_BLOCK:
747                 /* previous block had strip trailing whitespace flag, skip leading whitespace */
748                 if (lex->modifier == MINUS) {
749                         while (buf_remaining(lex) && isspace(lex->bufstart[0]))
750                                 buf_consume(lex, 1);
751 
752                         lex->modifier = UNSPEC;
753                 }
754 
755                 /* previous block was a statement block and trim_blocks is enabld, skip leading newline */
756                 else if (lex->modifier == NEWLINE) {
757                         if (buf_startswith(lex, "\n"))
758                                 buf_consume(lex, 1);
759 
760                         lex->modifier = UNSPEC;
761                 }
762 
763                 /* scan forward through buffer to identify start token */
764                 for (ptr = lex->bufstart; ptr < lex->bufend - strlen("{#"); ptr++) {
765                         /* found start of comment block */
766                         if (!strncmp(ptr, "{#", 2)) {
767                                 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
768                                 buf_consume(lex, (ptr + 2) - lex->bufstart);
769                                 lex->lastoff = lex->source->off - 2;
770                                 lex->state = UC_LEX_BLOCK_COMMENT_START;
771 
772                                 return NULL;
773                         }
774 
775                         /* found start of expression block */
776                         else if (!strncmp(ptr, "{{", 2)) {
777                                 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
778                                 buf_consume(lex, (ptr + 2) - lex->bufstart);
779                                 lex->lastoff = lex->source->off - 2;
780                                 lex->state = UC_LEX_BLOCK_EXPRESSION_START;
781 
782                                 return NULL;
783                         }
784 
785                         /* found start of statement block */
786                         else if (!strncmp(ptr, "{%", 2)) {
787                                 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
788                                 buf_consume(lex, (ptr + 2) - lex->bufstart);
789                                 lex->lastoff = lex->source->off - 2;
790                                 lex->state = UC_LEX_BLOCK_STATEMENT_START;
791 
792                                 return NULL;
793                         }
794                 }
795 
796                 /* we're at eof */
797                 if (lex->eof) {
798                         lookbehind_append(lex, ptr, lex->bufend - ptr);
799                         lex->state = UC_LEX_EOF;
800 
801                         return lookbehind_to_text(lex, lex->lastoff, TK_TEXT, NULL);
802                 }
803 
804                 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
805                 buf_consume(lex, ptr - lex->bufstart);
806                 break;
807 
808 
809         case UC_LEX_BLOCK_COMMENT_START:
810         case UC_LEX_BLOCK_EXPRESSION_START:
811         case UC_LEX_BLOCK_STATEMENT_START:
812                 rv = NULL;
813                 lex->modifier = UNSPEC;
814 
815                 /* strip whitespace before block */
816                 if (buf_startswith(lex, "-")) {
817                         rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \n\t\v\f\r");
818                         buf_consume(lex, 1);
819                 }
820 
821                 /* disable lstrip flag (only valid for statement blocks) */
822                 else if (lex->state == UC_LEX_BLOCK_STATEMENT_START) {
823                         /* disable lstrip flag */
824                         if (buf_startswith(lex, "+")) {
825                                 rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL);
826                                 buf_consume(lex, 1);
827                         }
828 
829                         /* global block lstrip */
830                         else if (lex->config && lex->config->lstrip_blocks) {
831                                 rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \t\v\f\r");
832                         }
833                 }
834                 else {
835                         rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL);
836                 }
837 
838                 switch (lex->state) {
839                 case UC_LEX_BLOCK_COMMENT_START:
840                         lex->state = UC_LEX_BLOCK_COMMENT;
841                         lex->block = COMMENT;
842                         break;
843 
844                 case UC_LEX_BLOCK_STATEMENT_START:
845                         lex->state = UC_LEX_IDENTIFY_TOKEN;
846                         lex->block = STATEMENTS;
847                         break;
848 
849                 case UC_LEX_BLOCK_EXPRESSION_START:
850                         lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;
851                         break;
852 
853                 default:
854                         break;
855                 }
856 
857                 return rv;
858 
859 
860         case UC_LEX_BLOCK_COMMENT:
861                 /* scan forward through buffer to identify end token */
862                 while (lex->bufstart < lex->bufend - 2) {
863                         if (buf_startswith(lex, "-#}")) {
864                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
865                                 lex->modifier = MINUS;
866                                 buf_consume(lex, 3);
867                                 lex->lastoff = lex->source->off;
868                                 break;
869                         }
870                         else if (buf_startswith(lex, "#}")) {
871                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
872                                 buf_consume(lex, 2);
873                                 lex->lastoff = lex->source->off;
874                                 break;
875                         }
876 
877                         buf_consume(lex, 1);
878                 }
879 
880                 /* we're at eof */
881                 if (lex->eof) {
882                         lex->state = UC_LEX_EOF;
883 
884                         buf_consume(lex, lex->bufend - lex->bufstart);
885 
886                         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
887                 }
888 
889                 break;
890 
891 
892         case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
893                 lex->state = UC_LEX_IDENTIFY_TOKEN;
894                 lex->block = EXPRESSION;
895 
896                 return emit_op(lex, lex->source->off, TK_LEXP, NULL);
897 
898 
899         case UC_LEX_IDENTIFY_TOKEN:
900                 /* skip leading whitespace */
901                 for (i = 0; i < buf_remaining(lex) && isspace(lex->bufstart[i]); i++)
902                         ;
903 
904                 buf_consume(lex, i);
905 
906                 if (i > 0 && buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN)
907                         return NULL;
908 
909                 for (i = 0; i < sizeof(search.str); i++)
910                         search.str[i] = (i < buf_remaining(lex)) ? lex->bufstart[i] : 0;
911 
912                 for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) {
913                         /* remaining buffer data is shorter than token, skip */
914                         if (tok->plen > buf_remaining(lex))
915                                 continue;
916 
917                         c = buf_remaining(lex) ? lex->bufstart[0] : 0;
918 
919                         if (tok->plen ? ((search.n & masks[tok->plen]) == tok->u.patn)
920                                       : (c >= tok->u.pat[0] && c <= tok->u.pat[1])) {
921                                 lex->lastoff = lex->source->off;
922 
923                                 /* token has a parse method, switch state */
924                                 if (tok->parse) {
925                                         lex->tok = tok;
926                                         lex->state = UC_LEX_PARSE_TOKEN;
927 
928                                         buf_consume(lex, tok->plen);
929 
930                                         return NULL;
931                                 }
932 
933                                 /* in raw code mode, ignore template tag tokens */
934                                 if (lex->config && lex->config->raw_mode &&
935                                     (tok->type == TK_LSTM || tok->type == TK_RSTM ||
936                                      tok->type == TK_LEXP || tok->type == TK_REXP)) {
937                                         continue;
938                                 }
939 
940                                 /* disallow nesting blocks */
941                                 if (tok->type == TK_LSTM || tok->type == TK_LEXP) {
942                                         buf_consume(lex, tok->plen);
943 
944                                         return emit_op(lex, lex->source->off - tok->plen, TK_ERROR, ucv_string_new("Template blocks may not be nested"));
945                                 }
946 
947                                 /* found end of block */
948                                 else if ((lex->block == STATEMENTS && tok->type == TK_RSTM) ||
949                                          (lex->block == EXPRESSION && tok->type == TK_REXP)) {
950                                         /* strip whitespace after block */
951                                         if (tok->u.pat[0] == '-')
952                                                 lex->modifier = MINUS;
953 
954                                         /* strip newline after statement block */
955                                         else if (lex->block == STATEMENTS &&
956                                                  lex->config && lex->config->trim_blocks)
957                                                 lex->modifier = NEWLINE;
958 
959                                         lex->state = UC_LEX_IDENTIFY_BLOCK;
960                                         lex->block = NONE;
961                                 }
962 
963                                 /* do not report statement tags to the parser */
964                                 if (tok->type != 0 && tok->type != TK_LSTM)
965                                         rv = emit_op(lex, lex->source->off,
966                                                 (tok->type == TK_RSTM) ? TK_SCOL : tok->type, NULL);
967                                 else
968                                         rv = NULL;
969 
970                                 buf_consume(lex, tok->plen);
971 
972                                 return rv;
973                         }
974                 }
975 
976                 /* no possible return beyond this point can advance,
977                    mark lex state as eof */
978                 lex->state = UC_LEX_EOF;
979 
980                 /* no token matched and we do have remaining data, junk */
981                 if (buf_remaining(lex))
982                         return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unexpected character"));
983 
984                 /* we're at eof, allow unclosed statement blocks */
985                 if (lex->block == STATEMENTS)
986                         return NULL;
987 
988                 /* premature EOF */
989                 return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unterminated template block"));
990 
991 
992         case UC_LEX_PARSE_TOKEN:
993                 tok = lex->tok;
994                 rv = tok->parse(lex);
995 
996                 if (rv) {
997                         memset(lex->esc, 0, sizeof(lex->esc));
998                         lex->state = UC_LEX_IDENTIFY_TOKEN;
999                         lex->tok = NULL;
1000 
1001                         if (rv == UC_LEX_CONTINUE_PARSING)
1002                                 rv = NULL;
1003 
1004                         return rv;
1005                 }
1006 
1007                 break;
1008 
1009 
1010         case UC_LEX_EOF:
1011                 break;
1012         }
1013 
1014         return NULL;
1015 }
1016 
1017 void
1018 uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
1019 {
1020         lex->state = UC_LEX_IDENTIFY_BLOCK;
1021 
1022         lex->config = config;
1023         lex->source = uc_source_get(source);
1024 
1025         lex->eof = 0;
1026         lex->is_escape = 0;
1027 
1028         lex->block = NONE;
1029         lex->modifier = UNSPEC;
1030 
1031         lex->buflen = 0;
1032         lex->buf = NULL;
1033         lex->bufstart = NULL;
1034         lex->bufend = NULL;
1035 
1036         lex->lookbehindlen = 0;
1037         lex->lookbehind = NULL;
1038 
1039         lex->tok = NULL;
1040 
1041         lex->esclen = 0;
1042         memset(lex->esc, 0, sizeof(lex->esc));
1043 
1044         lex->lead_surrogate = 0;
1045 
1046         lex->lastoff = 0;
1047 
1048         if (config && config->raw_mode) {
1049                 lex->state = UC_LEX_IDENTIFY_TOKEN;
1050                 lex->block = STATEMENTS;
1051         }
1052 }
1053 
1054 void
1055 uc_lexer_free(uc_lexer_t *lex)
1056 {
1057         uc_source_put(lex->source);
1058 
1059         free(lex->lookbehind);
1060         free(lex->buf);
1061 }
1062 
1063 uc_token_t *
1064 uc_lexer_next_token(uc_lexer_t *lex)
1065 {
1066         uc_token_t *rv = NULL;
1067 
1068         while (lex->state != UC_LEX_EOF) {
1069                 rv = lex_step(lex, lex->source->fp);
1070 
1071                 if (rv != NULL)
1072                         break;
1073         }
1074 
1075         if (rv) {
1076                 lex->no_keyword = false;
1077                 lex->no_regexp = false;
1078 
1079                 return rv;
1080         }
1081 
1082         return emit_op(lex, lex->source->off, TK_EOF, NULL);
1083 }
1084 
1085 const char *
1086 uc_tokenname(unsigned type)
1087 {
1088         static char buf[sizeof("'endfunction'")];
1089         size_t i;
1090 
1091         switch (type) {
1092         case 0:        return "End of file";
1093         case TK_STRING: return "String";
1094         case TK_LABEL:  return "Label";
1095         case TK_NUMBER: return "Number";
1096         case TK_DOUBLE: return "Double";
1097         case TK_REGEXP: return "Regexp";
1098         }
1099 
1100         for (i = 0; i < ARRAY_SIZE(tokens); i++) {
1101                 if (tokens[i].type != type)
1102                         continue;
1103 
1104                 snprintf(buf, sizeof(buf), "'%s'", tokens[i].u.pat);
1105 
1106                 return buf;
1107         }
1108 
1109         for (i = 0; i < ARRAY_SIZE(reserved_words); i++) {
1110                 if (reserved_words[i].type != type)
1111                         continue;
1112 
1113                 snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat);
1114 
1115                 return buf;
1116         }
1117 
1118         return "?";
1119 }
1120 
1121 bool
1122 uc_lexer_is_keyword(uc_value_t *label)
1123 {
1124         size_t i;
1125 
1126         if (ucv_type(label) != UC_STRING)
1127                 return false;
1128 
1129         for (i = 0; i < ARRAY_SIZE(reserved_words); i++)
1130                 if (!strcmp(reserved_words[i].pat, ucv_string_get(label)))
1131                         return true;
1132 
1133         return false;
1134 }
1135 
1136 #endif /* NO_COMPILE */
1137 
1138 /*
1139  * Stores the given codepoint as a utf8 multibyte sequence into the given
1140  * output buffer and substracts the required amount of bytes from  the given
1141  * length pointer.
1142  *
1143  * Returns false if the multibyte sequence would not fit into the buffer,
1144  * otherwise true.
1145  */
1146 
1147 bool
1148 utf8enc(char **out, int *rem, int code)
1149 {
1150         if (code >= 0 && code <= 0x7F) {
1151                 if (*rem < 1)
1152                         return false;
1153 
1154                 *(*out)++ = code; (*rem)--;
1155 
1156                 return true;
1157         }
1158         else if (code > 0 && code <= 0x7FF) {
1159                 if (*rem < 2)
1160                         return false;
1161 
1162                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
1163                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1164 
1165                 return true;
1166         }
1167         else if (code > 0 && code <= 0xFFFF) {
1168                 if (*rem < 3)
1169                         return false;
1170 
1171                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
1172                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1173                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1174 
1175                 return true;
1176         }
1177         else if (code > 0 && code <= 0x10FFFF) {
1178                 if (*rem < 4)
1179                         return false;
1180 
1181                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
1182                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
1183                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1184                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1185 
1186                 return true;
1187         }
1188 
1189         return true;
1190 }
1191 

This page was automatically generated by LXR 0.3.1.  •  OpenWrt