• source navigation  • diff markup  • identifier search  • freetext search  • 

Sources/ucode/lexer.c

  1 /*
  2  * Copyright (C) 2020-2021 Jo-Philipp Wich <jo@mein.io>
  3  *
  4  * Permission to use, copy, modify, and/or distribute this software for any
  5  * purpose with or without fee is hereby granted, provided that the above
  6  * copyright notice and this permission notice appear in all copies.
  7  *
  8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15  */
 16 
 17 #include <stdio.h>
 18 
 19 #include <stdbool.h>
 20 #include <stdlib.h>
 21 #include <string.h>
 22 #include <ctype.h>
 23 #include <regex.h>
 24 #include <math.h>
 25 #include <errno.h>
 26 #include <endian.h>
 27 
 28 #include "ucode/vm.h"
 29 #include "ucode/lib.h"
 30 #include "ucode/lexer.h"
 31 
 32 #define UC_LEX_CONTINUE_PARSING (void *)1
 33 
 34 struct keyword {
 35         unsigned type;
 36         const char *pat;
 37         unsigned plen;
 38 };
 39 
 40 struct token {
 41         unsigned type;
 42         union {
 43                 uint32_t patn;
 44                 char pat[4];
 45         } u;
 46         unsigned plen;
 47         uc_token_t *(*parse)(uc_lexer_t *);
 48 };
 49 
 50 #define dec(o) \
 51         ((o) - '')
 52 
 53 #define hex(x) \
 54         (((x) >= 'a') ? (10 + (x) - 'a') : \
 55                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
 56 
 57 #ifndef NO_COMPILE
 58 
 59 static uc_token_t *parse_comment(uc_lexer_t *);
 60 static uc_token_t *parse_string(uc_lexer_t *);
 61 static uc_token_t *parse_regexp(uc_lexer_t *);
 62 static uc_token_t *parse_number(uc_lexer_t *);
 63 static uc_token_t *parse_label(uc_lexer_t *);
 64 
 65 static const struct token tokens[] = {
 66         { TK_ASLEFT,    { .pat = "<<=" },   3, NULL },
 67         { TK_ASRIGHT,   { .pat = ">>=" },   3, NULL },
 68         { TK_LEXP,              { .pat = "{{-" },   3, NULL },
 69         { TK_REXP,              { .pat = "-}}" },   3, NULL },
 70         { TK_LSTM,              { .pat = "{%+" },   3, NULL },
 71         { TK_LSTM,              { .pat = "{%-" },   3, NULL },
 72         { TK_RSTM,              { .pat = "-%}" },   3, NULL },
 73         { TK_EQS,               { .pat = "===" },   3, NULL },
 74         { TK_NES,               { .pat = "!==" },   3, NULL },
 75         { TK_ELLIP,             { .pat = "..." },   3, NULL },
 76         { TK_QLBRACK,   { .pat = "?.[" },   3, NULL },
 77         { TK_QLPAREN,   { .pat = "?.(" },   3, NULL },
 78         { TK_ASEXP,             { .pat = "**=" },   3, NULL },
 79         { TK_ASAND,             { .pat = "&&=" },   3, NULL },
 80         { TK_ASOR,              { .pat = "||=" },   3, NULL },
 81         { TK_ASNULLISH, { .pat = "\?\?=" }, 3, NULL },
 82         { TK_AND,               { .pat = "&&" },    2, NULL },
 83         { TK_ASADD,             { .pat = "+=" },    2, NULL },
 84         { TK_ASBAND,    { .pat = "&=" },    2, NULL },
 85         { TK_ASBOR,             { .pat = "|=" },    2, NULL },
 86         { TK_ASBXOR,    { .pat = "^=" },    2, NULL },
 87         //{ TK_ASDIV,   { .pat = "/=" },    2, NULL },
 88         { TK_ASMOD,             { .pat = "%=" },    2, NULL },
 89         { TK_ASMUL,             { .pat = "*=" },    2, NULL },
 90         { TK_ASSUB,             { .pat = "-=" },    2, NULL },
 91         { TK_EXP,               { .pat = "**" },    2, NULL },
 92         { TK_DEC,               { .pat = "--" },    2, NULL },
 93         { TK_INC,               { .pat = "++" },    2, NULL },
 94         { TK_EQ,                { .pat = "==" },    2, NULL },
 95         { TK_NE,                { .pat = "!=" },    2, NULL },
 96         { TK_LE,                { .pat = "<=" },    2, NULL },
 97         { TK_GE,                { .pat = ">=" },    2, NULL },
 98         { TK_LSHIFT,    { .pat = "<<" },    2, NULL },
 99         { TK_RSHIFT,    { .pat = ">>" },    2, NULL },
100         { 0,                    { .pat = "//" },    2, parse_comment },
101         { 0,                    { .pat = "/*" },    2, parse_comment },
102         { TK_OR,                { .pat = "||" },    2, NULL },
103         { TK_LEXP,              { .pat = "{{" },    2, NULL },
104         { TK_REXP,              { .pat = "}}" },    2, NULL },
105         { TK_LSTM,              { .pat = "{%" },    2, NULL },
106         { TK_RSTM,              { .pat = "%}" },    2, NULL },
107         { TK_ARROW,             { .pat = "=>" },    2, NULL },
108         { TK_NULLISH,   { .pat = "??" },    2, NULL },
109         { TK_QDOT,              { .pat = "?." },    2, NULL },
110         { TK_PLACEH,    { .pat = "${" },    2, NULL },
111         { TK_ADD,               { .pat = "+" },     1, NULL },
112         { TK_ASSIGN,    { .pat = "=" },     1, NULL },
113         { TK_BAND,              { .pat = "&" },     1, NULL },
114         { TK_BOR,               { .pat = "|" },     1, NULL },
115         { TK_LBRACK,    { .pat = "[" },     1, NULL },
116         { TK_RBRACK,    { .pat = "]" },     1, NULL },
117         { TK_BXOR,              { .pat = "^" },     1, NULL },
118         { TK_LBRACE,    { .pat = "{" },     1, NULL },
119         { TK_RBRACE,    { .pat = "}" },     1, NULL },
120         { TK_COLON,             { .pat = ":" },     1, NULL },
121         { TK_COMMA,             { .pat = "," },     1, NULL },
122         { TK_COMPL,             { .pat = "~" },     1, NULL },
123         //{ TK_DIV,             { .pat = "/" },     1, NULL },
124         { TK_GT,                { .pat = ">" },     1, NULL },
125         { TK_NOT,               { .pat = "!" },     1, NULL },
126         { TK_LT,                { .pat = "<" },     1, NULL },
127         { TK_MOD,               { .pat = "%" },     1, NULL },
128         { TK_MUL,               { .pat = "*" },     1, NULL },
129         { TK_LPAREN,    { .pat = "(" },     1, NULL },
130         { TK_RPAREN,    { .pat = ")" },     1, NULL },
131         { TK_QMARK,             { .pat = "?" },     1, NULL },
132         { TK_SCOL,              { .pat = ";" },     1, NULL },
133         { TK_SUB,               { .pat = "-" },     1, NULL },
134         { TK_DOT,               { .pat = "." },     1, NULL },
135         { TK_STRING,    { .pat = "'" },     1, parse_string },
136         { TK_STRING,    { .pat = "\"" },    1, parse_string },
137         { TK_REGEXP,    { .pat = "/" },     1, parse_regexp },
138         { TK_LABEL,             { .pat = "_" },     1, parse_label },
139         { TK_LABEL,             { .pat = "az" },    0, parse_label },
140         { TK_LABEL,             { .pat = "AZ" },    0, parse_label },
141         { TK_NUMBER,    { .pat = "09" },    0, parse_number },
142 
143         /* NB: this must be last for simple retrieval */
144         { TK_TEMPLATE,  { .pat = "`" },     1, parse_string }
145 };
146 
147 static const struct keyword reserved_words[] = {
148         { TK_ENDFUNC,   "endfunction", 11 },
149         { TK_CONTINUE,  "continue", 8 },
150         { TK_ENDWHILE,  "endwhile", 8 },
151         { TK_FUNC,              "function", 8 },
152         { TK_DEFAULT,   "default", 7 },
153         { TK_DELETE,    "delete", 6 },
154         { TK_RETURN,    "return", 6 },
155         { TK_ENDFOR,    "endfor", 6 },
156         { TK_SWITCH,    "switch", 6 },
157         { TK_ENDIF,             "endif", 5 },
158         { TK_WHILE,             "while", 5 },
159         { TK_BREAK,             "break", 5 },
160         { TK_CATCH,             "catch", 5 },
161         { TK_CONST,             "const", 5 },
162         { TK_FALSE,             "false", 5 },
163         { TK_TRUE,              "true",  4 },
164         { TK_ELIF,              "elif",  4 },
165         { TK_ELSE,              "else",  4 },
166         { TK_THIS,              "this",  4 },
167         { TK_NULL,              "null",  4 },
168         { TK_CASE,              "case",  4 },
169         { TK_TRY,               "try",   3 },
170         { TK_FOR,               "for",   3 },
171         { TK_LOCAL,             "let",   3 },
172         { TK_IF,                "if",    2 },
173         { TK_IN,                "in",    2 },
174 };
175 
176 
177 /* length of the longest token in our lookup table */
178 #define UC_LEX_MAX_TOKEN_LEN 3
179 
180 static uc_token_t *
181 emit_op(uc_lexer_t *lex, uint32_t pos, int type, uc_value_t *uv)
182 {
183         lex->curr.type = type;
184         lex->curr.uv = uv;
185         lex->curr.pos = pos;
186 
187         return &lex->curr;
188 }
189 
190 static void lookbehind_append(uc_lexer_t *lex, const char *data, size_t len)
191 {
192         if (len) {
193                 lex->lookbehind = xrealloc(lex->lookbehind, lex->lookbehindlen + len);
194                 memcpy(lex->lookbehind + lex->lookbehindlen, data, len);
195                 lex->lookbehindlen += len;
196         }
197 }
198 
199 static void lookbehind_reset(uc_lexer_t *lex) {
200         free(lex->lookbehind);
201         lex->lookbehind = NULL;
202         lex->lookbehindlen = 0;
203 }
204 
205 static uc_token_t *
206 lookbehind_to_text(uc_lexer_t *lex, uint32_t pos, int type, const char *strip_trailing_chars) {
207         uc_token_t *rv = NULL;
208 
209         if (lex->lookbehind) {
210                 if (strip_trailing_chars) {
211                         while (lex->lookbehindlen > 0 && strchr(strip_trailing_chars, lex->lookbehind[lex->lookbehindlen-1]))
212                                 lex->lookbehindlen--;
213                 }
214 
215                 rv = emit_op(lex, pos, type, ucv_string_new_length(lex->lookbehind, lex->lookbehindlen));
216 
217                 lookbehind_reset(lex);
218         }
219 
220         return rv;
221 }
222 
223 static inline size_t
224 buf_remaining(uc_lexer_t *lex) {
225         return (lex->bufend - lex->bufstart);
226 }
227 
228 static inline bool
229 _buf_startswith(uc_lexer_t *lex, const char *str, size_t len) {
230         return (buf_remaining(lex) >= len && !strncmp(lex->bufstart, str, len));
231 }
232 
233 #define buf_startswith(s, str) _buf_startswith(s, str, sizeof(str) - 1)
234 
235 
236 static void
237 buf_consume(uc_lexer_t *lex, size_t len) {
238         size_t i, linelen;
239 
240         for (i = 0, linelen = 0; i < len; i++) {
241                 if (lex->bufstart[i] == '\n') {
242                         uc_source_line_update(lex->source, linelen);
243                         uc_source_line_next(lex->source);
244 
245                         linelen = 0;
246                 }
247                 else {
248                         linelen++;
249                 }
250         }
251 
252         if (linelen)
253                 uc_source_line_update(lex->source, linelen);
254 
255         lex->bufstart += len;
256         lex->source->off += len;
257 }
258 
259 static uc_token_t *
260 parse_comment(uc_lexer_t *lex)
261 {
262         const struct token *tok = lex->tok;
263         const char *ptr, *end;
264         size_t elen;
265 
266         if (!strcmp(tok->u.pat, "//")) {
267                 end = "\n";
268                 elen = 1;
269         }
270         else {
271                 end = "*/";
272                 elen = 2;
273         }
274 
275         for (ptr = lex->bufstart; ptr < lex->bufend - elen; ptr++) {
276                 if (!strncmp(ptr, end, elen)) {
277                         buf_consume(lex, (ptr - lex->bufstart) + elen);
278 
279                         return UC_LEX_CONTINUE_PARSING;
280                 }
281         }
282 
283         buf_consume(lex, ptr - lex->bufstart);
284 
285         if (lex->eof) {
286                 lex->state = UC_LEX_EOF;
287 
288                 if (elen == 2)
289                         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated comment"));
290         }
291 
292         return NULL;
293 }
294 
295 static void
296 append_utf8(uc_lexer_t *lex, int code) {
297         char ustr[8], *up;
298         int rem;
299 
300         up = ustr;
301         rem = sizeof(ustr);
302 
303         if (utf8enc(&up, &rem, code))
304                 lookbehind_append(lex, ustr, up - ustr);
305 }
306 
307 static uc_token_t *
308 parse_string(uc_lexer_t *lex)
309 {
310         const struct token *tok = lex->tok;
311         char q = tok->u.pat[0];
312         char *ptr, *c;
313         uc_token_t *rv;
314         int code;
315 
316         if (!buf_remaining(lex))
317                 return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated string"));
318 
319         for (ptr = lex->bufstart; ptr < lex->bufend; ptr++) {
320                 /* continuation of placeholder start */
321                 if (lex->is_placeholder) {
322                         if (*ptr == '{') {
323                                 buf_consume(lex, 1);
324                                 rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL);
325 
326                                 if (!rv)
327                                         rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0));
328 
329                                 return rv;
330                         }
331 
332                         lex->is_placeholder = false;
333                         lookbehind_append(lex, "$", 1);
334                 }
335 
336                 /* continuation of escape sequence */
337                 if (lex->is_escape) {
338                         if (lex->esclen == 0) {
339                                 /* non-unicode escape following a lead surrogate, emit replacement... */
340                                 if (lex->lead_surrogate && *ptr != 'u') {
341                                         append_utf8(lex, 0xFFFD);
342                                         lex->lead_surrogate = 0;
343                                 }
344 
345                                 switch ((q == '/') ? 0 : *ptr) {
346                                 case 'u':
347                                 case 'x':
348                                         lex->esc[lex->esclen++] = *ptr;
349                                         break;
350 
351                                 case '':
352                                 case '1':
353                                 case '2':
354                                 case '3':
355                                 case '4':
356                                 case '5':
357                                 case '6':
358                                 case '7':
359                                         lex->esc[lex->esclen++] = 'o';
360                                         lex->esc[lex->esclen++] = *ptr;
361                                         break;
362 
363                                 default:
364                                         lex->is_escape = false;
365                                         c = strchr("a\ab\be\033f\fn\nr\rt\tv\v", *ptr);
366 
367                                         if (c && *c >= 'a') {
368                                                 lookbehind_append(lex, c + 1, 1);
369                                         }
370                                         else {
371                                                 /* regex mode => retain backslash */
372                                                 if (q == '/')
373                                                         lookbehind_append(lex, "\\", 1);
374 
375                                                 lookbehind_append(lex, ptr, 1);
376                                         }
377 
378                                         buf_consume(lex, (ptr + 1) - lex->bufstart);
379 
380                                         break;
381                                 }
382                         }
383                         else {
384                                 switch (lex->esc[0]) {
385                                 case 'u':
386                                         if (lex->esclen < 5) {
387                                                 if (!isxdigit(*ptr))
388                                                         return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
389 
390                                                 lex->esc[lex->esclen++] = *ptr;
391                                         }
392 
393                                         if (lex->esclen == 5) {
394                                                 code = hex(lex->esc[1]) * 16 * 16 * 16 +
395                                                        hex(lex->esc[2]) * 16 * 16 +
396                                                        hex(lex->esc[3]) * 16 +
397                                                        hex(lex->esc[4]);
398 
399                                                 /* is a leading surrogate value */
400                                                 if ((code & 0xFC00) == 0xD800) {
401                                                         /* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
402                                                         if (lex->lead_surrogate)
403                                                                 append_utf8(lex, 0xFFFD);
404 
405                                                         /* store surrogate value and advance to next escape sequence */
406                                                         lex->lead_surrogate = code;
407                                                 }
408 
409                                                 /* is a trailing surrogate value */
410                                                 else if ((code & 0xFC00) == 0xDC00) {
411                                                         /* found a trailing surrogate following a leading one, combine and encode */
412                                                         if (lex->lead_surrogate) {
413                                                                 code = 0x10000 + ((lex->lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
414                                                                 lex->lead_surrogate = 0;
415                                                         }
416 
417                                                         /* trailing surrogate not following a leading one, ignore and use replacement char */
418                                                         else {
419                                                                 code = 0xFFFD;
420                                                         }
421 
422                                                         append_utf8(lex, code);
423                                                 }
424 
425                                                 /* is a normal codepoint */
426                                                 else {
427                                                         append_utf8(lex, code);
428                                                 }
429 
430                                                 lex->esclen = 0;
431                                                 lex->is_escape = false;
432                                                 buf_consume(lex, (ptr + 1) - lex->bufstart);
433                                         }
434 
435                                         break;
436 
437                                 case 'x':
438                                         if (lex->esclen < 3) {
439                                                 if (!isxdigit(*ptr))
440                                                         return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
441 
442                                                 lex->esc[lex->esclen++] = *ptr;
443                                         }
444 
445                                         if (lex->esclen == 3) {
446                                                 append_utf8(lex, hex(lex->esc[1]) * 16 + hex(lex->esc[2]));
447 
448                                                 lex->esclen = 0;
449                                                 lex->is_escape = false;
450                                                 buf_consume(lex, (ptr + 1) - lex->bufstart);
451                                         }
452 
453                                         break;
454 
455                                 case 'o':
456                                         if (lex->esclen < 4) {
457                                                 /* found a non-octal char */
458                                                 if (*ptr < '' || *ptr > '7') {
459                                                         /* pad sequence to three chars */
460                                                         switch (lex->esclen) {
461                                                         case 3:
462                                                                 lex->esc[3] = lex->esc[2];
463                                                                 lex->esc[2] = lex->esc[1];
464                                                                 lex->esc[1] = '';
465                                                                 break;
466 
467                                                         case 2:
468                                                                 lex->esc[3] = lex->esc[1];
469                                                                 lex->esc[2] = '';
470                                                                 lex->esc[1] = '';
471                                                                 break;
472                                                         }
473 
474                                                         lex->esclen = 4;
475                                                         buf_consume(lex, ptr-- - lex->bufstart);
476                                                 }
477 
478                                                 /* append */
479                                                 else {
480                                                         lex->esc[lex->esclen++] = *ptr;
481                                                         buf_consume(lex, (ptr + 1) - lex->bufstart);
482                                                 }
483                                         }
484 
485                                         if (lex->esclen == 4) {
486                                                 code = dec(lex->esc[1]) * 8 * 8 +
487                                                        dec(lex->esc[2]) * 8 +
488                                                        dec(lex->esc[3]);
489 
490                                                 if (code > 255)
491                                                         return emit_op(lex, lex->source->off + lex->esclen + 1, TK_ERROR, ucv_string_new("Invalid escape sequence"));
492 
493                                                 append_utf8(lex, code);
494 
495                                                 lex->esclen = 0;
496                                                 lex->is_escape = false;
497                                         }
498 
499                                         break;
500                                 }
501                         }
502                 }
503 
504                 /* terminating char */
505                 else if (*ptr == q) {
506                         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
507                         buf_consume(lex, (ptr + 1) - lex->bufstart);
508 
509                         rv = lookbehind_to_text(lex, lex->lastoff, tok->type, NULL);
510 
511                         if (!rv)
512                                 rv = emit_op(lex, lex->lastoff, tok->type, ucv_string_new_length("", 0));
513 
514                         return rv;
515                 }
516 
517                 /* escape sequence start */
518                 else if (*ptr == '\\') {
519                         lex->is_escape = true;
520                         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
521                         buf_consume(lex, (ptr - lex->bufstart) + 1);
522                 }
523 
524                 /* potential placeholder start */
525                 else if (q == '`' && *ptr == '$') {
526                         lex->is_placeholder = true;
527                         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
528                         buf_consume(lex, (ptr - lex->bufstart) + 1);
529                 }
530         }
531 
532         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
533         buf_consume(lex, ptr - lex->bufstart);
534 
535         return NULL;
536 }
537 
538 
539 /*
540  * Parses a regexp literal from the given buffer.
541  *
542  * Returns a negative value on error, otherwise the amount of consumed
543  * characters from the given buffer.
544  *
545  * Error values:
546  *  -UC_ERROR_UNTERMINATED_STRING       Unterminated regexp
547  *  -UC_ERROR_INVALID_ESCAPE            Invalid escape sequence
548  *  -UC_ERROR_OVERLONG_STRING           Regexp literal too long
549  *  -UC_ERROR_INVALID_REGEXP        Could not compile regexp
550  */
551 
552 enum {
553         UC_LEX_PARSE_REGEX_INIT,
554         UC_LEX_PARSE_REGEX_PATTERN,
555         UC_LEX_PARSE_REGEX_FLAGS
556 };
557 
558 static uc_token_t *
559 parse_regexp(uc_lexer_t *lex)
560 {
561         bool is_reg_global = false, is_reg_icase = false, is_reg_newline = false;
562         uc_token_t *rv;
563         size_t len;
564         char *s;
565 
566         switch (lex->esc[0]) {
567         case UC_LEX_PARSE_REGEX_INIT:
568                 if (lex->no_regexp) {
569                         if (buf_startswith(lex, "=")) {
570                                 buf_consume(lex, 1);
571 
572                                 return emit_op(lex, lex->source->off, TK_ASDIV, NULL);
573                         }
574 
575                         return emit_op(lex, lex->source->off, TK_DIV, NULL);
576                 }
577 
578                 lex->esc[0] = UC_LEX_PARSE_REGEX_PATTERN;
579                 break;
580 
581         case UC_LEX_PARSE_REGEX_PATTERN:
582                 rv = parse_string(lex);
583 
584                 if (rv && rv->type == TK_ERROR)
585                         return rv;
586 
587                 if (rv != NULL && rv != UC_LEX_CONTINUE_PARSING) {
588                         lex->lookbehind = (char *)rv;
589                         lex->esc[0] = UC_LEX_PARSE_REGEX_FLAGS;
590                 }
591 
592                 break;
593 
594         case UC_LEX_PARSE_REGEX_FLAGS:
595                 rv = (uc_token_t *)lex->lookbehind;
596 
597                 while (lex->bufstart < lex->bufend || lex->eof) {
598                         switch (lex->eof ? EOF : lex->bufstart[0]) {
599                         case 'g':
600                                 buf_consume(lex, 1);
601                                 is_reg_global = true;
602                                 break;
603 
604                         case 'i':
605                                 buf_consume(lex, 1);
606                                 is_reg_icase = true;
607                                 break;
608 
609                         case 's':
610                                 buf_consume(lex, 1);
611                                 is_reg_newline = true;
612                                 break;
613 
614                         default:
615                                 lex->lookbehind = NULL;
616 
617                                 len = xasprintf(&s, "%c%*s",
618                                         (is_reg_global << 0) | (is_reg_icase << 1) | (is_reg_newline << 2),
619                                         ucv_string_length(rv->uv),
620                                         ucv_string_get(rv->uv));
621 
622                                 ucv_free(rv->uv, false);
623                                 rv->uv = ucv_string_new_length(s, len);
624                                 free(s);
625 
626                                 rv->type = TK_REGEXP;
627 
628                                 return rv;
629                         }
630                 }
631 
632                 break;
633         }
634 
635         return NULL;
636 }
637 
638 
639 /*
640  * Parses a label from the given buffer.
641  *
642  * Returns a negative value on error, otherwise the amount of consumed
643  * characters from the given buffer.
644  *
645  * Error values:
646  *  -UC_ERROR_OVERLONG_STRING   Label too long
647  */
648 
649 static uc_token_t *
650 parse_label(uc_lexer_t *lex)
651 {
652         const struct token *tok = lex->tok;
653         const struct keyword *word;
654         char *ptr;
655         size_t i;
656 
657         if (!lex->lookbehind && tok->plen)
658                 lookbehind_append(lex, tok->u.pat, tok->plen);
659 
660         if (!buf_remaining(lex) || (lex->bufstart[0] != '_' && !isalnum(lex->bufstart[0]))) {
661                 if (lex->no_keyword == false) {
662                         for (i = 0, word = &reserved_words[0]; i < ARRAY_SIZE(reserved_words); i++, word = &reserved_words[i]) {
663                                 if (lex->lookbehind && lex->lookbehindlen == word->plen && !strncmp(lex->lookbehind, word->pat, word->plen)) {
664                                         lookbehind_reset(lex);
665 
666                                         return emit_op(lex, lex->source->off - word->plen, word->type, NULL);
667                                 }
668                         }
669                 }
670 
671                 return lookbehind_to_text(lex, lex->source->off - lex->lookbehindlen, TK_LABEL, NULL);
672         }
673 
674         for (ptr = lex->bufstart; ptr < lex->bufend && (*ptr == '_' || isalnum(*ptr)); ptr++)
675                 ;
676 
677         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
678         buf_consume(lex, ptr - lex->bufstart);
679 
680         return NULL;
681 }
682 
683 
684 /*
685  * Parses a number literal from the given buffer.
686  *
687  * Returns a negative value on error, otherwise the amount of consumed
688  * characters from the given buffer.
689  *
690  * Error values:
691  *  -UC_ERROR_INVALID_ESCAPE    Invalid number character
692  */
693 
694 static inline bool
695 is_numeric_char(uc_lexer_t *lex, char c)
696 {
697         char prev = lex->lookbehindlen ? lex->lookbehind[lex->lookbehindlen-1] : 0;
698 
699         switch (c|32) {
700         case '.':
701         case '':
702         case '1':
703         case '2':
704         case '3':
705         case '4':
706         case '5':
707         case '6':
708         case '7':
709         case '8':
710         case '9':
711                 return true;
712 
713         case 'a':
714         case 'b':
715         case 'c':
716         case 'd':
717         case 'e':
718         case 'f':
719         case 'o':
720         case 'x':
721                 /* require previous char, a number literal cannot start with these */
722                 return prev != 0;
723 
724         case '+':
725         case '-':
726                 /* sign is only allowed after an exponent char */
727                 return (prev|32) == 'e';
728         }
729 
730         return false;
731 }
732 
733 static uc_token_t *
734 parse_number(uc_lexer_t *lex)
735 {
736         uc_token_t *rv = NULL;
737         uc_value_t *nv = NULL;
738         const char *ptr;
739         char *e;
740 
741         if (!buf_remaining(lex) || !is_numeric_char(lex, lex->bufstart[0])) {
742                 lookbehind_append(lex, "\0", 1);
743 
744                 nv = uc_number_parse_octal(lex->lookbehind, &e);
745 
746                 switch (ucv_type(nv)) {
747                 case UC_DOUBLE:
748                         rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_DOUBLE, nv);
749                         break;
750 
751                 case UC_INTEGER:
752                         rv = emit_op(lex, lex->source->off - (e - lex->lookbehind), TK_NUMBER, nv);
753                         break;
754 
755                 default:
756                         rv = emit_op(lex, lex->source->off - (lex->lookbehindlen - (e - lex->lookbehind) - 1), TK_ERROR, ucv_string_new("Invalid number literal"));
757                 }
758 
759                 lookbehind_reset(lex);
760 
761                 return rv;
762         }
763 
764         for (ptr = lex->bufstart; ptr < lex->bufend && is_numeric_char(lex, *ptr); ptr++)
765                 ;
766 
767         lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
768         buf_consume(lex, ptr - lex->bufstart);
769 
770         return NULL;
771 }
772 
773 static uc_token_t *
774 lex_step(uc_lexer_t *lex, FILE *fp)
775 {
776         uint32_t masks[] = { 0, le32toh(0x000000ff), le32toh(0x0000ffff), le32toh(0x00ffffff), le32toh(0xffffffff) };
777         union { uint32_t n; char str[4]; } search;
778         const struct token *tok;
779         size_t rlen, rem, *nest;
780         char *ptr, c;
781         uc_token_t *rv;
782         size_t i;
783 
784         /* only less than UC_LEX_MAX_TOKEN_LEN unread buffer chars remaining,
785          * move the remaining bytes to the beginning and read more data */
786         if (buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN) {
787                 if (!lex->buf) {
788                         lex->buflen = 128;
789                         lex->buf = xalloc(lex->buflen);
790                 }
791 
792                 rem = lex->bufend - lex->bufstart;
793 
794                 if (rem)
795                         memcpy(lex->buf, lex->bufstart, rem);
796 
797                 rlen = fread(lex->buf + rem, 1, lex->buflen - rem, fp);
798 
799                 lex->bufstart = lex->buf;
800                 lex->bufend   = lex->buf + rlen + rem;
801 
802                 if (rlen == 0 && (ferror(fp) || feof(fp)))
803                         lex->eof = 1;
804         }
805 
806         switch (lex->state) {
807         case UC_LEX_IDENTIFY_BLOCK:
808                 /* previous block had strip trailing whitespace flag, skip leading whitespace */
809                 if (lex->modifier == MINUS) {
810                         while (buf_remaining(lex) && isspace(lex->bufstart[0]))
811                                 buf_consume(lex, 1);
812 
813                         lex->modifier = UNSPEC;
814                 }
815 
816                 /* previous block was a statement block and trim_blocks is enabld, skip leading newline */
817                 else if (lex->modifier == NEWLINE) {
818                         if (buf_startswith(lex, "\n"))
819                                 buf_consume(lex, 1);
820 
821                         lex->modifier = UNSPEC;
822                 }
823 
824                 /* scan forward through buffer to identify start token */
825                 for (ptr = lex->bufstart; ptr < lex->bufend - strlen("{#"); ptr++) {
826                         /* found start of comment block */
827                         if (!strncmp(ptr, "{#", 2)) {
828                                 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
829                                 buf_consume(lex, (ptr + 2) - lex->bufstart);
830                                 lex->lastoff = lex->source->off - 2;
831                                 lex->state = UC_LEX_BLOCK_COMMENT_START;
832 
833                                 return NULL;
834                         }
835 
836                         /* found start of expression block */
837                         else if (!strncmp(ptr, "{{", 2)) {
838                                 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
839                                 buf_consume(lex, (ptr + 2) - lex->bufstart);
840                                 lex->lastoff = lex->source->off - 2;
841                                 lex->state = UC_LEX_BLOCK_EXPRESSION_START;
842 
843                                 return NULL;
844                         }
845 
846                         /* found start of statement block */
847                         else if (!strncmp(ptr, "{%", 2)) {
848                                 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
849                                 buf_consume(lex, (ptr + 2) - lex->bufstart);
850                                 lex->lastoff = lex->source->off - 2;
851                                 lex->state = UC_LEX_BLOCK_STATEMENT_START;
852 
853                                 return NULL;
854                         }
855                 }
856 
857                 /* we're at eof */
858                 if (lex->eof) {
859                         lookbehind_append(lex, ptr, lex->bufend - ptr);
860                         lex->state = UC_LEX_EOF;
861 
862                         return lookbehind_to_text(lex, lex->lastoff, TK_TEXT, NULL);
863                 }
864 
865                 lookbehind_append(lex, lex->bufstart, ptr - lex->bufstart);
866                 buf_consume(lex, ptr - lex->bufstart);
867                 break;
868 
869 
870         case UC_LEX_BLOCK_COMMENT_START:
871         case UC_LEX_BLOCK_EXPRESSION_START:
872         case UC_LEX_BLOCK_STATEMENT_START:
873                 rv = NULL;
874                 lex->modifier = UNSPEC;
875 
876                 /* strip whitespace before block */
877                 if (buf_startswith(lex, "-")) {
878                         rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \n\t\v\f\r");
879                         buf_consume(lex, 1);
880                 }
881 
882                 /* disable lstrip flag (only valid for statement blocks) */
883                 else if (lex->state == UC_LEX_BLOCK_STATEMENT_START) {
884                         /* disable lstrip flag */
885                         if (buf_startswith(lex, "+")) {
886                                 rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL);
887                                 buf_consume(lex, 1);
888                         }
889 
890                         /* global block lstrip */
891                         else if (lex->config && lex->config->lstrip_blocks) {
892                                 rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, " \t\v\f\r");
893                         }
894                 }
895                 else {
896                         rv = lookbehind_to_text(lex, lex->source->off, TK_TEXT, NULL);
897                 }
898 
899                 switch (lex->state) {
900                 case UC_LEX_BLOCK_COMMENT_START:
901                         lex->state = UC_LEX_BLOCK_COMMENT;
902                         lex->block = COMMENT;
903                         break;
904 
905                 case UC_LEX_BLOCK_STATEMENT_START:
906                         lex->state = UC_LEX_IDENTIFY_TOKEN;
907                         lex->block = STATEMENTS;
908                         break;
909 
910                 case UC_LEX_BLOCK_EXPRESSION_START:
911                         lex->state = UC_LEX_BLOCK_EXPRESSION_EMIT_TAG;
912                         break;
913 
914                 default:
915                         break;
916                 }
917 
918                 return rv;
919 
920 
921         case UC_LEX_BLOCK_COMMENT:
922                 /* scan forward through buffer to identify end token */
923                 while (lex->bufstart < lex->bufend - 2) {
924                         if (buf_startswith(lex, "-#}")) {
925                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
926                                 lex->modifier = MINUS;
927                                 buf_consume(lex, 3);
928                                 lex->lastoff = lex->source->off;
929                                 break;
930                         }
931                         else if (buf_startswith(lex, "#}")) {
932                                 lex->state = UC_LEX_IDENTIFY_BLOCK;
933                                 buf_consume(lex, 2);
934                                 lex->lastoff = lex->source->off;
935                                 break;
936                         }
937 
938                         buf_consume(lex, 1);
939                 }
940 
941                 /* we're at eof */
942                 if (lex->eof) {
943                         lex->state = UC_LEX_EOF;
944 
945                         buf_consume(lex, lex->bufend - lex->bufstart);
946 
947                         return emit_op(lex, lex->lastoff, TK_ERROR, ucv_string_new("Unterminated template block"));
948                 }
949 
950                 break;
951 
952 
953         case UC_LEX_BLOCK_EXPRESSION_EMIT_TAG:
954                 lex->state = UC_LEX_IDENTIFY_TOKEN;
955                 lex->block = EXPRESSION;
956 
957                 return emit_op(lex, lex->source->off, TK_LEXP, NULL);
958 
959 
960         case UC_LEX_IDENTIFY_TOKEN:
961                 /* skip leading whitespace */
962                 for (i = 0; i < buf_remaining(lex) && isspace(lex->bufstart[i]); i++)
963                         ;
964 
965                 buf_consume(lex, i);
966 
967                 if (i > 0 && buf_remaining(lex) < UC_LEX_MAX_TOKEN_LEN)
968                         return NULL;
969 
970                 for (i = 0; i < sizeof(search.str); i++)
971                         search.str[i] = (i < buf_remaining(lex)) ? lex->bufstart[i] : 0;
972 
973                 for (i = 0, tok = tokens; i < ARRAY_SIZE(tokens); tok = &tokens[++i]) {
974                         /* remaining buffer data is shorter than token, skip */
975                         if (tok->plen > buf_remaining(lex))
976                                 continue;
977 
978                         c = buf_remaining(lex) ? lex->bufstart[0] : 0;
979 
980                         if (tok->plen ? ((search.n & masks[tok->plen]) == tok->u.patn)
981                                       : (c >= tok->u.pat[0] && c <= tok->u.pat[1])) {
982                                 lex->lastoff = lex->source->off;
983 
984                                 /* token has a parse method, switch state */
985                                 if (tok->parse) {
986                                         lex->tok = tok;
987                                         lex->state = UC_LEX_PARSE_TOKEN;
988 
989                                         buf_consume(lex, tok->plen);
990 
991                                         return NULL;
992                                 }
993 
994                                 /* in raw code mode, ignore template tag tokens */
995                                 if (lex->config && lex->config->raw_mode &&
996                                     (tok->type == TK_LSTM || tok->type == TK_RSTM ||
997                                      tok->type == TK_LEXP || tok->type == TK_REXP)) {
998                                         continue;
999                                 }
1000 
1001                                 /* disallow nesting blocks */
1002                                 if (tok->type == TK_LSTM || tok->type == TK_LEXP) {
1003                                         buf_consume(lex, tok->plen);
1004 
1005                                         return emit_op(lex, lex->source->off - tok->plen, TK_ERROR, ucv_string_new("Template blocks may not be nested"));
1006                                 }
1007 
1008                                 /* found end of block */
1009                                 else if ((lex->block == STATEMENTS && tok->type == TK_RSTM) ||
1010                                          (lex->block == EXPRESSION && tok->type == TK_REXP)) {
1011                                         /* strip whitespace after block */
1012                                         if (tok->u.pat[0] == '-')
1013                                                 lex->modifier = MINUS;
1014 
1015                                         /* strip newline after statement block */
1016                                         else if (lex->block == STATEMENTS &&
1017                                                  lex->config && lex->config->trim_blocks)
1018                                                 lex->modifier = NEWLINE;
1019 
1020                                         lex->state = UC_LEX_IDENTIFY_BLOCK;
1021                                         lex->block = NONE;
1022                                 }
1023 
1024                                 /* track opening braces */
1025                                 else if (tok->type == TK_LBRACE && lex->templates.count > 0) {
1026                                         nest = uc_vector_last(&lex->templates);
1027                                         (*nest)++;
1028                                 }
1029 
1030                                 /* check end of placeholder expression */
1031                                 else if (tok->type == TK_RBRACE && lex->templates.count > 0) {
1032                                         nest = uc_vector_last(&lex->templates);
1033 
1034                                         if (*nest == 0) {
1035                                                 lex->templates.count--;
1036                                                 lex->state = UC_LEX_PARSE_TOKEN;
1037                                                 lex->tok = &tokens[ARRAY_SIZE(tokens) - 1]; /* NB: TK_TEMPLATE token spec */
1038                                         }
1039                                         else {
1040                                                 (*nest)--;
1041                                         }
1042                                 }
1043 
1044                                 /* do not report statement tags to the parser */
1045                                 if (tok->type != 0 && tok->type != TK_LSTM)
1046                                         rv = emit_op(lex, lex->source->off,
1047                                                 (tok->type == TK_RSTM) ? TK_SCOL : tok->type, NULL);
1048                                 else
1049                                         rv = NULL;
1050 
1051                                 buf_consume(lex, tok->plen);
1052 
1053                                 return rv;
1054                         }
1055                 }
1056 
1057                 /* no possible return beyond this point can advance,
1058                    mark lex state as eof */
1059                 lex->state = UC_LEX_EOF;
1060 
1061                 /* no token matched and we do have remaining data, junk */
1062                 if (buf_remaining(lex))
1063                         return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unexpected character"));
1064 
1065                 /* we're at eof, allow unclosed statement blocks */
1066                 if (lex->block == STATEMENTS)
1067                         return NULL;
1068 
1069                 /* premature EOF */
1070                 return emit_op(lex, lex->source->off, TK_ERROR, ucv_string_new("Unterminated template block"));
1071 
1072 
1073         case UC_LEX_PARSE_TOKEN:
1074                 tok = lex->tok;
1075                 rv = tok->parse(lex);
1076 
1077                 if (rv) {
1078                         memset(lex->esc, 0, sizeof(lex->esc));
1079                         lex->state = lex->is_placeholder ? UC_LEX_PLACEHOLDER : UC_LEX_IDENTIFY_TOKEN;
1080                         lex->is_placeholder = false;
1081                         lex->tok = NULL;
1082 
1083                         if (rv == UC_LEX_CONTINUE_PARSING)
1084                                 rv = NULL;
1085 
1086                         return rv;
1087                 }
1088 
1089                 break;
1090 
1091 
1092         case UC_LEX_PLACEHOLDER:
1093                 lex->state = UC_LEX_IDENTIFY_TOKEN;
1094 
1095                 uc_vector_push(&lex->templates, 0);
1096 
1097                 return emit_op(lex, lex->source->off, TK_PLACEH, NULL);
1098 
1099 
1100         case UC_LEX_EOF:
1101                 break;
1102         }
1103 
1104         return NULL;
1105 }
1106 
1107 void
1108 uc_lexer_init(uc_lexer_t *lex, uc_parse_config_t *config, uc_source_t *source)
1109 {
1110         lex->state = UC_LEX_IDENTIFY_BLOCK;
1111 
1112         lex->config = config;
1113         lex->source = uc_source_get(source);
1114 
1115         lex->eof = 0;
1116         lex->is_escape = 0;
1117 
1118         lex->block = NONE;
1119         lex->modifier = UNSPEC;
1120 
1121         lex->buflen = 0;
1122         lex->buf = NULL;
1123         lex->bufstart = NULL;
1124         lex->bufend = NULL;
1125 
1126         lex->lookbehindlen = 0;
1127         lex->lookbehind = NULL;
1128 
1129         lex->tok = NULL;
1130 
1131         lex->esclen = 0;
1132         memset(lex->esc, 0, sizeof(lex->esc));
1133 
1134         lex->lead_surrogate = 0;
1135 
1136         lex->lastoff = 0;
1137 
1138         lex->templates.count = 0;
1139         lex->templates.entries = NULL;
1140 
1141         if (config && config->raw_mode) {
1142                 lex->state = UC_LEX_IDENTIFY_TOKEN;
1143                 lex->block = STATEMENTS;
1144         }
1145 }
1146 
1147 void
1148 uc_lexer_free(uc_lexer_t *lex)
1149 {
1150         uc_vector_clear(&lex->templates);
1151         uc_source_put(lex->source);
1152 
1153         free(lex->lookbehind);
1154         free(lex->buf);
1155 }
1156 
1157 uc_token_t *
1158 uc_lexer_next_token(uc_lexer_t *lex)
1159 {
1160         uc_token_t *rv = NULL;
1161 
1162         while (lex->state != UC_LEX_EOF) {
1163                 rv = lex_step(lex, lex->source->fp);
1164 
1165                 if (rv != NULL)
1166                         break;
1167         }
1168 
1169         if (rv) {
1170                 lex->no_keyword = false;
1171                 lex->no_regexp = false;
1172 
1173                 return rv;
1174         }
1175 
1176         return emit_op(lex, lex->source->off, TK_EOF, NULL);
1177 }
1178 
1179 const char *
1180 uc_tokenname(unsigned type)
1181 {
1182         static char buf[sizeof("'endfunction'")];
1183         size_t i;
1184 
1185         switch (type) {
1186         case 0:           return "End of file";
1187         case TK_TEMPLATE: return "Template";
1188         case TK_STRING:   return "String";
1189         case TK_LABEL:    return "Label";
1190         case TK_NUMBER:   return "Number";
1191         case TK_DOUBLE:   return "Double";
1192         case TK_REGEXP:   return "Regexp";
1193         }
1194 
1195         for (i = 0; i < ARRAY_SIZE(tokens); i++) {
1196                 if (tokens[i].type != type)
1197                         continue;
1198 
1199                 snprintf(buf, sizeof(buf), "'%s'", tokens[i].u.pat);
1200 
1201                 return buf;
1202         }
1203 
1204         for (i = 0; i < ARRAY_SIZE(reserved_words); i++) {
1205                 if (reserved_words[i].type != type)
1206                         continue;
1207 
1208                 snprintf(buf, sizeof(buf), "'%s'", reserved_words[i].pat);
1209 
1210                 return buf;
1211         }
1212 
1213         return "?";
1214 }
1215 
1216 bool
1217 uc_lexer_is_keyword(uc_value_t *label)
1218 {
1219         size_t i;
1220 
1221         if (ucv_type(label) != UC_STRING)
1222                 return false;
1223 
1224         for (i = 0; i < ARRAY_SIZE(reserved_words); i++)
1225                 if (!strcmp(reserved_words[i].pat, ucv_string_get(label)))
1226                         return true;
1227 
1228         return false;
1229 }
1230 
1231 #endif /* NO_COMPILE */
1232 
1233 /*
1234  * Stores the given codepoint as a utf8 multibyte sequence into the given
1235  * output buffer and substracts the required amount of bytes from  the given
1236  * length pointer.
1237  *
1238  * Returns false if the multibyte sequence would not fit into the buffer,
1239  * otherwise true.
1240  */
1241 
1242 bool
1243 utf8enc(char **out, int *rem, int code)
1244 {
1245         if (code >= 0 && code <= 0x7F) {
1246                 if (*rem < 1)
1247                         return false;
1248 
1249                 *(*out)++ = code; (*rem)--;
1250 
1251                 return true;
1252         }
1253         else if (code > 0 && code <= 0x7FF) {
1254                 if (*rem < 2)
1255                         return false;
1256 
1257                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
1258                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1259 
1260                 return true;
1261         }
1262         else if (code > 0 && code <= 0xFFFF) {
1263                 if (*rem < 3)
1264                         return false;
1265 
1266                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
1267                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1268                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1269 
1270                 return true;
1271         }
1272         else if (code > 0 && code <= 0x10FFFF) {
1273                 if (*rem < 4)
1274                         return false;
1275 
1276                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
1277                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
1278                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
1279                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
1280 
1281                 return true;
1282         }
1283 
1284         return true;
1285 }
1286 

This page was automatically generated by LXR 0.3.1.  •  OpenWrt