lexer.c

  1 /*
  2  * Copyright (C) 2013-2014 Jo-Philipp Wich <jo@mein.io>
  3  *
  4  * Permission to use, copy, modify, and/or distribute this software for any
  5  * purpose with or without fee is hereby granted, provided that the above
  6  * copyright notice and this permission notice appear in all copies.
  7  *
  8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15  */
 16 
 17 #include <stdbool.h>
 18 #include <stdlib.h>
 19 #include <string.h>
 20 #include <ctype.h>
 21 #include <regex.h>
 22 
 23 #include "ast.h"
 24 #include "lexer.h"
 25 #include "parser.h"
 26 
 27 
 28 struct token {
 29         int type;
 30         const char *pat;
 31         int plen;
 32         int (*parse)(const char *buf, struct jp_opcode *op, struct jp_state *s);
 33 };
 34 
 35 #define dec(o) \
 36         ((o) - '')
 37 
 38 #define hex(x) \
 39         (((x) >= 'a') ? (10 + (x) - 'a') : \
 40                 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x)))
 41 
 42 /*
 43  * Stores the given codepoint as a utf8 multibyte sequence into the given
 44  * output buffer and substracts the required amount of bytes from  the given
 45  * length pointer.
 46  *
 47  * Returns false if the multibyte sequence would not fit into the buffer,
 48  * otherwise true.
 49  */
 50 
 51 static bool
 52 utf8enc(char **out, int *rem, int code)
 53 {
 54         if (code > 0 && code <= 0x7F)
 55         {
 56                 if (*rem < 1)
 57                         return false;
 58 
 59                 *(*out)++ = code; (*rem)--;
 60                 return true;
 61         }
 62         else if (code > 0 && code <= 0x7FF)
 63         {
 64                 if (*rem < 2)
 65                         return false;
 66 
 67                 *(*out)++ = ((code >>  6) & 0x1F) | 0xC0; (*rem)--;
 68                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
 69                 return true;
 70         }
 71         else if (code > 0 && code <= 0xFFFF)
 72         {
 73                 if (*rem < 3)
 74                         return false;
 75 
 76                 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--;
 77                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
 78                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
 79                 return true;
 80         }
 81         else if (code > 0 && code <= 0x10FFFF)
 82         {
 83                 if (*rem < 4)
 84                         return false;
 85 
 86                 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--;
 87                 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--;
 88                 *(*out)++ = ((code >>  6) & 0x3F) | 0x80; (*rem)--;
 89                 *(*out)++ = ( code        & 0x3F) | 0x80; (*rem)--;
 90                 return true;
 91         }
 92 
 93         return true;
 94 }
 95 
 96 
 97 /*
 98  * Parses a string literal from the given buffer.
 99  *
100  * Returns a negative value on error, otherwise the amount of consumed
101  * characters from the given buffer.
102  *
103  * Error values:
104  *  -1  Unterminated string
105  *  -2  Invalid escape sequence
106  *  -3  String literal too long
107  */
108 
109 static int
110 parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s)
111 {
112         char q = *(buf++);
113         char str[128] = { 0 };
114         char *out = str;
115         const char *in = buf;
116         bool esc = false;
117         int rem = sizeof(str) - 1;
118         int code;
119 
120         while (*in)
121         {
122                 /* continuation of escape sequence */
123                 if (esc)
124                 {
125                         /* \uFFFF */
126                         if (in[0] == 'u')
127                         {
128                                 if (isxdigit(in[1]) && isxdigit(in[2]) &&
129                                     isxdigit(in[3]) && isxdigit(in[4]))
130                                 {
131                                         if (!utf8enc(&out, &rem,
132                                                      hex(in[1]) * 16 * 16 * 16 +
133                                                      hex(in[2]) * 16 * 16 +
134                                                      hex(in[3]) * 16 +
135                                                      hex(in[4])))
136                                         {
137                                                 s->error_pos = s->off + (in - buf);
138                                                 return -3;
139                                         }
140 
141                                         in += 5;
142                                 }
143                                 else
144                                 {
145                                         s->error_pos = s->off + (in - buf);
146                                         return -2;
147                                 }
148                         }
149 
150                         /* \xFF */
151                         else if (in[0] == 'x')
152                         {
153                                 if (isxdigit(in[1]) && isxdigit(in[2]))
154                                 {
155                                         if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2])))
156                                         {
157                                                 s->error_pos = s->off + (in - buf);
158                                                 return -3;
159                                         }
160 
161                                         in += 3;
162                                 }
163                                 else
164                                 {
165                                         s->error_pos = s->off + (in - buf);
166                                         return -2;
167                                 }
168                         }
169 
170                         /* \377, \77 or \7 */
171                         else if (in[0] >= '' && in[0] <= '7')
172                         {
173                                 /* \377 */
174                                 if (in[1] >= '' && in[1] <= '7' &&
175                                     in[2] >= '' && in[2] <= '7')
176                                 {
177                                         code = dec(in[0]) * 8 * 8 +
178                                                dec(in[1]) * 8 +
179                                                dec(in[2]);
180 
181                                         if (code > 255)
182                                         {
183                                                 s->error_pos = s->off + (in - buf);
184                                                 return -2;
185                                         }
186 
187                                         if (!utf8enc(&out, &rem, code))
188                                         {
189                                                 s->error_pos = s->off + (in - buf);
190                                                 return -3;
191                                         }
192 
193                                         in += 3;
194                                 }
195 
196                                 /* \77 */
197                                 else if (in[1] >= '' && in[1] <= '7')
198                                 {
199                                         if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1])))
200                                         {
201                                                 s->error_pos = s->off + (in - buf);
202                                                 return -3;
203                                         }
204 
205                                         in += 2;
206                                 }
207 
208                                 /* \7 */
209                                 else
210                                 {
211                                         if (!utf8enc(&out, &rem, dec(in[0])))
212                                         {
213                                                 s->error_pos = s->off + (in - buf);
214                                                 return -3;
215                                         }
216 
217                                         in += 1;
218                                 }
219                         }
220 
221                         /* single character escape */
222                         else
223                         {
224                                 if (rem-- < 1)
225                                 {
226                                         s->error_pos = s->off + (in - buf);
227                                         return -3;
228                                 }
229 
230                                 switch (in[0])
231                                 {
232                                 case 'a': *out = '\a'; break;
233                                 case 'b': *out = '\b'; break;
234                                 case 'e': *out = '\e'; break;
235                                 case 'f': *out = '\f'; break;
236                                 case 'n': *out = '\n'; break;
237                                 case 'r': *out = '\r'; break;
238                                 case 't': *out = '\t'; break;
239                                 case 'v': *out = '\v'; break;
240                                 default:
241                                         /* in regexp mode, retain backslash */
242                                         if (q == '/')
243                                         {
244                                                 if (rem-- < 1)
245                                                 {
246                                                         s->error_pos = s->off + (in - buf);
247                                                         return -3;
248                                                 }
249 
250                                                 *out++ = '\\';
251                                         }
252 
253                                         *out = *in;
254                                         break;
255                                 }
256 
257                                 in++;
258                                 out++;
259                         }
260 
261                         esc = false;
262                 }
263 
264                 /* begin of escape sequence */
265                 else if (*in == '\\')
266                 {
267                         in++;
268                         esc = true;
269                 }
270 
271                 /* terminating quote */
272                 else if (*in == q)
273                 {
274                         op->str = strdup(str);
275                         return (in - buf) + 2;
276                 }
277 
278                 /* ordinary char */
279                 else
280                 {
281                         if (rem-- < 1)
282                         {
283                                 s->error_pos = s->off + (in - buf);
284                                 return -3;
285                         }
286 
287                         *out++ = *in++;
288                 }
289         }
290 
291         return -1;
292 }
293 
294 
295 /*
296  * Parses a regexp literal from the given buffer.
297  *
298  * Returns a negative value on error, otherwise the amount of consumed
299  * characters from the given buffer.
300  *
301  * Error values:
302  *  -1  Unterminated regexp
303  *  -2  Invalid escape sequence
304  *  -3  Regexp literal too long
305  */
306 
307 static int
308 parse_regexp(const char *buf, struct jp_opcode *op, struct jp_state *s)
309 {
310         int len = parse_string(buf, op, s);
311         const char *p;
312 
313         if (len >= 2)
314         {
315                 op->num = REG_NOSUB | REG_NEWLINE;
316 
317                 for (p = buf + len; p; p++)
318                 {
319                         switch (*p)
320                         {
321                         case 'e':
322                                 op->num |= REG_EXTENDED;
323                                 len++;
324                                 break;
325 
326                         case 'i':
327                                 op->num |= REG_ICASE;
328                                 len++;
329                                 break;
330 
331                         case 's':
332                                 op->num &= ~REG_NEWLINE;
333                                 len++;
334                                 break;
335 
336                         default:
337                                 return len;
338                         }
339                 }
340 
341         }
342 
343         return len;
344 }
345 
346 
347 /*
348  * Parses a label from the given buffer.
349  *
350  * Returns a negative value on error, otherwise the amount of consumed
351  * characters from the given buffer.
352  *
353  * Error values:
354  *  -3  Label too long
355  */
356 
357 static int
358 parse_label(const char *buf, struct jp_opcode *op, struct jp_state *s)
359 {
360         char str[128] = { 0 };
361         char *out = str;
362         const char *in = buf;
363         int rem = sizeof(str) - 1;
364 
365         while (*in == '_' || isalnum(*in))
366         {
367                 if (rem-- < 1)
368                 {
369                         s->error_pos = s->off + (in - buf);
370                         return -3;
371                 }
372 
373                 *out++ = *in++;
374         }
375 
376         if (!strcmp(str, "true") || !strcmp(str, "false"))
377         {
378                 op->num = (str[0] == 't');
379                 op->type = T_BOOL;
380         }
381         else
382         {
383                 op->str = strdup(str);
384         }
385 
386         return (in - buf);
387 }
388 
389 
390 /*
391  * Parses a number literal from the given buffer.
392  *
393  * Returns a negative value on error, otherwise the amount of consumed
394  * characters from the given buffer.
395  *
396  * Error values:
397  *  -2  Invalid number character
398  */
399 
400 static int
401 parse_number(const char *buf, struct jp_opcode *op, struct jp_state *s)
402 {
403         char *e;
404         int n = strtol(buf, &e, 10);
405 
406         if (e == buf)
407         {
408                 s->error_pos = s->off;
409                 return -2;
410         }
411 
412         op->num = n;
413 
414         return (e - buf);
415 }
416 
417 static const struct token tokens[] = {
418         { 0,                    " ",     1 },
419         { 0,                    "\t",    1 },
420         { 0,                    "\n",    1 },
421         { T_LE,                 "<=",    2 },
422         { T_GE,                 ">=",    2 },
423         { T_NE,                 "!=",    2 },
424         { T_AND,                "&&",    2 },
425         { T_OR,                 "||",    2 },
426         { T_DOT,                ".",     1 },
427         { T_BROPEN,             "[",     1 },
428         { T_BRCLOSE,    "]",     1 },
429         { T_POPEN,              "(",     1 },
430         { T_PCLOSE,             ")",     1 },
431         { T_UNION,              ",",     1 },
432         { T_ROOT,               "$",     1 },
433         { T_THIS,               "@",     1 },
434         { T_LT,                 "<",     1 },
435         { T_GT,                 ">",     1 },
436         { T_EQ,                 "=",     1 },
437         { T_MATCH,              "~",     1 },
438         { T_NOT,                "!",     1 },
439         { T_WILDCARD,   "*",     1 },
440         { T_REGEXP,             "/",     1, parse_regexp },
441         { T_STRING,             "'",     1, parse_string },
442         { T_STRING,             "\"",    1, parse_string },
443         { T_LABEL,              "_",     1, parse_label  },
444         { T_LABEL,              "az",    0, parse_label  },
445         { T_LABEL,              "AZ",    0, parse_label  },
446         { T_NUMBER,             "-",     1, parse_number },
447         { T_NUMBER,             "09",    0, parse_number },
448 };
449 
450 const char *tokennames[25] = {
451         [0]                             = "End of file",
452         [T_AND]                 = "'&&'",
453         [T_OR]                  = "'||'",
454         [T_UNION]               = "','",
455         [T_EQ]                  = "'='",
456         [T_NE]                  = "'!='",
457         [T_GT]                  = "'>'",
458         [T_GE]                  = "'>='",
459         [T_LT]                  = "'<'",
460         [T_LE]                  = "'<='",
461         [T_MATCH]       = "'~'",
462         [T_NOT]                 = "'!'",
463         [T_LABEL]               = "Label",
464         [T_ROOT]                = "'$'",
465         [T_THIS]                = "'@'",
466         [T_DOT]                 = "'.'",
467         [T_WILDCARD]    = "'*'",
468         [T_REGEXP]      = "/.../",
469         [T_BROPEN]              = "'['",
470         [T_BRCLOSE]             = "']'",
471         [T_BOOL]                = "Bool",
472         [T_NUMBER]              = "Number",
473         [T_STRING]              = "String",
474         [T_POPEN]               = "'('",
475         [T_PCLOSE]              = "')'",
476 };
477 
478 
479 static int
480 match_token(const char *ptr, struct jp_opcode *op, struct jp_state *s)
481 {
482         int i;
483         const struct token *tok;
484 
485         for (i = 0, tok = &tokens[0];
486              i < sizeof(tokens) / sizeof(tokens[0]);
487                  i++, tok = &tokens[i])
488         {
489                 if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) ||
490                     (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1]))
491                 {
492                         op->type = tok->type;
493 
494                         if (tok->parse)
495                                 return tok->parse(ptr, op, s);
496 
497                         return tok->plen;
498                 }
499         }
500 
501         s->error_pos = s->off;
502         return -4;
503 }
504 
505 struct jp_opcode *
506 jp_get_token(struct jp_state *s, const char *input, int *mlen)
507 {
508         struct jp_opcode op = { 0 };
509 
510         *mlen = match_token(input, &op, s);
511 
512         if (*mlen < 0)
513         {
514                 s->error_code = *mlen;
515                 return NULL;
516         }
517         else if (op.type == 0)
518         {
519                 return NULL;
520         }
521 
522         return jp_alloc_op(s, op.type, op.num, op.str, NULL);
523 }
524
This page was automatically generated by LXR 0.3.1. • OpenWrt
OpenWrt.org Cross Reference

Sources/jsonpath/lexer.c