1 /* 2 * Copyright (C) 2013-2014 Jo-Philipp Wich <jo@mein.io> 3 * 4 * Permission to use, copy, modify, and/or distribute this software for any 5 * purpose with or without fee is hereby granted, provided that the above 6 * copyright notice and this permission notice appear in all copies. 7 * 8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 */ 16 17 #include <stdbool.h> 18 #include <stdlib.h> 19 #include <string.h> 20 #include <ctype.h> 21 #include <regex.h> 22 23 #include "ast.h" 24 #include "lexer.h" 25 #include "parser.h" 26 27 28 struct token { 29 int type; 30 const char *pat; 31 int plen; 32 int (*parse)(const char *buf, struct jp_opcode *op, struct jp_state *s); 33 }; 34 35 #define dec(o) \ 36 ((o) - '') 37 38 #define hex(x) \ 39 (((x) >= 'a') ? (10 + (x) - 'a') : \ 40 (((x) >= 'A') ? (10 + (x) - 'A') : dec(x))) 41 42 /* 43 * Stores the given codepoint as a utf8 multibyte sequence into the given 44 * output buffer and substracts the required amount of bytes from the given 45 * length pointer. 46 * 47 * Returns false if the multibyte sequence would not fit into the buffer, 48 * otherwise true. 49 */ 50 51 static bool 52 utf8enc(char **out, int *rem, int code) 53 { 54 if (code > 0 && code <= 0x7F) 55 { 56 if (*rem < 1) 57 return false; 58 59 *(*out)++ = code; (*rem)--; 60 return true; 61 } 62 else if (code > 0 && code <= 0x7FF) 63 { 64 if (*rem < 2) 65 return false; 66 67 *(*out)++ = ((code >> 6) & 0x1F) | 0xC0; (*rem)--; 68 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 69 return true; 70 } 71 else if (code > 0 && code <= 0xFFFF) 72 { 73 if (*rem < 3) 74 return false; 75 76 *(*out)++ = ((code >> 12) & 0x0F) | 0xE0; (*rem)--; 77 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; 78 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 79 return true; 80 } 81 else if (code > 0 && code <= 0x10FFFF) 82 { 83 if (*rem < 4) 84 return false; 85 86 *(*out)++ = ((code >> 18) & 0x07) | 0xF0; (*rem)--; 87 *(*out)++ = ((code >> 12) & 0x3F) | 0x80; (*rem)--; 88 *(*out)++ = ((code >> 6) & 0x3F) | 0x80; (*rem)--; 89 *(*out)++ = ( code & 0x3F) | 0x80; (*rem)--; 90 return true; 91 } 92 93 return true; 94 } 95 96 97 /* 98 * Parses a string literal from the given buffer. 99 * 100 * Returns a negative value on error, otherwise the amount of consumed 101 * characters from the given buffer. 102 * 103 * Error values: 104 * -1 Unterminated string 105 * -2 Invalid escape sequence 106 * -3 String literal too long 107 */ 108 109 static int 110 parse_string(const char *buf, struct jp_opcode *op, struct jp_state *s) 111 { 112 char q = *(buf++); 113 char str[128] = { 0 }; 114 char *out = str; 115 const char *in = buf; 116 bool esc = false; 117 int rem = sizeof(str) - 1; 118 int code; 119 120 while (*in) 121 { 122 /* continuation of escape sequence */ 123 if (esc) 124 { 125 /* \uFFFF */ 126 if (in[0] == 'u') 127 { 128 if (isxdigit(in[1]) && isxdigit(in[2]) && 129 isxdigit(in[3]) && isxdigit(in[4])) 130 { 131 if (!utf8enc(&out, &rem, 132 hex(in[1]) * 16 * 16 * 16 + 133 hex(in[2]) * 16 * 16 + 134 hex(in[3]) * 16 + 135 hex(in[4]))) 136 { 137 s->error_pos = s->off + (in - buf); 138 return -3; 139 } 140 141 in += 5; 142 } 143 else 144 { 145 s->error_pos = s->off + (in - buf); 146 return -2; 147 } 148 } 149 150 /* \xFF */ 151 else if (in[0] == 'x') 152 { 153 if (isxdigit(in[1]) && isxdigit(in[2])) 154 { 155 if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2]))) 156 { 157 s->error_pos = s->off + (in - buf); 158 return -3; 159 } 160 161 in += 3; 162 } 163 else 164 { 165 s->error_pos = s->off + (in - buf); 166 return -2; 167 } 168 } 169 170 /* \377, \77 or \7 */ 171 else if (in[0] >= '' && in[0] <= '7') 172 { 173 /* \377 */ 174 if (in[1] >= '' && in[1] <= '7' && 175 in[2] >= '' && in[2] <= '7') 176 { 177 code = dec(in[0]) * 8 * 8 + 178 dec(in[1]) * 8 + 179 dec(in[2]); 180 181 if (code > 255) 182 { 183 s->error_pos = s->off + (in - buf); 184 return -2; 185 } 186 187 if (!utf8enc(&out, &rem, code)) 188 { 189 s->error_pos = s->off + (in - buf); 190 return -3; 191 } 192 193 in += 3; 194 } 195 196 /* \77 */ 197 else if (in[1] >= '' && in[1] <= '7') 198 { 199 if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1]))) 200 { 201 s->error_pos = s->off + (in - buf); 202 return -3; 203 } 204 205 in += 2; 206 } 207 208 /* \7 */ 209 else 210 { 211 if (!utf8enc(&out, &rem, dec(in[0]))) 212 { 213 s->error_pos = s->off + (in - buf); 214 return -3; 215 } 216 217 in += 1; 218 } 219 } 220 221 /* single character escape */ 222 else 223 { 224 if (rem-- < 1) 225 { 226 s->error_pos = s->off + (in - buf); 227 return -3; 228 } 229 230 switch (in[0]) 231 { 232 case 'a': *out = '\a'; break; 233 case 'b': *out = '\b'; break; 234 case 'e': *out = '\e'; break; 235 case 'f': *out = '\f'; break; 236 case 'n': *out = '\n'; break; 237 case 'r': *out = '\r'; break; 238 case 't': *out = '\t'; break; 239 case 'v': *out = '\v'; break; 240 default: 241 /* in regexp mode, retain backslash */ 242 if (q == '/') 243 { 244 if (rem-- < 1) 245 { 246 s->error_pos = s->off + (in - buf); 247 return -3; 248 } 249 250 *out++ = '\\'; 251 } 252 253 *out = *in; 254 break; 255 } 256 257 in++; 258 out++; 259 } 260 261 esc = false; 262 } 263 264 /* begin of escape sequence */ 265 else if (*in == '\\') 266 { 267 in++; 268 esc = true; 269 } 270 271 /* terminating quote */ 272 else if (*in == q) 273 { 274 op->str = strdup(str); 275 return (in - buf) + 2; 276 } 277 278 /* ordinary char */ 279 else 280 { 281 if (rem-- < 1) 282 { 283 s->error_pos = s->off + (in - buf); 284 return -3; 285 } 286 287 *out++ = *in++; 288 } 289 } 290 291 return -1; 292 } 293 294 295 /* 296 * Parses a regexp literal from the given buffer. 297 * 298 * Returns a negative value on error, otherwise the amount of consumed 299 * characters from the given buffer. 300 * 301 * Error values: 302 * -1 Unterminated regexp 303 * -2 Invalid escape sequence 304 * -3 Regexp literal too long 305 */ 306 307 static int 308 parse_regexp(const char *buf, struct jp_opcode *op, struct jp_state *s) 309 { 310 int len = parse_string(buf, op, s); 311 const char *p; 312 313 if (len >= 2) 314 { 315 op->num = REG_NOSUB | REG_NEWLINE; 316 317 for (p = buf + len; p; p++) 318 { 319 switch (*p) 320 { 321 case 'e': 322 op->num |= REG_EXTENDED; 323 len++; 324 break; 325 326 case 'i': 327 op->num |= REG_ICASE; 328 len++; 329 break; 330 331 case 's': 332 op->num &= ~REG_NEWLINE; 333 len++; 334 break; 335 336 default: 337 return len; 338 } 339 } 340 341 } 342 343 return len; 344 } 345 346 347 /* 348 * Parses a label from the given buffer. 349 * 350 * Returns a negative value on error, otherwise the amount of consumed 351 * characters from the given buffer. 352 * 353 * Error values: 354 * -3 Label too long 355 */ 356 357 static int 358 parse_label(const char *buf, struct jp_opcode *op, struct jp_state *s) 359 { 360 char str[128] = { 0 }; 361 char *out = str; 362 const char *in = buf; 363 int rem = sizeof(str) - 1; 364 365 while (*in == '_' || isalnum(*in)) 366 { 367 if (rem-- < 1) 368 { 369 s->error_pos = s->off + (in - buf); 370 return -3; 371 } 372 373 *out++ = *in++; 374 } 375 376 if (!strcmp(str, "true") || !strcmp(str, "false")) 377 { 378 op->num = (str[0] == 't'); 379 op->type = T_BOOL; 380 } 381 else 382 { 383 op->str = strdup(str); 384 } 385 386 return (in - buf); 387 } 388 389 390 /* 391 * Parses a number literal from the given buffer. 392 * 393 * Returns a negative value on error, otherwise the amount of consumed 394 * characters from the given buffer. 395 * 396 * Error values: 397 * -2 Invalid number character 398 */ 399 400 static int 401 parse_number(const char *buf, struct jp_opcode *op, struct jp_state *s) 402 { 403 char *e; 404 int n = strtol(buf, &e, 10); 405 406 if (e == buf) 407 { 408 s->error_pos = s->off; 409 return -2; 410 } 411 412 op->num = n; 413 414 return (e - buf); 415 } 416 417 static const struct token tokens[] = { 418 { 0, " ", 1 }, 419 { 0, "\t", 1 }, 420 { 0, "\n", 1 }, 421 { T_LE, "<=", 2 }, 422 { T_GE, ">=", 2 }, 423 { T_NE, "!=", 2 }, 424 { T_AND, "&&", 2 }, 425 { T_OR, "||", 2 }, 426 { T_DOT, ".", 1 }, 427 { T_BROPEN, "[", 1 }, 428 { T_BRCLOSE, "]", 1 }, 429 { T_POPEN, "(", 1 }, 430 { T_PCLOSE, ")", 1 }, 431 { T_UNION, ",", 1 }, 432 { T_ROOT, "$", 1 }, 433 { T_THIS, "@", 1 }, 434 { T_LT, "<", 1 }, 435 { T_GT, ">", 1 }, 436 { T_EQ, "=", 1 }, 437 { T_MATCH, "~", 1 }, 438 { T_NOT, "!", 1 }, 439 { T_WILDCARD, "*", 1 }, 440 { T_REGEXP, "/", 1, parse_regexp }, 441 { T_STRING, "'", 1, parse_string }, 442 { T_STRING, "\"", 1, parse_string }, 443 { T_LABEL, "_", 1, parse_label }, 444 { T_LABEL, "az", 0, parse_label }, 445 { T_LABEL, "AZ", 0, parse_label }, 446 { T_NUMBER, "-", 1, parse_number }, 447 { T_NUMBER, "09", 0, parse_number }, 448 }; 449 450 const char *tokennames[25] = { 451 [0] = "End of file", 452 [T_AND] = "'&&'", 453 [T_OR] = "'||'", 454 [T_UNION] = "','", 455 [T_EQ] = "'='", 456 [T_NE] = "'!='", 457 [T_GT] = "'>'", 458 [T_GE] = "'>='", 459 [T_LT] = "'<'", 460 [T_LE] = "'<='", 461 [T_MATCH] = "'~'", 462 [T_NOT] = "'!'", 463 [T_LABEL] = "Label", 464 [T_ROOT] = "'$'", 465 [T_THIS] = "'@'", 466 [T_DOT] = "'.'", 467 [T_WILDCARD] = "'*'", 468 [T_REGEXP] = "/.../", 469 [T_BROPEN] = "'['", 470 [T_BRCLOSE] = "']'", 471 [T_BOOL] = "Bool", 472 [T_NUMBER] = "Number", 473 [T_STRING] = "String", 474 [T_POPEN] = "'('", 475 [T_PCLOSE] = "')'", 476 }; 477 478 479 static int 480 match_token(const char *ptr, struct jp_opcode *op, struct jp_state *s) 481 { 482 int i; 483 const struct token *tok; 484 485 for (i = 0, tok = &tokens[0]; 486 i < sizeof(tokens) / sizeof(tokens[0]); 487 i++, tok = &tokens[i]) 488 { 489 if ((tok->plen > 0 && !strncmp(ptr, tok->pat, tok->plen)) || 490 (tok->plen == 0 && *ptr >= tok->pat[0] && *ptr <= tok->pat[1])) 491 { 492 op->type = tok->type; 493 494 if (tok->parse) 495 return tok->parse(ptr, op, s); 496 497 return tok->plen; 498 } 499 } 500 501 s->error_pos = s->off; 502 return -4; 503 } 504 505 struct jp_opcode * 506 jp_get_token(struct jp_state *s, const char *input, int *mlen) 507 { 508 struct jp_opcode op = { 0 }; 509 510 *mlen = match_token(input, &op, s); 511 512 if (*mlen < 0) 513 { 514 s->error_code = *mlen; 515 return NULL; 516 } 517 else if (op.type == 0) 518 { 519 return NULL; 520 } 521 522 return jp_alloc_op(s, op.type, op.num, op.str, NULL); 523 } 524
This page was automatically generated by LXR 0.3.1. • OpenWrt