1 /* 2 * Copyright (C) 2015 John Crispin <blogic@openwrt.org> 3 * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU Lesser General Public License version 2.1 7 * as published by the Free Software Foundation 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 */ 14 15 #define _GNU_SOURCE 16 #include <sys/mount.h> 17 #include <sys/prctl.h> 18 #include <sys/wait.h> 19 #include <sys/types.h> 20 #include <sys/time.h> 21 #include <sys/resource.h> 22 #include <sys/stat.h> 23 #include <sys/sysmacros.h> 24 25 /* musl only defined 15 limit types, make sure all 16 are supported */ 26 #ifndef RLIMIT_RTTIME 27 #define RLIMIT_RTTIME 15 28 #undef RLIMIT_NLIMITS 29 #define RLIMIT_NLIMITS 16 30 #undef RLIM_NLIMITS 31 #define RLIM_NLIMITS 16 32 #endif 33 34 #include <assert.h> 35 #include <stdlib.h> 36 #include <unistd.h> 37 #include <errno.h> 38 #include <pwd.h> 39 #include <grp.h> 40 #include <string.h> 41 #include <fcntl.h> 42 #include <sched.h> 43 #include <linux/filter.h> 44 #include <linux/limits.h> 45 #include <linux/nsfs.h> 46 #include <linux/securebits.h> 47 #include <signal.h> 48 #include <inttypes.h> 49 50 #include "capabilities.h" 51 #include "elf.h" 52 #include "fs.h" 53 #include "jail.h" 54 #include "log.h" 55 #include "seccomp-oci.h" 56 #include "cgroups.h" 57 #include "netifd.h" 58 59 #include <libubox/blobmsg.h> 60 #include <libubox/blobmsg_json.h> 61 #include <libubox/list.h> 62 #include <libubox/vlist.h> 63 #include <libubox/uloop.h> 64 #include <libubox/utils.h> 65 #include <libubus.h> 66 67 #ifndef CLONE_NEWCGROUP 68 #define CLONE_NEWCGROUP 0x02000000 69 #endif 70 71 #define STACK_SIZE (1024 * 1024) 72 #define OPT_ARGS "cC:d:De:EfFG:h:ij:J:ln:NoO:pP:r:R:sS:uU:w:t:T:y" 73 74 #define OCI_VERSION_STRING "1.0.2" 75 76 struct hook_execvpe { 77 char *file; 78 char **argv; 79 char **envp; 80 int timeout; 81 }; 82 83 struct sysctl_val { 84 char *entry; 85 char *value; 86 }; 87 88 struct mknod_args { 89 char *path; 90 mode_t mode; 91 dev_t dev; 92 uid_t uid; 93 gid_t gid; 94 }; 95 96 static struct { 97 char *name; 98 char *hostname; 99 char **jail_argv; 100 char *cwd; 101 char *seccomp; 102 struct sock_fprog *ociseccomp; 103 char *capabilities; 104 struct jail_capset capset; 105 char *user; 106 char *group; 107 char *extroot; 108 char *overlaydir; 109 char *tmpoverlaysize; 110 char **envp; 111 char *uidmap; 112 char *gidmap; 113 char *pidfile; 114 struct sysctl_val **sysctl; 115 int no_new_privs; 116 int namespace; 117 struct { 118 int pid; 119 int net; 120 int ns; 121 int ipc; 122 int uts; 123 int user; 124 int cgroup; 125 #ifdef CLONE_NEWTIME 126 int time; 127 #endif 128 } setns; 129 int procfs; 130 int ronly; 131 int sysfs; 132 int console; 133 int pw_uid; 134 int pw_gid; 135 int gr_gid; 136 int root_map_uid; 137 gid_t *additional_gids; 138 size_t num_additional_gids; 139 mode_t umask; 140 bool set_umask; 141 int require_jail; 142 struct { 143 struct hook_execvpe **createRuntime; 144 struct hook_execvpe **createContainer; 145 struct hook_execvpe **startContainer; 146 struct hook_execvpe **poststart; 147 struct hook_execvpe **poststop; 148 } hooks; 149 struct rlimit *rlimits[RLIM_NLIMITS]; 150 int oom_score_adj; 151 bool set_oom_score_adj; 152 struct mknod_args **devices; 153 char *ocibundle; 154 bool immediately; 155 struct blob_attr *annotations; 156 int term_timeout; 157 } opts; 158 159 static struct blob_buf ocibuf; 160 161 extern int pivot_root(const char *new_root, const char *put_old); 162 163 int debug = 0; 164 165 static char child_stack[STACK_SIZE]; 166 167 static struct ubus_context *parent_ctx; 168 169 int console_fd; 170 171 172 static inline bool has_namespaces(void) 173 { 174 return ((opts.setns.pid != -1) || 175 (opts.setns.net != -1) || 176 (opts.setns.ns != -1) || 177 (opts.setns.ipc != -1) || 178 (opts.setns.uts != -1) || 179 (opts.setns.user != -1) || 180 (opts.setns.cgroup != -1) || 181 #ifdef CLONE_NEWTIME 182 (opts.setns.time != -1) || 183 #endif 184 opts.namespace); 185 } 186 187 static void free_oci_envp(char **p) { 188 char **tmp; 189 190 if (p) { 191 tmp = p; 192 while (*tmp) 193 free(*(tmp++)); 194 195 free(p); 196 } 197 } 198 199 static void free_hooklist(struct hook_execvpe **hooklist) 200 { 201 struct hook_execvpe *cur; 202 203 if (!hooklist) 204 return; 205 206 cur = *hooklist; 207 while (cur) { 208 free_oci_envp(cur->argv); 209 free_oci_envp(cur->envp); 210 free(cur->file); 211 free(cur++); 212 } 213 free(hooklist); 214 } 215 216 static void free_sysctl(void) { 217 struct sysctl_val *cur; 218 219 if (!opts.sysctl) 220 return; 221 222 cur = *opts.sysctl; 223 224 while (cur) { 225 free(cur->entry); 226 free(cur->value); 227 free(cur++); 228 } 229 free(opts.sysctl); 230 } 231 232 static void free_devices(void) { 233 struct mknod_args **cur; 234 235 if (!opts.devices) 236 return; 237 238 cur = opts.devices; 239 240 while (*cur) { 241 free((*cur)->path); 242 free(*(cur++)); 243 } 244 free(opts.devices); 245 } 246 247 static void free_rlimits(void) { 248 int type; 249 250 for (type = 0; type < RLIM_NLIMITS; ++type) 251 free(opts.rlimits[type]); 252 } 253 254 static void free_opts(bool parent) { 255 256 free_library_search(); 257 mount_free(); 258 cgroups_free(); 259 260 /* we need to keep argv, envp and seccomp filter in child */ 261 if (parent) { /* parent-only */ 262 if (opts.ociseccomp) { 263 free(opts.ociseccomp->filter); 264 free(opts.ociseccomp); 265 } 266 267 free_oci_envp(opts.jail_argv); 268 free_oci_envp(opts.envp); 269 } 270 271 free_rlimits(); 272 free_sysctl(); 273 free_devices(); 274 free(opts.hostname); 275 free(opts.cwd); 276 free(opts.uidmap); 277 free(opts.gidmap); 278 free(opts.annotations); 279 free(opts.extroot); 280 free(opts.overlaydir); 281 free_hooklist(opts.hooks.createRuntime); 282 free_hooklist(opts.hooks.createContainer); 283 free_hooklist(opts.hooks.startContainer); 284 free_hooklist(opts.hooks.poststart); 285 free_hooklist(opts.hooks.poststop); 286 } 287 288 static int mount_overlay(char *jail_root, char *overlaydir) { 289 char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf; 290 const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s"; 291 int ret = -1, fd; 292 293 if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0) 294 goto out; 295 296 if (asprintf(&workdir, "%s%s", overlaydir, "/work") < 0) 297 goto upper_printf; 298 299 if (asprintf(&optsstr, mountoptsformat, jail_root, upperdir, workdir) < 0) 300 goto work_printf; 301 302 if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755)) 303 goto opts_printf; 304 305 /* 306 * make sure /etc/resolv.conf exists in overlay and is owned by jail userns root 307 * this is to work-around a bug in overlayfs described in the overlayfs-userns 308 * patch: 309 * 3. modification of a file 'hithere' which is in l but not yet 310 * in u, and which is not owned by T, is not allowed, even if 311 * writes to u are allowed. This may be a bug in overlayfs, 312 * but it is safe behavior. 313 */ 314 if (asprintf(&upperetc, "%s/etc", upperdir) < 0) 315 goto opts_printf; 316 317 if (mkdir_p(upperetc, 0755)) 318 goto upper_etc_printf; 319 320 if (asprintf(&upperresolvconf, "%s/resolv.conf", upperetc) < 0) 321 goto upper_etc_printf; 322 323 fd = creat(upperresolvconf, 0644); 324 if (fd < 0) { 325 if (errno != EEXIST) 326 ERROR("creat(%s) failed: %m\n", upperresolvconf); 327 } else { 328 close(fd); 329 } 330 DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr); 331 332 if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr)) 333 goto upper_resolvconf_printf; 334 335 ret = 0; 336 337 upper_resolvconf_printf: 338 free(upperresolvconf); 339 upper_etc_printf: 340 free(upperetc); 341 opts_printf: 342 free(optsstr); 343 work_printf: 344 free(workdir); 345 upper_printf: 346 free(upperdir); 347 out: 348 return ret; 349 } 350 351 static void pass_console(int console_fd) 352 { 353 struct ubus_context *child_ctx = ubus_connect(NULL); 354 static struct blob_buf req; 355 uint32_t id; 356 357 if (!child_ctx) 358 return; 359 360 blob_buf_init(&req, 0); 361 blobmsg_add_string(&req, "name", opts.name); 362 363 if (ubus_lookup_id(child_ctx, "container", &id) || 364 ubus_invoke_fd(child_ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd)) 365 INFO("ubus request failed\n"); 366 else 367 close(console_fd); 368 369 blob_buf_free(&req); 370 ubus_free(child_ctx); 371 } 372 373 static int create_dev_console(const char *jail_root) 374 { 375 char *console_fname; 376 char dev_console_path[PATH_MAX]; 377 int slave_console_fd, dev_console_dummy; 378 379 /* Open UNIX/98 virtual console */ 380 console_fd = posix_openpt(O_RDWR | O_NOCTTY); 381 if (console_fd < 0) 382 return -1; 383 384 console_fname = ptsname(console_fd); 385 DEBUG("got console fd %d and PTS client name %s\n", console_fd, console_fname); 386 if (!console_fname) 387 goto no_console; 388 389 grantpt(console_fd); 390 unlockpt(console_fd); 391 392 /* pass PTY master to procd */ 393 pass_console(console_fd); 394 395 /* mount-bind PTY slave to /dev/console in jail */ 396 snprintf(dev_console_path, sizeof(dev_console_path), "%s/dev/console", jail_root); 397 dev_console_dummy = creat(dev_console_path, 0620); 398 if (dev_console_dummy < 0) 399 goto no_console; 400 401 close(dev_console_dummy); 402 403 if (mount(console_fname, dev_console_path, "bind", MS_BIND, NULL)) 404 goto no_console; 405 406 /* use PTY slave for stdio */ 407 slave_console_fd = open(console_fname, O_RDWR); /* | O_NOCTTY */ 408 if (slave_console_fd < 0) 409 goto no_console; 410 411 dup2(slave_console_fd, 0); 412 dup2(slave_console_fd, 1); 413 dup2(slave_console_fd, 2); 414 close(slave_console_fd); 415 416 INFO("using guest console %s\n", console_fname); 417 418 return 0; 419 420 no_console: 421 close(console_fd); 422 return 1; 423 } 424 425 static int hook_running = 0; 426 static int hook_return_code = 0; 427 static struct hook_execvpe **current_hook = NULL; 428 typedef void (*hook_return_handler)(void); 429 static hook_return_handler hook_return_cb = NULL; 430 431 static void hook_process_timeout_cb(struct uloop_timeout *t); 432 static struct uloop_timeout hook_process_timeout = { 433 .cb = hook_process_timeout_cb, 434 }; 435 436 static void run_hooklist(void); 437 static void hook_process_handler(struct uloop_process *c, int ret) 438 { 439 uloop_timeout_cancel(&hook_process_timeout); 440 441 if (WIFEXITED(ret)) { 442 hook_return_code = WEXITSTATUS(ret); 443 if (hook_return_code) 444 ERROR("hook (%d) exited with exit: %d\n", c->pid, hook_return_code); 445 else 446 DEBUG("hook (%d) exited with exit: %d\n", c->pid, hook_return_code); 447 448 } else { 449 hook_return_code = WTERMSIG(ret); 450 ERROR("hook (%d) exited with signal: %d\n", c->pid, hook_return_code); 451 } 452 hook_running = 0; 453 ++current_hook; 454 run_hooklist(); 455 } 456 457 static struct uloop_process hook_process = { 458 .cb = hook_process_handler, 459 }; 460 461 static void hook_process_timeout_cb(struct uloop_timeout *t) 462 { 463 DEBUG("hook process failed to stop, sending SIGKILL\n"); 464 kill(hook_process.pid, SIGKILL); 465 } 466 467 static void run_hooklist(void) 468 { 469 struct hook_execvpe *hook = *current_hook; 470 struct stat s; 471 472 if (!hook) 473 return hook_return_cb(); 474 475 DEBUG("executing hook %s\n", hook->file); 476 477 if (stat(hook->file, &s)) 478 hook_process_handler(&hook_process, ENOENT); 479 480 if (!((unsigned long)s.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) 481 hook_process_handler(&hook_process, EPERM); 482 483 hook_running = 1; 484 hook_process.pid = fork(); 485 if (hook_process.pid == 0) { 486 /* child */ 487 execve(hook->file, hook->argv, hook->envp); 488 ERROR("execve error %m\n"); 489 _exit(errno); 490 } else if (hook_process.pid < 0) { 491 /* fork error */ 492 ERROR("hook fork error\n"); 493 hook_running = 0; 494 hook_process_handler(&hook_process, errno); 495 } 496 497 /* parent */ 498 uloop_process_add(&hook_process); 499 500 if (hook->timeout > 0) 501 uloop_timeout_set(&hook_process_timeout, 1000 * hook->timeout); 502 503 uloop_run(); 504 if (hook_running) { 505 DEBUG("uloop interrupted, killing jail process\n"); 506 kill(hook_process.pid, SIGTERM); 507 uloop_timeout_set(&hook_process_timeout, 1000); 508 uloop_run(); 509 } 510 } 511 512 static void run_hooks(struct hook_execvpe **hooklist, hook_return_handler return_cb) 513 { 514 if (!hooklist) 515 return_cb(); 516 517 current_hook = hooklist; 518 hook_return_cb = return_cb; 519 520 run_hooklist(); 521 } 522 523 static int apply_sysctl(const char *jail_root) 524 { 525 struct sysctl_val **cur; 526 char *procdir, *fname; 527 int f; 528 529 if (!opts.sysctl) 530 return 0; 531 532 if (asprintf(&procdir, "%s/proc", jail_root) < 0) 533 return ENOMEM; 534 535 if (mkdir(procdir, 0700)) 536 return errno; 537 538 if (mount("proc", procdir, "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0)) 539 return EPERM; 540 541 cur = opts.sysctl; 542 543 while (*cur) { 544 if (asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry) < 0) 545 return ENOMEM; 546 547 DEBUG("sysctl: writing '%s' to %s\n", (*cur)->value, fname); 548 549 f = open(fname, O_WRONLY); 550 if (f < 0) { 551 ERROR("sysctl: can't open %s\n", fname); 552 free(fname); 553 return errno; 554 } 555 if (write(f, (*cur)->value, strlen((*cur)->value)) < 0) { 556 ERROR("sysctl: write to %s\n", fname); 557 free(fname); 558 close(f); 559 return errno; 560 } 561 562 free(fname); 563 close(f); 564 ++cur; 565 } 566 umount(procdir); 567 rmdir(procdir); 568 free(procdir); 569 570 return 0; 571 } 572 573 /* glibc defines makedev calling a function. make sure it's a pure macro */ 574 #if defined(__GLIBC__) 575 #undef makedev 576 /* from musl's sys/sysmacros.h */ 577 #define makedev(x,y) ( \ 578 (((x)&0xfffff000ULL) << 32) | \ 579 (((x)&0x00000fffULL) << 8) | \ 580 (((y)&0xffffff00ULL) << 12) | \ 581 (((y)&0x000000ffULL)) ) 582 #endif 583 584 static struct mknod_args default_devices[] = { 585 { .path = "/dev/null", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 3) }, 586 { .path = "/dev/zero", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 5) }, 587 { .path = "/dev/full", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 7) }, 588 { .path = "/dev/random", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 8) }, 589 { .path = "/dev/urandom", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 9) }, 590 { .path = "/dev/tty", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP), .dev = makedev(5, 0), .gid = 5 }, 591 { 0 }, 592 }; 593 594 static int create_devices(void) 595 { 596 struct mknod_args **cur, *curdef; 597 char *path, *tmp; 598 int ret; 599 600 if (!opts.devices) 601 goto only_default_devices; 602 603 cur = opts.devices; 604 605 while (*cur) { 606 path = (*cur)->path; 607 /* don't allow devices outside of /dev */ 608 if (strncmp(path, "/dev", 4)) 609 return EPERM; 610 611 /* make sure parent folder exists */ 612 tmp = strrchr(path, '/'); 613 if (!tmp) 614 return EINVAL; 615 616 *tmp = '\0'; 617 if (strcmp(path, "/dev")) { 618 DEBUG("creating directory %s\n", path); 619 620 if (mkdir_p(path, 0755)) 621 return errno; 622 } 623 *tmp = '/'; 624 625 DEBUG("creating %s (mode=%08o)\n", path, (*cur)->mode); 626 627 /* create device */ 628 if (mknod(path, (*cur)->mode, (*cur)->dev)) 629 return errno; 630 631 /* change owner, if needed */ 632 if (((*cur)->uid || (*cur)->gid) && 633 chown(path, (*cur)->uid, (*cur)->gid)) 634 return errno; 635 636 ++cur; 637 } 638 639 only_default_devices: 640 curdef = default_devices; 641 while(curdef->path) { 642 DEBUG("creating %s (mode=%08o)\n", curdef->path, curdef->mode); 643 if (mknod(curdef->path, curdef->mode, curdef->dev)) { 644 ++curdef; 645 continue; /* may already exist, eg. due to a bind-mount */ 646 } 647 if ((curdef->uid || curdef->gid) && 648 chown(curdef->path, curdef->uid, curdef->gid)) 649 return errno; 650 651 ++curdef; 652 } 653 654 /* Dev symbolic links as defined in OCI spec */ 655 ret = symlink("/dev/pts/ptmx", "/dev/ptmx"); 656 if (ret < 0) 657 WARNING("symlink() failed to create link to /dev/pts/ptmx"); 658 659 ret = symlink("/proc/self/fd", "/dev/fd"); 660 if (ret < 0) 661 WARNING("symlink() failed to create link to /proc/self/fd"); 662 663 ret = symlink("/proc/self/fd/0", "/dev/stdin"); 664 if (ret < 0) 665 WARNING("symlink() failed to create link to /proc/self/fd/0"); 666 667 ret = symlink("/proc/self/fd/1", "/dev/stdout"); 668 if (ret < 0) 669 WARNING("symlink() failed to create link to /proc/self/fd/1"); 670 671 ret = symlink("/proc/self/fd/2", "/dev/stderr"); 672 if (ret < 0) 673 WARNING("symlink() failed to create link to /proc/self/fd/2"); 674 675 return 0; 676 } 677 678 static char jail_root[] = "/tmp/ujail-XXXXXX"; 679 static char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX"; 680 static mode_t old_umask; 681 static void enter_jail_fs(void); 682 static int build_jail_fs(void) 683 { 684 char *overlaydir = NULL; 685 int ret; 686 687 old_umask = umask(0); 688 689 if (mkdtemp(jail_root) == NULL) { 690 ERROR("mkdtemp(%s) failed: %m\n", jail_root); 691 return -1; 692 } 693 694 if (apply_sysctl(jail_root)) { 695 ERROR("failed to apply sysctl values\n"); 696 return -1; 697 } 698 699 /* oldroot can't be MS_SHARED else pivot_root() fails */ 700 if (mount("none", "/", "none", MS_REC|MS_PRIVATE, NULL)) { 701 ERROR("private mount failed %m\n"); 702 return -1; 703 } 704 705 if (opts.extroot) { 706 if (mount(opts.extroot, jail_root, "bind", MS_BIND, NULL)) { 707 ERROR("extroot mount failed %m\n"); 708 return -1; 709 } 710 } else { 711 if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) { 712 ERROR("tmpfs mount failed %m\n"); 713 return -1; 714 } 715 } 716 717 if (opts.tmpoverlaysize) { 718 char mountoptsstr[] = "mode=0755,size=XXXXXXXX"; 719 720 snprintf(mountoptsstr, sizeof(mountoptsstr), 721 "mode=0755,size=%s", opts.tmpoverlaysize); 722 if (mkdtemp(tmpovdir) == NULL) { 723 ERROR("mkdtemp(%s) failed: %m\n", jail_root); 724 return -1; 725 } 726 if (mount("tmpfs", tmpovdir, "tmpfs", MS_NOATIME, 727 mountoptsstr)) { 728 ERROR("failed to mount tmpfs for overlay (size=%s)\n", opts.tmpoverlaysize); 729 return -1; 730 } 731 overlaydir = tmpovdir; 732 } 733 734 if (opts.overlaydir) 735 overlaydir = opts.overlaydir; 736 737 if (overlaydir) { 738 ret = mount_overlay(jail_root, overlaydir); 739 if (ret) 740 return ret; 741 } 742 743 if (chdir(jail_root)) { 744 ERROR("chdir(%s) (jail_root) failed: %m\n", jail_root); 745 return -1; 746 } 747 748 if (mount_all(jail_root)) { 749 ERROR("mount_all() failed\n"); 750 return -1; 751 } 752 753 if (opts.console) 754 create_dev_console(jail_root); 755 756 /* make sure /etc/resolv.conf exists if in new network namespace */ 757 if (opts.namespace & CLONE_NEWNET) { 758 char jailetc[PATH_MAX], jaillink[PATH_MAX]; 759 760 snprintf(jailetc, PATH_MAX, "%s/etc", jail_root); 761 if (mkdir_p(jailetc, 0755)) { 762 ERROR("mkdir(%s) failed: %m\n", jailetc); 763 return -1; 764 } 765 snprintf(jaillink, PATH_MAX, "%s/etc/resolv.conf", jail_root); 766 if (overlaydir) 767 unlink(jaillink); 768 769 ret = symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink); 770 if (ret < 0) 771 WARNING("symlink() failed to create link to ../dev/resolv.conf.d/resolv.conf.auto"); 772 } 773 774 run_hooks(opts.hooks.createContainer, enter_jail_fs); 775 776 return 0; 777 } 778 779 static bool exit_from_child; 780 static void free_and_exit(int ret) 781 { 782 if (!exit_from_child && opts.ocibundle) 783 cgroups_free(); 784 785 if (!exit_from_child && parent_ctx) 786 ubus_free(parent_ctx); 787 788 free_opts(!exit_from_child); 789 790 exit(ret); 791 } 792 793 static void post_jail_fs(void); 794 static void enter_jail_fs(void) 795 { 796 char dirbuf[sizeof(jail_root) + 4]; 797 798 snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root); 799 if (mkdir(dirbuf, 0755)) { 800 ERROR("mkdir(%s) failed: %m\n", dirbuf); 801 free_and_exit(-1); 802 } 803 if (pivot_root(jail_root, dirbuf) == -1) { 804 ERROR("pivot_root(%s, %s) failed: %m\n", jail_root, dirbuf); 805 free_and_exit(-1); 806 } 807 if (chdir("/")) { 808 ERROR("chdir(/) (after pivot_root) failed: %m\n"); 809 free_and_exit(-1); 810 } 811 812 snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root); 813 umount2(dirbuf, MNT_DETACH); 814 rmdir(dirbuf); 815 if (opts.tmpoverlaysize) { 816 char tmpdirbuf[sizeof(tmpovdir) + 4]; 817 snprintf(tmpdirbuf, sizeof(tmpdirbuf), "/old%s", tmpovdir); 818 umount2(tmpdirbuf, MNT_DETACH); 819 rmdir(tmpdirbuf); 820 } 821 822 umount2("/old", MNT_DETACH); 823 rmdir("/old"); 824 825 if (create_devices()) { 826 ERROR("create_devices() failed\n"); 827 free_and_exit(-1); 828 } 829 if (opts.ronly) 830 mount(NULL, "/", "bind", MS_REMOUNT | MS_BIND | MS_RDONLY, 0); 831 832 umask(old_umask); 833 post_jail_fs(); 834 } 835 836 static int write_uid_gid_map(pid_t child_pid, bool gidmap, char *mapstr) 837 { 838 int map_file; 839 char map_path[64]; 840 841 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s", 842 child_pid, gidmap?"gid_map":"uid_map") < 0) 843 return -1; 844 845 if ((map_file = open(map_path, O_WRONLY)) < 0) 846 return -1; 847 848 if (dprintf(map_file, "%s", mapstr)) { 849 close(map_file); 850 return -1; 851 } 852 853 close(map_file); 854 return 0; 855 } 856 857 static int write_single_uid_gid_map(pid_t child_pid, bool gidmap, int id) 858 { 859 int map_file; 860 char map_path[64]; 861 const char *map_format = "%d %d %d\n"; 862 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s", 863 child_pid, gidmap?"gid_map":"uid_map") < 0) 864 return -1; 865 866 if ((map_file = open(map_path, O_WRONLY)) < 0) 867 return -1; 868 869 if (dprintf(map_file, map_format, 0, id, 1) < 0) { 870 close(map_file); 871 return -1; 872 } 873 874 close(map_file); 875 return 0; 876 } 877 878 static int write_setgroups(pid_t child_pid, bool allow) 879 { 880 int setgroups_file; 881 char setgroups_path[64]; 882 883 if (snprintf(setgroups_path, sizeof(setgroups_path), "/proc/%d/setgroups", 884 child_pid) < 0) { 885 return -1; 886 } 887 888 if ((setgroups_file = open(setgroups_path, O_WRONLY)) < 0) { 889 return -1; 890 } 891 892 if (dprintf(setgroups_file, "%s", allow?"allow":"deny") == -1) { 893 close(setgroups_file); 894 return -1; 895 } 896 897 close(setgroups_file); 898 return 0; 899 } 900 901 static void get_jail_user(int *user, int *user_gid, int *gr_gid) 902 { 903 struct passwd *p = NULL; 904 struct group *g = NULL; 905 906 if (opts.user) { 907 p = getpwnam(opts.user); 908 if (!p) { 909 ERROR("failed to get uid/gid for user %s: %d (%s)\n", 910 opts.user, errno, strerror(errno)); 911 free_and_exit(EXIT_FAILURE); 912 } 913 *user = p->pw_uid; 914 *user_gid = p->pw_gid; 915 } else { 916 *user = -1; 917 *user_gid = -1; 918 } 919 920 if (opts.group) { 921 g = getgrnam(opts.group); 922 if (!g) { 923 ERROR("failed to get gid for group %s: %m\n", opts.group); 924 free_and_exit(EXIT_FAILURE); 925 } 926 *gr_gid = g->gr_gid; 927 } else { 928 *gr_gid = -1; 929 } 930 }; 931 932 static void set_jail_user(int pw_uid, int user_gid, int gr_gid) 933 { 934 if (opts.user && (user_gid != -1) && initgroups(opts.user, user_gid)) { 935 ERROR("failed to initgroups() for user %s: %m\n", opts.user); 936 free_and_exit(EXIT_FAILURE); 937 } 938 939 if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) { 940 ERROR("failed to set group id %d: %m\n", gr_gid); 941 free_and_exit(EXIT_FAILURE); 942 } 943 944 if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) { 945 ERROR("failed to set user id %d: %m\n", pw_uid); 946 free_and_exit(EXIT_FAILURE); 947 } 948 } 949 950 static int apply_rlimits(void) 951 { 952 int resource; 953 954 for (resource = 0; resource < RLIM_NLIMITS; ++resource) { 955 if (opts.rlimits[resource]) 956 DEBUG("applying limits to resource %u\n", resource); 957 958 if (opts.rlimits[resource] && 959 setrlimit(resource, opts.rlimits[resource])) 960 return errno; 961 } 962 963 return 0; 964 } 965 966 #define MAX_ENVP 64 967 static char** build_envp(const char *seccomp, char **ocienvp) 968 { 969 static char *envp[MAX_ENVP]; 970 static char preload_var[PATH_MAX]; 971 static char seccomp_var[PATH_MAX]; 972 static char seccomp_debug_var[20]; 973 static char debug_var[] = "LD_DEBUG=all"; 974 static char container_var[] = "container=ujail"; 975 const char *preload_lib = find_lib("libpreload-seccomp.so"); 976 char **addenv; 977 978 int count = 0; 979 980 if (seccomp && !preload_lib) { 981 ERROR("failed to add preload-lib to env\n"); 982 return NULL; 983 } 984 if (seccomp) { 985 snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp); 986 envp[count++] = seccomp_var; 987 snprintf(seccomp_debug_var, sizeof(seccomp_debug_var), "SECCOMP_DEBUG=%2d", debug); 988 envp[count++] = seccomp_debug_var; 989 snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib); 990 envp[count++] = preload_var; 991 } 992 993 envp[count++] = container_var; 994 995 if (debug > 1) 996 envp[count++] = debug_var; 997 998 addenv = ocienvp; 999 while (addenv && *addenv) { 1000 envp[count++] = *(addenv++); 1001 if (count >= MAX_ENVP) { 1002 ERROR("environment limited to %d extra records, truncating\n", MAX_ENVP); 1003 break; 1004 } 1005 } 1006 return envp; 1007 } 1008 1009 static void usage(void) 1010 { 1011 fprintf(stderr, "ujail <options> -- <binary> <params ...>\n"); 1012 fprintf(stderr, " -d <num>\tshow debug log (increase num to increase verbosity)\n"); 1013 fprintf(stderr, " -S <file>\tseccomp filter config\n"); 1014 fprintf(stderr, " -C <file>\tcapabilities drop config\n"); 1015 fprintf(stderr, " -c\t\tset PR_SET_NO_NEW_PRIVS\n"); 1016 fprintf(stderr, " -n <name>\tthe name of the jail\n"); 1017 fprintf(stderr, " -e <var>\timport environment variable\n"); 1018 fprintf(stderr, "namespace jail options:\n"); 1019 fprintf(stderr, " -h <hostname>\tchange the hostname of the jail\n"); 1020 fprintf(stderr, " -N\t\tjail has network namespace\n"); 1021 fprintf(stderr, " -f\t\tjail has user namespace\n"); 1022 fprintf(stderr, " -F\t\tjail has cgroups namespace\n"); 1023 fprintf(stderr, " -r <file>\treadonly files that should be staged\n"); 1024 fprintf(stderr, " -w <file>\twriteable files that should be staged\n"); 1025 fprintf(stderr, " -p\t\tjail has /proc\n"); 1026 fprintf(stderr, " -s\t\tjail has /sys\n"); 1027 fprintf(stderr, " -l\t\tjail has /dev/log\n"); 1028 fprintf(stderr, " -u\t\tjail has a ubus socket\n"); 1029 fprintf(stderr, " -D\t\tjail has a udebug socket\n"); 1030 fprintf(stderr, " -U <name>\tuser to run jailed process\n"); 1031 fprintf(stderr, " -G <name>\tgroup to run jailed process\n"); 1032 fprintf(stderr, " -o\t\tremont jail root (/) read only\n"); 1033 fprintf(stderr, " -R <dir>\texternal jail rootfs (system container)\n"); 1034 fprintf(stderr, " -O <dir>\tdirectory for r/w overlayfs\n"); 1035 fprintf(stderr, " -T <size>\tuse tmpfs r/w overlayfs with <size>\n"); 1036 fprintf(stderr, " -E\t\tfail if jail cannot be setup\n"); 1037 fprintf(stderr, " -y\t\tprovide jail console\n"); 1038 fprintf(stderr, " -J <dir>\tcreate container from OCI bundle\n"); 1039 fprintf(stderr, " -i\t\tstart container immediately\n"); 1040 fprintf(stderr, " -P <pidfile>\tcreate <pidfile>\n"); 1041 fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\ 1042 and he has the same powers as root outside the jail,\n\ 1043 thus he can escape the jail and/or break stuff.\n\ 1044 Please use seccomp/capabilities (-S/-C) to restrict his powers\n\n\ 1045 If you use none of the namespace jail options,\n\ 1046 ujail will not use namespace/build a jail,\n\ 1047 and will only drop capabilities/apply seccomp filter.\n\n"); 1048 } 1049 1050 static int* get_namespace_fd(const unsigned int nstype) 1051 { 1052 switch (nstype) { 1053 case CLONE_NEWPID: 1054 return &opts.setns.pid; 1055 case CLONE_NEWNET: 1056 return &opts.setns.net; 1057 case CLONE_NEWNS: 1058 return &opts.setns.ns; 1059 case CLONE_NEWIPC: 1060 return &opts.setns.ipc; 1061 case CLONE_NEWUTS: 1062 return &opts.setns.uts; 1063 case CLONE_NEWUSER: 1064 return &opts.setns.user; 1065 case CLONE_NEWCGROUP: 1066 return &opts.setns.cgroup; 1067 #ifdef CLONE_NEWTIME 1068 case CLONE_NEWTIME: 1069 return &opts.setns.time; 1070 #endif 1071 default: 1072 return NULL; 1073 } 1074 } 1075 1076 static int setns_open(unsigned long nstype) 1077 { 1078 int *fd = get_namespace_fd(nstype); 1079 1080 assert(fd != NULL); 1081 1082 if (*fd < 0) 1083 return 0; 1084 1085 if (setns(*fd, nstype) == -1) { 1086 close(*fd); 1087 return errno; 1088 } 1089 1090 close(*fd); 1091 return 0; 1092 } 1093 1094 static int jail_running = 0; 1095 static int jail_return_code = 0; 1096 1097 static void jail_process_timeout_cb(struct uloop_timeout *t); 1098 static struct uloop_timeout jail_process_timeout = { 1099 .cb = jail_process_timeout_cb, 1100 }; 1101 static void poststop(void); 1102 static void jail_process_handler(struct uloop_process *c, int ret) 1103 { 1104 uloop_timeout_cancel(&jail_process_timeout); 1105 if (WIFEXITED(ret)) { 1106 jail_return_code = WEXITSTATUS(ret); 1107 INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code); 1108 } else { 1109 jail_return_code = WTERMSIG(ret); 1110 INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code); 1111 } 1112 jail_running = 0; 1113 poststop(); 1114 } 1115 1116 static struct uloop_process jail_process = { 1117 .cb = jail_process_handler, 1118 }; 1119 1120 static void jail_process_timeout_cb(struct uloop_timeout *t) 1121 { 1122 DEBUG("jail process failed to stop, sending SIGKILL\n"); 1123 kill(jail_process.pid, SIGKILL); 1124 } 1125 1126 static void jail_handle_signal(int signo) 1127 { 1128 if (hook_running) { 1129 DEBUG("forwarding signal %d to the hook process\n", signo); 1130 kill(hook_process.pid, signo); 1131 /* set timeout to send SIGKILL hook process in case SIGTERM doesn't succeed */ 1132 if (signo == SIGTERM) 1133 uloop_timeout_set(&hook_process_timeout, opts.term_timeout * 1000); 1134 } 1135 1136 if (jail_running) { 1137 DEBUG("forwarding signal %d to the jailed process\n", signo); 1138 kill(jail_process.pid, signo); 1139 /* set timeout to send SIGKILL jail process in case SIGTERM doesn't succeed */ 1140 if (signo == SIGTERM) 1141 uloop_timeout_set(&jail_process_timeout, opts.term_timeout * 1000); 1142 } 1143 } 1144 1145 static void signals_init(void) 1146 { 1147 int i; 1148 sigset_t sigmask; 1149 1150 sigfillset(&sigmask); 1151 for (i = 0; i < _NSIG; i++) { 1152 struct sigaction s = { 0 }; 1153 1154 if (!sigismember(&sigmask, i)) 1155 continue; 1156 if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV) || (i == SIGSTOP) || (i == SIGKILL)) 1157 continue; 1158 1159 s.sa_handler = jail_handle_signal; 1160 sigaction(i, &s, NULL); 1161 } 1162 } 1163 1164 static void pre_exec_jail(struct uloop_timeout *t); 1165 static struct uloop_timeout pre_exec_timeout = { 1166 .cb = pre_exec_jail, 1167 }; 1168 1169 int pipes[4]; 1170 static int exec_jail(void *arg) 1171 { 1172 char buf[1]; 1173 1174 exit_from_child = true; 1175 prctl(PR_SET_SECUREBITS, 0); 1176 1177 uloop_init(); 1178 signals_init(); 1179 1180 close(pipes[0]); 1181 close(pipes[3]); 1182 1183 setns_open(CLONE_NEWUSER); 1184 setns_open(CLONE_NEWNET); 1185 setns_open(CLONE_NEWNS); 1186 setns_open(CLONE_NEWIPC); 1187 setns_open(CLONE_NEWUTS); 1188 1189 buf[0] = 'i'; 1190 if (write(pipes[1], buf, 1) < 1) { 1191 ERROR("can't write to parent\n"); 1192 return EXIT_FAILURE; 1193 } 1194 close(pipes[1]); 1195 if (read(pipes[2], buf, 1) < 1) { 1196 ERROR("can't read from parent\n"); 1197 return EXIT_FAILURE; 1198 } 1199 if (buf[0] != 'O') { 1200 ERROR("parent had an error, child exiting\n"); 1201 return EXIT_FAILURE; 1202 } 1203 1204 if (opts.namespace & CLONE_NEWCGROUP) 1205 unshare(CLONE_NEWCGROUP); 1206 1207 setns_open(CLONE_NEWCGROUP); 1208 1209 if ((opts.namespace & CLONE_NEWUSER) || (opts.setns.user != -1)) { 1210 if (setregid(0, 0) < 0) { 1211 ERROR("setgid\n"); 1212 free_and_exit(EXIT_FAILURE); 1213 } 1214 if (setreuid(0, 0) < 0) { 1215 ERROR("setuid\n"); 1216 free_and_exit(EXIT_FAILURE); 1217 } 1218 if (setgroups(0, NULL) < 0) { 1219 ERROR("setgroups\n"); 1220 free_and_exit(EXIT_FAILURE); 1221 } 1222 } 1223 1224 if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0 1225 && sethostname(opts.hostname, strlen(opts.hostname))) { 1226 ERROR("sethostname(%s) failed: %m\n", opts.hostname); 1227 free_and_exit(EXIT_FAILURE); 1228 } 1229 1230 uloop_timeout_add(&pre_exec_timeout); 1231 uloop_run(); 1232 1233 free_and_exit(-1); 1234 return -1; 1235 } 1236 1237 static void pre_exec_jail(struct uloop_timeout *t) 1238 { 1239 if ((opts.namespace & CLONE_NEWNS) && build_jail_fs()) { 1240 ERROR("failed to build jail fs\n"); 1241 free_and_exit(EXIT_FAILURE); 1242 } else { 1243 run_hooks(opts.hooks.createContainer, post_jail_fs); 1244 } 1245 } 1246 1247 static void post_start_hook(void); 1248 static void post_jail_fs(void) 1249 { 1250 char buf[1]; 1251 1252 if (read(pipes[2], buf, 1) < 1) { 1253 ERROR("can't read from parent\n"); 1254 free_and_exit(EXIT_FAILURE); 1255 } 1256 if (buf[0] != '!') { 1257 ERROR("parent had an error, child exiting\n"); 1258 free_and_exit(EXIT_FAILURE); 1259 } 1260 close(pipes[2]); 1261 1262 run_hooks(opts.hooks.startContainer, post_start_hook); 1263 } 1264 1265 static void post_start_hook(void) 1266 { 1267 int pw_uid, pw_gid, gr_gid; 1268 1269 /* 1270 * make sure setuid/setgid won't drop capabilities in case capabilities 1271 * have been specified explicitely. 1272 */ 1273 if (opts.capset.apply) { 1274 if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) { 1275 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n"); 1276 free_and_exit(EXIT_FAILURE); 1277 } 1278 } 1279 1280 /* drop capabilities, retain those still needed to further setup jail */ 1281 if (applyOCIcapabilities(opts.capset, (1LLU << CAP_SETGID) | (1LLU << CAP_SETUID) | (1LLU << CAP_SETPCAP))) 1282 free_and_exit(EXIT_FAILURE); 1283 1284 /* use either cmdline-supplied user/group or uid/gid from OCI spec */ 1285 get_jail_user(&pw_uid, &pw_gid, &gr_gid); 1286 set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid); 1287 1288 if (opts.additional_gids && 1289 (setgroups(opts.num_additional_gids, opts.additional_gids) < 0)) { 1290 ERROR("setgroups failed: %m\n"); 1291 free_and_exit(EXIT_FAILURE); 1292 } 1293 1294 if (opts.set_umask) 1295 umask(opts.umask); 1296 1297 /* restore securebits back to normal (and lock them if not in userns) */ 1298 if (opts.capset.apply) { 1299 if (prctl(PR_SET_SECUREBITS, (opts.namespace & CLONE_NEWUSER)?0: 1300 SECBIT_KEEP_CAPS_LOCKED|SECBIT_NO_SETUID_FIXUP_LOCKED|SECBIT_NOROOT_LOCKED)) { 1301 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n"); 1302 free_and_exit(EXIT_FAILURE); 1303 } 1304 } 1305 1306 /* drop remaining capabilities to end up with specified sets */ 1307 if (applyOCIcapabilities(opts.capset, 0)) 1308 free_and_exit(EXIT_FAILURE); 1309 1310 if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { 1311 ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n"); 1312 free_and_exit(EXIT_FAILURE); 1313 } 1314 1315 char **envp = build_envp(opts.seccomp, opts.envp); 1316 if (!envp) 1317 free_and_exit(EXIT_FAILURE); 1318 1319 if (opts.cwd && chdir(opts.cwd)) 1320 free_and_exit(EXIT_FAILURE); 1321 1322 if (opts.ociseccomp && applyOCIlinuxseccomp(opts.ociseccomp)) 1323 free_and_exit(EXIT_FAILURE); 1324 1325 uloop_end(); 1326 free_opts(false); 1327 INFO("exec-ing %s\n", *opts.jail_argv); 1328 if (opts.envp) /* respect PATH if potentially set in ENV */ 1329 execvpe(*opts.jail_argv, opts.jail_argv, envp); 1330 else 1331 execve(*opts.jail_argv, opts.jail_argv, envp); 1332 1333 /* we get there only if execve fails */ 1334 ERROR("failed to execve %s: %m\n", *opts.jail_argv); 1335 exit(EXIT_FAILURE); 1336 } 1337 1338 int ns_open_pid(const char *nstype, const pid_t target_ns) 1339 { 1340 char pid_pid_path[PATH_MAX]; 1341 1342 snprintf(pid_pid_path, sizeof(pid_pid_path), "/proc/%u/ns/%s", target_ns, nstype); 1343 1344 return open(pid_pid_path, O_RDONLY); 1345 } 1346 1347 static int parseOCIenvarray(struct blob_attr *msg, char ***envp) 1348 { 1349 struct blob_attr *cur; 1350 int sz = 0, rem; 1351 1352 blobmsg_for_each_attr(cur, msg, rem) 1353 ++sz; 1354 1355 if (sz > 0) { 1356 *envp = calloc(1 + sz, sizeof(char*)); 1357 if (!(*envp)) 1358 return ENOMEM; 1359 } else { 1360 *envp = NULL; 1361 return 0; 1362 } 1363 1364 sz = 0; 1365 blobmsg_for_each_attr(cur, msg, rem) 1366 (*envp)[sz++] = strdup(blobmsg_get_string(cur)); 1367 1368 if (sz) 1369 (*envp)[sz] = NULL; 1370 1371 return 0; 1372 } 1373 1374 enum { 1375 OCI_ROOT_PATH, 1376 OCI_ROOT_READONLY, 1377 __OCI_ROOT_MAX, 1378 }; 1379 1380 static const struct blobmsg_policy oci_root_policy[] = { 1381 [OCI_ROOT_PATH] = { "path", BLOBMSG_TYPE_STRING }, 1382 [OCI_ROOT_READONLY] = { "readonly", BLOBMSG_TYPE_BOOL }, 1383 }; 1384 1385 static int parseOCIroot(const char *jsonfile, struct blob_attr *msg) 1386 { 1387 char extroot[PATH_MAX] = { 0 }; 1388 struct blob_attr *tb[__OCI_ROOT_MAX]; 1389 char *cur; 1390 char *root_path; 1391 1392 blobmsg_parse(oci_root_policy, __OCI_ROOT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1393 1394 if (!tb[OCI_ROOT_PATH]) 1395 return ENODATA; 1396 1397 root_path = blobmsg_get_string(tb[OCI_ROOT_PATH]); 1398 1399 /* prepend bundle directory in case of relative paths */ 1400 if (root_path[0] != '/') { 1401 strncpy(extroot, jsonfile, PATH_MAX - 1); 1402 1403 cur = strrchr(extroot, '/'); 1404 1405 if (!cur) 1406 return ENOTDIR; 1407 1408 *(++cur) = '\0'; 1409 } 1410 1411 strncat(extroot, root_path, PATH_MAX - (strlen(extroot) + 1)); 1412 1413 /* follow symbolic link(s) */ 1414 opts.extroot = realpath(extroot, NULL); 1415 if (!opts.extroot) 1416 return errno; 1417 1418 if (tb[OCI_ROOT_READONLY]) 1419 opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]); 1420 1421 return 0; 1422 } 1423 1424 1425 enum { 1426 OCI_HOOK_PATH, 1427 OCI_HOOK_ARGS, 1428 OCI_HOOK_ENV, 1429 OCI_HOOK_TIMEOUT, 1430 __OCI_HOOK_MAX, 1431 }; 1432 1433 static const struct blobmsg_policy oci_hook_policy[] = { 1434 [OCI_HOOK_PATH] = { "path", BLOBMSG_TYPE_STRING }, 1435 [OCI_HOOK_ARGS] = { "args", BLOBMSG_TYPE_ARRAY }, 1436 [OCI_HOOK_ENV] = { "env", BLOBMSG_TYPE_ARRAY }, 1437 [OCI_HOOK_TIMEOUT] = { "timeout", BLOBMSG_TYPE_INT32 }, 1438 }; 1439 1440 1441 static int parseOCIhook(struct hook_execvpe ***hooklist, struct blob_attr *msg) 1442 { 1443 struct blob_attr *tb[__OCI_HOOK_MAX]; 1444 struct blob_attr *cur; 1445 int rem, ret = 0; 1446 int idx = 0; 1447 1448 blobmsg_for_each_attr(cur, msg, rem) 1449 ++idx; 1450 1451 if (!idx) 1452 return 0; 1453 1454 *hooklist = calloc(idx + 1, sizeof(struct hook_execvpe *)); 1455 idx = 0; 1456 1457 if (!(*hooklist)) 1458 return ENOMEM; 1459 1460 blobmsg_for_each_attr(cur, msg, rem) { 1461 blobmsg_parse(oci_hook_policy, __OCI_HOOK_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 1462 1463 if (!tb[OCI_HOOK_PATH]) { 1464 ret = EINVAL; 1465 goto errout; 1466 } 1467 1468 (*hooklist)[idx] = calloc(1, sizeof(struct hook_execvpe)); 1469 if (tb[OCI_HOOK_ARGS]) { 1470 ret = parseOCIenvarray(tb[OCI_HOOK_ARGS], &((*hooklist)[idx]->argv)); 1471 if (ret) 1472 goto errout; 1473 } else { 1474 (*hooklist)[idx]->argv = calloc(2, sizeof(char *)); 1475 ((*hooklist)[idx]->argv)[0] = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH])); 1476 ((*hooklist)[idx]->argv)[1] = NULL; 1477 }; 1478 1479 1480 if (tb[OCI_HOOK_ENV]) { 1481 ret = parseOCIenvarray(tb[OCI_HOOK_ENV], &((*hooklist)[idx]->envp)); 1482 if (ret) 1483 goto errout; 1484 } 1485 1486 if (tb[OCI_HOOK_TIMEOUT]) 1487 (*hooklist)[idx]->timeout = blobmsg_get_u32(tb[OCI_HOOK_TIMEOUT]); 1488 1489 (*hooklist)[idx]->file = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH])); 1490 1491 ++idx; 1492 } 1493 1494 (*hooklist)[idx] = NULL; 1495 1496 DEBUG("added %d hooks\n", idx); 1497 1498 return 0; 1499 1500 errout: 1501 free_hooklist(*hooklist); 1502 *hooklist = NULL; 1503 1504 return ret; 1505 }; 1506 1507 1508 enum { 1509 OCI_HOOKS_PRESTART, 1510 OCI_HOOKS_CREATERUNTIME, 1511 OCI_HOOKS_CREATECONTAINER, 1512 OCI_HOOKS_STARTCONTAINER, 1513 OCI_HOOKS_POSTSTART, 1514 OCI_HOOKS_POSTSTOP, 1515 __OCI_HOOKS_MAX, 1516 }; 1517 1518 static const struct blobmsg_policy oci_hooks_policy[] = { 1519 [OCI_HOOKS_PRESTART] = { "prestart", BLOBMSG_TYPE_ARRAY }, 1520 [OCI_HOOKS_CREATERUNTIME] = { "createRuntime", BLOBMSG_TYPE_ARRAY }, 1521 [OCI_HOOKS_CREATECONTAINER] = { "createContainer", BLOBMSG_TYPE_ARRAY }, 1522 [OCI_HOOKS_STARTCONTAINER] = { "startContainer", BLOBMSG_TYPE_ARRAY }, 1523 [OCI_HOOKS_POSTSTART] = { "poststart", BLOBMSG_TYPE_ARRAY }, 1524 [OCI_HOOKS_POSTSTOP] = { "poststop", BLOBMSG_TYPE_ARRAY }, 1525 }; 1526 1527 static int parseOCIhooks(struct blob_attr *msg) 1528 { 1529 struct blob_attr *tb[__OCI_HOOKS_MAX]; 1530 int ret; 1531 1532 blobmsg_parse(oci_hooks_policy, __OCI_HOOKS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1533 1534 if (tb[OCI_HOOKS_PRESTART]) 1535 INFO("warning: ignoring deprecated prestart hook\n"); 1536 1537 if (tb[OCI_HOOKS_CREATERUNTIME]) { 1538 ret = parseOCIhook(&opts.hooks.createRuntime, tb[OCI_HOOKS_CREATERUNTIME]); 1539 if (ret) 1540 return ret; 1541 } 1542 1543 if (tb[OCI_HOOKS_CREATECONTAINER]) { 1544 ret = parseOCIhook(&opts.hooks.createContainer, tb[OCI_HOOKS_CREATECONTAINER]); 1545 if (ret) 1546 goto out_createruntime; 1547 } 1548 1549 if (tb[OCI_HOOKS_STARTCONTAINER]) { 1550 ret = parseOCIhook(&opts.hooks.startContainer, tb[OCI_HOOKS_STARTCONTAINER]); 1551 if (ret) 1552 goto out_createcontainer; 1553 } 1554 1555 if (tb[OCI_HOOKS_POSTSTART]) { 1556 ret = parseOCIhook(&opts.hooks.poststart, tb[OCI_HOOKS_POSTSTART]); 1557 if (ret) 1558 goto out_startcontainer; 1559 } 1560 1561 if (tb[OCI_HOOKS_POSTSTOP]) { 1562 ret = parseOCIhook(&opts.hooks.poststop, tb[OCI_HOOKS_POSTSTOP]); 1563 if (ret) 1564 goto out_poststart; 1565 } 1566 1567 return 0; 1568 1569 out_poststart: 1570 free_hooklist(opts.hooks.poststart); 1571 out_startcontainer: 1572 free_hooklist(opts.hooks.startContainer); 1573 out_createcontainer: 1574 free_hooklist(opts.hooks.createContainer); 1575 out_createruntime: 1576 free_hooklist(opts.hooks.createRuntime); 1577 1578 return ret; 1579 }; 1580 1581 1582 enum { 1583 OCI_PROCESS_USER_UID, 1584 OCI_PROCESS_USER_GID, 1585 OCI_PROCESS_USER_UMASK, 1586 OCI_PROCESS_USER_ADDITIONALGIDS, 1587 __OCI_PROCESS_USER_MAX, 1588 }; 1589 1590 static const struct blobmsg_policy oci_process_user_policy[] = { 1591 [OCI_PROCESS_USER_UID] = { "uid", BLOBMSG_TYPE_INT32 }, 1592 [OCI_PROCESS_USER_GID] = { "gid", BLOBMSG_TYPE_INT32 }, 1593 [OCI_PROCESS_USER_UMASK] = { "umask", BLOBMSG_TYPE_INT32 }, 1594 [OCI_PROCESS_USER_ADDITIONALGIDS] = { "additionalGids", BLOBMSG_TYPE_ARRAY }, 1595 }; 1596 1597 static int parseOCIprocessuser(struct blob_attr *msg) { 1598 struct blob_attr *tb[__OCI_PROCESS_USER_MAX]; 1599 struct blob_attr *cur; 1600 int rem; 1601 int has_gid = 0; 1602 1603 blobmsg_parse(oci_process_user_policy, __OCI_PROCESS_USER_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1604 1605 if (tb[OCI_PROCESS_USER_UID]) 1606 opts.pw_uid = blobmsg_get_u32(tb[OCI_PROCESS_USER_UID]); 1607 1608 if (tb[OCI_PROCESS_USER_GID]) { 1609 opts.pw_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]); 1610 opts.gr_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]); 1611 has_gid = 1; 1612 } 1613 1614 if (tb[OCI_PROCESS_USER_ADDITIONALGIDS]) { 1615 size_t gidcnt = 0; 1616 1617 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) { 1618 ++gidcnt; 1619 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid)) 1620 continue; 1621 } 1622 1623 if (gidcnt) { 1624 opts.additional_gids = calloc(gidcnt + has_gid, sizeof(gid_t)); 1625 gidcnt = 0; 1626 1627 /* always add primary GID to set of GIDs if set */ 1628 if (has_gid) 1629 opts.additional_gids[gidcnt++] = opts.gr_gid; 1630 1631 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) { 1632 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid)) 1633 continue; 1634 opts.additional_gids[gidcnt++] = blobmsg_get_u32(cur); 1635 } 1636 opts.num_additional_gids = gidcnt; 1637 } 1638 DEBUG("read %zu additional groups\n", gidcnt); 1639 } 1640 1641 if (tb[OCI_PROCESS_USER_UMASK]) { 1642 opts.umask = blobmsg_get_u32(tb[OCI_PROCESS_USER_UMASK]); 1643 opts.set_umask = true; 1644 } 1645 1646 return 0; 1647 } 1648 1649 enum { 1650 OCI_PROCESS_RLIMIT_TYPE, 1651 OCI_PROCESS_RLIMIT_SOFT, 1652 OCI_PROCESS_RLIMIT_HARD, 1653 __OCI_PROCESS_RLIMIT_MAX, 1654 }; 1655 1656 static const struct blobmsg_policy oci_process_rlimit_policy[] = { 1657 [OCI_PROCESS_RLIMIT_TYPE] = { "type", BLOBMSG_TYPE_STRING }, 1658 [OCI_PROCESS_RLIMIT_SOFT] = { "soft", BLOBMSG_CAST_INT64 }, 1659 [OCI_PROCESS_RLIMIT_HARD] = { "hard", BLOBMSG_CAST_INT64 }, 1660 }; 1661 1662 /* from manpage GETRLIMIT(2) */ 1663 static const char* const rlimit_names[RLIM_NLIMITS] = { 1664 [RLIMIT_AS] = "AS", 1665 [RLIMIT_CORE] = "CORE", 1666 [RLIMIT_CPU] = "CPU", 1667 [RLIMIT_DATA] = "DATA", 1668 [RLIMIT_FSIZE] = "FSIZE", 1669 [RLIMIT_LOCKS] = "LOCKS", 1670 [RLIMIT_MEMLOCK] = "MEMLOCK", 1671 [RLIMIT_MSGQUEUE] = "MSGQUEUE", 1672 [RLIMIT_NICE] = "NICE", 1673 [RLIMIT_NOFILE] = "NOFILE", 1674 [RLIMIT_NPROC] = "NPROC", 1675 [RLIMIT_RSS] = "RSS", 1676 [RLIMIT_RTPRIO] = "RTPRIO", 1677 [RLIMIT_RTTIME] = "RTTIME", 1678 [RLIMIT_SIGPENDING] = "SIGPENDING", 1679 [RLIMIT_STACK] = "STACK", 1680 }; 1681 1682 static int resolve_rlimit(char *type) { 1683 unsigned int rltype; 1684 1685 for (rltype = 0; rltype < RLIM_NLIMITS; ++rltype) 1686 if (rlimit_names[rltype] && 1687 !strncmp("RLIMIT_", type, 7) && 1688 !strcmp(rlimit_names[rltype], type + 7)) 1689 return rltype; 1690 1691 return -1; 1692 } 1693 1694 1695 static int parseOCIrlimit(struct blob_attr *msg) 1696 { 1697 struct blob_attr *tb[__OCI_PROCESS_RLIMIT_MAX]; 1698 int limtype = -1; 1699 struct rlimit *curlim; 1700 1701 blobmsg_parse(oci_process_rlimit_policy, __OCI_PROCESS_RLIMIT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1702 1703 if (!tb[OCI_PROCESS_RLIMIT_TYPE] || 1704 !tb[OCI_PROCESS_RLIMIT_SOFT] || 1705 !tb[OCI_PROCESS_RLIMIT_HARD]) 1706 return ENODATA; 1707 1708 limtype = resolve_rlimit(blobmsg_get_string(tb[OCI_PROCESS_RLIMIT_TYPE])); 1709 1710 if (limtype < 0) 1711 return EINVAL; 1712 1713 if (opts.rlimits[limtype]) 1714 return ENOTUNIQ; 1715 1716 curlim = malloc(sizeof(struct rlimit)); 1717 curlim->rlim_cur = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_SOFT]); 1718 curlim->rlim_max = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_HARD]); 1719 1720 opts.rlimits[limtype] = curlim; 1721 1722 return 0; 1723 }; 1724 1725 enum { 1726 OCI_PROCESS_ARGS, 1727 OCI_PROCESS_CAPABILITIES, 1728 OCI_PROCESS_CWD, 1729 OCI_PROCESS_ENV, 1730 OCI_PROCESS_OOMSCOREADJ, 1731 OCI_PROCESS_NONEWPRIVILEGES, 1732 OCI_PROCESS_RLIMITS, 1733 OCI_PROCESS_TERMINAL, 1734 OCI_PROCESS_USER, 1735 __OCI_PROCESS_MAX, 1736 }; 1737 1738 static const struct blobmsg_policy oci_process_policy[] = { 1739 [OCI_PROCESS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY }, 1740 [OCI_PROCESS_CAPABILITIES] = { "capabilities", BLOBMSG_TYPE_TABLE }, 1741 [OCI_PROCESS_CWD] = { "cwd", BLOBMSG_TYPE_STRING }, 1742 [OCI_PROCESS_ENV] = { "env", BLOBMSG_TYPE_ARRAY }, 1743 [OCI_PROCESS_OOMSCOREADJ] = { "oomScoreAdj", BLOBMSG_TYPE_INT32 }, 1744 [OCI_PROCESS_NONEWPRIVILEGES] = { "noNewPrivileges", BLOBMSG_TYPE_BOOL }, 1745 [OCI_PROCESS_RLIMITS] = { "rlimits", BLOBMSG_TYPE_ARRAY }, 1746 [OCI_PROCESS_TERMINAL] = { "terminal", BLOBMSG_TYPE_BOOL }, 1747 [OCI_PROCESS_USER] = { "user", BLOBMSG_TYPE_TABLE }, 1748 }; 1749 1750 1751 static int parseOCIprocess(struct blob_attr *msg) 1752 { 1753 struct blob_attr *tb[__OCI_PROCESS_MAX], *cur; 1754 int rem, res; 1755 1756 blobmsg_parse(oci_process_policy, __OCI_PROCESS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1757 1758 if (!tb[OCI_PROCESS_ARGS]) 1759 return ENOENT; 1760 1761 res = parseOCIenvarray(tb[OCI_PROCESS_ARGS], &opts.jail_argv); 1762 if (res) 1763 return res; 1764 1765 if (tb[OCI_PROCESS_TERMINAL]) 1766 opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]); 1767 1768 if (tb[OCI_PROCESS_NONEWPRIVILEGES]) 1769 opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]); 1770 1771 if (tb[OCI_PROCESS_CWD]) 1772 opts.cwd = strdup(blobmsg_get_string(tb[OCI_PROCESS_CWD])); 1773 1774 if (tb[OCI_PROCESS_ENV]) { 1775 res = parseOCIenvarray(tb[OCI_PROCESS_ENV], &opts.envp); 1776 if (res) 1777 return res; 1778 } 1779 1780 if (tb[OCI_PROCESS_USER] && (res = parseOCIprocessuser(tb[OCI_PROCESS_USER]))) 1781 return res; 1782 1783 if (tb[OCI_PROCESS_CAPABILITIES] && 1784 (res = parseOCIcapabilities(&opts.capset, tb[OCI_PROCESS_CAPABILITIES]))) 1785 return res; 1786 1787 if (tb[OCI_PROCESS_RLIMITS]) { 1788 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_RLIMITS], rem) { 1789 res = parseOCIrlimit(cur); 1790 if (res) 1791 return res; 1792 } 1793 } 1794 1795 if (tb[OCI_PROCESS_OOMSCOREADJ]) { 1796 opts.oom_score_adj = blobmsg_get_u32(tb[OCI_PROCESS_OOMSCOREADJ]); 1797 opts.set_oom_score_adj = true; 1798 } 1799 1800 return 0; 1801 } 1802 1803 enum { 1804 OCI_LINUX_NAMESPACE_TYPE, 1805 OCI_LINUX_NAMESPACE_PATH, 1806 __OCI_LINUX_NAMESPACE_MAX, 1807 }; 1808 1809 static const struct blobmsg_policy oci_linux_namespace_policy[] = { 1810 [OCI_LINUX_NAMESPACE_TYPE] = { "type", BLOBMSG_TYPE_STRING }, 1811 [OCI_LINUX_NAMESPACE_PATH] = { "path", BLOBMSG_TYPE_STRING }, 1812 }; 1813 1814 static int resolve_nstype(char *type) { 1815 if (!strcmp("pid", type)) 1816 return CLONE_NEWPID; 1817 else if (!strcmp("network", type)) 1818 return CLONE_NEWNET; 1819 else if (!strcmp("net", type)) 1820 return CLONE_NEWNET; 1821 else if (!strcmp("mount", type)) 1822 return CLONE_NEWNS; 1823 else if (!strcmp("ipc", type)) 1824 return CLONE_NEWIPC; 1825 else if (!strcmp("uts", type)) 1826 return CLONE_NEWUTS; 1827 else if (!strcmp("user", type)) 1828 return CLONE_NEWUSER; 1829 else if (!strcmp("cgroup", type)) 1830 return CLONE_NEWCGROUP; 1831 #ifdef CLONE_NEWTIME 1832 else if (!strcmp("time", type)) 1833 return CLONE_NEWTIME; 1834 #endif 1835 else 1836 return 0; 1837 } 1838 1839 static int parseOCIlinuxns(struct blob_attr *msg) 1840 { 1841 struct blob_attr *tb[__OCI_LINUX_NAMESPACE_MAX]; 1842 int nstype; 1843 int *setns; 1844 int fd; 1845 1846 blobmsg_parse(oci_linux_namespace_policy, __OCI_LINUX_NAMESPACE_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1847 1848 if (!tb[OCI_LINUX_NAMESPACE_TYPE]) 1849 return EINVAL; 1850 1851 nstype = resolve_nstype(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE])); 1852 if (!nstype) 1853 return EINVAL; 1854 1855 if (opts.namespace & nstype) 1856 return ENOTUNIQ; 1857 1858 setns = get_namespace_fd(nstype); 1859 1860 if (!setns) 1861 return EFAULT; 1862 1863 if (*setns != -1) 1864 return ENOTUNIQ; 1865 1866 if (tb[OCI_LINUX_NAMESPACE_PATH]) { 1867 DEBUG("opening existing %s namespace from path %s\n", 1868 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]), 1869 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH])); 1870 1871 fd = open(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]), O_RDONLY); 1872 if (fd < 0) 1873 return errno?:ESTALE; 1874 1875 if (ioctl(fd, NS_GET_NSTYPE) != nstype) { 1876 close(fd); 1877 return EINVAL; 1878 } 1879 1880 DEBUG("opened existing %s namespace got filehandler %u\n", 1881 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]), 1882 fd); 1883 1884 *setns = fd; 1885 } else { 1886 opts.namespace |= nstype; 1887 } 1888 1889 return 0; 1890 } 1891 1892 /* 1893 * join namespace of existing PID 1894 * The string argument is the reference PID followed by ':' and a 1895 * ',' separated list of namespaces to to join. 1896 */ 1897 static int jail_join_ns(char *arg) 1898 { 1899 pid_t pid; 1900 int fd; 1901 int nstype; 1902 char *tmp, *etmp, *nspath; 1903 int *setns; 1904 1905 tmp = strchr(arg, ':'); 1906 if (!tmp) 1907 return EINVAL; 1908 1909 *tmp = '\0'; 1910 pid = atoi(arg); 1911 1912 do { 1913 ++tmp; 1914 etmp = strchr(tmp, ','); 1915 if (etmp) 1916 *etmp = '\0'; 1917 1918 nstype = resolve_nstype(tmp); 1919 if (!nstype) 1920 return EINVAL; 1921 1922 if (opts.namespace & nstype) 1923 return ENOTUNIQ; 1924 1925 setns = get_namespace_fd(nstype); 1926 1927 if (!setns) 1928 return EFAULT; 1929 1930 if (*setns != -1) 1931 return ENOTUNIQ; 1932 1933 if (asprintf(&nspath, "/proc/%d/ns/%s", pid, tmp) < 0) 1934 return ENOMEM; 1935 1936 fd = open(nspath, O_RDONLY); 1937 free(nspath); 1938 1939 if (fd < 0) 1940 return errno?:ESTALE; 1941 1942 *setns = fd; 1943 1944 if (etmp) 1945 tmp = etmp; 1946 else 1947 tmp = NULL; 1948 } while (tmp); 1949 1950 return 0; 1951 } 1952 1953 static void get_jail_root_user(bool is_gidmap, uint32_t container_id, uint32_t host_id, uint32_t size) 1954 { 1955 if (container_id == 0 && size >= 1) 1956 if (!is_gidmap) 1957 opts.root_map_uid = host_id; 1958 } 1959 1960 enum { 1961 OCI_LINUX_UIDGIDMAP_CONTAINERID, 1962 OCI_LINUX_UIDGIDMAP_HOSTID, 1963 OCI_LINUX_UIDGIDMAP_SIZE, 1964 __OCI_LINUX_UIDGIDMAP_MAX, 1965 }; 1966 1967 static const struct blobmsg_policy oci_linux_uidgidmap_policy[] = { 1968 [OCI_LINUX_UIDGIDMAP_CONTAINERID] = { "containerID", BLOBMSG_TYPE_INT32 }, 1969 [OCI_LINUX_UIDGIDMAP_HOSTID] = { "hostID", BLOBMSG_TYPE_INT32 }, 1970 [OCI_LINUX_UIDGIDMAP_SIZE] = { "size", BLOBMSG_TYPE_INT32 }, 1971 }; 1972 1973 static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap) 1974 { 1975 struct blob_attr *tb[__OCI_LINUX_UIDGIDMAP_MAX]; 1976 struct blob_attr *cur; 1977 int rem; 1978 char *map; 1979 size_t len, pos, totallen = 0; 1980 1981 blobmsg_for_each_attr(cur, msg, rem) { 1982 blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 1983 1984 if (!tb[OCI_LINUX_UIDGIDMAP_CONTAINERID] || 1985 !tb[OCI_LINUX_UIDGIDMAP_HOSTID] || 1986 !tb[OCI_LINUX_UIDGIDMAP_SIZE]) 1987 return EINVAL; 1988 1989 /* count length */ 1990 totallen += snprintf(NULL, 0, "%d %d %d\n", 1991 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]), 1992 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]), 1993 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE])); 1994 } 1995 1996 /* allocate combined mapping string */ 1997 map = malloc(totallen + 1); 1998 if (!map) 1999 return ENOMEM; 2000 2001 pos = 0; 2002 blobmsg_for_each_attr(cur, msg, rem) { 2003 blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 2004 2005 get_jail_root_user(is_gidmap, blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]), 2006 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]), 2007 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE])); 2008 2009 /* write mapping line into pre-allocated string */ 2010 len = snprintf(&map[pos], totallen + 1, "%d %d %d\n", 2011 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]), 2012 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]), 2013 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE])); 2014 pos += len; 2015 totallen -= len; 2016 } 2017 2018 assert(totallen == 0); 2019 2020 if (is_gidmap) 2021 opts.gidmap = map; 2022 else 2023 opts.uidmap = map; 2024 2025 return 0; 2026 } 2027 2028 enum { 2029 OCI_DEVICES_TYPE, 2030 OCI_DEVICES_PATH, 2031 OCI_DEVICES_MAJOR, 2032 OCI_DEVICES_MINOR, 2033 OCI_DEVICES_FILEMODE, 2034 OCI_DEVICES_UID, 2035 OCI_DEVICES_GID, 2036 __OCI_DEVICES_MAX, 2037 }; 2038 2039 static const struct blobmsg_policy oci_devices_policy[] = { 2040 [OCI_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING }, 2041 [OCI_DEVICES_PATH] = { "path", BLOBMSG_TYPE_STRING }, 2042 [OCI_DEVICES_MAJOR] = { "major", BLOBMSG_TYPE_INT32 }, 2043 [OCI_DEVICES_MINOR] = { "minor", BLOBMSG_TYPE_INT32 }, 2044 [OCI_DEVICES_FILEMODE] = { "fileMode", BLOBMSG_TYPE_INT32 }, 2045 [OCI_DEVICES_UID] = { "uid", BLOBMSG_TYPE_INT32 }, 2046 [OCI_DEVICES_GID] = { "uid", BLOBMSG_TYPE_INT32 }, 2047 }; 2048 2049 static mode_t resolve_devtype(char *tstr) 2050 { 2051 if (!strcmp("c", tstr) || 2052 !strcmp("u", tstr)) 2053 return S_IFCHR; 2054 else if (!strcmp("b", tstr)) 2055 return S_IFBLK; 2056 else if (!strcmp("p", tstr)) 2057 return S_IFIFO; 2058 else 2059 return 0; 2060 } 2061 2062 static int parseOCIdevices(struct blob_attr *msg) 2063 { 2064 struct blob_attr *tb[__OCI_DEVICES_MAX]; 2065 struct blob_attr *cur; 2066 int rem; 2067 size_t cnt = 0; 2068 struct mknod_args *tmp; 2069 2070 blobmsg_for_each_attr(cur, msg, rem) 2071 ++cnt; 2072 2073 opts.devices = calloc(cnt + 1, sizeof(struct mknod_args *)); 2074 2075 cnt = 0; 2076 blobmsg_for_each_attr(cur, msg, rem) { 2077 blobmsg_parse(oci_devices_policy, __OCI_DEVICES_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 2078 if (!tb[OCI_DEVICES_TYPE] || 2079 !tb[OCI_DEVICES_PATH]) 2080 return ENODATA; 2081 2082 tmp = calloc(1, sizeof(struct mknod_args)); 2083 if (!tmp) 2084 return ENOMEM; 2085 2086 tmp->mode = resolve_devtype(blobmsg_get_string(tb[OCI_DEVICES_TYPE])); 2087 if (!tmp->mode) { 2088 free(tmp); 2089 return EINVAL; 2090 } 2091 2092 if (tmp->mode != S_IFIFO) { 2093 if (!tb[OCI_DEVICES_MAJOR] || !tb[OCI_DEVICES_MINOR]) { 2094 free(tmp); 2095 return ENODATA; 2096 } 2097 2098 tmp->dev = makedev(blobmsg_get_u32(tb[OCI_DEVICES_MAJOR]), 2099 blobmsg_get_u32(tb[OCI_DEVICES_MINOR])); 2100 } 2101 2102 if (tb[OCI_DEVICES_FILEMODE]) { 2103 if (~(S_IRWXU|S_IRWXG|S_IRWXO) & blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE])) { 2104 free(tmp); 2105 return EINVAL; 2106 } 2107 2108 tmp->mode |= blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]); 2109 } else { 2110 tmp->mode |= (S_IRUSR|S_IWUSR); /* 0600 */ 2111 } 2112 2113 tmp->path = strdup(blobmsg_get_string(tb[OCI_DEVICES_PATH])); 2114 2115 if (tb[OCI_DEVICES_UID]) 2116 tmp->uid = blobmsg_get_u32(tb[OCI_DEVICES_UID]); 2117 else 2118 tmp->uid = -1; 2119 2120 if (tb[OCI_DEVICES_GID]) 2121 tmp->gid = blobmsg_get_u32(tb[OCI_DEVICES_GID]); 2122 else 2123 tmp->gid = -1; 2124 2125 DEBUG("read device %s (%s)\n", blobmsg_get_string(tb[OCI_DEVICES_PATH]), blobmsg_get_string(tb[OCI_DEVICES_TYPE])); 2126 opts.devices[cnt++] = tmp; 2127 } 2128 2129 opts.devices[cnt] = NULL; 2130 2131 return 0; 2132 } 2133 2134 static int parseOCIsysctl(struct blob_attr *msg) 2135 { 2136 struct blob_attr *cur; 2137 int rem; 2138 char *tmp, *tc; 2139 size_t cnt = 0; 2140 2141 blobmsg_for_each_attr(cur, msg, rem) { 2142 if (!blobmsg_name(cur) || !blobmsg_get_string(cur)) 2143 return EINVAL; 2144 2145 ++cnt; 2146 } 2147 2148 if (!cnt) 2149 return 0; 2150 2151 opts.sysctl = calloc(cnt + 1, sizeof(struct sysctl_val *)); 2152 if (!opts.sysctl) 2153 return ENOMEM; 2154 2155 cnt = 0; 2156 blobmsg_for_each_attr(cur, msg, rem) { 2157 opts.sysctl[cnt] = malloc(sizeof(struct sysctl_val)); 2158 if (!opts.sysctl[cnt]) 2159 return ENOMEM; 2160 2161 /* replace '.' with '/' in entry name */ 2162 tc = tmp = strdup(blobmsg_name(cur)); 2163 while ((tc = strchr(tc, '.'))) 2164 *tc = '/'; 2165 2166 opts.sysctl[cnt]->value = strdup(blobmsg_get_string(cur)); 2167 opts.sysctl[cnt]->entry = tmp; 2168 2169 ++cnt; 2170 } 2171 2172 opts.sysctl[cnt] = NULL; 2173 2174 return 0; 2175 } 2176 2177 2178 enum { 2179 OCI_LINUX_CGROUPSPATH, 2180 OCI_LINUX_RESOURCES, 2181 OCI_LINUX_SECCOMP, 2182 OCI_LINUX_SYSCTL, 2183 OCI_LINUX_NAMESPACES, 2184 OCI_LINUX_DEVICES, 2185 OCI_LINUX_UIDMAPPINGS, 2186 OCI_LINUX_GIDMAPPINGS, 2187 OCI_LINUX_MASKEDPATHS, 2188 OCI_LINUX_READONLYPATHS, 2189 OCI_LINUX_ROOTFSPROPAGATION, 2190 __OCI_LINUX_MAX, 2191 }; 2192 2193 static const struct blobmsg_policy oci_linux_policy[] = { 2194 [OCI_LINUX_CGROUPSPATH] = { "cgroupsPath", BLOBMSG_TYPE_STRING }, 2195 [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE }, 2196 [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE }, 2197 [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE }, 2198 [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY }, 2199 [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY }, 2200 [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY }, 2201 [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY }, 2202 [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY }, 2203 [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY }, 2204 [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING }, 2205 }; 2206 2207 static int parseOCIlinux(struct blob_attr *msg) 2208 { 2209 struct blob_attr *tb[__OCI_LINUX_MAX]; 2210 struct blob_attr *cur; 2211 int rem; 2212 int res = 0; 2213 char *cgpath; 2214 char cgfullpath[256] = "/sys/fs/cgroup"; 2215 2216 blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 2217 2218 if (tb[OCI_LINUX_NAMESPACES]) { 2219 blobmsg_for_each_attr(cur, tb[OCI_LINUX_NAMESPACES], rem) { 2220 res = parseOCIlinuxns(cur); 2221 if (res) 2222 return res; 2223 } 2224 } 2225 2226 if (tb[OCI_LINUX_UIDMAPPINGS]) { 2227 res = parseOCIuidgidmappings(tb[OCI_LINUX_UIDMAPPINGS], 0); 2228 if (res) 2229 return res; 2230 } 2231 2232 if (tb[OCI_LINUX_GIDMAPPINGS]) { 2233 res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 1); 2234 if (res) 2235 return res; 2236 } 2237 2238 if (tb[OCI_LINUX_READONLYPATHS]) { 2239 blobmsg_for_each_attr(cur, tb[OCI_LINUX_READONLYPATHS], rem) { 2240 res = add_mount(NULL, blobmsg_get_string(cur), NULL, MS_BIND | MS_REC | MS_RDONLY, 0, NULL, 0); 2241 if (res) 2242 return res; 2243 } 2244 } 2245 2246 if (tb[OCI_LINUX_MASKEDPATHS]) { 2247 blobmsg_for_each_attr(cur, tb[OCI_LINUX_MASKEDPATHS], rem) { 2248 res = add_mount((void *)(-1), blobmsg_get_string(cur), NULL, 0, 0, NULL, 0); 2249 if (res) 2250 return res; 2251 } 2252 } 2253 2254 if (tb[OCI_LINUX_SYSCTL]) { 2255 res = parseOCIsysctl(tb[OCI_LINUX_SYSCTL]); 2256 if (res) 2257 return res; 2258 } 2259 2260 if (tb[OCI_LINUX_SECCOMP]) { 2261 opts.ociseccomp = parseOCIlinuxseccomp(tb[OCI_LINUX_SECCOMP]); 2262 if (!opts.ociseccomp) 2263 return EINVAL; 2264 } 2265 2266 if (tb[OCI_LINUX_DEVICES]) { 2267 res = parseOCIdevices(tb[OCI_LINUX_DEVICES]); 2268 if (res) 2269 return res; 2270 } 2271 2272 if (tb[OCI_LINUX_CGROUPSPATH]) { 2273 cgpath = blobmsg_get_string(tb[OCI_LINUX_CGROUPSPATH]); 2274 if (cgpath[0] == '/') { 2275 if (strlen(cgpath) + 1 >= (sizeof(cgfullpath) - strlen(cgfullpath))) 2276 return E2BIG; 2277 2278 strcat(cgfullpath, cgpath); 2279 } else { 2280 strcat(cgfullpath, "/containers/"); 2281 if (strlen(opts.name) + strlen(cgpath) + 2 >= (sizeof(cgfullpath) - strlen(cgfullpath))) 2282 return E2BIG; 2283 2284 strcat(cgfullpath, opts.name); /* should be container name rather than jail name */ 2285 strcat(cgfullpath, "/"); 2286 strcat(cgfullpath, cgpath); 2287 } 2288 } else { 2289 strcat(cgfullpath, "/containers/"); 2290 if (2 * strlen(opts.name) + 2 >= (sizeof(cgfullpath) - strlen(cgfullpath))) 2291 return E2BIG; 2292 2293 strcat(cgfullpath, opts.name); /* should be container name rather than jail name */ 2294 strcat(cgfullpath, "/"); 2295 strcat(cgfullpath, opts.name); /* should be container instance name rather than jail name */ 2296 } 2297 2298 cgroups_init(cgfullpath); 2299 2300 if (tb[OCI_LINUX_RESOURCES]) { 2301 res = parseOCIlinuxcgroups(tb[OCI_LINUX_RESOURCES]); 2302 if (res) 2303 return res; 2304 } 2305 2306 return 0; 2307 } 2308 2309 enum { 2310 OCI_VERSION, 2311 OCI_HOSTNAME, 2312 OCI_PROCESS, 2313 OCI_ROOT, 2314 OCI_MOUNTS, 2315 OCI_HOOKS, 2316 OCI_LINUX, 2317 OCI_ANNOTATIONS, 2318 __OCI_MAX, 2319 }; 2320 2321 static const struct blobmsg_policy oci_policy[] = { 2322 [OCI_VERSION] = { "ociVersion", BLOBMSG_TYPE_STRING }, 2323 [OCI_HOSTNAME] = { "hostname", BLOBMSG_TYPE_STRING }, 2324 [OCI_PROCESS] = { "process", BLOBMSG_TYPE_TABLE }, 2325 [OCI_ROOT] = { "root", BLOBMSG_TYPE_TABLE }, 2326 [OCI_MOUNTS] = { "mounts", BLOBMSG_TYPE_ARRAY }, 2327 [OCI_HOOKS] = { "hooks", BLOBMSG_TYPE_TABLE }, 2328 [OCI_LINUX] = { "linux", BLOBMSG_TYPE_TABLE }, 2329 [OCI_ANNOTATIONS] = { "annotations", BLOBMSG_TYPE_TABLE }, 2330 }; 2331 2332 static int parseOCI(const char *jsonfile) 2333 { 2334 struct blob_attr *tb[__OCI_MAX]; 2335 struct blob_attr *cur; 2336 int rem; 2337 int res; 2338 2339 blob_buf_init(&ocibuf, 0); 2340 2341 if (!blobmsg_add_json_from_file(&ocibuf, jsonfile)) { 2342 res=ENOENT; 2343 goto errout; 2344 } 2345 2346 blobmsg_parse(oci_policy, __OCI_MAX, tb, blob_data(ocibuf.head), blob_len(ocibuf.head)); 2347 2348 if (!tb[OCI_VERSION]) { 2349 res=ENOMSG; 2350 goto errout; 2351 } 2352 2353 if (strncmp("1.0", blobmsg_get_string(tb[OCI_VERSION]), 3)) { 2354 ERROR("unsupported ociVersion %s\n", blobmsg_get_string(tb[OCI_VERSION])); 2355 res=ENOTSUP; 2356 goto errout; 2357 } 2358 2359 if (tb[OCI_HOSTNAME]) 2360 opts.hostname = strdup(blobmsg_get_string(tb[OCI_HOSTNAME])); 2361 2362 if (!tb[OCI_PROCESS]) { 2363 res=ENODATA; 2364 goto errout; 2365 } 2366 2367 if ((res = parseOCIprocess(tb[OCI_PROCESS]))) 2368 goto errout; 2369 2370 if (!tb[OCI_ROOT]) { 2371 res=ENODATA; 2372 goto errout; 2373 } 2374 if ((res = parseOCIroot(jsonfile, tb[OCI_ROOT]))) 2375 goto errout; 2376 2377 if (!tb[OCI_MOUNTS]) { 2378 res=ENODATA; 2379 goto errout; 2380 } 2381 2382 blobmsg_for_each_attr(cur, tb[OCI_MOUNTS], rem) 2383 if ((res = parseOCImount(cur))) 2384 goto errout; 2385 2386 if (tb[OCI_LINUX] && (res = parseOCIlinux(tb[OCI_LINUX]))) 2387 goto errout; 2388 2389 if (tb[OCI_HOOKS] && (res = parseOCIhooks(tb[OCI_HOOKS]))) 2390 goto errout; 2391 2392 if (tb[OCI_ANNOTATIONS]) 2393 opts.annotations = blob_memdup(tb[OCI_ANNOTATIONS]); 2394 2395 errout: 2396 blob_buf_free(&ocibuf); 2397 2398 return res; 2399 } 2400 2401 static int set_oom_score_adj(void) 2402 { 2403 int f; 2404 char fname[32]; 2405 2406 if (!opts.set_oom_score_adj) 2407 return 0; 2408 2409 snprintf(fname, sizeof(fname), "/proc/%u/oom_score_adj", jail_process.pid); 2410 f = open(fname, O_WRONLY | O_TRUNC); 2411 if (f < 0) 2412 return errno; 2413 2414 dprintf(f, "%d", opts.oom_score_adj); 2415 close(f); 2416 2417 return 0; 2418 } 2419 2420 2421 enum { 2422 OCI_STATE_CREATING, 2423 OCI_STATE_CREATED, 2424 OCI_STATE_RUNNING, 2425 OCI_STATE_STOPPED, 2426 }; 2427 2428 static int jail_oci_state = OCI_STATE_CREATED; 2429 static void pipe_send_start_container(struct uloop_timeout *t); 2430 static struct uloop_timeout start_container_timeout = { 2431 .cb = pipe_send_start_container, 2432 }; 2433 2434 static int handle_start(struct ubus_context *ctx, struct ubus_object *obj, 2435 struct ubus_request_data *req, const char *method, 2436 struct blob_attr *msg) 2437 { 2438 if (jail_oci_state != OCI_STATE_CREATED) 2439 return UBUS_STATUS_INVALID_ARGUMENT; 2440 2441 uloop_timeout_add(&start_container_timeout); 2442 2443 return UBUS_STATUS_OK; 2444 } 2445 2446 static struct blob_buf bb; 2447 static int handle_state(struct ubus_context *ctx, struct ubus_object *obj, 2448 struct ubus_request_data *req, const char *method, 2449 struct blob_attr *msg) 2450 { 2451 char *statusstr; 2452 2453 switch (jail_oci_state) { 2454 case OCI_STATE_CREATING: 2455 statusstr = "creating"; 2456 break; 2457 case OCI_STATE_CREATED: 2458 statusstr = "created"; 2459 break; 2460 case OCI_STATE_RUNNING: 2461 statusstr = "running"; 2462 break; 2463 case OCI_STATE_STOPPED: 2464 statusstr = "stopped"; 2465 break; 2466 default: 2467 statusstr = "unknown"; 2468 } 2469 2470 blob_buf_init(&bb, 0); 2471 blobmsg_add_string(&bb, "ociVersion", OCI_VERSION_STRING); 2472 blobmsg_add_string(&bb, "id", opts.name); 2473 blobmsg_add_string(&bb, "status", statusstr); 2474 if (jail_oci_state == OCI_STATE_CREATED || 2475 jail_oci_state == OCI_STATE_RUNNING) 2476 blobmsg_add_u32(&bb, "pid", jail_process.pid); 2477 2478 blobmsg_add_string(&bb, "bundle", opts.ocibundle); 2479 2480 if (opts.annotations) 2481 blobmsg_add_blob(&bb, opts.annotations); 2482 2483 ubus_send_reply(ctx, req, bb.head); 2484 2485 return UBUS_STATUS_OK; 2486 } 2487 2488 enum { 2489 CONTAINER_KILL_ATTR_SIGNAL, 2490 __CONTAINER_KILL_ATTR_MAX, 2491 }; 2492 2493 static const struct blobmsg_policy container_kill_attrs[__CONTAINER_KILL_ATTR_MAX] = { 2494 [CONTAINER_KILL_ATTR_SIGNAL] = { "signal", BLOBMSG_TYPE_INT32 }, 2495 }; 2496 2497 static int 2498 container_handle_kill(struct ubus_context *ctx, struct ubus_object *obj, 2499 struct ubus_request_data *req, const char *method, 2500 struct blob_attr *msg) 2501 { 2502 struct blob_attr *tb[__CONTAINER_KILL_ATTR_MAX], *cur; 2503 int sig = SIGTERM; 2504 2505 blobmsg_parse(container_kill_attrs, __CONTAINER_KILL_ATTR_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg)); 2506 2507 cur = tb[CONTAINER_KILL_ATTR_SIGNAL]; 2508 if (cur) 2509 sig = blobmsg_get_u32(cur); 2510 2511 if (jail_oci_state == OCI_STATE_CREATING) 2512 return UBUS_STATUS_NOT_FOUND; 2513 2514 if (kill(jail_process.pid, sig) == 0) 2515 return 0; 2516 2517 switch (errno) { 2518 case EINVAL: return UBUS_STATUS_INVALID_ARGUMENT; 2519 case EPERM: return UBUS_STATUS_PERMISSION_DENIED; 2520 case ESRCH: return UBUS_STATUS_NOT_FOUND; 2521 } 2522 2523 return UBUS_STATUS_UNKNOWN_ERROR; 2524 } 2525 2526 static int 2527 jail_writepid(pid_t pid) 2528 { 2529 FILE *_pidfile; 2530 2531 if (!opts.pidfile) 2532 return 0; 2533 2534 _pidfile = fopen(opts.pidfile, "w"); 2535 if (_pidfile == NULL) 2536 return errno; 2537 2538 if (fprintf(_pidfile, "%d\n", pid) < 0) { 2539 fclose(_pidfile); 2540 return errno; 2541 } 2542 2543 if (fclose(_pidfile)) 2544 return errno; 2545 2546 return 0; 2547 } 2548 2549 static int checkpath(const char *path) 2550 { 2551 int dirfd = open(path, O_RDONLY | O_DIRECTORY | O_CLOEXEC); 2552 if (dirfd < 0) { 2553 ERROR("path %s open failed %m\n", path); 2554 return -1; 2555 } 2556 close(dirfd); 2557 2558 return 0; 2559 } 2560 2561 static struct ubus_method container_methods[] = { 2562 UBUS_METHOD_NOARG("start", handle_start), 2563 UBUS_METHOD_NOARG("state", handle_state), 2564 UBUS_METHOD("kill", container_handle_kill, container_kill_attrs), 2565 }; 2566 2567 static struct ubus_object_type container_object_type = 2568 UBUS_OBJECT_TYPE("container", container_methods); 2569 2570 static struct ubus_object container_object = { 2571 .type = &container_object_type, 2572 .methods = container_methods, 2573 .n_methods = ARRAY_SIZE(container_methods), 2574 }; 2575 2576 static void post_main(struct uloop_timeout *t); 2577 static struct uloop_timeout post_main_timeout = { 2578 .cb = post_main, 2579 }; 2580 static int netns_fd; 2581 static int pidns_fd; 2582 #ifdef CLONE_NEWTIME 2583 static int timens_fd; 2584 #endif 2585 static void post_create_runtime(void); 2586 2587 struct env_e { 2588 struct list_head list; 2589 char *envarg; 2590 }; 2591 2592 int main(int argc, char **argv) 2593 { 2594 uid_t uid = getuid(); 2595 const char log[] = "/dev/log"; 2596 const char ubus[] = "/var/run/ubus/ubus.sock"; 2597 const char udebug[] = "/var/run/udebug.sock"; 2598 int ret = EXIT_FAILURE; 2599 int ch; 2600 char *tmp; 2601 struct list_head envl = LIST_HEAD_INIT(envl); 2602 struct env_e *enve, *tmpenve; 2603 unsigned short int envn = 0, envc = 0; 2604 2605 if (uid) { 2606 ERROR("not root, aborting: %m\n"); 2607 return EXIT_FAILURE; 2608 } 2609 2610 /* those are filehandlers, so -1 indicates unused */ 2611 opts.setns.pid = -1; 2612 opts.setns.net = -1; 2613 opts.setns.ns = -1; 2614 opts.setns.ipc = -1; 2615 opts.setns.uts = -1; 2616 opts.setns.user = -1; 2617 opts.setns.cgroup = -1; 2618 #ifdef CLONE_NEWTIME 2619 opts.setns.time = -1; 2620 #endif 2621 2622 /* default 5 seconds timeout after SIGTERM before SIGKILL is sent */ 2623 opts.term_timeout = 5; 2624 2625 umask(022); 2626 mount_list_init(); 2627 init_library_search(); 2628 cgroups_prepare(); 2629 exit_from_child = false; 2630 2631 while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) { 2632 switch (ch) { 2633 case 'd': 2634 debug = atoi(optarg); 2635 break; 2636 case 'e': 2637 enve = calloc(1, sizeof(*enve)); 2638 enve->envarg = optarg; 2639 list_add_tail(&enve->list, &envl); 2640 break; 2641 case 'p': 2642 opts.namespace |= CLONE_NEWNS; 2643 opts.procfs = 1; 2644 break; 2645 case 'o': 2646 opts.namespace |= CLONE_NEWNS; 2647 opts.ronly = 1; 2648 break; 2649 case 'f': 2650 opts.namespace |= CLONE_NEWUSER; 2651 break; 2652 case 'F': 2653 opts.namespace |= CLONE_NEWCGROUP; 2654 break; 2655 case 'R': 2656 opts.extroot = realpath(optarg, NULL); 2657 break; 2658 case 's': 2659 opts.namespace |= CLONE_NEWNS; 2660 opts.sysfs = 1; 2661 break; 2662 case 'S': 2663 opts.seccomp = optarg; 2664 add_mount_bind(optarg, 1, -1); 2665 break; 2666 case 'C': 2667 opts.capabilities = optarg; 2668 break; 2669 case 'c': 2670 opts.no_new_privs = 1; 2671 break; 2672 case 'n': 2673 opts.name = optarg; 2674 break; 2675 case 'N': 2676 opts.namespace |= CLONE_NEWNET; 2677 break; 2678 case 'h': 2679 opts.namespace |= CLONE_NEWUTS; 2680 opts.hostname = strdup(optarg); 2681 break; 2682 case 'j': 2683 jail_join_ns(optarg); 2684 break; 2685 case 'r': 2686 opts.namespace |= CLONE_NEWNS; 2687 tmp = strchr(optarg, ':'); 2688 if (tmp) { 2689 *(tmp++) = '\0'; 2690 add_2paths_and_deps(optarg, tmp, 1, 0, 0); 2691 } else { 2692 add_path_and_deps(optarg, 1, 0, 0); 2693 } 2694 break; 2695 case 'w': 2696 opts.namespace |= CLONE_NEWNS; 2697 tmp = strchr(optarg, ':'); 2698 if (tmp) { 2699 *(tmp++) = '\0'; 2700 add_2paths_and_deps(optarg, tmp, 0, 0, 0); 2701 } else { 2702 add_path_and_deps(optarg, 0, 0, 0); 2703 } 2704 break; 2705 case 'u': 2706 opts.namespace |= CLONE_NEWNS; 2707 add_mount_bind(ubus, 0, -1); 2708 break; 2709 case 'D': 2710 opts.namespace |= CLONE_NEWNS; 2711 add_mount_bind(udebug, 0, 0); 2712 break; 2713 case 'l': 2714 opts.namespace |= CLONE_NEWNS; 2715 add_mount_bind(log, 0, -1); 2716 break; 2717 case 'U': 2718 opts.user = optarg; 2719 break; 2720 case 'G': 2721 opts.group = optarg; 2722 break; 2723 case 'O': 2724 opts.overlaydir = realpath(optarg, NULL); 2725 break; 2726 case 't': 2727 opts.term_timeout = atoi(optarg); 2728 break; 2729 case 'T': 2730 opts.tmpoverlaysize = optarg; 2731 break; 2732 case 'E': 2733 opts.require_jail = 1; 2734 break; 2735 case 'y': 2736 opts.console = 1; 2737 break; 2738 case 'J': 2739 opts.ocibundle = optarg; 2740 break; 2741 case 'i': 2742 opts.immediately = true; 2743 break; 2744 case 'P': 2745 opts.pidfile = optarg; 2746 break; 2747 } 2748 } 2749 2750 if (opts.namespace && !opts.ocibundle) 2751 opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID; 2752 2753 /* 2754 * env import from cmdline is not available for OCI containers 2755 */ 2756 if (opts.ocibundle && !list_empty(&envl)) { 2757 ret=-ENOTSUP; 2758 goto errout; 2759 } 2760 2761 /* 2762 * prepare list of env variables to import for slim containers 2763 */ 2764 if (!list_empty(&envl)) { 2765 list_for_each_entry(enve, &envl, list) 2766 ++envn; 2767 2768 opts.envp = calloc(1 + envn, sizeof(char*)); 2769 list_for_each_entry_safe(enve, tmpenve, &envl, list) { 2770 tmp = getenv(enve->envarg); 2771 if (tmp) { 2772 ret = asprintf(&opts.envp[envc++], "%s=%s", enve->envarg, tmp); 2773 if (ret < 0) { 2774 ERROR("filed to handle envargs %s\n", tmp); 2775 free(enve); 2776 goto errout; 2777 } 2778 } 2779 2780 list_del(&enve->list); 2781 free(enve); 2782 } 2783 2784 opts.envp[envc] = NULL; 2785 } 2786 2787 /* 2788 * uid in parent user namespace representing root user in new 2789 * user namespace, defaults to nobody unless specified in uidMappings 2790 */ 2791 opts.root_map_uid = 65534; 2792 2793 if (opts.capabilities && parseOCIcapabilities_from_file(&opts.capset, opts.capabilities)) { 2794 ERROR("failed to read capabilities from file %s\n", opts.capabilities); 2795 ret=-1; 2796 goto errout; 2797 } 2798 2799 if (opts.ocibundle) { 2800 char *jsonfile; 2801 int ocires; 2802 2803 if (!opts.name) { 2804 ERROR("OCI bundle needs a named jail\n"); 2805 ret=-1; 2806 goto errout; 2807 } 2808 if (asprintf(&jsonfile, "%s/config.json", opts.ocibundle) < 0) { 2809 ret=-ENOMEM; 2810 goto errout; 2811 } 2812 ocires = parseOCI(jsonfile); 2813 free(jsonfile); 2814 if (ocires) { 2815 ERROR("parsing of OCI JSON spec has failed: %s (%d)\n", strerror(ocires), ocires); 2816 ret=ocires; 2817 goto errout; 2818 } 2819 } 2820 2821 if (opts.namespace & CLONE_NEWNET) { 2822 if (!opts.name) { 2823 ERROR("netns needs a named jail\n"); 2824 ret=-1; 2825 goto errout; 2826 } 2827 } 2828 2829 2830 if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) { 2831 ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize); 2832 ret=-1; 2833 goto errout; 2834 } 2835 2836 if (opts.extroot && checkpath(opts.extroot)) { 2837 ERROR("invalid rootfs path '%s'", opts.extroot); 2838 ret=-1; 2839 goto errout; 2840 } 2841 2842 if (opts.overlaydir && checkpath(opts.overlaydir)) { 2843 ERROR("invalid rootfs overlay path '%s'", opts.overlaydir); 2844 ret=-1; 2845 goto errout; 2846 } 2847 2848 /* no <binary> param found */ 2849 if (!opts.ocibundle && (argc - optind < 1)) { 2850 usage(); 2851 ret=EXIT_FAILURE; 2852 goto errout; 2853 } 2854 if (!(opts.ocibundle||opts.namespace||opts.capabilities||opts.seccomp|| 2855 (opts.setns.net != -1) || 2856 (opts.setns.ns != -1) || 2857 (opts.setns.ipc != -1) || 2858 (opts.setns.uts != -1) || 2859 (opts.setns.user != -1) || 2860 (opts.setns.cgroup != -1))) { 2861 ERROR("Not using namespaces, capabilities or seccomp !!!\n\n"); 2862 usage(); 2863 ret=EXIT_FAILURE; 2864 goto errout; 2865 } 2866 DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n", 2867 opts.namespace, 2868 opts.capset.apply, 2869 opts.seccomp != 0 || opts.ociseccomp != 0); 2870 2871 uloop_init(); 2872 signals_init(); 2873 2874 parent_ctx = ubus_connect(NULL); 2875 if (!parent_ctx) { 2876 ERROR("Connection to ubus failed\n"); 2877 ret = -ECONNREFUSED; 2878 goto errout; 2879 } 2880 2881 ubus_add_uloop(parent_ctx); 2882 2883 if (opts.ocibundle) { 2884 char *objname; 2885 if (asprintf(&objname, "container.%s", opts.name) < 0) { 2886 ret=-ENOMEM; 2887 goto errout; 2888 } 2889 2890 container_object.name = objname; 2891 ret = ubus_add_object(parent_ctx, &container_object); 2892 if (ret) { 2893 ERROR("Failed to add object: %s\n", ubus_strerror(ret)); 2894 ret=-1; 2895 goto errout; 2896 } 2897 } 2898 2899 /* deliberately not using 'else' on unrelated conditional branches */ 2900 if (!opts.ocibundle) { 2901 /* allocate NULL-terminated array for argv */ 2902 opts.jail_argv = calloc(1 + argc - optind, sizeof(void *)); 2903 if (!opts.jail_argv) { 2904 ret=EXIT_FAILURE; 2905 goto errout; 2906 } 2907 for (size_t s = optind; s < argc; s++) 2908 opts.jail_argv[s - optind] = strdup(argv[s]); 2909 2910 if (opts.namespace & CLONE_NEWUSER) 2911 get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid); 2912 } 2913 2914 if (!opts.extroot) { 2915 if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) { 2916 ERROR("failed to load dependencies\n"); 2917 ret=-1; 2918 goto errout; 2919 } 2920 } 2921 2922 if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) { 2923 ERROR("failed to load libpreload-seccomp.so\n"); 2924 opts.seccomp = 0; 2925 if (opts.require_jail) { 2926 ret=-1; 2927 goto errout; 2928 } 2929 } 2930 2931 uloop_timeout_add(&post_main_timeout); 2932 uloop_run(); 2933 2934 errout: 2935 if (opts.ocibundle) 2936 cgroups_free(); 2937 2938 free_opts(true); 2939 2940 return ret; 2941 } 2942 2943 static void post_main(struct uloop_timeout *t) 2944 { 2945 if (apply_rlimits()) { 2946 ERROR("error applying resource limits\n"); 2947 free_and_exit(EXIT_FAILURE); 2948 } 2949 2950 if (opts.name) 2951 prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL); 2952 2953 if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0) 2954 free_and_exit(-1); 2955 2956 if (has_namespaces()) { 2957 if (opts.namespace & CLONE_NEWNS) { 2958 if (!opts.extroot && (opts.user || opts.group)) { 2959 add_mount_bind("/etc/passwd", 1, -1); 2960 add_mount_bind("/etc/group", 1, -1); 2961 } 2962 2963 #if defined(__GLIBC__) 2964 if (!opts.extroot) 2965 add_mount_bind("/etc/nsswitch.conf", 1, -1); 2966 #endif 2967 if (opts.setns.ns == -1) { 2968 if (!(opts.namespace & CLONE_NEWNET)) { 2969 add_mount_bind("/etc/resolv.conf", 1, 0); 2970 } else { 2971 /* new mount namespace to provide /dev/resolv.conf.d */ 2972 char hostdir[PATH_MAX]; 2973 2974 snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name); 2975 if (mkdir_p(hostdir, 0755)) { 2976 ERROR("mkdir(%s) failed: %m\n", hostdir); 2977 free_and_exit(-1); 2978 } 2979 add_mount(hostdir, "/dev/resolv.conf.d", NULL, 2980 MS_BIND | MS_NOEXEC | MS_NOATIME | MS_NOSUID | MS_NODEV | MS_RDONLY, 0, NULL, 0); 2981 } 2982 } 2983 /* default mounts */ 2984 add_mount(NULL, "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "size=1M", -1); 2985 add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, 0, "mode=1777", -1); 2986 add_mount(NULL, "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "newinstance,ptmxmode=0666,mode=0620,gid=5", 0); 2987 2988 if (opts.procfs || opts.ocibundle) { 2989 add_mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0, NULL, -1); 2990 2991 /* 2992 * hack to make /proc/sys/net read-write while the rest of /proc/sys is read-only 2993 * which cannot be expressed with OCI spec, but happends to be very useful. 2994 * Only apply it if '/proc/sys' is not already listed as mount, maskedPath or 2995 * readonlyPath. 2996 * If not running in a new network namespace, only make /proc/sys read-only. 2997 * If running in a new network namespace, temporarily stash (ie. mount-bind) 2998 * /proc/sys/net into (totally unrelated, but surely existing) /proc/self/net. 2999 * Then we mount-bind /proc/sys read-only and then mount-move /proc/self/net into 3000 * /proc/sys/net. 3001 * This works because mounts are executed in incrementing strcmp() order and 3002 * /proc/self/net appears there before /proc/sys/net and hence the operation 3003 * succeeds as the bind-mount of /proc/self/net is performed first and then 3004 * move-mount of /proc/sys/net follows because 'e' preceeds 'y' in the ASCII 3005 * table (and in the alphabet). 3006 */ 3007 if (!add_mount(NULL, "/proc/sys", NULL, MS_BIND | MS_RDONLY, 0, NULL, -1)) 3008 if (opts.namespace & CLONE_NEWNET) 3009 if (!add_mount_inner("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, 0, NULL, -1)) 3010 add_mount_inner("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, 0, NULL, -1); 3011 3012 } 3013 if (opts.sysfs || opts.ocibundle) 3014 add_mount("sysfs", "/sys", "sysfs", MS_RELATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, 0, NULL, -1); 3015 3016 } 3017 3018 if (opts.setns.pid != -1) { 3019 pidns_fd = ns_open_pid("pid", getpid()); 3020 setns_open(CLONE_NEWPID); 3021 } else { 3022 pidns_fd = -1; 3023 } 3024 3025 #ifdef CLONE_NEWTIME 3026 if (opts.setns.time != -1) { 3027 timens_fd = ns_open_pid("time", getpid()); 3028 setns_open(CLONE_NEWTIME); 3029 } else { 3030 timens_fd = -1; 3031 } 3032 #endif 3033 3034 if (opts.namespace & CLONE_NEWUSER) { 3035 if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) { 3036 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n"); 3037 free_and_exit(EXIT_FAILURE); 3038 } 3039 if (seteuid(opts.root_map_uid)) { 3040 ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid); 3041 free_and_exit(EXIT_FAILURE); 3042 } 3043 } 3044 3045 jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | (opts.namespace & (~CLONE_NEWCGROUP)), NULL); 3046 } else { 3047 jail_process.pid = fork(); 3048 } 3049 3050 if (jail_process.pid > 0) { 3051 /* parent process */ 3052 char sig_buf[1]; 3053 3054 uloop_process_add(&jail_process); 3055 jail_running = 1; 3056 if (seteuid(0)) { 3057 ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid); 3058 free_and_exit(EXIT_FAILURE); 3059 } 3060 3061 prctl(PR_SET_SECUREBITS, 0); 3062 3063 if (pidns_fd != -1) { 3064 setns(pidns_fd, CLONE_NEWPID); 3065 close(pidns_fd); 3066 } 3067 #ifdef CLONE_NEWTIME 3068 if (timens_fd != -1) { 3069 setns(timens_fd, CLONE_NEWTIME); 3070 close(timens_fd); 3071 } 3072 #endif 3073 if (opts.setns.net != -1) 3074 close(opts.setns.net); 3075 if (opts.setns.ns != -1) 3076 close(opts.setns.ns); 3077 if (opts.setns.ipc != -1) 3078 close(opts.setns.ipc); 3079 if (opts.setns.uts != -1) 3080 close(opts.setns.uts); 3081 if (opts.setns.user != -1) 3082 close(opts.setns.user); 3083 if (opts.setns.cgroup != -1) 3084 close(opts.setns.cgroup); 3085 close(pipes[1]); 3086 close(pipes[2]); 3087 if (read(pipes[0], sig_buf, 1) < 1) { 3088 ERROR("can't read from child\n"); 3089 free_and_exit(-1); 3090 } 3091 close(pipes[0]); 3092 set_oom_score_adj(); 3093 3094 if (opts.ocibundle) 3095 cgroups_apply(jail_process.pid); 3096 3097 if (opts.namespace & CLONE_NEWUSER) { 3098 if (write_setgroups(jail_process.pid, true)) { 3099 ERROR("can't write setgroups\n"); 3100 free_and_exit(-1); 3101 } 3102 if (!opts.uidmap) { 3103 bool has_gr = (opts.gr_gid != -1); 3104 if (opts.pw_uid != -1) { 3105 write_single_uid_gid_map(jail_process.pid, 0, opts.pw_uid); 3106 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid); 3107 } else { 3108 write_single_uid_gid_map(jail_process.pid, 0, 65534); 3109 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534); 3110 } 3111 } else { 3112 write_uid_gid_map(jail_process.pid, 0, opts.uidmap); 3113 if (opts.gidmap) 3114 write_uid_gid_map(jail_process.pid, 1, opts.gidmap); 3115 } 3116 } 3117 3118 if (opts.namespace & CLONE_NEWNET) 3119 jail_network_start(parent_ctx, opts.name, jail_process.pid); 3120 3121 if (jail_writepid(jail_process.pid)) { 3122 ERROR("failed to write pidfile: %m\n"); 3123 free_and_exit(-1); 3124 } 3125 } else if (jail_process.pid == 0) { 3126 /* fork child process */ 3127 free_and_exit(exec_jail(NULL)); 3128 } else { 3129 ERROR("failed to clone/fork: %m\n"); 3130 free_and_exit(EXIT_FAILURE); 3131 } 3132 run_hooks(opts.hooks.createRuntime, post_create_runtime); 3133 } 3134 3135 static void post_poststart(void); 3136 static void post_create_runtime(void) 3137 { 3138 char sig_buf[1]; 3139 3140 sig_buf[0] = 'O'; 3141 if (write(pipes[3], sig_buf, 1) < 0) { 3142 ERROR("can't write to child\n"); 3143 free_and_exit(-1); 3144 } 3145 3146 jail_oci_state = OCI_STATE_CREATED; 3147 if (opts.ocibundle && !opts.immediately) 3148 uloop_run(); /* wait for 'start' command via ubus */ 3149 else 3150 pipe_send_start_container(NULL); 3151 } 3152 3153 static void pipe_send_start_container(struct uloop_timeout *t) 3154 { 3155 char sig_buf[1]; 3156 3157 jail_oci_state = OCI_STATE_RUNNING; 3158 sig_buf[0] = '!'; 3159 if (write(pipes[3], sig_buf, 1) < 0) { 3160 ERROR("can't write to child\n"); 3161 free_and_exit(-1); 3162 } 3163 close(pipes[3]); 3164 3165 run_hooks(opts.hooks.poststart, post_poststart); 3166 } 3167 3168 static void post_poststart(void) 3169 { 3170 uloop_run(); /* idle here while jail is running */ 3171 if (jail_running) { 3172 DEBUG("uloop interrupted, killing jail process\n"); 3173 kill(jail_process.pid, SIGTERM); 3174 uloop_timeout_set(&jail_process_timeout, 1000); 3175 uloop_run(); 3176 } 3177 uloop_done(); 3178 poststop(); 3179 } 3180 3181 static void post_poststop(void); 3182 static void poststop(void) { 3183 if (opts.namespace & CLONE_NEWNET) { 3184 setns(netns_fd, CLONE_NEWNET); 3185 jail_network_stop(); 3186 close(netns_fd); 3187 } 3188 run_hooks(opts.hooks.poststop, post_poststop); 3189 } 3190 3191 static void post_poststop(void) 3192 { 3193 free_opts(true); 3194 if (parent_ctx) 3195 ubus_free(parent_ctx); 3196 3197 exit(jail_return_code); 3198 } 3199
This page was automatically generated by LXR 0.3.1. • OpenWrt