1 /* 2 * Copyright (C) 2015 John Crispin <blogic@openwrt.org> 3 * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU Lesser General Public License version 2.1 7 * as published by the Free Software Foundation 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 */ 14 15 #define _GNU_SOURCE 16 #include <sys/mount.h> 17 #include <sys/prctl.h> 18 #include <sys/wait.h> 19 #include <sys/types.h> 20 #include <sys/time.h> 21 #include <sys/resource.h> 22 #include <sys/stat.h> 23 #include <sys/sysmacros.h> 24 25 /* musl only defined 15 limit types, make sure all 16 are supported */ 26 #ifndef RLIMIT_RTTIME 27 #define RLIMIT_RTTIME 15 28 #undef RLIMIT_NLIMITS 29 #define RLIMIT_NLIMITS 16 30 #undef RLIM_NLIMITS 31 #define RLIM_NLIMITS 16 32 #endif 33 34 #include <assert.h> 35 #include <stdlib.h> 36 #include <unistd.h> 37 #include <errno.h> 38 #include <pwd.h> 39 #include <grp.h> 40 #include <string.h> 41 #include <fcntl.h> 42 #include <sched.h> 43 #include <linux/filter.h> 44 #include <linux/limits.h> 45 #include <linux/nsfs.h> 46 #include <linux/securebits.h> 47 #include <signal.h> 48 #include <inttypes.h> 49 50 #include "capabilities.h" 51 #include "elf.h" 52 #include "fs.h" 53 #include "jail.h" 54 #include "log.h" 55 #include "seccomp-oci.h" 56 #include "cgroups.h" 57 #include "netifd.h" 58 59 #include <libubox/blobmsg.h> 60 #include <libubox/blobmsg_json.h> 61 #include <libubox/list.h> 62 #include <libubox/vlist.h> 63 #include <libubox/uloop.h> 64 #include <libubox/utils.h> 65 #include <libubus.h> 66 67 #ifndef CLONE_NEWCGROUP 68 #define CLONE_NEWCGROUP 0x02000000 69 #endif 70 71 #define STACK_SIZE (1024 * 1024) 72 #define OPT_ARGS "cC:d:e:EfFG:h:ij:J:ln:NoO:pP:r:R:sS:uU:w:t:T:y" 73 74 #define OCI_VERSION_STRING "1.0.2" 75 76 struct hook_execvpe { 77 char *file; 78 char **argv; 79 char **envp; 80 int timeout; 81 }; 82 83 struct sysctl_val { 84 char *entry; 85 char *value; 86 }; 87 88 struct mknod_args { 89 char *path; 90 mode_t mode; 91 dev_t dev; 92 uid_t uid; 93 gid_t gid; 94 }; 95 96 static struct { 97 char *name; 98 char *hostname; 99 char **jail_argv; 100 char *cwd; 101 char *seccomp; 102 struct sock_fprog *ociseccomp; 103 char *capabilities; 104 struct jail_capset capset; 105 char *user; 106 char *group; 107 char *extroot; 108 char *overlaydir; 109 char *tmpoverlaysize; 110 char **envp; 111 char *uidmap; 112 char *gidmap; 113 char *pidfile; 114 struct sysctl_val **sysctl; 115 int no_new_privs; 116 int namespace; 117 struct { 118 int pid; 119 int net; 120 int ns; 121 int ipc; 122 int uts; 123 int user; 124 int cgroup; 125 #ifdef CLONE_NEWTIME 126 int time; 127 #endif 128 } setns; 129 int procfs; 130 int ronly; 131 int sysfs; 132 int console; 133 int pw_uid; 134 int pw_gid; 135 int gr_gid; 136 int root_map_uid; 137 gid_t *additional_gids; 138 size_t num_additional_gids; 139 mode_t umask; 140 bool set_umask; 141 int require_jail; 142 struct { 143 struct hook_execvpe **createRuntime; 144 struct hook_execvpe **createContainer; 145 struct hook_execvpe **startContainer; 146 struct hook_execvpe **poststart; 147 struct hook_execvpe **poststop; 148 } hooks; 149 struct rlimit *rlimits[RLIM_NLIMITS]; 150 int oom_score_adj; 151 bool set_oom_score_adj; 152 struct mknod_args **devices; 153 char *ocibundle; 154 bool immediately; 155 struct blob_attr *annotations; 156 int term_timeout; 157 } opts; 158 159 static struct blob_buf ocibuf; 160 161 extern int pivot_root(const char *new_root, const char *put_old); 162 163 int debug = 0; 164 165 static char child_stack[STACK_SIZE]; 166 167 static struct ubus_context *parent_ctx; 168 169 int console_fd; 170 171 172 static inline bool has_namespaces(void) 173 { 174 return ((opts.setns.pid != -1) || 175 (opts.setns.net != -1) || 176 (opts.setns.ns != -1) || 177 (opts.setns.ipc != -1) || 178 (opts.setns.uts != -1) || 179 (opts.setns.user != -1) || 180 (opts.setns.cgroup != -1) || 181 #ifdef CLONE_NEWTIME 182 (opts.setns.time != -1) || 183 #endif 184 opts.namespace); 185 } 186 187 static void free_oci_envp(char **p) { 188 char **tmp; 189 190 if (p) { 191 tmp = p; 192 while (*tmp) 193 free(*(tmp++)); 194 195 free(p); 196 } 197 } 198 199 static void free_hooklist(struct hook_execvpe **hooklist) 200 { 201 struct hook_execvpe *cur; 202 203 if (!hooklist) 204 return; 205 206 cur = *hooklist; 207 while (cur) { 208 free_oci_envp(cur->argv); 209 free_oci_envp(cur->envp); 210 free(cur->file); 211 free(cur++); 212 } 213 free(hooklist); 214 } 215 216 static void free_sysctl(void) { 217 struct sysctl_val *cur; 218 219 if (!opts.sysctl) 220 return; 221 222 cur = *opts.sysctl; 223 224 while (cur) { 225 free(cur->entry); 226 free(cur->value); 227 free(cur++); 228 } 229 free(opts.sysctl); 230 } 231 232 static void free_devices(void) { 233 struct mknod_args **cur; 234 235 if (!opts.devices) 236 return; 237 238 cur = opts.devices; 239 240 while (*cur) { 241 free((*cur)->path); 242 free(*(cur++)); 243 } 244 free(opts.devices); 245 } 246 247 static void free_rlimits(void) { 248 int type; 249 250 for (type = 0; type < RLIM_NLIMITS; ++type) 251 free(opts.rlimits[type]); 252 } 253 254 static void free_opts(bool parent) { 255 256 free_library_search(); 257 mount_free(); 258 cgroups_free(); 259 260 /* we need to keep argv, envp and seccomp filter in child */ 261 if (parent) { /* parent-only */ 262 if (opts.ociseccomp) { 263 free(opts.ociseccomp->filter); 264 free(opts.ociseccomp); 265 } 266 267 free_oci_envp(opts.jail_argv); 268 free_oci_envp(opts.envp); 269 } 270 271 free_rlimits(); 272 free_sysctl(); 273 free_devices(); 274 free(opts.hostname); 275 free(opts.cwd); 276 free(opts.uidmap); 277 free(opts.gidmap); 278 free(opts.annotations); 279 free(opts.extroot); 280 free(opts.overlaydir); 281 free_hooklist(opts.hooks.createRuntime); 282 free_hooklist(opts.hooks.createContainer); 283 free_hooklist(opts.hooks.startContainer); 284 free_hooklist(opts.hooks.poststart); 285 free_hooklist(opts.hooks.poststop); 286 } 287 288 static int mount_overlay(char *jail_root, char *overlaydir) { 289 char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf; 290 const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s"; 291 int ret = -1, fd; 292 293 if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0) 294 goto out; 295 296 if (asprintf(&workdir, "%s%s", overlaydir, "/work") < 0) 297 goto upper_printf; 298 299 if (asprintf(&optsstr, mountoptsformat, jail_root, upperdir, workdir) < 0) 300 goto work_printf; 301 302 if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755)) 303 goto opts_printf; 304 305 /* 306 * make sure /etc/resolv.conf exists in overlay and is owned by jail userns root 307 * this is to work-around a bug in overlayfs described in the overlayfs-userns 308 * patch: 309 * 3. modification of a file 'hithere' which is in l but not yet 310 * in u, and which is not owned by T, is not allowed, even if 311 * writes to u are allowed. This may be a bug in overlayfs, 312 * but it is safe behavior. 313 */ 314 if (asprintf(&upperetc, "%s/etc", upperdir) < 0) 315 goto opts_printf; 316 317 if (mkdir_p(upperetc, 0755)) 318 goto upper_etc_printf; 319 320 if (asprintf(&upperresolvconf, "%s/resolv.conf", upperetc) < 0) 321 goto upper_etc_printf; 322 323 fd = creat(upperresolvconf, 0644); 324 if (fd < 0) { 325 if (errno != EEXIST) 326 ERROR("creat(%s) failed: %m\n", upperresolvconf); 327 } else { 328 close(fd); 329 } 330 DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr); 331 332 if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr)) 333 goto upper_resolvconf_printf; 334 335 ret = 0; 336 337 upper_resolvconf_printf: 338 free(upperresolvconf); 339 upper_etc_printf: 340 free(upperetc); 341 opts_printf: 342 free(optsstr); 343 work_printf: 344 free(workdir); 345 upper_printf: 346 free(upperdir); 347 out: 348 return ret; 349 } 350 351 static void pass_console(int console_fd) 352 { 353 struct ubus_context *child_ctx = ubus_connect(NULL); 354 static struct blob_buf req; 355 uint32_t id; 356 357 if (!child_ctx) 358 return; 359 360 blob_buf_init(&req, 0); 361 blobmsg_add_string(&req, "name", opts.name); 362 363 if (ubus_lookup_id(child_ctx, "container", &id) || 364 ubus_invoke_fd(child_ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd)) 365 INFO("ubus request failed\n"); 366 else 367 close(console_fd); 368 369 blob_buf_free(&req); 370 ubus_free(child_ctx); 371 } 372 373 static int create_dev_console(const char *jail_root) 374 { 375 char *console_fname; 376 char dev_console_path[PATH_MAX]; 377 int slave_console_fd, dev_console_dummy; 378 379 /* Open UNIX/98 virtual console */ 380 console_fd = posix_openpt(O_RDWR | O_NOCTTY); 381 if (console_fd < 0) 382 return -1; 383 384 console_fname = ptsname(console_fd); 385 DEBUG("got console fd %d and PTS client name %s\n", console_fd, console_fname); 386 if (!console_fname) 387 goto no_console; 388 389 grantpt(console_fd); 390 unlockpt(console_fd); 391 392 /* pass PTY master to procd */ 393 pass_console(console_fd); 394 395 /* mount-bind PTY slave to /dev/console in jail */ 396 snprintf(dev_console_path, sizeof(dev_console_path), "%s/dev/console", jail_root); 397 dev_console_dummy = creat(dev_console_path, 0620); 398 if (dev_console_dummy < 0) 399 goto no_console; 400 401 close(dev_console_dummy); 402 403 if (mount(console_fname, dev_console_path, "bind", MS_BIND, NULL)) 404 goto no_console; 405 406 /* use PTY slave for stdio */ 407 slave_console_fd = open(console_fname, O_RDWR); /* | O_NOCTTY */ 408 if (slave_console_fd < 0) 409 goto no_console; 410 411 dup2(slave_console_fd, 0); 412 dup2(slave_console_fd, 1); 413 dup2(slave_console_fd, 2); 414 close(slave_console_fd); 415 416 INFO("using guest console %s\n", console_fname); 417 418 return 0; 419 420 no_console: 421 close(console_fd); 422 return 1; 423 } 424 425 static int hook_running = 0; 426 static int hook_return_code = 0; 427 static struct hook_execvpe **current_hook = NULL; 428 typedef void (*hook_return_handler)(void); 429 static hook_return_handler hook_return_cb = NULL; 430 431 static void hook_process_timeout_cb(struct uloop_timeout *t); 432 static struct uloop_timeout hook_process_timeout = { 433 .cb = hook_process_timeout_cb, 434 }; 435 436 static void run_hooklist(void); 437 static void hook_process_handler(struct uloop_process *c, int ret) 438 { 439 uloop_timeout_cancel(&hook_process_timeout); 440 441 if (WIFEXITED(ret)) { 442 hook_return_code = WEXITSTATUS(ret); 443 if (hook_return_code) 444 ERROR("hook (%d) exited with exit: %d\n", c->pid, hook_return_code); 445 else 446 DEBUG("hook (%d) exited with exit: %d\n", c->pid, hook_return_code); 447 448 } else { 449 hook_return_code = WTERMSIG(ret); 450 ERROR("hook (%d) exited with signal: %d\n", c->pid, hook_return_code); 451 } 452 hook_running = 0; 453 ++current_hook; 454 run_hooklist(); 455 } 456 457 static struct uloop_process hook_process = { 458 .cb = hook_process_handler, 459 }; 460 461 static void hook_process_timeout_cb(struct uloop_timeout *t) 462 { 463 DEBUG("hook process failed to stop, sending SIGKILL\n"); 464 kill(hook_process.pid, SIGKILL); 465 } 466 467 static void run_hooklist(void) 468 { 469 struct hook_execvpe *hook = *current_hook; 470 struct stat s; 471 472 if (!hook) 473 return hook_return_cb(); 474 475 DEBUG("executing hook %s\n", hook->file); 476 477 if (stat(hook->file, &s)) 478 hook_process_handler(&hook_process, ENOENT); 479 480 if (!((unsigned long)s.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) 481 hook_process_handler(&hook_process, EPERM); 482 483 hook_running = 1; 484 hook_process.pid = fork(); 485 if (hook_process.pid == 0) { 486 /* child */ 487 execve(hook->file, hook->argv, hook->envp); 488 ERROR("execve error %m\n"); 489 _exit(errno); 490 } else if (hook_process.pid < 0) { 491 /* fork error */ 492 ERROR("hook fork error\n"); 493 hook_running = 0; 494 hook_process_handler(&hook_process, errno); 495 } 496 497 /* parent */ 498 uloop_process_add(&hook_process); 499 500 if (hook->timeout > 0) 501 uloop_timeout_set(&hook_process_timeout, 1000 * hook->timeout); 502 503 uloop_run(); 504 if (hook_running) { 505 DEBUG("uloop interrupted, killing jail process\n"); 506 kill(hook_process.pid, SIGTERM); 507 uloop_timeout_set(&hook_process_timeout, 1000); 508 uloop_run(); 509 } 510 } 511 512 static void run_hooks(struct hook_execvpe **hooklist, hook_return_handler return_cb) 513 { 514 if (!hooklist) 515 return_cb(); 516 517 current_hook = hooklist; 518 hook_return_cb = return_cb; 519 520 run_hooklist(); 521 } 522 523 static int apply_sysctl(const char *jail_root) 524 { 525 struct sysctl_val **cur; 526 char *procdir, *fname; 527 int f; 528 529 if (!opts.sysctl) 530 return 0; 531 532 if (asprintf(&procdir, "%s/proc", jail_root) < 0) 533 return ENOMEM; 534 535 mkdir(procdir, 0700); 536 if (mount("proc", procdir, "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0)) 537 return EPERM; 538 539 cur = opts.sysctl; 540 541 while (*cur) { 542 if (asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry) < 0) 543 return ENOMEM; 544 545 DEBUG("sysctl: writing '%s' to %s\n", (*cur)->value, fname); 546 547 f = open(fname, O_WRONLY); 548 if (f < 0) { 549 ERROR("sysctl: can't open %s\n", fname); 550 free(fname); 551 return errno; 552 } 553 if (write(f, (*cur)->value, strlen((*cur)->value)) < 0) { 554 ERROR("sysctl: write to %s\n", fname); 555 free(fname); 556 close(f); 557 return errno; 558 } 559 560 free(fname); 561 close(f); 562 ++cur; 563 } 564 umount(procdir); 565 rmdir(procdir); 566 free(procdir); 567 568 return 0; 569 } 570 571 /* glibc defines makedev calling a function. make sure it's a pure macro */ 572 #if defined(__GLIBC__) 573 #undef makedev 574 /* from musl's sys/sysmacros.h */ 575 #define makedev(x,y) ( \ 576 (((x)&0xfffff000ULL) << 32) | \ 577 (((x)&0x00000fffULL) << 8) | \ 578 (((y)&0xffffff00ULL) << 12) | \ 579 (((y)&0x000000ffULL)) ) 580 #endif 581 582 static struct mknod_args default_devices[] = { 583 { .path = "/dev/null", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 3) }, 584 { .path = "/dev/zero", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 5) }, 585 { .path = "/dev/full", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 7) }, 586 { .path = "/dev/random", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 8) }, 587 { .path = "/dev/urandom", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 9) }, 588 { .path = "/dev/tty", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP), .dev = makedev(5, 0), .gid = 5 }, 589 { 0 }, 590 }; 591 592 static int create_devices(void) 593 { 594 struct mknod_args **cur, *curdef; 595 char *path, *tmp; 596 int ret; 597 598 if (!opts.devices) 599 goto only_default_devices; 600 601 cur = opts.devices; 602 603 while (*cur) { 604 path = (*cur)->path; 605 /* don't allow devices outside of /dev */ 606 if (strncmp(path, "/dev", 4)) 607 return EPERM; 608 609 /* make sure parent folder exists */ 610 tmp = strrchr(path, '/'); 611 if (!tmp) 612 return EINVAL; 613 614 *tmp = '\0'; 615 if (strcmp(path, "/dev")) { 616 DEBUG("creating directory %s\n", path); 617 618 mkdir_p(path, 0755); 619 } 620 *tmp = '/'; 621 622 DEBUG("creating %s (mode=%08o)\n", path, (*cur)->mode); 623 624 /* create device */ 625 if (mknod(path, (*cur)->mode, (*cur)->dev)) 626 return errno; 627 628 /* change owner, if needed */ 629 if (((*cur)->uid || (*cur)->gid) && 630 chown(path, (*cur)->uid, (*cur)->gid)) 631 return errno; 632 633 ++cur; 634 } 635 636 only_default_devices: 637 curdef = default_devices; 638 while(curdef->path) { 639 DEBUG("creating %s (mode=%08o)\n", curdef->path, curdef->mode); 640 if (mknod(curdef->path, curdef->mode, curdef->dev)) { 641 ++curdef; 642 continue; /* may already exist, eg. due to a bind-mount */ 643 } 644 if ((curdef->uid || curdef->gid) && 645 chown(curdef->path, curdef->uid, curdef->gid)) 646 return errno; 647 648 ++curdef; 649 } 650 651 /* Dev symbolic links as defined in OCI spec */ 652 ret = symlink("/dev/pts/ptmx", "/dev/ptmx"); 653 if (ret < 0) 654 WARNING("symlink() failed to create link to /dev/pts/ptmx"); 655 656 ret = symlink("/proc/self/fd", "/dev/fd"); 657 if (ret < 0) 658 WARNING("symlink() failed to create link to /proc/self/fd"); 659 660 ret = symlink("/proc/self/fd/0", "/dev/stdin"); 661 if (ret < 0) 662 WARNING("symlink() failed to create link to /proc/self/fd/0"); 663 664 ret = symlink("/proc/self/fd/1", "/dev/stdout"); 665 if (ret < 0) 666 WARNING("symlink() failed to create link to /proc/self/fd/1"); 667 668 ret = symlink("/proc/self/fd/2", "/dev/stderr"); 669 if (ret < 0) 670 WARNING("symlink() failed to create link to /proc/self/fd/2"); 671 672 return 0; 673 } 674 675 static char jail_root[] = "/tmp/ujail-XXXXXX"; 676 static char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX"; 677 static mode_t old_umask; 678 static void enter_jail_fs(void); 679 static int build_jail_fs(void) 680 { 681 char *overlaydir = NULL; 682 int ret; 683 684 old_umask = umask(0); 685 686 if (mkdtemp(jail_root) == NULL) { 687 ERROR("mkdtemp(%s) failed: %m\n", jail_root); 688 return -1; 689 } 690 691 if (apply_sysctl(jail_root)) { 692 ERROR("failed to apply sysctl values\n"); 693 return -1; 694 } 695 696 /* oldroot can't be MS_SHARED else pivot_root() fails */ 697 if (mount("none", "/", "none", MS_REC|MS_PRIVATE, NULL)) { 698 ERROR("private mount failed %m\n"); 699 return -1; 700 } 701 702 if (opts.extroot) { 703 if (mount(opts.extroot, jail_root, "bind", MS_BIND, NULL)) { 704 ERROR("extroot mount failed %m\n"); 705 return -1; 706 } 707 } else { 708 if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) { 709 ERROR("tmpfs mount failed %m\n"); 710 return -1; 711 } 712 } 713 714 if (opts.tmpoverlaysize) { 715 char mountoptsstr[] = "mode=0755,size=XXXXXXXX"; 716 717 snprintf(mountoptsstr, sizeof(mountoptsstr), 718 "mode=0755,size=%s", opts.tmpoverlaysize); 719 if (mkdtemp(tmpovdir) == NULL) { 720 ERROR("mkdtemp(%s) failed: %m\n", jail_root); 721 return -1; 722 } 723 if (mount("tmpfs", tmpovdir, "tmpfs", MS_NOATIME, 724 mountoptsstr)) { 725 ERROR("failed to mount tmpfs for overlay (size=%s)\n", opts.tmpoverlaysize); 726 return -1; 727 } 728 overlaydir = tmpovdir; 729 } 730 731 if (opts.overlaydir) 732 overlaydir = opts.overlaydir; 733 734 if (overlaydir) { 735 ret = mount_overlay(jail_root, overlaydir); 736 if (ret) 737 return ret; 738 } 739 740 if (chdir(jail_root)) { 741 ERROR("chdir(%s) (jail_root) failed: %m\n", jail_root); 742 return -1; 743 } 744 745 if (mount_all(jail_root)) { 746 ERROR("mount_all() failed\n"); 747 return -1; 748 } 749 750 if (opts.console) 751 create_dev_console(jail_root); 752 753 /* make sure /etc/resolv.conf exists if in new network namespace */ 754 if (opts.namespace & CLONE_NEWNET) { 755 char jailetc[PATH_MAX], jaillink[PATH_MAX]; 756 757 snprintf(jailetc, PATH_MAX, "%s/etc", jail_root); 758 mkdir_p(jailetc, 0755); 759 snprintf(jaillink, PATH_MAX, "%s/etc/resolv.conf", jail_root); 760 if (overlaydir) 761 unlink(jaillink); 762 763 ret = symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink); 764 if (ret < 0) 765 WARNING("symlink() failed to create link to ../dev/resolv.conf.d/resolv.conf.auto"); 766 } 767 768 run_hooks(opts.hooks.createContainer, enter_jail_fs); 769 770 return 0; 771 } 772 773 static bool exit_from_child; 774 static void free_and_exit(int ret) 775 { 776 if (!exit_from_child && opts.ocibundle) 777 cgroups_free(); 778 779 if (!exit_from_child && parent_ctx) 780 ubus_free(parent_ctx); 781 782 free_opts(!exit_from_child); 783 784 exit(ret); 785 } 786 787 static void post_jail_fs(void); 788 static void enter_jail_fs(void) 789 { 790 char dirbuf[sizeof(jail_root) + 4]; 791 792 snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root); 793 mkdir(dirbuf, 0755); 794 795 if (pivot_root(jail_root, dirbuf) == -1) { 796 ERROR("pivot_root(%s, %s) failed: %m\n", jail_root, dirbuf); 797 free_and_exit(-1); 798 } 799 if (chdir("/")) { 800 ERROR("chdir(/) (after pivot_root) failed: %m\n"); 801 free_and_exit(-1); 802 } 803 804 snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root); 805 umount2(dirbuf, MNT_DETACH); 806 rmdir(dirbuf); 807 if (opts.tmpoverlaysize) { 808 char tmpdirbuf[sizeof(tmpovdir) + 4]; 809 snprintf(tmpdirbuf, sizeof(tmpdirbuf), "/old%s", tmpovdir); 810 umount2(tmpdirbuf, MNT_DETACH); 811 rmdir(tmpdirbuf); 812 } 813 814 umount2("/old", MNT_DETACH); 815 rmdir("/old"); 816 817 if (create_devices()) { 818 ERROR("create_devices() failed\n"); 819 free_and_exit(-1); 820 } 821 if (opts.ronly) 822 mount(NULL, "/", "bind", MS_REMOUNT | MS_BIND | MS_RDONLY, 0); 823 824 umask(old_umask); 825 post_jail_fs(); 826 } 827 828 static int write_uid_gid_map(pid_t child_pid, bool gidmap, char *mapstr) 829 { 830 int map_file; 831 char map_path[64]; 832 833 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s", 834 child_pid, gidmap?"gid_map":"uid_map") < 0) 835 return -1; 836 837 if ((map_file = open(map_path, O_WRONLY)) < 0) 838 return -1; 839 840 if (dprintf(map_file, "%s", mapstr)) { 841 close(map_file); 842 return -1; 843 } 844 845 close(map_file); 846 return 0; 847 } 848 849 static int write_single_uid_gid_map(pid_t child_pid, bool gidmap, int id) 850 { 851 int map_file; 852 char map_path[64]; 853 const char *map_format = "%d %d %d\n"; 854 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s", 855 child_pid, gidmap?"gid_map":"uid_map") < 0) 856 return -1; 857 858 if ((map_file = open(map_path, O_WRONLY)) < 0) 859 return -1; 860 861 if (dprintf(map_file, map_format, 0, id, 1) < 0) { 862 close(map_file); 863 return -1; 864 } 865 866 close(map_file); 867 return 0; 868 } 869 870 static int write_setgroups(pid_t child_pid, bool allow) 871 { 872 int setgroups_file; 873 char setgroups_path[64]; 874 875 if (snprintf(setgroups_path, sizeof(setgroups_path), "/proc/%d/setgroups", 876 child_pid) < 0) { 877 return -1; 878 } 879 880 if ((setgroups_file = open(setgroups_path, O_WRONLY)) < 0) { 881 return -1; 882 } 883 884 if (dprintf(setgroups_file, "%s", allow?"allow":"deny") == -1) { 885 close(setgroups_file); 886 return -1; 887 } 888 889 close(setgroups_file); 890 return 0; 891 } 892 893 static void get_jail_user(int *user, int *user_gid, int *gr_gid) 894 { 895 struct passwd *p = NULL; 896 struct group *g = NULL; 897 898 if (opts.user) { 899 p = getpwnam(opts.user); 900 if (!p) { 901 ERROR("failed to get uid/gid for user %s: %d (%s)\n", 902 opts.user, errno, strerror(errno)); 903 free_and_exit(EXIT_FAILURE); 904 } 905 *user = p->pw_uid; 906 *user_gid = p->pw_gid; 907 } else { 908 *user = -1; 909 *user_gid = -1; 910 } 911 912 if (opts.group) { 913 g = getgrnam(opts.group); 914 if (!g) { 915 ERROR("failed to get gid for group %s: %m\n", opts.group); 916 free_and_exit(EXIT_FAILURE); 917 } 918 *gr_gid = g->gr_gid; 919 } else { 920 *gr_gid = -1; 921 } 922 }; 923 924 static void set_jail_user(int pw_uid, int user_gid, int gr_gid) 925 { 926 if (opts.user && (user_gid != -1) && initgroups(opts.user, user_gid)) { 927 ERROR("failed to initgroups() for user %s: %m\n", opts.user); 928 free_and_exit(EXIT_FAILURE); 929 } 930 931 if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) { 932 ERROR("failed to set group id %d: %m\n", gr_gid); 933 free_and_exit(EXIT_FAILURE); 934 } 935 936 if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) { 937 ERROR("failed to set user id %d: %m\n", pw_uid); 938 free_and_exit(EXIT_FAILURE); 939 } 940 } 941 942 static int apply_rlimits(void) 943 { 944 int resource; 945 946 for (resource = 0; resource < RLIM_NLIMITS; ++resource) { 947 if (opts.rlimits[resource]) 948 DEBUG("applying limits to resource %u\n", resource); 949 950 if (opts.rlimits[resource] && 951 setrlimit(resource, opts.rlimits[resource])) 952 return errno; 953 } 954 955 return 0; 956 } 957 958 #define MAX_ENVP 64 959 static char** build_envp(const char *seccomp, char **ocienvp) 960 { 961 static char *envp[MAX_ENVP]; 962 static char preload_var[PATH_MAX]; 963 static char seccomp_var[PATH_MAX]; 964 static char seccomp_debug_var[20]; 965 static char debug_var[] = "LD_DEBUG=all"; 966 static char container_var[] = "container=ujail"; 967 const char *preload_lib = find_lib("libpreload-seccomp.so"); 968 char **addenv; 969 970 int count = 0; 971 972 if (seccomp && !preload_lib) { 973 ERROR("failed to add preload-lib to env\n"); 974 return NULL; 975 } 976 if (seccomp) { 977 snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp); 978 envp[count++] = seccomp_var; 979 snprintf(seccomp_debug_var, sizeof(seccomp_debug_var), "SECCOMP_DEBUG=%2d", debug); 980 envp[count++] = seccomp_debug_var; 981 snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib); 982 envp[count++] = preload_var; 983 } 984 985 envp[count++] = container_var; 986 987 if (debug > 1) 988 envp[count++] = debug_var; 989 990 addenv = ocienvp; 991 while (addenv && *addenv) { 992 envp[count++] = *(addenv++); 993 if (count >= MAX_ENVP) { 994 ERROR("environment limited to %d extra records, truncating\n", MAX_ENVP); 995 break; 996 } 997 } 998 return envp; 999 } 1000 1001 static void usage(void) 1002 { 1003 fprintf(stderr, "ujail <options> -- <binary> <params ...>\n"); 1004 fprintf(stderr, " -d <num>\tshow debug log (increase num to increase verbosity)\n"); 1005 fprintf(stderr, " -S <file>\tseccomp filter config\n"); 1006 fprintf(stderr, " -C <file>\tcapabilities drop config\n"); 1007 fprintf(stderr, " -c\t\tset PR_SET_NO_NEW_PRIVS\n"); 1008 fprintf(stderr, " -n <name>\tthe name of the jail\n"); 1009 fprintf(stderr, " -e <var>\timport environment variable\n"); 1010 fprintf(stderr, "namespace jail options:\n"); 1011 fprintf(stderr, " -h <hostname>\tchange the hostname of the jail\n"); 1012 fprintf(stderr, " -N\t\tjail has network namespace\n"); 1013 fprintf(stderr, " -f\t\tjail has user namespace\n"); 1014 fprintf(stderr, " -F\t\tjail has cgroups namespace\n"); 1015 fprintf(stderr, " -r <file>\treadonly files that should be staged\n"); 1016 fprintf(stderr, " -w <file>\twriteable files that should be staged\n"); 1017 fprintf(stderr, " -p\t\tjail has /proc\n"); 1018 fprintf(stderr, " -s\t\tjail has /sys\n"); 1019 fprintf(stderr, " -l\t\tjail has /dev/log\n"); 1020 fprintf(stderr, " -u\t\tjail has a ubus socket\n"); 1021 fprintf(stderr, " -U <name>\tuser to run jailed process\n"); 1022 fprintf(stderr, " -G <name>\tgroup to run jailed process\n"); 1023 fprintf(stderr, " -o\t\tremont jail root (/) read only\n"); 1024 fprintf(stderr, " -R <dir>\texternal jail rootfs (system container)\n"); 1025 fprintf(stderr, " -O <dir>\tdirectory for r/w overlayfs\n"); 1026 fprintf(stderr, " -T <size>\tuse tmpfs r/w overlayfs with <size>\n"); 1027 fprintf(stderr, " -E\t\tfail if jail cannot be setup\n"); 1028 fprintf(stderr, " -y\t\tprovide jail console\n"); 1029 fprintf(stderr, " -J <dir>\tcreate container from OCI bundle\n"); 1030 fprintf(stderr, " -i\t\tstart container immediately\n"); 1031 fprintf(stderr, " -P <pidfile>\tcreate <pidfile>\n"); 1032 fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\ 1033 and he has the same powers as root outside the jail,\n\ 1034 thus he can escape the jail and/or break stuff.\n\ 1035 Please use seccomp/capabilities (-S/-C) to restrict his powers\n\n\ 1036 If you use none of the namespace jail options,\n\ 1037 ujail will not use namespace/build a jail,\n\ 1038 and will only drop capabilities/apply seccomp filter.\n\n"); 1039 } 1040 1041 static int* get_namespace_fd(const unsigned int nstype) 1042 { 1043 switch (nstype) { 1044 case CLONE_NEWPID: 1045 return &opts.setns.pid; 1046 case CLONE_NEWNET: 1047 return &opts.setns.net; 1048 case CLONE_NEWNS: 1049 return &opts.setns.ns; 1050 case CLONE_NEWIPC: 1051 return &opts.setns.ipc; 1052 case CLONE_NEWUTS: 1053 return &opts.setns.uts; 1054 case CLONE_NEWUSER: 1055 return &opts.setns.user; 1056 case CLONE_NEWCGROUP: 1057 return &opts.setns.cgroup; 1058 #ifdef CLONE_NEWTIME 1059 case CLONE_NEWTIME: 1060 return &opts.setns.time; 1061 #endif 1062 default: 1063 return NULL; 1064 } 1065 } 1066 1067 static int setns_open(unsigned long nstype) 1068 { 1069 int *fd = get_namespace_fd(nstype); 1070 1071 assert(fd != NULL); 1072 1073 if (*fd < 0) 1074 return 0; 1075 1076 if (setns(*fd, nstype) == -1) { 1077 close(*fd); 1078 return errno; 1079 } 1080 1081 close(*fd); 1082 return 0; 1083 } 1084 1085 static int jail_running = 0; 1086 static int jail_return_code = 0; 1087 1088 static void jail_process_timeout_cb(struct uloop_timeout *t); 1089 static struct uloop_timeout jail_process_timeout = { 1090 .cb = jail_process_timeout_cb, 1091 }; 1092 static void poststop(void); 1093 static void jail_process_handler(struct uloop_process *c, int ret) 1094 { 1095 uloop_timeout_cancel(&jail_process_timeout); 1096 if (WIFEXITED(ret)) { 1097 jail_return_code = WEXITSTATUS(ret); 1098 INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code); 1099 } else { 1100 jail_return_code = WTERMSIG(ret); 1101 INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code); 1102 } 1103 jail_running = 0; 1104 poststop(); 1105 } 1106 1107 static struct uloop_process jail_process = { 1108 .cb = jail_process_handler, 1109 }; 1110 1111 static void jail_process_timeout_cb(struct uloop_timeout *t) 1112 { 1113 DEBUG("jail process failed to stop, sending SIGKILL\n"); 1114 kill(jail_process.pid, SIGKILL); 1115 } 1116 1117 static void jail_handle_signal(int signo) 1118 { 1119 if (hook_running) { 1120 DEBUG("forwarding signal %d to the hook process\n", signo); 1121 kill(hook_process.pid, signo); 1122 /* set timeout to send SIGKILL hook process in case SIGTERM doesn't succeed */ 1123 if (signo == SIGTERM) 1124 uloop_timeout_set(&hook_process_timeout, opts.term_timeout * 1000); 1125 } 1126 1127 if (jail_running) { 1128 DEBUG("forwarding signal %d to the jailed process\n", signo); 1129 kill(jail_process.pid, signo); 1130 /* set timeout to send SIGKILL jail process in case SIGTERM doesn't succeed */ 1131 if (signo == SIGTERM) 1132 uloop_timeout_set(&jail_process_timeout, opts.term_timeout * 1000); 1133 } 1134 } 1135 1136 static void signals_init(void) 1137 { 1138 int i; 1139 sigset_t sigmask; 1140 1141 sigfillset(&sigmask); 1142 for (i = 0; i < _NSIG; i++) { 1143 struct sigaction s = { 0 }; 1144 1145 if (!sigismember(&sigmask, i)) 1146 continue; 1147 if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV) || (i == SIGSTOP) || (i == SIGKILL)) 1148 continue; 1149 1150 s.sa_handler = jail_handle_signal; 1151 sigaction(i, &s, NULL); 1152 } 1153 } 1154 1155 static void pre_exec_jail(struct uloop_timeout *t); 1156 static struct uloop_timeout pre_exec_timeout = { 1157 .cb = pre_exec_jail, 1158 }; 1159 1160 int pipes[4]; 1161 static int exec_jail(void *arg) 1162 { 1163 char buf[1]; 1164 1165 exit_from_child = true; 1166 prctl(PR_SET_SECUREBITS, 0); 1167 1168 uloop_init(); 1169 signals_init(); 1170 1171 close(pipes[0]); 1172 close(pipes[3]); 1173 1174 setns_open(CLONE_NEWUSER); 1175 setns_open(CLONE_NEWNET); 1176 setns_open(CLONE_NEWNS); 1177 setns_open(CLONE_NEWIPC); 1178 setns_open(CLONE_NEWUTS); 1179 1180 buf[0] = 'i'; 1181 if (write(pipes[1], buf, 1) < 1) { 1182 ERROR("can't write to parent\n"); 1183 return EXIT_FAILURE; 1184 } 1185 close(pipes[1]); 1186 if (read(pipes[2], buf, 1) < 1) { 1187 ERROR("can't read from parent\n"); 1188 return EXIT_FAILURE; 1189 } 1190 if (buf[0] != 'O') { 1191 ERROR("parent had an error, child exiting\n"); 1192 return EXIT_FAILURE; 1193 } 1194 1195 if (opts.namespace & CLONE_NEWCGROUP) 1196 unshare(CLONE_NEWCGROUP); 1197 1198 setns_open(CLONE_NEWCGROUP); 1199 1200 if ((opts.namespace & CLONE_NEWUSER) || (opts.setns.user != -1)) { 1201 if (setregid(0, 0) < 0) { 1202 ERROR("setgid\n"); 1203 free_and_exit(EXIT_FAILURE); 1204 } 1205 if (setreuid(0, 0) < 0) { 1206 ERROR("setuid\n"); 1207 free_and_exit(EXIT_FAILURE); 1208 } 1209 if (setgroups(0, NULL) < 0) { 1210 ERROR("setgroups\n"); 1211 free_and_exit(EXIT_FAILURE); 1212 } 1213 } 1214 1215 if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0 1216 && sethostname(opts.hostname, strlen(opts.hostname))) { 1217 ERROR("sethostname(%s) failed: %m\n", opts.hostname); 1218 free_and_exit(EXIT_FAILURE); 1219 } 1220 1221 uloop_timeout_add(&pre_exec_timeout); 1222 uloop_run(); 1223 1224 free_and_exit(-1); 1225 return -1; 1226 } 1227 1228 static void pre_exec_jail(struct uloop_timeout *t) 1229 { 1230 if ((opts.namespace & CLONE_NEWNS) && build_jail_fs()) { 1231 ERROR("failed to build jail fs\n"); 1232 free_and_exit(EXIT_FAILURE); 1233 } else { 1234 run_hooks(opts.hooks.createContainer, post_jail_fs); 1235 } 1236 } 1237 1238 static void post_start_hook(void); 1239 static void post_jail_fs(void) 1240 { 1241 char buf[1]; 1242 1243 if (read(pipes[2], buf, 1) < 1) { 1244 ERROR("can't read from parent\n"); 1245 free_and_exit(EXIT_FAILURE); 1246 } 1247 if (buf[0] != '!') { 1248 ERROR("parent had an error, child exiting\n"); 1249 free_and_exit(EXIT_FAILURE); 1250 } 1251 close(pipes[2]); 1252 1253 run_hooks(opts.hooks.startContainer, post_start_hook); 1254 } 1255 1256 static void post_start_hook(void) 1257 { 1258 int pw_uid, pw_gid, gr_gid; 1259 1260 /* 1261 * make sure setuid/setgid won't drop capabilities in case capabilities 1262 * have been specified explicitely. 1263 */ 1264 if (opts.capset.apply) { 1265 if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) { 1266 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n"); 1267 free_and_exit(EXIT_FAILURE); 1268 } 1269 } 1270 1271 /* drop capabilities, retain those still needed to further setup jail */ 1272 if (applyOCIcapabilities(opts.capset, (1LLU << CAP_SETGID) | (1LLU << CAP_SETUID) | (1LLU << CAP_SETPCAP))) 1273 free_and_exit(EXIT_FAILURE); 1274 1275 /* use either cmdline-supplied user/group or uid/gid from OCI spec */ 1276 get_jail_user(&pw_uid, &pw_gid, &gr_gid); 1277 set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid); 1278 1279 if (opts.additional_gids && 1280 (setgroups(opts.num_additional_gids, opts.additional_gids) < 0)) { 1281 ERROR("setgroups failed: %m\n"); 1282 free_and_exit(EXIT_FAILURE); 1283 } 1284 1285 if (opts.set_umask) 1286 umask(opts.umask); 1287 1288 /* restore securebits back to normal (and lock them if not in userns) */ 1289 if (opts.capset.apply) { 1290 if (prctl(PR_SET_SECUREBITS, (opts.namespace & CLONE_NEWUSER)?0: 1291 SECBIT_KEEP_CAPS_LOCKED|SECBIT_NO_SETUID_FIXUP_LOCKED|SECBIT_NOROOT_LOCKED)) { 1292 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n"); 1293 free_and_exit(EXIT_FAILURE); 1294 } 1295 } 1296 1297 /* drop remaining capabilities to end up with specified sets */ 1298 if (applyOCIcapabilities(opts.capset, 0)) 1299 free_and_exit(EXIT_FAILURE); 1300 1301 if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { 1302 ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n"); 1303 free_and_exit(EXIT_FAILURE); 1304 } 1305 1306 char **envp = build_envp(opts.seccomp, opts.envp); 1307 if (!envp) 1308 free_and_exit(EXIT_FAILURE); 1309 1310 if (opts.cwd && chdir(opts.cwd)) 1311 free_and_exit(EXIT_FAILURE); 1312 1313 if (opts.ociseccomp && applyOCIlinuxseccomp(opts.ociseccomp)) 1314 free_and_exit(EXIT_FAILURE); 1315 1316 uloop_end(); 1317 free_opts(false); 1318 INFO("exec-ing %s\n", *opts.jail_argv); 1319 if (opts.envp) /* respect PATH if potentially set in ENV */ 1320 execvpe(*opts.jail_argv, opts.jail_argv, envp); 1321 else 1322 execve(*opts.jail_argv, opts.jail_argv, envp); 1323 1324 /* we get there only if execve fails */ 1325 ERROR("failed to execve %s: %m\n", *opts.jail_argv); 1326 exit(EXIT_FAILURE); 1327 } 1328 1329 int ns_open_pid(const char *nstype, const pid_t target_ns) 1330 { 1331 char pid_pid_path[PATH_MAX]; 1332 1333 snprintf(pid_pid_path, sizeof(pid_pid_path), "/proc/%u/ns/%s", target_ns, nstype); 1334 1335 return open(pid_pid_path, O_RDONLY); 1336 } 1337 1338 static int parseOCIenvarray(struct blob_attr *msg, char ***envp) 1339 { 1340 struct blob_attr *cur; 1341 int sz = 0, rem; 1342 1343 blobmsg_for_each_attr(cur, msg, rem) 1344 ++sz; 1345 1346 if (sz > 0) { 1347 *envp = calloc(1 + sz, sizeof(char*)); 1348 if (!(*envp)) 1349 return ENOMEM; 1350 } else { 1351 *envp = NULL; 1352 return 0; 1353 } 1354 1355 sz = 0; 1356 blobmsg_for_each_attr(cur, msg, rem) 1357 (*envp)[sz++] = strdup(blobmsg_get_string(cur)); 1358 1359 if (sz) 1360 (*envp)[sz] = NULL; 1361 1362 return 0; 1363 } 1364 1365 enum { 1366 OCI_ROOT_PATH, 1367 OCI_ROOT_READONLY, 1368 __OCI_ROOT_MAX, 1369 }; 1370 1371 static const struct blobmsg_policy oci_root_policy[] = { 1372 [OCI_ROOT_PATH] = { "path", BLOBMSG_TYPE_STRING }, 1373 [OCI_ROOT_READONLY] = { "readonly", BLOBMSG_TYPE_BOOL }, 1374 }; 1375 1376 static int parseOCIroot(const char *jsonfile, struct blob_attr *msg) 1377 { 1378 char extroot[PATH_MAX] = { 0 }; 1379 struct blob_attr *tb[__OCI_ROOT_MAX]; 1380 char *cur; 1381 char *root_path; 1382 1383 blobmsg_parse(oci_root_policy, __OCI_ROOT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1384 1385 if (!tb[OCI_ROOT_PATH]) 1386 return ENODATA; 1387 1388 root_path = blobmsg_get_string(tb[OCI_ROOT_PATH]); 1389 1390 /* prepend bundle directory in case of relative paths */ 1391 if (root_path[0] != '/') { 1392 strncpy(extroot, jsonfile, PATH_MAX - 1); 1393 1394 cur = strrchr(extroot, '/'); 1395 1396 if (!cur) 1397 return ENOTDIR; 1398 1399 *(++cur) = '\0'; 1400 } 1401 1402 strncat(extroot, root_path, PATH_MAX - (strlen(extroot) + 1)); 1403 1404 /* follow symbolic link(s) */ 1405 opts.extroot = realpath(extroot, NULL); 1406 if (!opts.extroot) 1407 return errno; 1408 1409 if (tb[OCI_ROOT_READONLY]) 1410 opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]); 1411 1412 return 0; 1413 } 1414 1415 1416 enum { 1417 OCI_HOOK_PATH, 1418 OCI_HOOK_ARGS, 1419 OCI_HOOK_ENV, 1420 OCI_HOOK_TIMEOUT, 1421 __OCI_HOOK_MAX, 1422 }; 1423 1424 static const struct blobmsg_policy oci_hook_policy[] = { 1425 [OCI_HOOK_PATH] = { "path", BLOBMSG_TYPE_STRING }, 1426 [OCI_HOOK_ARGS] = { "args", BLOBMSG_TYPE_ARRAY }, 1427 [OCI_HOOK_ENV] = { "env", BLOBMSG_TYPE_ARRAY }, 1428 [OCI_HOOK_TIMEOUT] = { "timeout", BLOBMSG_TYPE_INT32 }, 1429 }; 1430 1431 1432 static int parseOCIhook(struct hook_execvpe ***hooklist, struct blob_attr *msg) 1433 { 1434 struct blob_attr *tb[__OCI_HOOK_MAX]; 1435 struct blob_attr *cur; 1436 int rem, ret = 0; 1437 int idx = 0; 1438 1439 blobmsg_for_each_attr(cur, msg, rem) 1440 ++idx; 1441 1442 if (!idx) 1443 return 0; 1444 1445 *hooklist = calloc(idx + 1, sizeof(struct hook_execvpe *)); 1446 idx = 0; 1447 1448 if (!(*hooklist)) 1449 return ENOMEM; 1450 1451 blobmsg_for_each_attr(cur, msg, rem) { 1452 blobmsg_parse(oci_hook_policy, __OCI_HOOK_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 1453 1454 if (!tb[OCI_HOOK_PATH]) { 1455 ret = EINVAL; 1456 goto errout; 1457 } 1458 1459 (*hooklist)[idx] = calloc(1, sizeof(struct hook_execvpe)); 1460 if (tb[OCI_HOOK_ARGS]) { 1461 ret = parseOCIenvarray(tb[OCI_HOOK_ARGS], &((*hooklist)[idx]->argv)); 1462 if (ret) 1463 goto errout; 1464 } else { 1465 (*hooklist)[idx]->argv = calloc(2, sizeof(char *)); 1466 ((*hooklist)[idx]->argv)[0] = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH])); 1467 ((*hooklist)[idx]->argv)[1] = NULL; 1468 }; 1469 1470 1471 if (tb[OCI_HOOK_ENV]) { 1472 ret = parseOCIenvarray(tb[OCI_HOOK_ENV], &((*hooklist)[idx]->envp)); 1473 if (ret) 1474 goto errout; 1475 } 1476 1477 if (tb[OCI_HOOK_TIMEOUT]) 1478 (*hooklist)[idx]->timeout = blobmsg_get_u32(tb[OCI_HOOK_TIMEOUT]); 1479 1480 (*hooklist)[idx]->file = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH])); 1481 1482 ++idx; 1483 } 1484 1485 (*hooklist)[idx] = NULL; 1486 1487 DEBUG("added %d hooks\n", idx); 1488 1489 return 0; 1490 1491 errout: 1492 free_hooklist(*hooklist); 1493 *hooklist = NULL; 1494 1495 return ret; 1496 }; 1497 1498 1499 enum { 1500 OCI_HOOKS_PRESTART, 1501 OCI_HOOKS_CREATERUNTIME, 1502 OCI_HOOKS_CREATECONTAINER, 1503 OCI_HOOKS_STARTCONTAINER, 1504 OCI_HOOKS_POSTSTART, 1505 OCI_HOOKS_POSTSTOP, 1506 __OCI_HOOKS_MAX, 1507 }; 1508 1509 static const struct blobmsg_policy oci_hooks_policy[] = { 1510 [OCI_HOOKS_PRESTART] = { "prestart", BLOBMSG_TYPE_ARRAY }, 1511 [OCI_HOOKS_CREATERUNTIME] = { "createRuntime", BLOBMSG_TYPE_ARRAY }, 1512 [OCI_HOOKS_CREATECONTAINER] = { "createContainer", BLOBMSG_TYPE_ARRAY }, 1513 [OCI_HOOKS_STARTCONTAINER] = { "startContainer", BLOBMSG_TYPE_ARRAY }, 1514 [OCI_HOOKS_POSTSTART] = { "poststart", BLOBMSG_TYPE_ARRAY }, 1515 [OCI_HOOKS_POSTSTOP] = { "poststop", BLOBMSG_TYPE_ARRAY }, 1516 }; 1517 1518 static int parseOCIhooks(struct blob_attr *msg) 1519 { 1520 struct blob_attr *tb[__OCI_HOOKS_MAX]; 1521 int ret; 1522 1523 blobmsg_parse(oci_hooks_policy, __OCI_HOOKS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1524 1525 if (tb[OCI_HOOKS_PRESTART]) 1526 INFO("warning: ignoring deprecated prestart hook\n"); 1527 1528 if (tb[OCI_HOOKS_CREATERUNTIME]) { 1529 ret = parseOCIhook(&opts.hooks.createRuntime, tb[OCI_HOOKS_CREATERUNTIME]); 1530 if (ret) 1531 return ret; 1532 } 1533 1534 if (tb[OCI_HOOKS_CREATECONTAINER]) { 1535 ret = parseOCIhook(&opts.hooks.createContainer, tb[OCI_HOOKS_CREATECONTAINER]); 1536 if (ret) 1537 goto out_createruntime; 1538 } 1539 1540 if (tb[OCI_HOOKS_STARTCONTAINER]) { 1541 ret = parseOCIhook(&opts.hooks.startContainer, tb[OCI_HOOKS_STARTCONTAINER]); 1542 if (ret) 1543 goto out_createcontainer; 1544 } 1545 1546 if (tb[OCI_HOOKS_POSTSTART]) { 1547 ret = parseOCIhook(&opts.hooks.poststart, tb[OCI_HOOKS_POSTSTART]); 1548 if (ret) 1549 goto out_startcontainer; 1550 } 1551 1552 if (tb[OCI_HOOKS_POSTSTOP]) { 1553 ret = parseOCIhook(&opts.hooks.poststop, tb[OCI_HOOKS_POSTSTOP]); 1554 if (ret) 1555 goto out_poststart; 1556 } 1557 1558 return 0; 1559 1560 out_poststart: 1561 free_hooklist(opts.hooks.poststart); 1562 out_startcontainer: 1563 free_hooklist(opts.hooks.startContainer); 1564 out_createcontainer: 1565 free_hooklist(opts.hooks.createContainer); 1566 out_createruntime: 1567 free_hooklist(opts.hooks.createRuntime); 1568 1569 return ret; 1570 }; 1571 1572 1573 enum { 1574 OCI_PROCESS_USER_UID, 1575 OCI_PROCESS_USER_GID, 1576 OCI_PROCESS_USER_UMASK, 1577 OCI_PROCESS_USER_ADDITIONALGIDS, 1578 __OCI_PROCESS_USER_MAX, 1579 }; 1580 1581 static const struct blobmsg_policy oci_process_user_policy[] = { 1582 [OCI_PROCESS_USER_UID] = { "uid", BLOBMSG_TYPE_INT32 }, 1583 [OCI_PROCESS_USER_GID] = { "gid", BLOBMSG_TYPE_INT32 }, 1584 [OCI_PROCESS_USER_UMASK] = { "umask", BLOBMSG_TYPE_INT32 }, 1585 [OCI_PROCESS_USER_ADDITIONALGIDS] = { "additionalGids", BLOBMSG_TYPE_ARRAY }, 1586 }; 1587 1588 static int parseOCIprocessuser(struct blob_attr *msg) { 1589 struct blob_attr *tb[__OCI_PROCESS_USER_MAX]; 1590 struct blob_attr *cur; 1591 int rem; 1592 int has_gid = 0; 1593 1594 blobmsg_parse(oci_process_user_policy, __OCI_PROCESS_USER_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1595 1596 if (tb[OCI_PROCESS_USER_UID]) 1597 opts.pw_uid = blobmsg_get_u32(tb[OCI_PROCESS_USER_UID]); 1598 1599 if (tb[OCI_PROCESS_USER_GID]) { 1600 opts.pw_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]); 1601 opts.gr_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]); 1602 has_gid = 1; 1603 } 1604 1605 if (tb[OCI_PROCESS_USER_ADDITIONALGIDS]) { 1606 size_t gidcnt = 0; 1607 1608 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) { 1609 ++gidcnt; 1610 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid)) 1611 continue; 1612 } 1613 1614 if (gidcnt) { 1615 opts.additional_gids = calloc(gidcnt + has_gid, sizeof(gid_t)); 1616 gidcnt = 0; 1617 1618 /* always add primary GID to set of GIDs if set */ 1619 if (has_gid) 1620 opts.additional_gids[gidcnt++] = opts.gr_gid; 1621 1622 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) { 1623 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid)) 1624 continue; 1625 opts.additional_gids[gidcnt++] = blobmsg_get_u32(cur); 1626 } 1627 opts.num_additional_gids = gidcnt; 1628 } 1629 DEBUG("read %zu additional groups\n", gidcnt); 1630 } 1631 1632 if (tb[OCI_PROCESS_USER_UMASK]) { 1633 opts.umask = blobmsg_get_u32(tb[OCI_PROCESS_USER_UMASK]); 1634 opts.set_umask = true; 1635 } 1636 1637 return 0; 1638 } 1639 1640 enum { 1641 OCI_PROCESS_RLIMIT_TYPE, 1642 OCI_PROCESS_RLIMIT_SOFT, 1643 OCI_PROCESS_RLIMIT_HARD, 1644 __OCI_PROCESS_RLIMIT_MAX, 1645 }; 1646 1647 static const struct blobmsg_policy oci_process_rlimit_policy[] = { 1648 [OCI_PROCESS_RLIMIT_TYPE] = { "type", BLOBMSG_TYPE_STRING }, 1649 [OCI_PROCESS_RLIMIT_SOFT] = { "soft", BLOBMSG_CAST_INT64 }, 1650 [OCI_PROCESS_RLIMIT_HARD] = { "hard", BLOBMSG_CAST_INT64 }, 1651 }; 1652 1653 /* from manpage GETRLIMIT(2) */ 1654 static const char* const rlimit_names[RLIM_NLIMITS] = { 1655 [RLIMIT_AS] = "AS", 1656 [RLIMIT_CORE] = "CORE", 1657 [RLIMIT_CPU] = "CPU", 1658 [RLIMIT_DATA] = "DATA", 1659 [RLIMIT_FSIZE] = "FSIZE", 1660 [RLIMIT_LOCKS] = "LOCKS", 1661 [RLIMIT_MEMLOCK] = "MEMLOCK", 1662 [RLIMIT_MSGQUEUE] = "MSGQUEUE", 1663 [RLIMIT_NICE] = "NICE", 1664 [RLIMIT_NOFILE] = "NOFILE", 1665 [RLIMIT_NPROC] = "NPROC", 1666 [RLIMIT_RSS] = "RSS", 1667 [RLIMIT_RTPRIO] = "RTPRIO", 1668 [RLIMIT_RTTIME] = "RTTIME", 1669 [RLIMIT_SIGPENDING] = "SIGPENDING", 1670 [RLIMIT_STACK] = "STACK", 1671 }; 1672 1673 static int resolve_rlimit(char *type) { 1674 unsigned int rltype; 1675 1676 for (rltype = 0; rltype < RLIM_NLIMITS; ++rltype) 1677 if (rlimit_names[rltype] && 1678 !strncmp("RLIMIT_", type, 7) && 1679 !strcmp(rlimit_names[rltype], type + 7)) 1680 return rltype; 1681 1682 return -1; 1683 } 1684 1685 1686 static int parseOCIrlimit(struct blob_attr *msg) 1687 { 1688 struct blob_attr *tb[__OCI_PROCESS_RLIMIT_MAX]; 1689 int limtype = -1; 1690 struct rlimit *curlim; 1691 1692 blobmsg_parse(oci_process_rlimit_policy, __OCI_PROCESS_RLIMIT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1693 1694 if (!tb[OCI_PROCESS_RLIMIT_TYPE] || 1695 !tb[OCI_PROCESS_RLIMIT_SOFT] || 1696 !tb[OCI_PROCESS_RLIMIT_HARD]) 1697 return ENODATA; 1698 1699 limtype = resolve_rlimit(blobmsg_get_string(tb[OCI_PROCESS_RLIMIT_TYPE])); 1700 1701 if (limtype < 0) 1702 return EINVAL; 1703 1704 if (opts.rlimits[limtype]) 1705 return ENOTUNIQ; 1706 1707 curlim = malloc(sizeof(struct rlimit)); 1708 curlim->rlim_cur = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_SOFT]); 1709 curlim->rlim_max = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_HARD]); 1710 1711 opts.rlimits[limtype] = curlim; 1712 1713 return 0; 1714 }; 1715 1716 enum { 1717 OCI_PROCESS_ARGS, 1718 OCI_PROCESS_CAPABILITIES, 1719 OCI_PROCESS_CWD, 1720 OCI_PROCESS_ENV, 1721 OCI_PROCESS_OOMSCOREADJ, 1722 OCI_PROCESS_NONEWPRIVILEGES, 1723 OCI_PROCESS_RLIMITS, 1724 OCI_PROCESS_TERMINAL, 1725 OCI_PROCESS_USER, 1726 __OCI_PROCESS_MAX, 1727 }; 1728 1729 static const struct blobmsg_policy oci_process_policy[] = { 1730 [OCI_PROCESS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY }, 1731 [OCI_PROCESS_CAPABILITIES] = { "capabilities", BLOBMSG_TYPE_TABLE }, 1732 [OCI_PROCESS_CWD] = { "cwd", BLOBMSG_TYPE_STRING }, 1733 [OCI_PROCESS_ENV] = { "env", BLOBMSG_TYPE_ARRAY }, 1734 [OCI_PROCESS_OOMSCOREADJ] = { "oomScoreAdj", BLOBMSG_TYPE_INT32 }, 1735 [OCI_PROCESS_NONEWPRIVILEGES] = { "noNewPrivileges", BLOBMSG_TYPE_BOOL }, 1736 [OCI_PROCESS_RLIMITS] = { "rlimits", BLOBMSG_TYPE_ARRAY }, 1737 [OCI_PROCESS_TERMINAL] = { "terminal", BLOBMSG_TYPE_BOOL }, 1738 [OCI_PROCESS_USER] = { "user", BLOBMSG_TYPE_TABLE }, 1739 }; 1740 1741 1742 static int parseOCIprocess(struct blob_attr *msg) 1743 { 1744 struct blob_attr *tb[__OCI_PROCESS_MAX], *cur; 1745 int rem, res; 1746 1747 blobmsg_parse(oci_process_policy, __OCI_PROCESS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1748 1749 if (!tb[OCI_PROCESS_ARGS]) 1750 return ENOENT; 1751 1752 res = parseOCIenvarray(tb[OCI_PROCESS_ARGS], &opts.jail_argv); 1753 if (res) 1754 return res; 1755 1756 if (tb[OCI_PROCESS_TERMINAL]) 1757 opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]); 1758 1759 if (tb[OCI_PROCESS_NONEWPRIVILEGES]) 1760 opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]); 1761 1762 if (tb[OCI_PROCESS_CWD]) 1763 opts.cwd = strdup(blobmsg_get_string(tb[OCI_PROCESS_CWD])); 1764 1765 if (tb[OCI_PROCESS_ENV]) { 1766 res = parseOCIenvarray(tb[OCI_PROCESS_ENV], &opts.envp); 1767 if (res) 1768 return res; 1769 } 1770 1771 if (tb[OCI_PROCESS_USER] && (res = parseOCIprocessuser(tb[OCI_PROCESS_USER]))) 1772 return res; 1773 1774 if (tb[OCI_PROCESS_CAPABILITIES] && 1775 (res = parseOCIcapabilities(&opts.capset, tb[OCI_PROCESS_CAPABILITIES]))) 1776 return res; 1777 1778 if (tb[OCI_PROCESS_RLIMITS]) { 1779 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_RLIMITS], rem) { 1780 res = parseOCIrlimit(cur); 1781 if (res) 1782 return res; 1783 } 1784 } 1785 1786 if (tb[OCI_PROCESS_OOMSCOREADJ]) { 1787 opts.oom_score_adj = blobmsg_get_u32(tb[OCI_PROCESS_OOMSCOREADJ]); 1788 opts.set_oom_score_adj = true; 1789 } 1790 1791 return 0; 1792 } 1793 1794 enum { 1795 OCI_LINUX_NAMESPACE_TYPE, 1796 OCI_LINUX_NAMESPACE_PATH, 1797 __OCI_LINUX_NAMESPACE_MAX, 1798 }; 1799 1800 static const struct blobmsg_policy oci_linux_namespace_policy[] = { 1801 [OCI_LINUX_NAMESPACE_TYPE] = { "type", BLOBMSG_TYPE_STRING }, 1802 [OCI_LINUX_NAMESPACE_PATH] = { "path", BLOBMSG_TYPE_STRING }, 1803 }; 1804 1805 static int resolve_nstype(char *type) { 1806 if (!strcmp("pid", type)) 1807 return CLONE_NEWPID; 1808 else if (!strcmp("network", type)) 1809 return CLONE_NEWNET; 1810 else if (!strcmp("net", type)) 1811 return CLONE_NEWNET; 1812 else if (!strcmp("mount", type)) 1813 return CLONE_NEWNS; 1814 else if (!strcmp("ipc", type)) 1815 return CLONE_NEWIPC; 1816 else if (!strcmp("uts", type)) 1817 return CLONE_NEWUTS; 1818 else if (!strcmp("user", type)) 1819 return CLONE_NEWUSER; 1820 else if (!strcmp("cgroup", type)) 1821 return CLONE_NEWCGROUP; 1822 #ifdef CLONE_NEWTIME 1823 else if (!strcmp("time", type)) 1824 return CLONE_NEWTIME; 1825 #endif 1826 else 1827 return 0; 1828 } 1829 1830 static int parseOCIlinuxns(struct blob_attr *msg) 1831 { 1832 struct blob_attr *tb[__OCI_LINUX_NAMESPACE_MAX]; 1833 int nstype; 1834 int *setns; 1835 int fd; 1836 1837 blobmsg_parse(oci_linux_namespace_policy, __OCI_LINUX_NAMESPACE_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1838 1839 if (!tb[OCI_LINUX_NAMESPACE_TYPE]) 1840 return EINVAL; 1841 1842 nstype = resolve_nstype(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE])); 1843 if (!nstype) 1844 return EINVAL; 1845 1846 if (opts.namespace & nstype) 1847 return ENOTUNIQ; 1848 1849 setns = get_namespace_fd(nstype); 1850 1851 if (!setns) 1852 return EFAULT; 1853 1854 if (*setns != -1) 1855 return ENOTUNIQ; 1856 1857 if (tb[OCI_LINUX_NAMESPACE_PATH]) { 1858 DEBUG("opening existing %s namespace from path %s\n", 1859 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]), 1860 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH])); 1861 1862 fd = open(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]), O_RDONLY); 1863 if (fd < 0) 1864 return errno?:ESTALE; 1865 1866 if (ioctl(fd, NS_GET_NSTYPE) != nstype) { 1867 close(fd); 1868 return EINVAL; 1869 } 1870 1871 DEBUG("opened existing %s namespace got filehandler %u\n", 1872 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]), 1873 fd); 1874 1875 *setns = fd; 1876 } else { 1877 opts.namespace |= nstype; 1878 } 1879 1880 return 0; 1881 } 1882 1883 /* 1884 * join namespace of existing PID 1885 * The string argument is the reference PID followed by ':' and a 1886 * ',' separated list of namespaces to to join. 1887 */ 1888 static int jail_join_ns(char *arg) 1889 { 1890 pid_t pid; 1891 int fd; 1892 int nstype; 1893 char *tmp, *etmp, *nspath; 1894 int *setns; 1895 1896 tmp = strchr(arg, ':'); 1897 if (!tmp) 1898 return EINVAL; 1899 1900 *tmp = '\0'; 1901 pid = atoi(arg); 1902 1903 do { 1904 ++tmp; 1905 etmp = strchr(tmp, ','); 1906 if (etmp) 1907 *etmp = '\0'; 1908 1909 nstype = resolve_nstype(tmp); 1910 if (!nstype) 1911 return EINVAL; 1912 1913 if (opts.namespace & nstype) 1914 return ENOTUNIQ; 1915 1916 setns = get_namespace_fd(nstype); 1917 1918 if (!setns) 1919 return EFAULT; 1920 1921 if (*setns != -1) 1922 return ENOTUNIQ; 1923 1924 if (asprintf(&nspath, "/proc/%d/ns/%s", pid, tmp) < 0) 1925 return ENOMEM; 1926 1927 fd = open(nspath, O_RDONLY); 1928 free(nspath); 1929 1930 if (fd < 0) 1931 return errno?:ESTALE; 1932 1933 *setns = fd; 1934 1935 if (etmp) 1936 tmp = etmp; 1937 else 1938 tmp = NULL; 1939 } while (tmp); 1940 1941 return 0; 1942 } 1943 1944 static void get_jail_root_user(bool is_gidmap, uint32_t container_id, uint32_t host_id, uint32_t size) 1945 { 1946 if (container_id == 0 && size >= 1) 1947 if (!is_gidmap) 1948 opts.root_map_uid = host_id; 1949 } 1950 1951 enum { 1952 OCI_LINUX_UIDGIDMAP_CONTAINERID, 1953 OCI_LINUX_UIDGIDMAP_HOSTID, 1954 OCI_LINUX_UIDGIDMAP_SIZE, 1955 __OCI_LINUX_UIDGIDMAP_MAX, 1956 }; 1957 1958 static const struct blobmsg_policy oci_linux_uidgidmap_policy[] = { 1959 [OCI_LINUX_UIDGIDMAP_CONTAINERID] = { "containerID", BLOBMSG_TYPE_INT32 }, 1960 [OCI_LINUX_UIDGIDMAP_HOSTID] = { "hostID", BLOBMSG_TYPE_INT32 }, 1961 [OCI_LINUX_UIDGIDMAP_SIZE] = { "size", BLOBMSG_TYPE_INT32 }, 1962 }; 1963 1964 static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap) 1965 { 1966 struct blob_attr *tb[__OCI_LINUX_UIDGIDMAP_MAX]; 1967 struct blob_attr *cur; 1968 int rem; 1969 char *map; 1970 size_t len, pos, totallen = 0; 1971 1972 blobmsg_for_each_attr(cur, msg, rem) { 1973 blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 1974 1975 if (!tb[OCI_LINUX_UIDGIDMAP_CONTAINERID] || 1976 !tb[OCI_LINUX_UIDGIDMAP_HOSTID] || 1977 !tb[OCI_LINUX_UIDGIDMAP_SIZE]) 1978 return EINVAL; 1979 1980 /* count length */ 1981 totallen += snprintf(NULL, 0, "%d %d %d\n", 1982 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]), 1983 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]), 1984 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE])); 1985 } 1986 1987 /* allocate combined mapping string */ 1988 map = malloc(totallen + 1); 1989 if (!map) 1990 return ENOMEM; 1991 1992 pos = 0; 1993 blobmsg_for_each_attr(cur, msg, rem) { 1994 blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 1995 1996 get_jail_root_user(is_gidmap, blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]), 1997 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]), 1998 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE])); 1999 2000 /* write mapping line into pre-allocated string */ 2001 len = snprintf(&map[pos], totallen + 1, "%d %d %d\n", 2002 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]), 2003 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]), 2004 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE])); 2005 pos += len; 2006 totallen -= len; 2007 } 2008 2009 assert(totallen == 0); 2010 2011 if (is_gidmap) 2012 opts.gidmap = map; 2013 else 2014 opts.uidmap = map; 2015 2016 return 0; 2017 } 2018 2019 enum { 2020 OCI_DEVICES_TYPE, 2021 OCI_DEVICES_PATH, 2022 OCI_DEVICES_MAJOR, 2023 OCI_DEVICES_MINOR, 2024 OCI_DEVICES_FILEMODE, 2025 OCI_DEVICES_UID, 2026 OCI_DEVICES_GID, 2027 __OCI_DEVICES_MAX, 2028 }; 2029 2030 static const struct blobmsg_policy oci_devices_policy[] = { 2031 [OCI_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING }, 2032 [OCI_DEVICES_PATH] = { "path", BLOBMSG_TYPE_STRING }, 2033 [OCI_DEVICES_MAJOR] = { "major", BLOBMSG_TYPE_INT32 }, 2034 [OCI_DEVICES_MINOR] = { "minor", BLOBMSG_TYPE_INT32 }, 2035 [OCI_DEVICES_FILEMODE] = { "fileMode", BLOBMSG_TYPE_INT32 }, 2036 [OCI_DEVICES_UID] = { "uid", BLOBMSG_TYPE_INT32 }, 2037 [OCI_DEVICES_GID] = { "uid", BLOBMSG_TYPE_INT32 }, 2038 }; 2039 2040 static mode_t resolve_devtype(char *tstr) 2041 { 2042 if (!strcmp("c", tstr) || 2043 !strcmp("u", tstr)) 2044 return S_IFCHR; 2045 else if (!strcmp("b", tstr)) 2046 return S_IFBLK; 2047 else if (!strcmp("p", tstr)) 2048 return S_IFIFO; 2049 else 2050 return 0; 2051 } 2052 2053 static int parseOCIdevices(struct blob_attr *msg) 2054 { 2055 struct blob_attr *tb[__OCI_DEVICES_MAX]; 2056 struct blob_attr *cur; 2057 int rem; 2058 size_t cnt = 0; 2059 struct mknod_args *tmp; 2060 2061 blobmsg_for_each_attr(cur, msg, rem) 2062 ++cnt; 2063 2064 opts.devices = calloc(cnt + 1, sizeof(struct mknod_args *)); 2065 2066 cnt = 0; 2067 blobmsg_for_each_attr(cur, msg, rem) { 2068 blobmsg_parse(oci_devices_policy, __OCI_DEVICES_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 2069 if (!tb[OCI_DEVICES_TYPE] || 2070 !tb[OCI_DEVICES_PATH]) 2071 return ENODATA; 2072 2073 tmp = calloc(1, sizeof(struct mknod_args)); 2074 if (!tmp) 2075 return ENOMEM; 2076 2077 tmp->mode = resolve_devtype(blobmsg_get_string(tb[OCI_DEVICES_TYPE])); 2078 if (!tmp->mode) { 2079 free(tmp); 2080 return EINVAL; 2081 } 2082 2083 if (tmp->mode != S_IFIFO) { 2084 if (!tb[OCI_DEVICES_MAJOR] || !tb[OCI_DEVICES_MINOR]) { 2085 free(tmp); 2086 return ENODATA; 2087 } 2088 2089 tmp->dev = makedev(blobmsg_get_u32(tb[OCI_DEVICES_MAJOR]), 2090 blobmsg_get_u32(tb[OCI_DEVICES_MINOR])); 2091 } 2092 2093 if (tb[OCI_DEVICES_FILEMODE]) { 2094 if (~(S_IRWXU|S_IRWXG|S_IRWXO) & blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE])) { 2095 free(tmp); 2096 return EINVAL; 2097 } 2098 2099 tmp->mode |= blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]); 2100 } else { 2101 tmp->mode |= (S_IRUSR|S_IWUSR); /* 0600 */ 2102 } 2103 2104 tmp->path = strdup(blobmsg_get_string(tb[OCI_DEVICES_PATH])); 2105 2106 if (tb[OCI_DEVICES_UID]) 2107 tmp->uid = blobmsg_get_u32(tb[OCI_DEVICES_UID]); 2108 else 2109 tmp->uid = -1; 2110 2111 if (tb[OCI_DEVICES_GID]) 2112 tmp->gid = blobmsg_get_u32(tb[OCI_DEVICES_GID]); 2113 else 2114 tmp->gid = -1; 2115 2116 DEBUG("read device %s (%s)\n", blobmsg_get_string(tb[OCI_DEVICES_PATH]), blobmsg_get_string(tb[OCI_DEVICES_TYPE])); 2117 opts.devices[cnt++] = tmp; 2118 } 2119 2120 opts.devices[cnt] = NULL; 2121 2122 return 0; 2123 } 2124 2125 static int parseOCIsysctl(struct blob_attr *msg) 2126 { 2127 struct blob_attr *cur; 2128 int rem; 2129 char *tmp, *tc; 2130 size_t cnt = 0; 2131 2132 blobmsg_for_each_attr(cur, msg, rem) { 2133 if (!blobmsg_name(cur) || !blobmsg_get_string(cur)) 2134 return EINVAL; 2135 2136 ++cnt; 2137 } 2138 2139 if (!cnt) 2140 return 0; 2141 2142 opts.sysctl = calloc(cnt + 1, sizeof(struct sysctl_val *)); 2143 if (!opts.sysctl) 2144 return ENOMEM; 2145 2146 cnt = 0; 2147 blobmsg_for_each_attr(cur, msg, rem) { 2148 opts.sysctl[cnt] = malloc(sizeof(struct sysctl_val)); 2149 if (!opts.sysctl[cnt]) 2150 return ENOMEM; 2151 2152 /* replace '.' with '/' in entry name */ 2153 tc = tmp = strdup(blobmsg_name(cur)); 2154 while ((tc = strchr(tc, '.'))) 2155 *tc = '/'; 2156 2157 opts.sysctl[cnt]->value = strdup(blobmsg_get_string(cur)); 2158 opts.sysctl[cnt]->entry = tmp; 2159 2160 ++cnt; 2161 } 2162 2163 opts.sysctl[cnt] = NULL; 2164 2165 return 0; 2166 } 2167 2168 2169 enum { 2170 OCI_LINUX_CGROUPSPATH, 2171 OCI_LINUX_RESOURCES, 2172 OCI_LINUX_SECCOMP, 2173 OCI_LINUX_SYSCTL, 2174 OCI_LINUX_NAMESPACES, 2175 OCI_LINUX_DEVICES, 2176 OCI_LINUX_UIDMAPPINGS, 2177 OCI_LINUX_GIDMAPPINGS, 2178 OCI_LINUX_MASKEDPATHS, 2179 OCI_LINUX_READONLYPATHS, 2180 OCI_LINUX_ROOTFSPROPAGATION, 2181 __OCI_LINUX_MAX, 2182 }; 2183 2184 static const struct blobmsg_policy oci_linux_policy[] = { 2185 [OCI_LINUX_CGROUPSPATH] = { "cgroupsPath", BLOBMSG_TYPE_STRING }, 2186 [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE }, 2187 [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE }, 2188 [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE }, 2189 [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY }, 2190 [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY }, 2191 [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY }, 2192 [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY }, 2193 [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY }, 2194 [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY }, 2195 [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING }, 2196 }; 2197 2198 static int parseOCIlinux(struct blob_attr *msg) 2199 { 2200 struct blob_attr *tb[__OCI_LINUX_MAX]; 2201 struct blob_attr *cur; 2202 int rem; 2203 int res = 0; 2204 char *cgpath; 2205 char cgfullpath[256] = "/sys/fs/cgroup"; 2206 2207 blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 2208 2209 if (tb[OCI_LINUX_NAMESPACES]) { 2210 blobmsg_for_each_attr(cur, tb[OCI_LINUX_NAMESPACES], rem) { 2211 res = parseOCIlinuxns(cur); 2212 if (res) 2213 return res; 2214 } 2215 } 2216 2217 if (tb[OCI_LINUX_UIDMAPPINGS]) { 2218 res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 0); 2219 if (res) 2220 return res; 2221 } 2222 2223 if (tb[OCI_LINUX_GIDMAPPINGS]) { 2224 res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 1); 2225 if (res) 2226 return res; 2227 } 2228 2229 if (tb[OCI_LINUX_READONLYPATHS]) { 2230 blobmsg_for_each_attr(cur, tb[OCI_LINUX_READONLYPATHS], rem) { 2231 res = add_mount(NULL, blobmsg_get_string(cur), NULL, MS_BIND | MS_REC | MS_RDONLY, 0, NULL, 0); 2232 if (res) 2233 return res; 2234 } 2235 } 2236 2237 if (tb[OCI_LINUX_MASKEDPATHS]) { 2238 blobmsg_for_each_attr(cur, tb[OCI_LINUX_MASKEDPATHS], rem) { 2239 res = add_mount((void *)(-1), blobmsg_get_string(cur), NULL, 0, 0, NULL, 0); 2240 if (res) 2241 return res; 2242 } 2243 } 2244 2245 if (tb[OCI_LINUX_SYSCTL]) { 2246 res = parseOCIsysctl(tb[OCI_LINUX_SYSCTL]); 2247 if (res) 2248 return res; 2249 } 2250 2251 if (tb[OCI_LINUX_SECCOMP]) { 2252 opts.ociseccomp = parseOCIlinuxseccomp(tb[OCI_LINUX_SECCOMP]); 2253 if (!opts.ociseccomp) 2254 return EINVAL; 2255 } 2256 2257 if (tb[OCI_LINUX_DEVICES]) { 2258 res = parseOCIdevices(tb[OCI_LINUX_DEVICES]); 2259 if (res) 2260 return res; 2261 } 2262 2263 if (tb[OCI_LINUX_CGROUPSPATH]) { 2264 cgpath = blobmsg_get_string(tb[OCI_LINUX_CGROUPSPATH]); 2265 if (cgpath[0] == '/') { 2266 if (strlen(cgpath) + 1 >= (sizeof(cgfullpath) - strlen(cgfullpath))) 2267 return E2BIG; 2268 2269 strcat(cgfullpath, cgpath); 2270 } else { 2271 strcat(cgfullpath, "/containers/"); 2272 if (strlen(opts.name) + strlen(cgpath) + 2 >= (sizeof(cgfullpath) - strlen(cgfullpath))) 2273 return E2BIG; 2274 2275 strcat(cgfullpath, opts.name); /* should be container name rather than jail name */ 2276 strcat(cgfullpath, "/"); 2277 strcat(cgfullpath, cgpath); 2278 } 2279 } else { 2280 strcat(cgfullpath, "/containers/"); 2281 if (2 * strlen(opts.name) + 2 >= (sizeof(cgfullpath) - strlen(cgfullpath))) 2282 return E2BIG; 2283 2284 strcat(cgfullpath, opts.name); /* should be container name rather than jail name */ 2285 strcat(cgfullpath, "/"); 2286 strcat(cgfullpath, opts.name); /* should be container instance name rather than jail name */ 2287 } 2288 2289 cgroups_init(cgfullpath); 2290 2291 if (tb[OCI_LINUX_RESOURCES]) { 2292 res = parseOCIlinuxcgroups(tb[OCI_LINUX_RESOURCES]); 2293 if (res) 2294 return res; 2295 } 2296 2297 return 0; 2298 } 2299 2300 enum { 2301 OCI_VERSION, 2302 OCI_HOSTNAME, 2303 OCI_PROCESS, 2304 OCI_ROOT, 2305 OCI_MOUNTS, 2306 OCI_HOOKS, 2307 OCI_LINUX, 2308 OCI_ANNOTATIONS, 2309 __OCI_MAX, 2310 }; 2311 2312 static const struct blobmsg_policy oci_policy[] = { 2313 [OCI_VERSION] = { "ociVersion", BLOBMSG_TYPE_STRING }, 2314 [OCI_HOSTNAME] = { "hostname", BLOBMSG_TYPE_STRING }, 2315 [OCI_PROCESS] = { "process", BLOBMSG_TYPE_TABLE }, 2316 [OCI_ROOT] = { "root", BLOBMSG_TYPE_TABLE }, 2317 [OCI_MOUNTS] = { "mounts", BLOBMSG_TYPE_ARRAY }, 2318 [OCI_HOOKS] = { "hooks", BLOBMSG_TYPE_TABLE }, 2319 [OCI_LINUX] = { "linux", BLOBMSG_TYPE_TABLE }, 2320 [OCI_ANNOTATIONS] = { "annotations", BLOBMSG_TYPE_TABLE }, 2321 }; 2322 2323 static int parseOCI(const char *jsonfile) 2324 { 2325 struct blob_attr *tb[__OCI_MAX]; 2326 struct blob_attr *cur; 2327 int rem; 2328 int res; 2329 2330 blob_buf_init(&ocibuf, 0); 2331 2332 if (!blobmsg_add_json_from_file(&ocibuf, jsonfile)) { 2333 res=ENOENT; 2334 goto errout; 2335 } 2336 2337 blobmsg_parse(oci_policy, __OCI_MAX, tb, blob_data(ocibuf.head), blob_len(ocibuf.head)); 2338 2339 if (!tb[OCI_VERSION]) { 2340 res=ENOMSG; 2341 goto errout; 2342 } 2343 2344 if (strncmp("1.0", blobmsg_get_string(tb[OCI_VERSION]), 3)) { 2345 ERROR("unsupported ociVersion %s\n", blobmsg_get_string(tb[OCI_VERSION])); 2346 res=ENOTSUP; 2347 goto errout; 2348 } 2349 2350 if (tb[OCI_HOSTNAME]) 2351 opts.hostname = strdup(blobmsg_get_string(tb[OCI_HOSTNAME])); 2352 2353 if (!tb[OCI_PROCESS]) { 2354 res=ENODATA; 2355 goto errout; 2356 } 2357 2358 if ((res = parseOCIprocess(tb[OCI_PROCESS]))) 2359 goto errout; 2360 2361 if (!tb[OCI_ROOT]) { 2362 res=ENODATA; 2363 goto errout; 2364 } 2365 if ((res = parseOCIroot(jsonfile, tb[OCI_ROOT]))) 2366 goto errout; 2367 2368 if (!tb[OCI_MOUNTS]) { 2369 res=ENODATA; 2370 goto errout; 2371 } 2372 2373 blobmsg_for_each_attr(cur, tb[OCI_MOUNTS], rem) 2374 if ((res = parseOCImount(cur))) 2375 goto errout; 2376 2377 if (tb[OCI_LINUX] && (res = parseOCIlinux(tb[OCI_LINUX]))) 2378 goto errout; 2379 2380 if (tb[OCI_HOOKS] && (res = parseOCIhooks(tb[OCI_HOOKS]))) 2381 goto errout; 2382 2383 if (tb[OCI_ANNOTATIONS]) 2384 opts.annotations = blob_memdup(tb[OCI_ANNOTATIONS]); 2385 2386 errout: 2387 blob_buf_free(&ocibuf); 2388 2389 return res; 2390 } 2391 2392 static int set_oom_score_adj(void) 2393 { 2394 int f; 2395 char fname[32]; 2396 2397 if (!opts.set_oom_score_adj) 2398 return 0; 2399 2400 snprintf(fname, sizeof(fname), "/proc/%u/oom_score_adj", jail_process.pid); 2401 f = open(fname, O_WRONLY | O_TRUNC); 2402 if (f < 0) 2403 return errno; 2404 2405 dprintf(f, "%d", opts.oom_score_adj); 2406 close(f); 2407 2408 return 0; 2409 } 2410 2411 2412 enum { 2413 OCI_STATE_CREATING, 2414 OCI_STATE_CREATED, 2415 OCI_STATE_RUNNING, 2416 OCI_STATE_STOPPED, 2417 }; 2418 2419 static int jail_oci_state = OCI_STATE_CREATED; 2420 static void pipe_send_start_container(struct uloop_timeout *t); 2421 static struct uloop_timeout start_container_timeout = { 2422 .cb = pipe_send_start_container, 2423 }; 2424 2425 static int handle_start(struct ubus_context *ctx, struct ubus_object *obj, 2426 struct ubus_request_data *req, const char *method, 2427 struct blob_attr *msg) 2428 { 2429 if (jail_oci_state != OCI_STATE_CREATED) 2430 return UBUS_STATUS_INVALID_ARGUMENT; 2431 2432 uloop_timeout_add(&start_container_timeout); 2433 2434 return UBUS_STATUS_OK; 2435 } 2436 2437 static struct blob_buf bb; 2438 static int handle_state(struct ubus_context *ctx, struct ubus_object *obj, 2439 struct ubus_request_data *req, const char *method, 2440 struct blob_attr *msg) 2441 { 2442 char *statusstr; 2443 2444 switch (jail_oci_state) { 2445 case OCI_STATE_CREATING: 2446 statusstr = "creating"; 2447 break; 2448 case OCI_STATE_CREATED: 2449 statusstr = "created"; 2450 break; 2451 case OCI_STATE_RUNNING: 2452 statusstr = "running"; 2453 break; 2454 case OCI_STATE_STOPPED: 2455 statusstr = "stopped"; 2456 break; 2457 default: 2458 statusstr = "unknown"; 2459 } 2460 2461 blob_buf_init(&bb, 0); 2462 blobmsg_add_string(&bb, "ociVersion", OCI_VERSION_STRING); 2463 blobmsg_add_string(&bb, "id", opts.name); 2464 blobmsg_add_string(&bb, "status", statusstr); 2465 if (jail_oci_state == OCI_STATE_CREATED || 2466 jail_oci_state == OCI_STATE_RUNNING) 2467 blobmsg_add_u32(&bb, "pid", jail_process.pid); 2468 2469 blobmsg_add_string(&bb, "bundle", opts.ocibundle); 2470 2471 if (opts.annotations) 2472 blobmsg_add_blob(&bb, opts.annotations); 2473 2474 ubus_send_reply(ctx, req, bb.head); 2475 2476 return UBUS_STATUS_OK; 2477 } 2478 2479 enum { 2480 CONTAINER_KILL_ATTR_SIGNAL, 2481 __CONTAINER_KILL_ATTR_MAX, 2482 }; 2483 2484 static const struct blobmsg_policy container_kill_attrs[__CONTAINER_KILL_ATTR_MAX] = { 2485 [CONTAINER_KILL_ATTR_SIGNAL] = { "signal", BLOBMSG_TYPE_INT32 }, 2486 }; 2487 2488 static int 2489 container_handle_kill(struct ubus_context *ctx, struct ubus_object *obj, 2490 struct ubus_request_data *req, const char *method, 2491 struct blob_attr *msg) 2492 { 2493 struct blob_attr *tb[__CONTAINER_KILL_ATTR_MAX], *cur; 2494 int sig = SIGTERM; 2495 2496 blobmsg_parse(container_kill_attrs, __CONTAINER_KILL_ATTR_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg)); 2497 2498 cur = tb[CONTAINER_KILL_ATTR_SIGNAL]; 2499 if (cur) 2500 sig = blobmsg_get_u32(cur); 2501 2502 if (jail_oci_state == OCI_STATE_CREATING) 2503 return UBUS_STATUS_NOT_FOUND; 2504 2505 if (kill(jail_process.pid, sig) == 0) 2506 return 0; 2507 2508 switch (errno) { 2509 case EINVAL: return UBUS_STATUS_INVALID_ARGUMENT; 2510 case EPERM: return UBUS_STATUS_PERMISSION_DENIED; 2511 case ESRCH: return UBUS_STATUS_NOT_FOUND; 2512 } 2513 2514 return UBUS_STATUS_UNKNOWN_ERROR; 2515 } 2516 2517 static int 2518 jail_writepid(pid_t pid) 2519 { 2520 FILE *_pidfile; 2521 2522 if (!opts.pidfile) 2523 return 0; 2524 2525 _pidfile = fopen(opts.pidfile, "w"); 2526 if (_pidfile == NULL) 2527 return errno; 2528 2529 if (fprintf(_pidfile, "%d\n", pid) < 0) { 2530 fclose(_pidfile); 2531 return errno; 2532 } 2533 2534 if (fclose(_pidfile)) 2535 return errno; 2536 2537 return 0; 2538 } 2539 2540 static int checkpath(const char *path) 2541 { 2542 int dirfd = open(path, O_RDONLY | O_DIRECTORY | O_CLOEXEC); 2543 if (dirfd < 0) { 2544 ERROR("path %s open failed %m\n", path); 2545 return -1; 2546 } 2547 close(dirfd); 2548 2549 return 0; 2550 } 2551 2552 static struct ubus_method container_methods[] = { 2553 UBUS_METHOD_NOARG("start", handle_start), 2554 UBUS_METHOD_NOARG("state", handle_state), 2555 UBUS_METHOD("kill", container_handle_kill, container_kill_attrs), 2556 }; 2557 2558 static struct ubus_object_type container_object_type = 2559 UBUS_OBJECT_TYPE("container", container_methods); 2560 2561 static struct ubus_object container_object = { 2562 .type = &container_object_type, 2563 .methods = container_methods, 2564 .n_methods = ARRAY_SIZE(container_methods), 2565 }; 2566 2567 static void post_main(struct uloop_timeout *t); 2568 static struct uloop_timeout post_main_timeout = { 2569 .cb = post_main, 2570 }; 2571 static int netns_fd; 2572 static int pidns_fd; 2573 #ifdef CLONE_NEWTIME 2574 static int timens_fd; 2575 #endif 2576 static void post_create_runtime(void); 2577 2578 struct env_e { 2579 struct list_head list; 2580 char *envarg; 2581 }; 2582 2583 int main(int argc, char **argv) 2584 { 2585 uid_t uid = getuid(); 2586 const char log[] = "/dev/log"; 2587 const char ubus[] = "/var/run/ubus/ubus.sock"; 2588 int ret = EXIT_FAILURE; 2589 int ch; 2590 char *tmp; 2591 struct list_head envl = LIST_HEAD_INIT(envl); 2592 struct env_e *enve, *tmpenve; 2593 unsigned short int envn = 0, envc = 0; 2594 2595 if (uid) { 2596 ERROR("not root, aborting: %m\n"); 2597 return EXIT_FAILURE; 2598 } 2599 2600 /* those are filehandlers, so -1 indicates unused */ 2601 opts.setns.pid = -1; 2602 opts.setns.net = -1; 2603 opts.setns.ns = -1; 2604 opts.setns.ipc = -1; 2605 opts.setns.uts = -1; 2606 opts.setns.user = -1; 2607 opts.setns.cgroup = -1; 2608 #ifdef CLONE_NEWTIME 2609 opts.setns.time = -1; 2610 #endif 2611 2612 /* default 5 seconds timeout after SIGTERM before SIGKILL is sent */ 2613 opts.term_timeout = 5; 2614 2615 umask(022); 2616 mount_list_init(); 2617 init_library_search(); 2618 cgroups_prepare(); 2619 exit_from_child = false; 2620 2621 while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) { 2622 switch (ch) { 2623 case 'd': 2624 debug = atoi(optarg); 2625 break; 2626 case 'e': 2627 enve = calloc(1, sizeof(*enve)); 2628 enve->envarg = optarg; 2629 list_add_tail(&enve->list, &envl); 2630 break; 2631 case 'p': 2632 opts.namespace |= CLONE_NEWNS; 2633 opts.procfs = 1; 2634 break; 2635 case 'o': 2636 opts.namespace |= CLONE_NEWNS; 2637 opts.ronly = 1; 2638 break; 2639 case 'f': 2640 opts.namespace |= CLONE_NEWUSER; 2641 break; 2642 case 'F': 2643 opts.namespace |= CLONE_NEWCGROUP; 2644 break; 2645 case 'R': 2646 opts.extroot = realpath(optarg, NULL); 2647 break; 2648 case 's': 2649 opts.namespace |= CLONE_NEWNS; 2650 opts.sysfs = 1; 2651 break; 2652 case 'S': 2653 opts.seccomp = optarg; 2654 add_mount_bind(optarg, 1, -1); 2655 break; 2656 case 'C': 2657 opts.capabilities = optarg; 2658 break; 2659 case 'c': 2660 opts.no_new_privs = 1; 2661 break; 2662 case 'n': 2663 opts.name = optarg; 2664 break; 2665 case 'N': 2666 opts.namespace |= CLONE_NEWNET; 2667 break; 2668 case 'h': 2669 opts.namespace |= CLONE_NEWUTS; 2670 opts.hostname = strdup(optarg); 2671 break; 2672 case 'j': 2673 jail_join_ns(optarg); 2674 break; 2675 case 'r': 2676 opts.namespace |= CLONE_NEWNS; 2677 tmp = strchr(optarg, ':'); 2678 if (tmp) { 2679 *(tmp++) = '\0'; 2680 add_2paths_and_deps(optarg, tmp, 1, 0, 0); 2681 } else { 2682 add_path_and_deps(optarg, 1, 0, 0); 2683 } 2684 break; 2685 case 'w': 2686 opts.namespace |= CLONE_NEWNS; 2687 tmp = strchr(optarg, ':'); 2688 if (tmp) { 2689 *(tmp++) = '\0'; 2690 add_2paths_and_deps(optarg, tmp, 0, 0, 0); 2691 } else { 2692 add_path_and_deps(optarg, 0, 0, 0); 2693 } 2694 break; 2695 case 'u': 2696 opts.namespace |= CLONE_NEWNS; 2697 add_mount_bind(ubus, 0, -1); 2698 break; 2699 case 'l': 2700 opts.namespace |= CLONE_NEWNS; 2701 add_mount_bind(log, 0, -1); 2702 break; 2703 case 'U': 2704 opts.user = optarg; 2705 break; 2706 case 'G': 2707 opts.group = optarg; 2708 break; 2709 case 'O': 2710 opts.overlaydir = realpath(optarg, NULL); 2711 break; 2712 case 't': 2713 opts.term_timeout = atoi(optarg); 2714 break; 2715 case 'T': 2716 opts.tmpoverlaysize = optarg; 2717 break; 2718 case 'E': 2719 opts.require_jail = 1; 2720 break; 2721 case 'y': 2722 opts.console = 1; 2723 break; 2724 case 'J': 2725 opts.ocibundle = optarg; 2726 break; 2727 case 'i': 2728 opts.immediately = true; 2729 break; 2730 case 'P': 2731 opts.pidfile = optarg; 2732 break; 2733 } 2734 } 2735 2736 if (opts.namespace && !opts.ocibundle) 2737 opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID; 2738 2739 /* 2740 * env import from cmdline is not available for OCI containers 2741 */ 2742 if (opts.ocibundle && !list_empty(&envl)) { 2743 ret=-ENOTSUP; 2744 goto errout; 2745 } 2746 2747 /* 2748 * prepare list of env variables to import for slim containers 2749 */ 2750 if (!list_empty(&envl)) { 2751 list_for_each_entry(enve, &envl, list) 2752 ++envn; 2753 2754 opts.envp = calloc(1 + envn, sizeof(char*)); 2755 list_for_each_entry_safe(enve, tmpenve, &envl, list) { 2756 tmp = getenv(enve->envarg); 2757 if (tmp) { 2758 ret = asprintf(&opts.envp[envc++], "%s=%s", enve->envarg, tmp); 2759 if (ret < 0) { 2760 ERROR("filed to handle envargs %s\n", tmp); 2761 free(enve); 2762 goto errout; 2763 } 2764 } 2765 2766 list_del(&enve->list); 2767 free(enve); 2768 } 2769 2770 opts.envp[envc] = NULL; 2771 } 2772 2773 /* 2774 * uid in parent user namespace representing root user in new 2775 * user namespace, defaults to nobody unless specified in uidMappings 2776 */ 2777 opts.root_map_uid = 65534; 2778 2779 if (opts.capabilities && parseOCIcapabilities_from_file(&opts.capset, opts.capabilities)) { 2780 ERROR("failed to read capabilities from file %s\n", opts.capabilities); 2781 ret=-1; 2782 goto errout; 2783 } 2784 2785 if (opts.ocibundle) { 2786 char *jsonfile; 2787 int ocires; 2788 2789 if (!opts.name) { 2790 ERROR("OCI bundle needs a named jail\n"); 2791 ret=-1; 2792 goto errout; 2793 } 2794 if (asprintf(&jsonfile, "%s/config.json", opts.ocibundle) < 0) { 2795 ret=-ENOMEM; 2796 goto errout; 2797 } 2798 ocires = parseOCI(jsonfile); 2799 free(jsonfile); 2800 if (ocires) { 2801 ERROR("parsing of OCI JSON spec has failed: %s (%d)\n", strerror(ocires), ocires); 2802 ret=ocires; 2803 goto errout; 2804 } 2805 } 2806 2807 if (opts.namespace & CLONE_NEWNET) { 2808 if (!opts.name) { 2809 ERROR("netns needs a named jail\n"); 2810 ret=-1; 2811 goto errout; 2812 } 2813 } 2814 2815 2816 if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) { 2817 ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize); 2818 ret=-1; 2819 goto errout; 2820 } 2821 2822 if (opts.extroot && checkpath(opts.extroot)) { 2823 ERROR("invalid rootfs path '%s'", opts.extroot); 2824 ret=-1; 2825 goto errout; 2826 } 2827 2828 if (opts.overlaydir && checkpath(opts.overlaydir)) { 2829 ERROR("invalid rootfs overlay path '%s'", opts.overlaydir); 2830 ret=-1; 2831 goto errout; 2832 } 2833 2834 /* no <binary> param found */ 2835 if (!opts.ocibundle && (argc - optind < 1)) { 2836 usage(); 2837 ret=EXIT_FAILURE; 2838 goto errout; 2839 } 2840 if (!(opts.ocibundle||opts.namespace||opts.capabilities||opts.seccomp|| 2841 (opts.setns.net != -1) || 2842 (opts.setns.ns != -1) || 2843 (opts.setns.ipc != -1) || 2844 (opts.setns.uts != -1) || 2845 (opts.setns.user != -1) || 2846 (opts.setns.cgroup != -1))) { 2847 ERROR("Not using namespaces, capabilities or seccomp !!!\n\n"); 2848 usage(); 2849 ret=EXIT_FAILURE; 2850 goto errout; 2851 } 2852 DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n", 2853 opts.namespace, 2854 opts.capset.apply, 2855 opts.seccomp != 0 || opts.ociseccomp != 0); 2856 2857 uloop_init(); 2858 signals_init(); 2859 2860 parent_ctx = ubus_connect(NULL); 2861 ubus_add_uloop(parent_ctx); 2862 2863 if (opts.ocibundle) { 2864 char *objname; 2865 if (asprintf(&objname, "container.%s", opts.name) < 0) { 2866 ret=-ENOMEM; 2867 goto errout; 2868 } 2869 2870 container_object.name = objname; 2871 ret = ubus_add_object(parent_ctx, &container_object); 2872 if (ret) { 2873 ERROR("Failed to add object: %s\n", ubus_strerror(ret)); 2874 ret=-1; 2875 goto errout; 2876 } 2877 } 2878 2879 /* deliberately not using 'else' on unrelated conditional branches */ 2880 if (!opts.ocibundle) { 2881 /* allocate NULL-terminated array for argv */ 2882 opts.jail_argv = calloc(1 + argc - optind, sizeof(void *)); 2883 if (!opts.jail_argv) { 2884 ret=EXIT_FAILURE; 2885 goto errout; 2886 } 2887 for (size_t s = optind; s < argc; s++) 2888 opts.jail_argv[s - optind] = strdup(argv[s]); 2889 2890 if (opts.namespace & CLONE_NEWUSER) 2891 get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid); 2892 } 2893 2894 if (!opts.extroot) { 2895 if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) { 2896 ERROR("failed to load dependencies\n"); 2897 ret=-1; 2898 goto errout; 2899 } 2900 } 2901 2902 if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) { 2903 ERROR("failed to load libpreload-seccomp.so\n"); 2904 opts.seccomp = 0; 2905 if (opts.require_jail) { 2906 ret=-1; 2907 goto errout; 2908 } 2909 } 2910 2911 uloop_timeout_add(&post_main_timeout); 2912 uloop_run(); 2913 2914 errout: 2915 if (opts.ocibundle) 2916 cgroups_free(); 2917 2918 free_opts(true); 2919 2920 return ret; 2921 } 2922 2923 static void post_main(struct uloop_timeout *t) 2924 { 2925 if (apply_rlimits()) { 2926 ERROR("error applying resource limits\n"); 2927 free_and_exit(EXIT_FAILURE); 2928 } 2929 2930 if (opts.name) 2931 prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL); 2932 2933 if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0) 2934 free_and_exit(-1); 2935 2936 if (has_namespaces()) { 2937 if (opts.namespace & CLONE_NEWNS) { 2938 if (!opts.extroot && (opts.user || opts.group)) { 2939 add_mount_bind("/etc/passwd", 1, -1); 2940 add_mount_bind("/etc/group", 1, -1); 2941 } 2942 2943 #if defined(__GLIBC__) 2944 if (!opts.extroot) 2945 add_mount_bind("/etc/nsswitch.conf", 1, -1); 2946 #endif 2947 if (opts.setns.ns == -1) { 2948 if (!(opts.namespace & CLONE_NEWNET)) { 2949 add_mount_bind("/etc/resolv.conf", 1, 0); 2950 } else { 2951 /* new mount namespace to provide /dev/resolv.conf.d */ 2952 char hostdir[PATH_MAX]; 2953 2954 snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name); 2955 mkdir_p(hostdir, 0755); 2956 add_mount(hostdir, "/dev/resolv.conf.d", NULL, 2957 MS_BIND | MS_NOEXEC | MS_NOATIME | MS_NOSUID | MS_NODEV | MS_RDONLY, 0, NULL, 0); 2958 } 2959 } 2960 /* default mounts */ 2961 add_mount(NULL, "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "size=1M", -1); 2962 add_mount(NULL, "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "newinstance,ptmxmode=0666,mode=0620,gid=5", 0); 2963 2964 if (opts.procfs || opts.ocibundle) { 2965 add_mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0, NULL, -1); 2966 2967 /* 2968 * hack to make /proc/sys/net read-write while the rest of /proc/sys is read-only 2969 * which cannot be expressed with OCI spec, but happends to be very useful. 2970 * Only apply it if '/proc/sys' is not already listed as mount, maskedPath or 2971 * readonlyPath. 2972 * If not running in a new network namespace, only make /proc/sys read-only. 2973 * If running in a new network namespace, temporarily stash (ie. mount-bind) 2974 * /proc/sys/net into (totally unrelated, but surely existing) /proc/self/net. 2975 * Then we mount-bind /proc/sys read-only and then mount-move /proc/self/net into 2976 * /proc/sys/net. 2977 * This works because mounts are executed in incrementing strcmp() order and 2978 * /proc/self/net appears there before /proc/sys/net and hence the operation 2979 * succeeds as the bind-mount of /proc/self/net is performed first and then 2980 * move-mount of /proc/sys/net follows because 'e' preceeds 'y' in the ASCII 2981 * table (and in the alphabet). 2982 */ 2983 if (!add_mount(NULL, "/proc/sys", NULL, MS_BIND | MS_RDONLY, 0, NULL, -1)) 2984 if (opts.namespace & CLONE_NEWNET) 2985 if (!add_mount_inner("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, 0, NULL, -1)) 2986 add_mount_inner("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, 0, NULL, -1); 2987 2988 } 2989 if (opts.sysfs || opts.ocibundle) 2990 add_mount("sysfs", "/sys", "sysfs", MS_RELATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, 0, NULL, -1); 2991 2992 if (opts.ocibundle) 2993 add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, 0, "mode=1777", -1); 2994 2995 } 2996 2997 if (opts.setns.pid != -1) { 2998 pidns_fd = ns_open_pid("pid", getpid()); 2999 setns_open(CLONE_NEWPID); 3000 } else { 3001 pidns_fd = -1; 3002 } 3003 3004 #ifdef CLONE_NEWTIME 3005 if (opts.setns.time != -1) { 3006 timens_fd = ns_open_pid("time", getpid()); 3007 setns_open(CLONE_NEWTIME); 3008 } else { 3009 timens_fd = -1; 3010 } 3011 #endif 3012 3013 if (opts.namespace & CLONE_NEWUSER) { 3014 if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) { 3015 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n"); 3016 free_and_exit(EXIT_FAILURE); 3017 } 3018 if (seteuid(opts.root_map_uid)) { 3019 ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid); 3020 free_and_exit(EXIT_FAILURE); 3021 } 3022 } 3023 3024 jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | (opts.namespace & (~CLONE_NEWCGROUP)), NULL); 3025 } else { 3026 jail_process.pid = fork(); 3027 } 3028 3029 if (jail_process.pid > 0) { 3030 /* parent process */ 3031 char sig_buf[1]; 3032 3033 uloop_process_add(&jail_process); 3034 jail_running = 1; 3035 if (seteuid(0)) { 3036 ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid); 3037 free_and_exit(EXIT_FAILURE); 3038 } 3039 3040 prctl(PR_SET_SECUREBITS, 0); 3041 3042 if (pidns_fd != -1) { 3043 setns(pidns_fd, CLONE_NEWPID); 3044 close(pidns_fd); 3045 } 3046 #ifdef CLONE_NEWTIME 3047 if (timens_fd != -1) { 3048 setns(timens_fd, CLONE_NEWTIME); 3049 close(timens_fd); 3050 } 3051 #endif 3052 if (opts.setns.net != -1) 3053 close(opts.setns.net); 3054 if (opts.setns.ns != -1) 3055 close(opts.setns.ns); 3056 if (opts.setns.ipc != -1) 3057 close(opts.setns.ipc); 3058 if (opts.setns.uts != -1) 3059 close(opts.setns.uts); 3060 if (opts.setns.user != -1) 3061 close(opts.setns.user); 3062 if (opts.setns.cgroup != -1) 3063 close(opts.setns.cgroup); 3064 close(pipes[1]); 3065 close(pipes[2]); 3066 if (read(pipes[0], sig_buf, 1) < 1) { 3067 ERROR("can't read from child\n"); 3068 free_and_exit(-1); 3069 } 3070 close(pipes[0]); 3071 set_oom_score_adj(); 3072 3073 if (opts.ocibundle) 3074 cgroups_apply(jail_process.pid); 3075 3076 if (opts.namespace & CLONE_NEWUSER) { 3077 if (write_setgroups(jail_process.pid, true)) { 3078 ERROR("can't write setgroups\n"); 3079 free_and_exit(-1); 3080 } 3081 if (!opts.uidmap) { 3082 bool has_gr = (opts.gr_gid != -1); 3083 if (opts.pw_uid != -1) { 3084 write_single_uid_gid_map(jail_process.pid, 0, opts.pw_uid); 3085 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid); 3086 } else { 3087 write_single_uid_gid_map(jail_process.pid, 0, 65534); 3088 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534); 3089 } 3090 } else { 3091 write_uid_gid_map(jail_process.pid, 0, opts.uidmap); 3092 if (opts.gidmap) 3093 write_uid_gid_map(jail_process.pid, 1, opts.gidmap); 3094 } 3095 } 3096 3097 if (opts.namespace & CLONE_NEWNET) 3098 jail_network_start(parent_ctx, opts.name, jail_process.pid); 3099 3100 if (jail_writepid(jail_process.pid)) { 3101 ERROR("failed to write pidfile: %m\n"); 3102 free_and_exit(-1); 3103 } 3104 } else if (jail_process.pid == 0) { 3105 /* fork child process */ 3106 free_and_exit(exec_jail(NULL)); 3107 } else { 3108 ERROR("failed to clone/fork: %m\n"); 3109 free_and_exit(EXIT_FAILURE); 3110 } 3111 run_hooks(opts.hooks.createRuntime, post_create_runtime); 3112 } 3113 3114 static void post_poststart(void); 3115 static void post_create_runtime(void) 3116 { 3117 char sig_buf[1]; 3118 3119 sig_buf[0] = 'O'; 3120 if (write(pipes[3], sig_buf, 1) < 0) { 3121 ERROR("can't write to child\n"); 3122 free_and_exit(-1); 3123 } 3124 3125 jail_oci_state = OCI_STATE_CREATED; 3126 if (opts.ocibundle && !opts.immediately) 3127 uloop_run(); /* wait for 'start' command via ubus */ 3128 else 3129 pipe_send_start_container(NULL); 3130 } 3131 3132 static void pipe_send_start_container(struct uloop_timeout *t) 3133 { 3134 char sig_buf[1]; 3135 3136 jail_oci_state = OCI_STATE_RUNNING; 3137 sig_buf[0] = '!'; 3138 if (write(pipes[3], sig_buf, 1) < 0) { 3139 ERROR("can't write to child\n"); 3140 free_and_exit(-1); 3141 } 3142 close(pipes[3]); 3143 3144 run_hooks(opts.hooks.poststart, post_poststart); 3145 } 3146 3147 static void post_poststart(void) 3148 { 3149 uloop_run(); /* idle here while jail is running */ 3150 if (jail_running) { 3151 DEBUG("uloop interrupted, killing jail process\n"); 3152 kill(jail_process.pid, SIGTERM); 3153 uloop_timeout_set(&jail_process_timeout, 1000); 3154 uloop_run(); 3155 } 3156 uloop_done(); 3157 poststop(); 3158 } 3159 3160 static void post_poststop(void); 3161 static void poststop(void) { 3162 if (opts.namespace & CLONE_NEWNET) { 3163 setns(netns_fd, CLONE_NEWNET); 3164 jail_network_stop(); 3165 close(netns_fd); 3166 } 3167 run_hooks(opts.hooks.poststop, post_poststop); 3168 } 3169 3170 static void post_poststop(void) 3171 { 3172 free_opts(true); 3173 if (parent_ctx) 3174 ubus_free(parent_ctx); 3175 3176 exit(jail_return_code); 3177 } 3178
This page was automatically generated by LXR 0.3.1. • OpenWrt