1 /* 2 * Copyright (C) 2015 John Crispin <blogic@openwrt.org> 3 * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org> 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU Lesser General Public License version 2.1 7 * as published by the Free Software Foundation 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 */ 14 15 #define _GNU_SOURCE 16 #include <sys/mount.h> 17 #include <sys/prctl.h> 18 #include <sys/wait.h> 19 #include <sys/types.h> 20 #include <sys/time.h> 21 #include <sys/resource.h> 22 #include <sys/stat.h> 23 #include <sys/sysmacros.h> 24 25 /* musl only defined 15 limit types, make sure all 16 are supported */ 26 #ifndef RLIMIT_RTTIME 27 #define RLIMIT_RTTIME 15 28 #undef RLIMIT_NLIMITS 29 #define RLIMIT_NLIMITS 16 30 #undef RLIM_NLIMITS 31 #define RLIM_NLIMITS 16 32 #endif 33 34 #include <assert.h> 35 #include <stdlib.h> 36 #include <unistd.h> 37 #include <errno.h> 38 #include <pwd.h> 39 #include <grp.h> 40 #include <string.h> 41 #include <fcntl.h> 42 #include <sched.h> 43 #include <linux/filter.h> 44 #include <linux/limits.h> 45 #include <linux/nsfs.h> 46 #include <linux/securebits.h> 47 #include <signal.h> 48 #include <inttypes.h> 49 50 #include "capabilities.h" 51 #include "elf.h" 52 #include "fs.h" 53 #include "jail.h" 54 #include "log.h" 55 #include "seccomp-oci.h" 56 #include "cgroups.h" 57 #include "netifd.h" 58 59 #include <libubox/blobmsg.h> 60 #include <libubox/blobmsg_json.h> 61 #include <libubox/list.h> 62 #include <libubox/vlist.h> 63 #include <libubox/uloop.h> 64 #include <libubox/utils.h> 65 #include <libubus.h> 66 67 #ifndef CLONE_NEWCGROUP 68 #define CLONE_NEWCGROUP 0x02000000 69 #endif 70 71 #define STACK_SIZE (1024 * 1024) 72 #define OPT_ARGS "cC:d:e:EfFG:h:ij:J:ln:NoO:pP:r:R:sS:uU:w:t:T:y" 73 74 #define OCI_VERSION_STRING "1.0.2" 75 76 struct hook_execvpe { 77 char *file; 78 char **argv; 79 char **envp; 80 int timeout; 81 }; 82 83 struct sysctl_val { 84 char *entry; 85 char *value; 86 }; 87 88 struct mknod_args { 89 char *path; 90 mode_t mode; 91 dev_t dev; 92 uid_t uid; 93 gid_t gid; 94 }; 95 96 static struct { 97 char *name; 98 char *hostname; 99 char **jail_argv; 100 char *cwd; 101 char *seccomp; 102 struct sock_fprog *ociseccomp; 103 char *capabilities; 104 struct jail_capset capset; 105 char *user; 106 char *group; 107 char *extroot; 108 char *overlaydir; 109 char *tmpoverlaysize; 110 char **envp; 111 char *uidmap; 112 char *gidmap; 113 char *pidfile; 114 struct sysctl_val **sysctl; 115 int no_new_privs; 116 int namespace; 117 struct { 118 int pid; 119 int net; 120 int ns; 121 int ipc; 122 int uts; 123 int user; 124 int cgroup; 125 #ifdef CLONE_NEWTIME 126 int time; 127 #endif 128 } setns; 129 int procfs; 130 int ronly; 131 int sysfs; 132 int console; 133 int pw_uid; 134 int pw_gid; 135 int gr_gid; 136 int root_map_uid; 137 gid_t *additional_gids; 138 size_t num_additional_gids; 139 mode_t umask; 140 bool set_umask; 141 int require_jail; 142 struct { 143 struct hook_execvpe **createRuntime; 144 struct hook_execvpe **createContainer; 145 struct hook_execvpe **startContainer; 146 struct hook_execvpe **poststart; 147 struct hook_execvpe **poststop; 148 } hooks; 149 struct rlimit *rlimits[RLIM_NLIMITS]; 150 int oom_score_adj; 151 bool set_oom_score_adj; 152 struct mknod_args **devices; 153 char *ocibundle; 154 bool immediately; 155 struct blob_attr *annotations; 156 int term_timeout; 157 } opts; 158 159 static struct blob_buf ocibuf; 160 161 extern int pivot_root(const char *new_root, const char *put_old); 162 163 int debug = 0; 164 165 static char child_stack[STACK_SIZE]; 166 167 static struct ubus_context *parent_ctx; 168 169 int console_fd; 170 171 172 static inline bool has_namespaces(void) 173 { 174 return ((opts.setns.pid != -1) || 175 (opts.setns.net != -1) || 176 (opts.setns.ns != -1) || 177 (opts.setns.ipc != -1) || 178 (opts.setns.uts != -1) || 179 (opts.setns.user != -1) || 180 (opts.setns.cgroup != -1) || 181 #ifdef CLONE_NEWTIME 182 (opts.setns.time != -1) || 183 #endif 184 opts.namespace); 185 } 186 187 static void free_oci_envp(char **p) { 188 char **tmp; 189 190 if (p) { 191 tmp = p; 192 while (*tmp) 193 free(*(tmp++)); 194 195 free(p); 196 } 197 } 198 199 static void free_hooklist(struct hook_execvpe **hooklist) 200 { 201 struct hook_execvpe *cur; 202 203 if (!hooklist) 204 return; 205 206 cur = *hooklist; 207 while (cur) { 208 free_oci_envp(cur->argv); 209 free_oci_envp(cur->envp); 210 free(cur->file); 211 free(cur++); 212 } 213 free(hooklist); 214 } 215 216 static void free_sysctl(void) { 217 struct sysctl_val *cur; 218 219 if (!opts.sysctl) 220 return; 221 222 cur = *opts.sysctl; 223 224 while (cur) { 225 free(cur->entry); 226 free(cur->value); 227 free(cur++); 228 } 229 free(opts.sysctl); 230 } 231 232 static void free_devices(void) { 233 struct mknod_args **cur; 234 235 if (!opts.devices) 236 return; 237 238 cur = opts.devices; 239 240 while (*cur) { 241 free((*cur)->path); 242 free(*(cur++)); 243 } 244 free(opts.devices); 245 } 246 247 static void free_rlimits(void) { 248 int type; 249 250 for (type = 0; type < RLIM_NLIMITS; ++type) 251 free(opts.rlimits[type]); 252 } 253 254 static void free_opts(bool parent) { 255 256 free_library_search(); 257 mount_free(); 258 cgroups_free(); 259 260 /* we need to keep argv, envp and seccomp filter in child */ 261 if (parent) { /* parent-only */ 262 if (opts.ociseccomp) { 263 free(opts.ociseccomp->filter); 264 free(opts.ociseccomp); 265 } 266 267 free_oci_envp(opts.jail_argv); 268 free_oci_envp(opts.envp); 269 } 270 271 free_rlimits(); 272 free_sysctl(); 273 free_devices(); 274 free(opts.hostname); 275 free(opts.cwd); 276 free(opts.uidmap); 277 free(opts.gidmap); 278 free(opts.annotations); 279 free(opts.extroot); 280 free(opts.overlaydir); 281 free_hooklist(opts.hooks.createRuntime); 282 free_hooklist(opts.hooks.createContainer); 283 free_hooklist(opts.hooks.startContainer); 284 free_hooklist(opts.hooks.poststart); 285 free_hooklist(opts.hooks.poststop); 286 } 287 288 static int mount_overlay(char *jail_root, char *overlaydir) { 289 char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf; 290 const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s"; 291 int ret = -1, fd; 292 293 if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0) 294 goto out; 295 296 if (asprintf(&workdir, "%s%s", overlaydir, "/work") < 0) 297 goto upper_printf; 298 299 if (asprintf(&optsstr, mountoptsformat, jail_root, upperdir, workdir) < 0) 300 goto work_printf; 301 302 if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755)) 303 goto opts_printf; 304 305 /* 306 * make sure /etc/resolv.conf exists in overlay and is owned by jail userns root 307 * this is to work-around a bug in overlayfs described in the overlayfs-userns 308 * patch: 309 * 3. modification of a file 'hithere' which is in l but not yet 310 * in u, and which is not owned by T, is not allowed, even if 311 * writes to u are allowed. This may be a bug in overlayfs, 312 * but it is safe behavior. 313 */ 314 if (asprintf(&upperetc, "%s/etc", upperdir) < 0) 315 goto opts_printf; 316 317 if (mkdir_p(upperetc, 0755)) 318 goto upper_etc_printf; 319 320 if (asprintf(&upperresolvconf, "%s/resolv.conf", upperetc) < 0) 321 goto upper_etc_printf; 322 323 fd = creat(upperresolvconf, 0644); 324 if (fd < 0) { 325 if (errno != EEXIST) 326 ERROR("creat(%s) failed: %m\n", upperresolvconf); 327 } else { 328 close(fd); 329 } 330 DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr); 331 332 if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr)) 333 goto upper_resolvconf_printf; 334 335 ret = 0; 336 337 upper_resolvconf_printf: 338 free(upperresolvconf); 339 upper_etc_printf: 340 free(upperetc); 341 opts_printf: 342 free(optsstr); 343 work_printf: 344 free(workdir); 345 upper_printf: 346 free(upperdir); 347 out: 348 return ret; 349 } 350 351 static void pass_console(int console_fd) 352 { 353 struct ubus_context *child_ctx = ubus_connect(NULL); 354 static struct blob_buf req; 355 uint32_t id; 356 357 if (!child_ctx) 358 return; 359 360 blob_buf_init(&req, 0); 361 blobmsg_add_string(&req, "name", opts.name); 362 363 if (ubus_lookup_id(child_ctx, "container", &id) || 364 ubus_invoke_fd(child_ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd)) 365 INFO("ubus request failed\n"); 366 else 367 close(console_fd); 368 369 blob_buf_free(&req); 370 ubus_free(child_ctx); 371 } 372 373 static int create_dev_console(const char *jail_root) 374 { 375 char *console_fname; 376 char dev_console_path[PATH_MAX]; 377 int slave_console_fd, dev_console_dummy; 378 379 /* Open UNIX/98 virtual console */ 380 console_fd = posix_openpt(O_RDWR | O_NOCTTY); 381 if (console_fd < 0) 382 return -1; 383 384 console_fname = ptsname(console_fd); 385 DEBUG("got console fd %d and PTS client name %s\n", console_fd, console_fname); 386 if (!console_fname) 387 goto no_console; 388 389 grantpt(console_fd); 390 unlockpt(console_fd); 391 392 /* pass PTY master to procd */ 393 pass_console(console_fd); 394 395 /* mount-bind PTY slave to /dev/console in jail */ 396 snprintf(dev_console_path, sizeof(dev_console_path), "%s/dev/console", jail_root); 397 dev_console_dummy = creat(dev_console_path, 0620); 398 if (dev_console_dummy < 0) 399 goto no_console; 400 401 close(dev_console_dummy); 402 403 if (mount(console_fname, dev_console_path, "bind", MS_BIND, NULL)) 404 goto no_console; 405 406 /* use PTY slave for stdio */ 407 slave_console_fd = open(console_fname, O_RDWR); /* | O_NOCTTY */ 408 if (slave_console_fd < 0) 409 goto no_console; 410 411 dup2(slave_console_fd, 0); 412 dup2(slave_console_fd, 1); 413 dup2(slave_console_fd, 2); 414 close(slave_console_fd); 415 416 INFO("using guest console %s\n", console_fname); 417 418 return 0; 419 420 no_console: 421 close(console_fd); 422 return 1; 423 } 424 425 static int hook_running = 0; 426 static int hook_return_code = 0; 427 static struct hook_execvpe **current_hook = NULL; 428 typedef void (*hook_return_handler)(void); 429 static hook_return_handler hook_return_cb = NULL; 430 431 static void hook_process_timeout_cb(struct uloop_timeout *t); 432 static struct uloop_timeout hook_process_timeout = { 433 .cb = hook_process_timeout_cb, 434 }; 435 436 static void run_hooklist(void); 437 static void hook_process_handler(struct uloop_process *c, int ret) 438 { 439 uloop_timeout_cancel(&hook_process_timeout); 440 441 if (WIFEXITED(ret)) { 442 hook_return_code = WEXITSTATUS(ret); 443 if (hook_return_code) 444 ERROR("hook (%d) exited with exit: %d\n", c->pid, hook_return_code); 445 else 446 DEBUG("hook (%d) exited with exit: %d\n", c->pid, hook_return_code); 447 448 } else { 449 hook_return_code = WTERMSIG(ret); 450 ERROR("hook (%d) exited with signal: %d\n", c->pid, hook_return_code); 451 } 452 hook_running = 0; 453 ++current_hook; 454 run_hooklist(); 455 } 456 457 static struct uloop_process hook_process = { 458 .cb = hook_process_handler, 459 }; 460 461 static void hook_process_timeout_cb(struct uloop_timeout *t) 462 { 463 DEBUG("hook process failed to stop, sending SIGKILL\n"); 464 kill(hook_process.pid, SIGKILL); 465 } 466 467 static void run_hooklist(void) 468 { 469 struct hook_execvpe *hook = *current_hook; 470 struct stat s; 471 472 if (!hook) 473 return hook_return_cb(); 474 475 DEBUG("executing hook %s\n", hook->file); 476 477 if (stat(hook->file, &s)) 478 hook_process_handler(&hook_process, ENOENT); 479 480 if (!((unsigned long)s.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH))) 481 hook_process_handler(&hook_process, EPERM); 482 483 hook_running = 1; 484 hook_process.pid = fork(); 485 if (hook_process.pid == 0) { 486 /* child */ 487 execve(hook->file, hook->argv, hook->envp); 488 ERROR("execve error %m\n"); 489 _exit(errno); 490 } else if (hook_process.pid < 0) { 491 /* fork error */ 492 ERROR("hook fork error\n"); 493 hook_running = 0; 494 hook_process_handler(&hook_process, errno); 495 } 496 497 /* parent */ 498 uloop_process_add(&hook_process); 499 500 if (hook->timeout > 0) 501 uloop_timeout_set(&hook_process_timeout, 1000 * hook->timeout); 502 503 uloop_run(); 504 if (hook_running) { 505 DEBUG("uloop interrupted, killing jail process\n"); 506 kill(hook_process.pid, SIGTERM); 507 uloop_timeout_set(&hook_process_timeout, 1000); 508 uloop_run(); 509 } 510 } 511 512 static void run_hooks(struct hook_execvpe **hooklist, hook_return_handler return_cb) 513 { 514 if (!hooklist) 515 return_cb(); 516 517 current_hook = hooklist; 518 hook_return_cb = return_cb; 519 520 run_hooklist(); 521 } 522 523 static int apply_sysctl(const char *jail_root) 524 { 525 struct sysctl_val **cur; 526 char *procdir, *fname; 527 int f; 528 529 if (!opts.sysctl) 530 return 0; 531 532 if (asprintf(&procdir, "%s/proc", jail_root) < 0) 533 return ENOMEM; 534 535 if (mkdir(procdir, 0700)) 536 return errno; 537 538 if (mount("proc", procdir, "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0)) 539 return EPERM; 540 541 cur = opts.sysctl; 542 543 while (*cur) { 544 if (asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry) < 0) 545 return ENOMEM; 546 547 DEBUG("sysctl: writing '%s' to %s\n", (*cur)->value, fname); 548 549 f = open(fname, O_WRONLY); 550 if (f < 0) { 551 ERROR("sysctl: can't open %s\n", fname); 552 free(fname); 553 return errno; 554 } 555 if (write(f, (*cur)->value, strlen((*cur)->value)) < 0) { 556 ERROR("sysctl: write to %s\n", fname); 557 free(fname); 558 close(f); 559 return errno; 560 } 561 562 free(fname); 563 close(f); 564 ++cur; 565 } 566 umount(procdir); 567 rmdir(procdir); 568 free(procdir); 569 570 return 0; 571 } 572 573 /* glibc defines makedev calling a function. make sure it's a pure macro */ 574 #if defined(__GLIBC__) 575 #undef makedev 576 /* from musl's sys/sysmacros.h */ 577 #define makedev(x,y) ( \ 578 (((x)&0xfffff000ULL) << 32) | \ 579 (((x)&0x00000fffULL) << 8) | \ 580 (((y)&0xffffff00ULL) << 12) | \ 581 (((y)&0x000000ffULL)) ) 582 #endif 583 584 static struct mknod_args default_devices[] = { 585 { .path = "/dev/null", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 3) }, 586 { .path = "/dev/zero", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 5) }, 587 { .path = "/dev/full", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 7) }, 588 { .path = "/dev/random", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 8) }, 589 { .path = "/dev/urandom", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 9) }, 590 { .path = "/dev/tty", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP), .dev = makedev(5, 0), .gid = 5 }, 591 { 0 }, 592 }; 593 594 static int create_devices(void) 595 { 596 struct mknod_args **cur, *curdef; 597 char *path, *tmp; 598 int ret; 599 600 if (!opts.devices) 601 goto only_default_devices; 602 603 cur = opts.devices; 604 605 while (*cur) { 606 path = (*cur)->path; 607 /* don't allow devices outside of /dev */ 608 if (strncmp(path, "/dev", 4)) 609 return EPERM; 610 611 /* make sure parent folder exists */ 612 tmp = strrchr(path, '/'); 613 if (!tmp) 614 return EINVAL; 615 616 *tmp = '\0'; 617 if (strcmp(path, "/dev")) { 618 DEBUG("creating directory %s\n", path); 619 620 if (mkdir_p(path, 0755)) 621 return errno; 622 } 623 *tmp = '/'; 624 625 DEBUG("creating %s (mode=%08o)\n", path, (*cur)->mode); 626 627 /* create device */ 628 if (mknod(path, (*cur)->mode, (*cur)->dev)) 629 return errno; 630 631 /* change owner, if needed */ 632 if (((*cur)->uid || (*cur)->gid) && 633 chown(path, (*cur)->uid, (*cur)->gid)) 634 return errno; 635 636 ++cur; 637 } 638 639 only_default_devices: 640 curdef = default_devices; 641 while(curdef->path) { 642 DEBUG("creating %s (mode=%08o)\n", curdef->path, curdef->mode); 643 if (mknod(curdef->path, curdef->mode, curdef->dev)) { 644 ++curdef; 645 continue; /* may already exist, eg. due to a bind-mount */ 646 } 647 if ((curdef->uid || curdef->gid) && 648 chown(curdef->path, curdef->uid, curdef->gid)) 649 return errno; 650 651 ++curdef; 652 } 653 654 /* Dev symbolic links as defined in OCI spec */ 655 ret = symlink("/dev/pts/ptmx", "/dev/ptmx"); 656 if (ret < 0) 657 WARNING("symlink() failed to create link to /dev/pts/ptmx"); 658 659 ret = symlink("/proc/self/fd", "/dev/fd"); 660 if (ret < 0) 661 WARNING("symlink() failed to create link to /proc/self/fd"); 662 663 ret = symlink("/proc/self/fd/0", "/dev/stdin"); 664 if (ret < 0) 665 WARNING("symlink() failed to create link to /proc/self/fd/0"); 666 667 ret = symlink("/proc/self/fd/1", "/dev/stdout"); 668 if (ret < 0) 669 WARNING("symlink() failed to create link to /proc/self/fd/1"); 670 671 ret = symlink("/proc/self/fd/2", "/dev/stderr"); 672 if (ret < 0) 673 WARNING("symlink() failed to create link to /proc/self/fd/2"); 674 675 return 0; 676 } 677 678 static char jail_root[] = "/tmp/ujail-XXXXXX"; 679 static char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX"; 680 static mode_t old_umask; 681 static void enter_jail_fs(void); 682 static int build_jail_fs(void) 683 { 684 char *overlaydir = NULL; 685 int ret; 686 687 old_umask = umask(0); 688 689 if (mkdtemp(jail_root) == NULL) { 690 ERROR("mkdtemp(%s) failed: %m\n", jail_root); 691 return -1; 692 } 693 694 if (apply_sysctl(jail_root)) { 695 ERROR("failed to apply sysctl values\n"); 696 return -1; 697 } 698 699 /* oldroot can't be MS_SHARED else pivot_root() fails */ 700 if (mount("none", "/", "none", MS_REC|MS_PRIVATE, NULL)) { 701 ERROR("private mount failed %m\n"); 702 return -1; 703 } 704 705 if (opts.extroot) { 706 if (mount(opts.extroot, jail_root, "bind", MS_BIND, NULL)) { 707 ERROR("extroot mount failed %m\n"); 708 return -1; 709 } 710 } else { 711 if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) { 712 ERROR("tmpfs mount failed %m\n"); 713 return -1; 714 } 715 } 716 717 if (opts.tmpoverlaysize) { 718 char mountoptsstr[] = "mode=0755,size=XXXXXXXX"; 719 720 snprintf(mountoptsstr, sizeof(mountoptsstr), 721 "mode=0755,size=%s", opts.tmpoverlaysize); 722 if (mkdtemp(tmpovdir) == NULL) { 723 ERROR("mkdtemp(%s) failed: %m\n", jail_root); 724 return -1; 725 } 726 if (mount("tmpfs", tmpovdir, "tmpfs", MS_NOATIME, 727 mountoptsstr)) { 728 ERROR("failed to mount tmpfs for overlay (size=%s)\n", opts.tmpoverlaysize); 729 return -1; 730 } 731 overlaydir = tmpovdir; 732 } 733 734 if (opts.overlaydir) 735 overlaydir = opts.overlaydir; 736 737 if (overlaydir) { 738 ret = mount_overlay(jail_root, overlaydir); 739 if (ret) 740 return ret; 741 } 742 743 if (chdir(jail_root)) { 744 ERROR("chdir(%s) (jail_root) failed: %m\n", jail_root); 745 return -1; 746 } 747 748 if (mount_all(jail_root)) { 749 ERROR("mount_all() failed\n"); 750 return -1; 751 } 752 753 if (opts.console) 754 create_dev_console(jail_root); 755 756 /* make sure /etc/resolv.conf exists if in new network namespace */ 757 if (opts.namespace & CLONE_NEWNET) { 758 char jailetc[PATH_MAX], jaillink[PATH_MAX]; 759 760 snprintf(jailetc, PATH_MAX, "%s/etc", jail_root); 761 if (mkdir_p(jailetc, 0755)) { 762 ERROR("mkdir(%s) failed: %m\n", jailetc); 763 return -1; 764 } 765 snprintf(jaillink, PATH_MAX, "%s/etc/resolv.conf", jail_root); 766 if (overlaydir) 767 unlink(jaillink); 768 769 ret = symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink); 770 if (ret < 0) 771 WARNING("symlink() failed to create link to ../dev/resolv.conf.d/resolv.conf.auto"); 772 } 773 774 run_hooks(opts.hooks.createContainer, enter_jail_fs); 775 776 return 0; 777 } 778 779 static bool exit_from_child; 780 static void free_and_exit(int ret) 781 { 782 if (!exit_from_child && opts.ocibundle) 783 cgroups_free(); 784 785 if (!exit_from_child && parent_ctx) 786 ubus_free(parent_ctx); 787 788 free_opts(!exit_from_child); 789 790 exit(ret); 791 } 792 793 static void post_jail_fs(void); 794 static void enter_jail_fs(void) 795 { 796 char dirbuf[sizeof(jail_root) + 4]; 797 798 snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root); 799 if (mkdir(dirbuf, 0755)) { 800 ERROR("mkdir(%s) failed: %m\n", dirbuf); 801 free_and_exit(-1); 802 } 803 if (pivot_root(jail_root, dirbuf) == -1) { 804 ERROR("pivot_root(%s, %s) failed: %m\n", jail_root, dirbuf); 805 free_and_exit(-1); 806 } 807 if (chdir("/")) { 808 ERROR("chdir(/) (after pivot_root) failed: %m\n"); 809 free_and_exit(-1); 810 } 811 812 snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root); 813 umount2(dirbuf, MNT_DETACH); 814 rmdir(dirbuf); 815 if (opts.tmpoverlaysize) { 816 char tmpdirbuf[sizeof(tmpovdir) + 4]; 817 snprintf(tmpdirbuf, sizeof(tmpdirbuf), "/old%s", tmpovdir); 818 umount2(tmpdirbuf, MNT_DETACH); 819 rmdir(tmpdirbuf); 820 } 821 822 umount2("/old", MNT_DETACH); 823 rmdir("/old"); 824 825 if (create_devices()) { 826 ERROR("create_devices() failed\n"); 827 free_and_exit(-1); 828 } 829 if (opts.ronly) 830 mount(NULL, "/", "bind", MS_REMOUNT | MS_BIND | MS_RDONLY, 0); 831 832 umask(old_umask); 833 post_jail_fs(); 834 } 835 836 static int write_uid_gid_map(pid_t child_pid, bool gidmap, char *mapstr) 837 { 838 int map_file; 839 char map_path[64]; 840 841 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s", 842 child_pid, gidmap?"gid_map":"uid_map") < 0) 843 return -1; 844 845 if ((map_file = open(map_path, O_WRONLY)) < 0) 846 return -1; 847 848 if (dprintf(map_file, "%s", mapstr)) { 849 close(map_file); 850 return -1; 851 } 852 853 close(map_file); 854 return 0; 855 } 856 857 static int write_single_uid_gid_map(pid_t child_pid, bool gidmap, int id) 858 { 859 int map_file; 860 char map_path[64]; 861 const char *map_format = "%d %d %d\n"; 862 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s", 863 child_pid, gidmap?"gid_map":"uid_map") < 0) 864 return -1; 865 866 if ((map_file = open(map_path, O_WRONLY)) < 0) 867 return -1; 868 869 if (dprintf(map_file, map_format, 0, id, 1) < 0) { 870 close(map_file); 871 return -1; 872 } 873 874 close(map_file); 875 return 0; 876 } 877 878 static int write_setgroups(pid_t child_pid, bool allow) 879 { 880 int setgroups_file; 881 char setgroups_path[64]; 882 883 if (snprintf(setgroups_path, sizeof(setgroups_path), "/proc/%d/setgroups", 884 child_pid) < 0) { 885 return -1; 886 } 887 888 if ((setgroups_file = open(setgroups_path, O_WRONLY)) < 0) { 889 return -1; 890 } 891 892 if (dprintf(setgroups_file, "%s", allow?"allow":"deny") == -1) { 893 close(setgroups_file); 894 return -1; 895 } 896 897 close(setgroups_file); 898 return 0; 899 } 900 901 static void get_jail_user(int *user, int *user_gid, int *gr_gid) 902 { 903 struct passwd *p = NULL; 904 struct group *g = NULL; 905 906 if (opts.user) { 907 p = getpwnam(opts.user); 908 if (!p) { 909 ERROR("failed to get uid/gid for user %s: %d (%s)\n", 910 opts.user, errno, strerror(errno)); 911 free_and_exit(EXIT_FAILURE); 912 } 913 *user = p->pw_uid; 914 *user_gid = p->pw_gid; 915 } else { 916 *user = -1; 917 *user_gid = -1; 918 } 919 920 if (opts.group) { 921 g = getgrnam(opts.group); 922 if (!g) { 923 ERROR("failed to get gid for group %s: %m\n", opts.group); 924 free_and_exit(EXIT_FAILURE); 925 } 926 *gr_gid = g->gr_gid; 927 } else { 928 *gr_gid = -1; 929 } 930 }; 931 932 static void set_jail_user(int pw_uid, int user_gid, int gr_gid) 933 { 934 if (opts.user && (user_gid != -1) && initgroups(opts.user, user_gid)) { 935 ERROR("failed to initgroups() for user %s: %m\n", opts.user); 936 free_and_exit(EXIT_FAILURE); 937 } 938 939 if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) { 940 ERROR("failed to set group id %d: %m\n", gr_gid); 941 free_and_exit(EXIT_FAILURE); 942 } 943 944 if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) { 945 ERROR("failed to set user id %d: %m\n", pw_uid); 946 free_and_exit(EXIT_FAILURE); 947 } 948 } 949 950 static int apply_rlimits(void) 951 { 952 int resource; 953 954 for (resource = 0; resource < RLIM_NLIMITS; ++resource) { 955 if (opts.rlimits[resource]) 956 DEBUG("applying limits to resource %u\n", resource); 957 958 if (opts.rlimits[resource] && 959 setrlimit(resource, opts.rlimits[resource])) 960 return errno; 961 } 962 963 return 0; 964 } 965 966 #define MAX_ENVP 64 967 static char** build_envp(const char *seccomp, char **ocienvp) 968 { 969 static char *envp[MAX_ENVP]; 970 static char preload_var[PATH_MAX]; 971 static char seccomp_var[PATH_MAX]; 972 static char seccomp_debug_var[20]; 973 static char debug_var[] = "LD_DEBUG=all"; 974 static char container_var[] = "container=ujail"; 975 const char *preload_lib = find_lib("libpreload-seccomp.so"); 976 char **addenv; 977 978 int count = 0; 979 980 if (seccomp && !preload_lib) { 981 ERROR("failed to add preload-lib to env\n"); 982 return NULL; 983 } 984 if (seccomp) { 985 snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp); 986 envp[count++] = seccomp_var; 987 snprintf(seccomp_debug_var, sizeof(seccomp_debug_var), "SECCOMP_DEBUG=%2d", debug); 988 envp[count++] = seccomp_debug_var; 989 snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib); 990 envp[count++] = preload_var; 991 } 992 993 envp[count++] = container_var; 994 995 if (debug > 1) 996 envp[count++] = debug_var; 997 998 addenv = ocienvp; 999 while (addenv && *addenv) { 1000 envp[count++] = *(addenv++); 1001 if (count >= MAX_ENVP) { 1002 ERROR("environment limited to %d extra records, truncating\n", MAX_ENVP); 1003 break; 1004 } 1005 } 1006 return envp; 1007 } 1008 1009 static void usage(void) 1010 { 1011 fprintf(stderr, "ujail <options> -- <binary> <params ...>\n"); 1012 fprintf(stderr, " -d <num>\tshow debug log (increase num to increase verbosity)\n"); 1013 fprintf(stderr, " -S <file>\tseccomp filter config\n"); 1014 fprintf(stderr, " -C <file>\tcapabilities drop config\n"); 1015 fprintf(stderr, " -c\t\tset PR_SET_NO_NEW_PRIVS\n"); 1016 fprintf(stderr, " -n <name>\tthe name of the jail\n"); 1017 fprintf(stderr, " -e <var>\timport environment variable\n"); 1018 fprintf(stderr, "namespace jail options:\n"); 1019 fprintf(stderr, " -h <hostname>\tchange the hostname of the jail\n"); 1020 fprintf(stderr, " -N\t\tjail has network namespace\n"); 1021 fprintf(stderr, " -f\t\tjail has user namespace\n"); 1022 fprintf(stderr, " -F\t\tjail has cgroups namespace\n"); 1023 fprintf(stderr, " -r <file>\treadonly files that should be staged\n"); 1024 fprintf(stderr, " -w <file>\twriteable files that should be staged\n"); 1025 fprintf(stderr, " -p\t\tjail has /proc\n"); 1026 fprintf(stderr, " -s\t\tjail has /sys\n"); 1027 fprintf(stderr, " -l\t\tjail has /dev/log\n"); 1028 fprintf(stderr, " -u\t\tjail has a ubus socket\n"); 1029 fprintf(stderr, " -U <name>\tuser to run jailed process\n"); 1030 fprintf(stderr, " -G <name>\tgroup to run jailed process\n"); 1031 fprintf(stderr, " -o\t\tremont jail root (/) read only\n"); 1032 fprintf(stderr, " -R <dir>\texternal jail rootfs (system container)\n"); 1033 fprintf(stderr, " -O <dir>\tdirectory for r/w overlayfs\n"); 1034 fprintf(stderr, " -T <size>\tuse tmpfs r/w overlayfs with <size>\n"); 1035 fprintf(stderr, " -E\t\tfail if jail cannot be setup\n"); 1036 fprintf(stderr, " -y\t\tprovide jail console\n"); 1037 fprintf(stderr, " -J <dir>\tcreate container from OCI bundle\n"); 1038 fprintf(stderr, " -i\t\tstart container immediately\n"); 1039 fprintf(stderr, " -P <pidfile>\tcreate <pidfile>\n"); 1040 fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\ 1041 and he has the same powers as root outside the jail,\n\ 1042 thus he can escape the jail and/or break stuff.\n\ 1043 Please use seccomp/capabilities (-S/-C) to restrict his powers\n\n\ 1044 If you use none of the namespace jail options,\n\ 1045 ujail will not use namespace/build a jail,\n\ 1046 and will only drop capabilities/apply seccomp filter.\n\n"); 1047 } 1048 1049 static int* get_namespace_fd(const unsigned int nstype) 1050 { 1051 switch (nstype) { 1052 case CLONE_NEWPID: 1053 return &opts.setns.pid; 1054 case CLONE_NEWNET: 1055 return &opts.setns.net; 1056 case CLONE_NEWNS: 1057 return &opts.setns.ns; 1058 case CLONE_NEWIPC: 1059 return &opts.setns.ipc; 1060 case CLONE_NEWUTS: 1061 return &opts.setns.uts; 1062 case CLONE_NEWUSER: 1063 return &opts.setns.user; 1064 case CLONE_NEWCGROUP: 1065 return &opts.setns.cgroup; 1066 #ifdef CLONE_NEWTIME 1067 case CLONE_NEWTIME: 1068 return &opts.setns.time; 1069 #endif 1070 default: 1071 return NULL; 1072 } 1073 } 1074 1075 static int setns_open(unsigned long nstype) 1076 { 1077 int *fd = get_namespace_fd(nstype); 1078 1079 assert(fd != NULL); 1080 1081 if (*fd < 0) 1082 return 0; 1083 1084 if (setns(*fd, nstype) == -1) { 1085 close(*fd); 1086 return errno; 1087 } 1088 1089 close(*fd); 1090 return 0; 1091 } 1092 1093 static int jail_running = 0; 1094 static int jail_return_code = 0; 1095 1096 static void jail_process_timeout_cb(struct uloop_timeout *t); 1097 static struct uloop_timeout jail_process_timeout = { 1098 .cb = jail_process_timeout_cb, 1099 }; 1100 static void poststop(void); 1101 static void jail_process_handler(struct uloop_process *c, int ret) 1102 { 1103 uloop_timeout_cancel(&jail_process_timeout); 1104 if (WIFEXITED(ret)) { 1105 jail_return_code = WEXITSTATUS(ret); 1106 INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code); 1107 } else { 1108 jail_return_code = WTERMSIG(ret); 1109 INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code); 1110 } 1111 jail_running = 0; 1112 poststop(); 1113 } 1114 1115 static struct uloop_process jail_process = { 1116 .cb = jail_process_handler, 1117 }; 1118 1119 static void jail_process_timeout_cb(struct uloop_timeout *t) 1120 { 1121 DEBUG("jail process failed to stop, sending SIGKILL\n"); 1122 kill(jail_process.pid, SIGKILL); 1123 } 1124 1125 static void jail_handle_signal(int signo) 1126 { 1127 if (hook_running) { 1128 DEBUG("forwarding signal %d to the hook process\n", signo); 1129 kill(hook_process.pid, signo); 1130 /* set timeout to send SIGKILL hook process in case SIGTERM doesn't succeed */ 1131 if (signo == SIGTERM) 1132 uloop_timeout_set(&hook_process_timeout, opts.term_timeout * 1000); 1133 } 1134 1135 if (jail_running) { 1136 DEBUG("forwarding signal %d to the jailed process\n", signo); 1137 kill(jail_process.pid, signo); 1138 /* set timeout to send SIGKILL jail process in case SIGTERM doesn't succeed */ 1139 if (signo == SIGTERM) 1140 uloop_timeout_set(&jail_process_timeout, opts.term_timeout * 1000); 1141 } 1142 } 1143 1144 static void signals_init(void) 1145 { 1146 int i; 1147 sigset_t sigmask; 1148 1149 sigfillset(&sigmask); 1150 for (i = 0; i < _NSIG; i++) { 1151 struct sigaction s = { 0 }; 1152 1153 if (!sigismember(&sigmask, i)) 1154 continue; 1155 if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV) || (i == SIGSTOP) || (i == SIGKILL)) 1156 continue; 1157 1158 s.sa_handler = jail_handle_signal; 1159 sigaction(i, &s, NULL); 1160 } 1161 } 1162 1163 static void pre_exec_jail(struct uloop_timeout *t); 1164 static struct uloop_timeout pre_exec_timeout = { 1165 .cb = pre_exec_jail, 1166 }; 1167 1168 int pipes[4]; 1169 static int exec_jail(void *arg) 1170 { 1171 char buf[1]; 1172 1173 exit_from_child = true; 1174 prctl(PR_SET_SECUREBITS, 0); 1175 1176 uloop_init(); 1177 signals_init(); 1178 1179 close(pipes[0]); 1180 close(pipes[3]); 1181 1182 setns_open(CLONE_NEWUSER); 1183 setns_open(CLONE_NEWNET); 1184 setns_open(CLONE_NEWNS); 1185 setns_open(CLONE_NEWIPC); 1186 setns_open(CLONE_NEWUTS); 1187 1188 buf[0] = 'i'; 1189 if (write(pipes[1], buf, 1) < 1) { 1190 ERROR("can't write to parent\n"); 1191 return EXIT_FAILURE; 1192 } 1193 close(pipes[1]); 1194 if (read(pipes[2], buf, 1) < 1) { 1195 ERROR("can't read from parent\n"); 1196 return EXIT_FAILURE; 1197 } 1198 if (buf[0] != 'O') { 1199 ERROR("parent had an error, child exiting\n"); 1200 return EXIT_FAILURE; 1201 } 1202 1203 if (opts.namespace & CLONE_NEWCGROUP) 1204 unshare(CLONE_NEWCGROUP); 1205 1206 setns_open(CLONE_NEWCGROUP); 1207 1208 if ((opts.namespace & CLONE_NEWUSER) || (opts.setns.user != -1)) { 1209 if (setregid(0, 0) < 0) { 1210 ERROR("setgid\n"); 1211 free_and_exit(EXIT_FAILURE); 1212 } 1213 if (setreuid(0, 0) < 0) { 1214 ERROR("setuid\n"); 1215 free_and_exit(EXIT_FAILURE); 1216 } 1217 if (setgroups(0, NULL) < 0) { 1218 ERROR("setgroups\n"); 1219 free_and_exit(EXIT_FAILURE); 1220 } 1221 } 1222 1223 if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0 1224 && sethostname(opts.hostname, strlen(opts.hostname))) { 1225 ERROR("sethostname(%s) failed: %m\n", opts.hostname); 1226 free_and_exit(EXIT_FAILURE); 1227 } 1228 1229 uloop_timeout_add(&pre_exec_timeout); 1230 uloop_run(); 1231 1232 free_and_exit(-1); 1233 return -1; 1234 } 1235 1236 static void pre_exec_jail(struct uloop_timeout *t) 1237 { 1238 if ((opts.namespace & CLONE_NEWNS) && build_jail_fs()) { 1239 ERROR("failed to build jail fs\n"); 1240 free_and_exit(EXIT_FAILURE); 1241 } else { 1242 run_hooks(opts.hooks.createContainer, post_jail_fs); 1243 } 1244 } 1245 1246 static void post_start_hook(void); 1247 static void post_jail_fs(void) 1248 { 1249 char buf[1]; 1250 1251 if (read(pipes[2], buf, 1) < 1) { 1252 ERROR("can't read from parent\n"); 1253 free_and_exit(EXIT_FAILURE); 1254 } 1255 if (buf[0] != '!') { 1256 ERROR("parent had an error, child exiting\n"); 1257 free_and_exit(EXIT_FAILURE); 1258 } 1259 close(pipes[2]); 1260 1261 run_hooks(opts.hooks.startContainer, post_start_hook); 1262 } 1263 1264 static void post_start_hook(void) 1265 { 1266 int pw_uid, pw_gid, gr_gid; 1267 1268 /* 1269 * make sure setuid/setgid won't drop capabilities in case capabilities 1270 * have been specified explicitely. 1271 */ 1272 if (opts.capset.apply) { 1273 if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) { 1274 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n"); 1275 free_and_exit(EXIT_FAILURE); 1276 } 1277 } 1278 1279 /* drop capabilities, retain those still needed to further setup jail */ 1280 if (applyOCIcapabilities(opts.capset, (1LLU << CAP_SETGID) | (1LLU << CAP_SETUID) | (1LLU << CAP_SETPCAP))) 1281 free_and_exit(EXIT_FAILURE); 1282 1283 /* use either cmdline-supplied user/group or uid/gid from OCI spec */ 1284 get_jail_user(&pw_uid, &pw_gid, &gr_gid); 1285 set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid); 1286 1287 if (opts.additional_gids && 1288 (setgroups(opts.num_additional_gids, opts.additional_gids) < 0)) { 1289 ERROR("setgroups failed: %m\n"); 1290 free_and_exit(EXIT_FAILURE); 1291 } 1292 1293 if (opts.set_umask) 1294 umask(opts.umask); 1295 1296 /* restore securebits back to normal (and lock them if not in userns) */ 1297 if (opts.capset.apply) { 1298 if (prctl(PR_SET_SECUREBITS, (opts.namespace & CLONE_NEWUSER)?0: 1299 SECBIT_KEEP_CAPS_LOCKED|SECBIT_NO_SETUID_FIXUP_LOCKED|SECBIT_NOROOT_LOCKED)) { 1300 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n"); 1301 free_and_exit(EXIT_FAILURE); 1302 } 1303 } 1304 1305 /* drop remaining capabilities to end up with specified sets */ 1306 if (applyOCIcapabilities(opts.capset, 0)) 1307 free_and_exit(EXIT_FAILURE); 1308 1309 if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { 1310 ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n"); 1311 free_and_exit(EXIT_FAILURE); 1312 } 1313 1314 char **envp = build_envp(opts.seccomp, opts.envp); 1315 if (!envp) 1316 free_and_exit(EXIT_FAILURE); 1317 1318 if (opts.cwd && chdir(opts.cwd)) 1319 free_and_exit(EXIT_FAILURE); 1320 1321 if (opts.ociseccomp && applyOCIlinuxseccomp(opts.ociseccomp)) 1322 free_and_exit(EXIT_FAILURE); 1323 1324 uloop_end(); 1325 free_opts(false); 1326 INFO("exec-ing %s\n", *opts.jail_argv); 1327 if (opts.envp) /* respect PATH if potentially set in ENV */ 1328 execvpe(*opts.jail_argv, opts.jail_argv, envp); 1329 else 1330 execve(*opts.jail_argv, opts.jail_argv, envp); 1331 1332 /* we get there only if execve fails */ 1333 ERROR("failed to execve %s: %m\n", *opts.jail_argv); 1334 exit(EXIT_FAILURE); 1335 } 1336 1337 int ns_open_pid(const char *nstype, const pid_t target_ns) 1338 { 1339 char pid_pid_path[PATH_MAX]; 1340 1341 snprintf(pid_pid_path, sizeof(pid_pid_path), "/proc/%u/ns/%s", target_ns, nstype); 1342 1343 return open(pid_pid_path, O_RDONLY); 1344 } 1345 1346 static int parseOCIenvarray(struct blob_attr *msg, char ***envp) 1347 { 1348 struct blob_attr *cur; 1349 int sz = 0, rem; 1350 1351 blobmsg_for_each_attr(cur, msg, rem) 1352 ++sz; 1353 1354 if (sz > 0) { 1355 *envp = calloc(1 + sz, sizeof(char*)); 1356 if (!(*envp)) 1357 return ENOMEM; 1358 } else { 1359 *envp = NULL; 1360 return 0; 1361 } 1362 1363 sz = 0; 1364 blobmsg_for_each_attr(cur, msg, rem) 1365 (*envp)[sz++] = strdup(blobmsg_get_string(cur)); 1366 1367 if (sz) 1368 (*envp)[sz] = NULL; 1369 1370 return 0; 1371 } 1372 1373 enum { 1374 OCI_ROOT_PATH, 1375 OCI_ROOT_READONLY, 1376 __OCI_ROOT_MAX, 1377 }; 1378 1379 static const struct blobmsg_policy oci_root_policy[] = { 1380 [OCI_ROOT_PATH] = { "path", BLOBMSG_TYPE_STRING }, 1381 [OCI_ROOT_READONLY] = { "readonly", BLOBMSG_TYPE_BOOL }, 1382 }; 1383 1384 static int parseOCIroot(const char *jsonfile, struct blob_attr *msg) 1385 { 1386 char extroot[PATH_MAX] = { 0 }; 1387 struct blob_attr *tb[__OCI_ROOT_MAX]; 1388 char *cur; 1389 char *root_path; 1390 1391 blobmsg_parse(oci_root_policy, __OCI_ROOT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1392 1393 if (!tb[OCI_ROOT_PATH]) 1394 return ENODATA; 1395 1396 root_path = blobmsg_get_string(tb[OCI_ROOT_PATH]); 1397 1398 /* prepend bundle directory in case of relative paths */ 1399 if (root_path[0] != '/') { 1400 strncpy(extroot, jsonfile, PATH_MAX - 1); 1401 1402 cur = strrchr(extroot, '/'); 1403 1404 if (!cur) 1405 return ENOTDIR; 1406 1407 *(++cur) = '\0'; 1408 } 1409 1410 strncat(extroot, root_path, PATH_MAX - (strlen(extroot) + 1)); 1411 1412 /* follow symbolic link(s) */ 1413 opts.extroot = realpath(extroot, NULL); 1414 if (!opts.extroot) 1415 return errno; 1416 1417 if (tb[OCI_ROOT_READONLY]) 1418 opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]); 1419 1420 return 0; 1421 } 1422 1423 1424 enum { 1425 OCI_HOOK_PATH, 1426 OCI_HOOK_ARGS, 1427 OCI_HOOK_ENV, 1428 OCI_HOOK_TIMEOUT, 1429 __OCI_HOOK_MAX, 1430 }; 1431 1432 static const struct blobmsg_policy oci_hook_policy[] = { 1433 [OCI_HOOK_PATH] = { "path", BLOBMSG_TYPE_STRING }, 1434 [OCI_HOOK_ARGS] = { "args", BLOBMSG_TYPE_ARRAY }, 1435 [OCI_HOOK_ENV] = { "env", BLOBMSG_TYPE_ARRAY }, 1436 [OCI_HOOK_TIMEOUT] = { "timeout", BLOBMSG_TYPE_INT32 }, 1437 }; 1438 1439 1440 static int parseOCIhook(struct hook_execvpe ***hooklist, struct blob_attr *msg) 1441 { 1442 struct blob_attr *tb[__OCI_HOOK_MAX]; 1443 struct blob_attr *cur; 1444 int rem, ret = 0; 1445 int idx = 0; 1446 1447 blobmsg_for_each_attr(cur, msg, rem) 1448 ++idx; 1449 1450 if (!idx) 1451 return 0; 1452 1453 *hooklist = calloc(idx + 1, sizeof(struct hook_execvpe *)); 1454 idx = 0; 1455 1456 if (!(*hooklist)) 1457 return ENOMEM; 1458 1459 blobmsg_for_each_attr(cur, msg, rem) { 1460 blobmsg_parse(oci_hook_policy, __OCI_HOOK_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 1461 1462 if (!tb[OCI_HOOK_PATH]) { 1463 ret = EINVAL; 1464 goto errout; 1465 } 1466 1467 (*hooklist)[idx] = calloc(1, sizeof(struct hook_execvpe)); 1468 if (tb[OCI_HOOK_ARGS]) { 1469 ret = parseOCIenvarray(tb[OCI_HOOK_ARGS], &((*hooklist)[idx]->argv)); 1470 if (ret) 1471 goto errout; 1472 } else { 1473 (*hooklist)[idx]->argv = calloc(2, sizeof(char *)); 1474 ((*hooklist)[idx]->argv)[0] = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH])); 1475 ((*hooklist)[idx]->argv)[1] = NULL; 1476 }; 1477 1478 1479 if (tb[OCI_HOOK_ENV]) { 1480 ret = parseOCIenvarray(tb[OCI_HOOK_ENV], &((*hooklist)[idx]->envp)); 1481 if (ret) 1482 goto errout; 1483 } 1484 1485 if (tb[OCI_HOOK_TIMEOUT]) 1486 (*hooklist)[idx]->timeout = blobmsg_get_u32(tb[OCI_HOOK_TIMEOUT]); 1487 1488 (*hooklist)[idx]->file = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH])); 1489 1490 ++idx; 1491 } 1492 1493 (*hooklist)[idx] = NULL; 1494 1495 DEBUG("added %d hooks\n", idx); 1496 1497 return 0; 1498 1499 errout: 1500 free_hooklist(*hooklist); 1501 *hooklist = NULL; 1502 1503 return ret; 1504 }; 1505 1506 1507 enum { 1508 OCI_HOOKS_PRESTART, 1509 OCI_HOOKS_CREATERUNTIME, 1510 OCI_HOOKS_CREATECONTAINER, 1511 OCI_HOOKS_STARTCONTAINER, 1512 OCI_HOOKS_POSTSTART, 1513 OCI_HOOKS_POSTSTOP, 1514 __OCI_HOOKS_MAX, 1515 }; 1516 1517 static const struct blobmsg_policy oci_hooks_policy[] = { 1518 [OCI_HOOKS_PRESTART] = { "prestart", BLOBMSG_TYPE_ARRAY }, 1519 [OCI_HOOKS_CREATERUNTIME] = { "createRuntime", BLOBMSG_TYPE_ARRAY }, 1520 [OCI_HOOKS_CREATECONTAINER] = { "createContainer", BLOBMSG_TYPE_ARRAY }, 1521 [OCI_HOOKS_STARTCONTAINER] = { "startContainer", BLOBMSG_TYPE_ARRAY }, 1522 [OCI_HOOKS_POSTSTART] = { "poststart", BLOBMSG_TYPE_ARRAY }, 1523 [OCI_HOOKS_POSTSTOP] = { "poststop", BLOBMSG_TYPE_ARRAY }, 1524 }; 1525 1526 static int parseOCIhooks(struct blob_attr *msg) 1527 { 1528 struct blob_attr *tb[__OCI_HOOKS_MAX]; 1529 int ret; 1530 1531 blobmsg_parse(oci_hooks_policy, __OCI_HOOKS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1532 1533 if (tb[OCI_HOOKS_PRESTART]) 1534 INFO("warning: ignoring deprecated prestart hook\n"); 1535 1536 if (tb[OCI_HOOKS_CREATERUNTIME]) { 1537 ret = parseOCIhook(&opts.hooks.createRuntime, tb[OCI_HOOKS_CREATERUNTIME]); 1538 if (ret) 1539 return ret; 1540 } 1541 1542 if (tb[OCI_HOOKS_CREATECONTAINER]) { 1543 ret = parseOCIhook(&opts.hooks.createContainer, tb[OCI_HOOKS_CREATECONTAINER]); 1544 if (ret) 1545 goto out_createruntime; 1546 } 1547 1548 if (tb[OCI_HOOKS_STARTCONTAINER]) { 1549 ret = parseOCIhook(&opts.hooks.startContainer, tb[OCI_HOOKS_STARTCONTAINER]); 1550 if (ret) 1551 goto out_createcontainer; 1552 } 1553 1554 if (tb[OCI_HOOKS_POSTSTART]) { 1555 ret = parseOCIhook(&opts.hooks.poststart, tb[OCI_HOOKS_POSTSTART]); 1556 if (ret) 1557 goto out_startcontainer; 1558 } 1559 1560 if (tb[OCI_HOOKS_POSTSTOP]) { 1561 ret = parseOCIhook(&opts.hooks.poststop, tb[OCI_HOOKS_POSTSTOP]); 1562 if (ret) 1563 goto out_poststart; 1564 } 1565 1566 return 0; 1567 1568 out_poststart: 1569 free_hooklist(opts.hooks.poststart); 1570 out_startcontainer: 1571 free_hooklist(opts.hooks.startContainer); 1572 out_createcontainer: 1573 free_hooklist(opts.hooks.createContainer); 1574 out_createruntime: 1575 free_hooklist(opts.hooks.createRuntime); 1576 1577 return ret; 1578 }; 1579 1580 1581 enum { 1582 OCI_PROCESS_USER_UID, 1583 OCI_PROCESS_USER_GID, 1584 OCI_PROCESS_USER_UMASK, 1585 OCI_PROCESS_USER_ADDITIONALGIDS, 1586 __OCI_PROCESS_USER_MAX, 1587 }; 1588 1589 static const struct blobmsg_policy oci_process_user_policy[] = { 1590 [OCI_PROCESS_USER_UID] = { "uid", BLOBMSG_TYPE_INT32 }, 1591 [OCI_PROCESS_USER_GID] = { "gid", BLOBMSG_TYPE_INT32 }, 1592 [OCI_PROCESS_USER_UMASK] = { "umask", BLOBMSG_TYPE_INT32 }, 1593 [OCI_PROCESS_USER_ADDITIONALGIDS] = { "additionalGids", BLOBMSG_TYPE_ARRAY }, 1594 }; 1595 1596 static int parseOCIprocessuser(struct blob_attr *msg) { 1597 struct blob_attr *tb[__OCI_PROCESS_USER_MAX]; 1598 struct blob_attr *cur; 1599 int rem; 1600 int has_gid = 0; 1601 1602 blobmsg_parse(oci_process_user_policy, __OCI_PROCESS_USER_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1603 1604 if (tb[OCI_PROCESS_USER_UID]) 1605 opts.pw_uid = blobmsg_get_u32(tb[OCI_PROCESS_USER_UID]); 1606 1607 if (tb[OCI_PROCESS_USER_GID]) { 1608 opts.pw_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]); 1609 opts.gr_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]); 1610 has_gid = 1; 1611 } 1612 1613 if (tb[OCI_PROCESS_USER_ADDITIONALGIDS]) { 1614 size_t gidcnt = 0; 1615 1616 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) { 1617 ++gidcnt; 1618 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid)) 1619 continue; 1620 } 1621 1622 if (gidcnt) { 1623 opts.additional_gids = calloc(gidcnt + has_gid, sizeof(gid_t)); 1624 gidcnt = 0; 1625 1626 /* always add primary GID to set of GIDs if set */ 1627 if (has_gid) 1628 opts.additional_gids[gidcnt++] = opts.gr_gid; 1629 1630 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) { 1631 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid)) 1632 continue; 1633 opts.additional_gids[gidcnt++] = blobmsg_get_u32(cur); 1634 } 1635 opts.num_additional_gids = gidcnt; 1636 } 1637 DEBUG("read %zu additional groups\n", gidcnt); 1638 } 1639 1640 if (tb[OCI_PROCESS_USER_UMASK]) { 1641 opts.umask = blobmsg_get_u32(tb[OCI_PROCESS_USER_UMASK]); 1642 opts.set_umask = true; 1643 } 1644 1645 return 0; 1646 } 1647 1648 enum { 1649 OCI_PROCESS_RLIMIT_TYPE, 1650 OCI_PROCESS_RLIMIT_SOFT, 1651 OCI_PROCESS_RLIMIT_HARD, 1652 __OCI_PROCESS_RLIMIT_MAX, 1653 }; 1654 1655 static const struct blobmsg_policy oci_process_rlimit_policy[] = { 1656 [OCI_PROCESS_RLIMIT_TYPE] = { "type", BLOBMSG_TYPE_STRING }, 1657 [OCI_PROCESS_RLIMIT_SOFT] = { "soft", BLOBMSG_CAST_INT64 }, 1658 [OCI_PROCESS_RLIMIT_HARD] = { "hard", BLOBMSG_CAST_INT64 }, 1659 }; 1660 1661 /* from manpage GETRLIMIT(2) */ 1662 static const char* const rlimit_names[RLIM_NLIMITS] = { 1663 [RLIMIT_AS] = "AS", 1664 [RLIMIT_CORE] = "CORE", 1665 [RLIMIT_CPU] = "CPU", 1666 [RLIMIT_DATA] = "DATA", 1667 [RLIMIT_FSIZE] = "FSIZE", 1668 [RLIMIT_LOCKS] = "LOCKS", 1669 [RLIMIT_MEMLOCK] = "MEMLOCK", 1670 [RLIMIT_MSGQUEUE] = "MSGQUEUE", 1671 [RLIMIT_NICE] = "NICE", 1672 [RLIMIT_NOFILE] = "NOFILE", 1673 [RLIMIT_NPROC] = "NPROC", 1674 [RLIMIT_RSS] = "RSS", 1675 [RLIMIT_RTPRIO] = "RTPRIO", 1676 [RLIMIT_RTTIME] = "RTTIME", 1677 [RLIMIT_SIGPENDING] = "SIGPENDING", 1678 [RLIMIT_STACK] = "STACK", 1679 }; 1680 1681 static int resolve_rlimit(char *type) { 1682 unsigned int rltype; 1683 1684 for (rltype = 0; rltype < RLIM_NLIMITS; ++rltype) 1685 if (rlimit_names[rltype] && 1686 !strncmp("RLIMIT_", type, 7) && 1687 !strcmp(rlimit_names[rltype], type + 7)) 1688 return rltype; 1689 1690 return -1; 1691 } 1692 1693 1694 static int parseOCIrlimit(struct blob_attr *msg) 1695 { 1696 struct blob_attr *tb[__OCI_PROCESS_RLIMIT_MAX]; 1697 int limtype = -1; 1698 struct rlimit *curlim; 1699 1700 blobmsg_parse(oci_process_rlimit_policy, __OCI_PROCESS_RLIMIT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1701 1702 if (!tb[OCI_PROCESS_RLIMIT_TYPE] || 1703 !tb[OCI_PROCESS_RLIMIT_SOFT] || 1704 !tb[OCI_PROCESS_RLIMIT_HARD]) 1705 return ENODATA; 1706 1707 limtype = resolve_rlimit(blobmsg_get_string(tb[OCI_PROCESS_RLIMIT_TYPE])); 1708 1709 if (limtype < 0) 1710 return EINVAL; 1711 1712 if (opts.rlimits[limtype]) 1713 return ENOTUNIQ; 1714 1715 curlim = malloc(sizeof(struct rlimit)); 1716 curlim->rlim_cur = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_SOFT]); 1717 curlim->rlim_max = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_HARD]); 1718 1719 opts.rlimits[limtype] = curlim; 1720 1721 return 0; 1722 }; 1723 1724 enum { 1725 OCI_PROCESS_ARGS, 1726 OCI_PROCESS_CAPABILITIES, 1727 OCI_PROCESS_CWD, 1728 OCI_PROCESS_ENV, 1729 OCI_PROCESS_OOMSCOREADJ, 1730 OCI_PROCESS_NONEWPRIVILEGES, 1731 OCI_PROCESS_RLIMITS, 1732 OCI_PROCESS_TERMINAL, 1733 OCI_PROCESS_USER, 1734 __OCI_PROCESS_MAX, 1735 }; 1736 1737 static const struct blobmsg_policy oci_process_policy[] = { 1738 [OCI_PROCESS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY }, 1739 [OCI_PROCESS_CAPABILITIES] = { "capabilities", BLOBMSG_TYPE_TABLE }, 1740 [OCI_PROCESS_CWD] = { "cwd", BLOBMSG_TYPE_STRING }, 1741 [OCI_PROCESS_ENV] = { "env", BLOBMSG_TYPE_ARRAY }, 1742 [OCI_PROCESS_OOMSCOREADJ] = { "oomScoreAdj", BLOBMSG_TYPE_INT32 }, 1743 [OCI_PROCESS_NONEWPRIVILEGES] = { "noNewPrivileges", BLOBMSG_TYPE_BOOL }, 1744 [OCI_PROCESS_RLIMITS] = { "rlimits", BLOBMSG_TYPE_ARRAY }, 1745 [OCI_PROCESS_TERMINAL] = { "terminal", BLOBMSG_TYPE_BOOL }, 1746 [OCI_PROCESS_USER] = { "user", BLOBMSG_TYPE_TABLE }, 1747 }; 1748 1749 1750 static int parseOCIprocess(struct blob_attr *msg) 1751 { 1752 struct blob_attr *tb[__OCI_PROCESS_MAX], *cur; 1753 int rem, res; 1754 1755 blobmsg_parse(oci_process_policy, __OCI_PROCESS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1756 1757 if (!tb[OCI_PROCESS_ARGS]) 1758 return ENOENT; 1759 1760 res = parseOCIenvarray(tb[OCI_PROCESS_ARGS], &opts.jail_argv); 1761 if (res) 1762 return res; 1763 1764 if (tb[OCI_PROCESS_TERMINAL]) 1765 opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]); 1766 1767 if (tb[OCI_PROCESS_NONEWPRIVILEGES]) 1768 opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]); 1769 1770 if (tb[OCI_PROCESS_CWD]) 1771 opts.cwd = strdup(blobmsg_get_string(tb[OCI_PROCESS_CWD])); 1772 1773 if (tb[OCI_PROCESS_ENV]) { 1774 res = parseOCIenvarray(tb[OCI_PROCESS_ENV], &opts.envp); 1775 if (res) 1776 return res; 1777 } 1778 1779 if (tb[OCI_PROCESS_USER] && (res = parseOCIprocessuser(tb[OCI_PROCESS_USER]))) 1780 return res; 1781 1782 if (tb[OCI_PROCESS_CAPABILITIES] && 1783 (res = parseOCIcapabilities(&opts.capset, tb[OCI_PROCESS_CAPABILITIES]))) 1784 return res; 1785 1786 if (tb[OCI_PROCESS_RLIMITS]) { 1787 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_RLIMITS], rem) { 1788 res = parseOCIrlimit(cur); 1789 if (res) 1790 return res; 1791 } 1792 } 1793 1794 if (tb[OCI_PROCESS_OOMSCOREADJ]) { 1795 opts.oom_score_adj = blobmsg_get_u32(tb[OCI_PROCESS_OOMSCOREADJ]); 1796 opts.set_oom_score_adj = true; 1797 } 1798 1799 return 0; 1800 } 1801 1802 enum { 1803 OCI_LINUX_NAMESPACE_TYPE, 1804 OCI_LINUX_NAMESPACE_PATH, 1805 __OCI_LINUX_NAMESPACE_MAX, 1806 }; 1807 1808 static const struct blobmsg_policy oci_linux_namespace_policy[] = { 1809 [OCI_LINUX_NAMESPACE_TYPE] = { "type", BLOBMSG_TYPE_STRING }, 1810 [OCI_LINUX_NAMESPACE_PATH] = { "path", BLOBMSG_TYPE_STRING }, 1811 }; 1812 1813 static int resolve_nstype(char *type) { 1814 if (!strcmp("pid", type)) 1815 return CLONE_NEWPID; 1816 else if (!strcmp("network", type)) 1817 return CLONE_NEWNET; 1818 else if (!strcmp("net", type)) 1819 return CLONE_NEWNET; 1820 else if (!strcmp("mount", type)) 1821 return CLONE_NEWNS; 1822 else if (!strcmp("ipc", type)) 1823 return CLONE_NEWIPC; 1824 else if (!strcmp("uts", type)) 1825 return CLONE_NEWUTS; 1826 else if (!strcmp("user", type)) 1827 return CLONE_NEWUSER; 1828 else if (!strcmp("cgroup", type)) 1829 return CLONE_NEWCGROUP; 1830 #ifdef CLONE_NEWTIME 1831 else if (!strcmp("time", type)) 1832 return CLONE_NEWTIME; 1833 #endif 1834 else 1835 return 0; 1836 } 1837 1838 static int parseOCIlinuxns(struct blob_attr *msg) 1839 { 1840 struct blob_attr *tb[__OCI_LINUX_NAMESPACE_MAX]; 1841 int nstype; 1842 int *setns; 1843 int fd; 1844 1845 blobmsg_parse(oci_linux_namespace_policy, __OCI_LINUX_NAMESPACE_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 1846 1847 if (!tb[OCI_LINUX_NAMESPACE_TYPE]) 1848 return EINVAL; 1849 1850 nstype = resolve_nstype(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE])); 1851 if (!nstype) 1852 return EINVAL; 1853 1854 if (opts.namespace & nstype) 1855 return ENOTUNIQ; 1856 1857 setns = get_namespace_fd(nstype); 1858 1859 if (!setns) 1860 return EFAULT; 1861 1862 if (*setns != -1) 1863 return ENOTUNIQ; 1864 1865 if (tb[OCI_LINUX_NAMESPACE_PATH]) { 1866 DEBUG("opening existing %s namespace from path %s\n", 1867 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]), 1868 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH])); 1869 1870 fd = open(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]), O_RDONLY); 1871 if (fd < 0) 1872 return errno?:ESTALE; 1873 1874 if (ioctl(fd, NS_GET_NSTYPE) != nstype) { 1875 close(fd); 1876 return EINVAL; 1877 } 1878 1879 DEBUG("opened existing %s namespace got filehandler %u\n", 1880 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]), 1881 fd); 1882 1883 *setns = fd; 1884 } else { 1885 opts.namespace |= nstype; 1886 } 1887 1888 return 0; 1889 } 1890 1891 /* 1892 * join namespace of existing PID 1893 * The string argument is the reference PID followed by ':' and a 1894 * ',' separated list of namespaces to to join. 1895 */ 1896 static int jail_join_ns(char *arg) 1897 { 1898 pid_t pid; 1899 int fd; 1900 int nstype; 1901 char *tmp, *etmp, *nspath; 1902 int *setns; 1903 1904 tmp = strchr(arg, ':'); 1905 if (!tmp) 1906 return EINVAL; 1907 1908 *tmp = '\0'; 1909 pid = atoi(arg); 1910 1911 do { 1912 ++tmp; 1913 etmp = strchr(tmp, ','); 1914 if (etmp) 1915 *etmp = '\0'; 1916 1917 nstype = resolve_nstype(tmp); 1918 if (!nstype) 1919 return EINVAL; 1920 1921 if (opts.namespace & nstype) 1922 return ENOTUNIQ; 1923 1924 setns = get_namespace_fd(nstype); 1925 1926 if (!setns) 1927 return EFAULT; 1928 1929 if (*setns != -1) 1930 return ENOTUNIQ; 1931 1932 if (asprintf(&nspath, "/proc/%d/ns/%s", pid, tmp) < 0) 1933 return ENOMEM; 1934 1935 fd = open(nspath, O_RDONLY); 1936 free(nspath); 1937 1938 if (fd < 0) 1939 return errno?:ESTALE; 1940 1941 *setns = fd; 1942 1943 if (etmp) 1944 tmp = etmp; 1945 else 1946 tmp = NULL; 1947 } while (tmp); 1948 1949 return 0; 1950 } 1951 1952 static void get_jail_root_user(bool is_gidmap, uint32_t container_id, uint32_t host_id, uint32_t size) 1953 { 1954 if (container_id == 0 && size >= 1) 1955 if (!is_gidmap) 1956 opts.root_map_uid = host_id; 1957 } 1958 1959 enum { 1960 OCI_LINUX_UIDGIDMAP_CONTAINERID, 1961 OCI_LINUX_UIDGIDMAP_HOSTID, 1962 OCI_LINUX_UIDGIDMAP_SIZE, 1963 __OCI_LINUX_UIDGIDMAP_MAX, 1964 }; 1965 1966 static const struct blobmsg_policy oci_linux_uidgidmap_policy[] = { 1967 [OCI_LINUX_UIDGIDMAP_CONTAINERID] = { "containerID", BLOBMSG_TYPE_INT32 }, 1968 [OCI_LINUX_UIDGIDMAP_HOSTID] = { "hostID", BLOBMSG_TYPE_INT32 }, 1969 [OCI_LINUX_UIDGIDMAP_SIZE] = { "size", BLOBMSG_TYPE_INT32 }, 1970 }; 1971 1972 static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap) 1973 { 1974 struct blob_attr *tb[__OCI_LINUX_UIDGIDMAP_MAX]; 1975 struct blob_attr *cur; 1976 int rem; 1977 char *map; 1978 size_t len, pos, totallen = 0; 1979 1980 blobmsg_for_each_attr(cur, msg, rem) { 1981 blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 1982 1983 if (!tb[OCI_LINUX_UIDGIDMAP_CONTAINERID] || 1984 !tb[OCI_LINUX_UIDGIDMAP_HOSTID] || 1985 !tb[OCI_LINUX_UIDGIDMAP_SIZE]) 1986 return EINVAL; 1987 1988 /* count length */ 1989 totallen += snprintf(NULL, 0, "%d %d %d\n", 1990 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]), 1991 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]), 1992 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE])); 1993 } 1994 1995 /* allocate combined mapping string */ 1996 map = malloc(totallen + 1); 1997 if (!map) 1998 return ENOMEM; 1999 2000 pos = 0; 2001 blobmsg_for_each_attr(cur, msg, rem) { 2002 blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 2003 2004 get_jail_root_user(is_gidmap, blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]), 2005 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]), 2006 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE])); 2007 2008 /* write mapping line into pre-allocated string */ 2009 len = snprintf(&map[pos], totallen + 1, "%d %d %d\n", 2010 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]), 2011 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]), 2012 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE])); 2013 pos += len; 2014 totallen -= len; 2015 } 2016 2017 assert(totallen == 0); 2018 2019 if (is_gidmap) 2020 opts.gidmap = map; 2021 else 2022 opts.uidmap = map; 2023 2024 return 0; 2025 } 2026 2027 enum { 2028 OCI_DEVICES_TYPE, 2029 OCI_DEVICES_PATH, 2030 OCI_DEVICES_MAJOR, 2031 OCI_DEVICES_MINOR, 2032 OCI_DEVICES_FILEMODE, 2033 OCI_DEVICES_UID, 2034 OCI_DEVICES_GID, 2035 __OCI_DEVICES_MAX, 2036 }; 2037 2038 static const struct blobmsg_policy oci_devices_policy[] = { 2039 [OCI_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING }, 2040 [OCI_DEVICES_PATH] = { "path", BLOBMSG_TYPE_STRING }, 2041 [OCI_DEVICES_MAJOR] = { "major", BLOBMSG_TYPE_INT32 }, 2042 [OCI_DEVICES_MINOR] = { "minor", BLOBMSG_TYPE_INT32 }, 2043 [OCI_DEVICES_FILEMODE] = { "fileMode", BLOBMSG_TYPE_INT32 }, 2044 [OCI_DEVICES_UID] = { "uid", BLOBMSG_TYPE_INT32 }, 2045 [OCI_DEVICES_GID] = { "uid", BLOBMSG_TYPE_INT32 }, 2046 }; 2047 2048 static mode_t resolve_devtype(char *tstr) 2049 { 2050 if (!strcmp("c", tstr) || 2051 !strcmp("u", tstr)) 2052 return S_IFCHR; 2053 else if (!strcmp("b", tstr)) 2054 return S_IFBLK; 2055 else if (!strcmp("p", tstr)) 2056 return S_IFIFO; 2057 else 2058 return 0; 2059 } 2060 2061 static int parseOCIdevices(struct blob_attr *msg) 2062 { 2063 struct blob_attr *tb[__OCI_DEVICES_MAX]; 2064 struct blob_attr *cur; 2065 int rem; 2066 size_t cnt = 0; 2067 struct mknod_args *tmp; 2068 2069 blobmsg_for_each_attr(cur, msg, rem) 2070 ++cnt; 2071 2072 opts.devices = calloc(cnt + 1, sizeof(struct mknod_args *)); 2073 2074 cnt = 0; 2075 blobmsg_for_each_attr(cur, msg, rem) { 2076 blobmsg_parse(oci_devices_policy, __OCI_DEVICES_MAX, tb, blobmsg_data(cur), blobmsg_len(cur)); 2077 if (!tb[OCI_DEVICES_TYPE] || 2078 !tb[OCI_DEVICES_PATH]) 2079 return ENODATA; 2080 2081 tmp = calloc(1, sizeof(struct mknod_args)); 2082 if (!tmp) 2083 return ENOMEM; 2084 2085 tmp->mode = resolve_devtype(blobmsg_get_string(tb[OCI_DEVICES_TYPE])); 2086 if (!tmp->mode) { 2087 free(tmp); 2088 return EINVAL; 2089 } 2090 2091 if (tmp->mode != S_IFIFO) { 2092 if (!tb[OCI_DEVICES_MAJOR] || !tb[OCI_DEVICES_MINOR]) { 2093 free(tmp); 2094 return ENODATA; 2095 } 2096 2097 tmp->dev = makedev(blobmsg_get_u32(tb[OCI_DEVICES_MAJOR]), 2098 blobmsg_get_u32(tb[OCI_DEVICES_MINOR])); 2099 } 2100 2101 if (tb[OCI_DEVICES_FILEMODE]) { 2102 if (~(S_IRWXU|S_IRWXG|S_IRWXO) & blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE])) { 2103 free(tmp); 2104 return EINVAL; 2105 } 2106 2107 tmp->mode |= blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]); 2108 } else { 2109 tmp->mode |= (S_IRUSR|S_IWUSR); /* 0600 */ 2110 } 2111 2112 tmp->path = strdup(blobmsg_get_string(tb[OCI_DEVICES_PATH])); 2113 2114 if (tb[OCI_DEVICES_UID]) 2115 tmp->uid = blobmsg_get_u32(tb[OCI_DEVICES_UID]); 2116 else 2117 tmp->uid = -1; 2118 2119 if (tb[OCI_DEVICES_GID]) 2120 tmp->gid = blobmsg_get_u32(tb[OCI_DEVICES_GID]); 2121 else 2122 tmp->gid = -1; 2123 2124 DEBUG("read device %s (%s)\n", blobmsg_get_string(tb[OCI_DEVICES_PATH]), blobmsg_get_string(tb[OCI_DEVICES_TYPE])); 2125 opts.devices[cnt++] = tmp; 2126 } 2127 2128 opts.devices[cnt] = NULL; 2129 2130 return 0; 2131 } 2132 2133 static int parseOCIsysctl(struct blob_attr *msg) 2134 { 2135 struct blob_attr *cur; 2136 int rem; 2137 char *tmp, *tc; 2138 size_t cnt = 0; 2139 2140 blobmsg_for_each_attr(cur, msg, rem) { 2141 if (!blobmsg_name(cur) || !blobmsg_get_string(cur)) 2142 return EINVAL; 2143 2144 ++cnt; 2145 } 2146 2147 if (!cnt) 2148 return 0; 2149 2150 opts.sysctl = calloc(cnt + 1, sizeof(struct sysctl_val *)); 2151 if (!opts.sysctl) 2152 return ENOMEM; 2153 2154 cnt = 0; 2155 blobmsg_for_each_attr(cur, msg, rem) { 2156 opts.sysctl[cnt] = malloc(sizeof(struct sysctl_val)); 2157 if (!opts.sysctl[cnt]) 2158 return ENOMEM; 2159 2160 /* replace '.' with '/' in entry name */ 2161 tc = tmp = strdup(blobmsg_name(cur)); 2162 while ((tc = strchr(tc, '.'))) 2163 *tc = '/'; 2164 2165 opts.sysctl[cnt]->value = strdup(blobmsg_get_string(cur)); 2166 opts.sysctl[cnt]->entry = tmp; 2167 2168 ++cnt; 2169 } 2170 2171 opts.sysctl[cnt] = NULL; 2172 2173 return 0; 2174 } 2175 2176 2177 enum { 2178 OCI_LINUX_CGROUPSPATH, 2179 OCI_LINUX_RESOURCES, 2180 OCI_LINUX_SECCOMP, 2181 OCI_LINUX_SYSCTL, 2182 OCI_LINUX_NAMESPACES, 2183 OCI_LINUX_DEVICES, 2184 OCI_LINUX_UIDMAPPINGS, 2185 OCI_LINUX_GIDMAPPINGS, 2186 OCI_LINUX_MASKEDPATHS, 2187 OCI_LINUX_READONLYPATHS, 2188 OCI_LINUX_ROOTFSPROPAGATION, 2189 __OCI_LINUX_MAX, 2190 }; 2191 2192 static const struct blobmsg_policy oci_linux_policy[] = { 2193 [OCI_LINUX_CGROUPSPATH] = { "cgroupsPath", BLOBMSG_TYPE_STRING }, 2194 [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE }, 2195 [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE }, 2196 [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE }, 2197 [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY }, 2198 [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY }, 2199 [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY }, 2200 [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY }, 2201 [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY }, 2202 [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY }, 2203 [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING }, 2204 }; 2205 2206 static int parseOCIlinux(struct blob_attr *msg) 2207 { 2208 struct blob_attr *tb[__OCI_LINUX_MAX]; 2209 struct blob_attr *cur; 2210 int rem; 2211 int res = 0; 2212 char *cgpath; 2213 char cgfullpath[256] = "/sys/fs/cgroup"; 2214 2215 blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg)); 2216 2217 if (tb[OCI_LINUX_NAMESPACES]) { 2218 blobmsg_for_each_attr(cur, tb[OCI_LINUX_NAMESPACES], rem) { 2219 res = parseOCIlinuxns(cur); 2220 if (res) 2221 return res; 2222 } 2223 } 2224 2225 if (tb[OCI_LINUX_UIDMAPPINGS]) { 2226 res = parseOCIuidgidmappings(tb[OCI_LINUX_UIDMAPPINGS], 0); 2227 if (res) 2228 return res; 2229 } 2230 2231 if (tb[OCI_LINUX_GIDMAPPINGS]) { 2232 res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 1); 2233 if (res) 2234 return res; 2235 } 2236 2237 if (tb[OCI_LINUX_READONLYPATHS]) { 2238 blobmsg_for_each_attr(cur, tb[OCI_LINUX_READONLYPATHS], rem) { 2239 res = add_mount(NULL, blobmsg_get_string(cur), NULL, MS_BIND | MS_REC | MS_RDONLY, 0, NULL, 0); 2240 if (res) 2241 return res; 2242 } 2243 } 2244 2245 if (tb[OCI_LINUX_MASKEDPATHS]) { 2246 blobmsg_for_each_attr(cur, tb[OCI_LINUX_MASKEDPATHS], rem) { 2247 res = add_mount((void *)(-1), blobmsg_get_string(cur), NULL, 0, 0, NULL, 0); 2248 if (res) 2249 return res; 2250 } 2251 } 2252 2253 if (tb[OCI_LINUX_SYSCTL]) { 2254 res = parseOCIsysctl(tb[OCI_LINUX_SYSCTL]); 2255 if (res) 2256 return res; 2257 } 2258 2259 if (tb[OCI_LINUX_SECCOMP]) { 2260 opts.ociseccomp = parseOCIlinuxseccomp(tb[OCI_LINUX_SECCOMP]); 2261 if (!opts.ociseccomp) 2262 return EINVAL; 2263 } 2264 2265 if (tb[OCI_LINUX_DEVICES]) { 2266 res = parseOCIdevices(tb[OCI_LINUX_DEVICES]); 2267 if (res) 2268 return res; 2269 } 2270 2271 if (tb[OCI_LINUX_CGROUPSPATH]) { 2272 cgpath = blobmsg_get_string(tb[OCI_LINUX_CGROUPSPATH]); 2273 if (cgpath[0] == '/') { 2274 if (strlen(cgpath) + 1 >= (sizeof(cgfullpath) - strlen(cgfullpath))) 2275 return E2BIG; 2276 2277 strcat(cgfullpath, cgpath); 2278 } else { 2279 strcat(cgfullpath, "/containers/"); 2280 if (strlen(opts.name) + strlen(cgpath) + 2 >= (sizeof(cgfullpath) - strlen(cgfullpath))) 2281 return E2BIG; 2282 2283 strcat(cgfullpath, opts.name); /* should be container name rather than jail name */ 2284 strcat(cgfullpath, "/"); 2285 strcat(cgfullpath, cgpath); 2286 } 2287 } else { 2288 strcat(cgfullpath, "/containers/"); 2289 if (2 * strlen(opts.name) + 2 >= (sizeof(cgfullpath) - strlen(cgfullpath))) 2290 return E2BIG; 2291 2292 strcat(cgfullpath, opts.name); /* should be container name rather than jail name */ 2293 strcat(cgfullpath, "/"); 2294 strcat(cgfullpath, opts.name); /* should be container instance name rather than jail name */ 2295 } 2296 2297 cgroups_init(cgfullpath); 2298 2299 if (tb[OCI_LINUX_RESOURCES]) { 2300 res = parseOCIlinuxcgroups(tb[OCI_LINUX_RESOURCES]); 2301 if (res) 2302 return res; 2303 } 2304 2305 return 0; 2306 } 2307 2308 enum { 2309 OCI_VERSION, 2310 OCI_HOSTNAME, 2311 OCI_PROCESS, 2312 OCI_ROOT, 2313 OCI_MOUNTS, 2314 OCI_HOOKS, 2315 OCI_LINUX, 2316 OCI_ANNOTATIONS, 2317 __OCI_MAX, 2318 }; 2319 2320 static const struct blobmsg_policy oci_policy[] = { 2321 [OCI_VERSION] = { "ociVersion", BLOBMSG_TYPE_STRING }, 2322 [OCI_HOSTNAME] = { "hostname", BLOBMSG_TYPE_STRING }, 2323 [OCI_PROCESS] = { "process", BLOBMSG_TYPE_TABLE }, 2324 [OCI_ROOT] = { "root", BLOBMSG_TYPE_TABLE }, 2325 [OCI_MOUNTS] = { "mounts", BLOBMSG_TYPE_ARRAY }, 2326 [OCI_HOOKS] = { "hooks", BLOBMSG_TYPE_TABLE }, 2327 [OCI_LINUX] = { "linux", BLOBMSG_TYPE_TABLE }, 2328 [OCI_ANNOTATIONS] = { "annotations", BLOBMSG_TYPE_TABLE }, 2329 }; 2330 2331 static int parseOCI(const char *jsonfile) 2332 { 2333 struct blob_attr *tb[__OCI_MAX]; 2334 struct blob_attr *cur; 2335 int rem; 2336 int res; 2337 2338 blob_buf_init(&ocibuf, 0); 2339 2340 if (!blobmsg_add_json_from_file(&ocibuf, jsonfile)) { 2341 res=ENOENT; 2342 goto errout; 2343 } 2344 2345 blobmsg_parse(oci_policy, __OCI_MAX, tb, blob_data(ocibuf.head), blob_len(ocibuf.head)); 2346 2347 if (!tb[OCI_VERSION]) { 2348 res=ENOMSG; 2349 goto errout; 2350 } 2351 2352 if (strncmp("1.0", blobmsg_get_string(tb[OCI_VERSION]), 3)) { 2353 ERROR("unsupported ociVersion %s\n", blobmsg_get_string(tb[OCI_VERSION])); 2354 res=ENOTSUP; 2355 goto errout; 2356 } 2357 2358 if (tb[OCI_HOSTNAME]) 2359 opts.hostname = strdup(blobmsg_get_string(tb[OCI_HOSTNAME])); 2360 2361 if (!tb[OCI_PROCESS]) { 2362 res=ENODATA; 2363 goto errout; 2364 } 2365 2366 if ((res = parseOCIprocess(tb[OCI_PROCESS]))) 2367 goto errout; 2368 2369 if (!tb[OCI_ROOT]) { 2370 res=ENODATA; 2371 goto errout; 2372 } 2373 if ((res = parseOCIroot(jsonfile, tb[OCI_ROOT]))) 2374 goto errout; 2375 2376 if (!tb[OCI_MOUNTS]) { 2377 res=ENODATA; 2378 goto errout; 2379 } 2380 2381 blobmsg_for_each_attr(cur, tb[OCI_MOUNTS], rem) 2382 if ((res = parseOCImount(cur))) 2383 goto errout; 2384 2385 if (tb[OCI_LINUX] && (res = parseOCIlinux(tb[OCI_LINUX]))) 2386 goto errout; 2387 2388 if (tb[OCI_HOOKS] && (res = parseOCIhooks(tb[OCI_HOOKS]))) 2389 goto errout; 2390 2391 if (tb[OCI_ANNOTATIONS]) 2392 opts.annotations = blob_memdup(tb[OCI_ANNOTATIONS]); 2393 2394 errout: 2395 blob_buf_free(&ocibuf); 2396 2397 return res; 2398 } 2399 2400 static int set_oom_score_adj(void) 2401 { 2402 int f; 2403 char fname[32]; 2404 2405 if (!opts.set_oom_score_adj) 2406 return 0; 2407 2408 snprintf(fname, sizeof(fname), "/proc/%u/oom_score_adj", jail_process.pid); 2409 f = open(fname, O_WRONLY | O_TRUNC); 2410 if (f < 0) 2411 return errno; 2412 2413 dprintf(f, "%d", opts.oom_score_adj); 2414 close(f); 2415 2416 return 0; 2417 } 2418 2419 2420 enum { 2421 OCI_STATE_CREATING, 2422 OCI_STATE_CREATED, 2423 OCI_STATE_RUNNING, 2424 OCI_STATE_STOPPED, 2425 }; 2426 2427 static int jail_oci_state = OCI_STATE_CREATED; 2428 static void pipe_send_start_container(struct uloop_timeout *t); 2429 static struct uloop_timeout start_container_timeout = { 2430 .cb = pipe_send_start_container, 2431 }; 2432 2433 static int handle_start(struct ubus_context *ctx, struct ubus_object *obj, 2434 struct ubus_request_data *req, const char *method, 2435 struct blob_attr *msg) 2436 { 2437 if (jail_oci_state != OCI_STATE_CREATED) 2438 return UBUS_STATUS_INVALID_ARGUMENT; 2439 2440 uloop_timeout_add(&start_container_timeout); 2441 2442 return UBUS_STATUS_OK; 2443 } 2444 2445 static struct blob_buf bb; 2446 static int handle_state(struct ubus_context *ctx, struct ubus_object *obj, 2447 struct ubus_request_data *req, const char *method, 2448 struct blob_attr *msg) 2449 { 2450 char *statusstr; 2451 2452 switch (jail_oci_state) { 2453 case OCI_STATE_CREATING: 2454 statusstr = "creating"; 2455 break; 2456 case OCI_STATE_CREATED: 2457 statusstr = "created"; 2458 break; 2459 case OCI_STATE_RUNNING: 2460 statusstr = "running"; 2461 break; 2462 case OCI_STATE_STOPPED: 2463 statusstr = "stopped"; 2464 break; 2465 default: 2466 statusstr = "unknown"; 2467 } 2468 2469 blob_buf_init(&bb, 0); 2470 blobmsg_add_string(&bb, "ociVersion", OCI_VERSION_STRING); 2471 blobmsg_add_string(&bb, "id", opts.name); 2472 blobmsg_add_string(&bb, "status", statusstr); 2473 if (jail_oci_state == OCI_STATE_CREATED || 2474 jail_oci_state == OCI_STATE_RUNNING) 2475 blobmsg_add_u32(&bb, "pid", jail_process.pid); 2476 2477 blobmsg_add_string(&bb, "bundle", opts.ocibundle); 2478 2479 if (opts.annotations) 2480 blobmsg_add_blob(&bb, opts.annotations); 2481 2482 ubus_send_reply(ctx, req, bb.head); 2483 2484 return UBUS_STATUS_OK; 2485 } 2486 2487 enum { 2488 CONTAINER_KILL_ATTR_SIGNAL, 2489 __CONTAINER_KILL_ATTR_MAX, 2490 }; 2491 2492 static const struct blobmsg_policy container_kill_attrs[__CONTAINER_KILL_ATTR_MAX] = { 2493 [CONTAINER_KILL_ATTR_SIGNAL] = { "signal", BLOBMSG_TYPE_INT32 }, 2494 }; 2495 2496 static int 2497 container_handle_kill(struct ubus_context *ctx, struct ubus_object *obj, 2498 struct ubus_request_data *req, const char *method, 2499 struct blob_attr *msg) 2500 { 2501 struct blob_attr *tb[__CONTAINER_KILL_ATTR_MAX], *cur; 2502 int sig = SIGTERM; 2503 2504 blobmsg_parse(container_kill_attrs, __CONTAINER_KILL_ATTR_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg)); 2505 2506 cur = tb[CONTAINER_KILL_ATTR_SIGNAL]; 2507 if (cur) 2508 sig = blobmsg_get_u32(cur); 2509 2510 if (jail_oci_state == OCI_STATE_CREATING) 2511 return UBUS_STATUS_NOT_FOUND; 2512 2513 if (kill(jail_process.pid, sig) == 0) 2514 return 0; 2515 2516 switch (errno) { 2517 case EINVAL: return UBUS_STATUS_INVALID_ARGUMENT; 2518 case EPERM: return UBUS_STATUS_PERMISSION_DENIED; 2519 case ESRCH: return UBUS_STATUS_NOT_FOUND; 2520 } 2521 2522 return UBUS_STATUS_UNKNOWN_ERROR; 2523 } 2524 2525 static int 2526 jail_writepid(pid_t pid) 2527 { 2528 FILE *_pidfile; 2529 2530 if (!opts.pidfile) 2531 return 0; 2532 2533 _pidfile = fopen(opts.pidfile, "w"); 2534 if (_pidfile == NULL) 2535 return errno; 2536 2537 if (fprintf(_pidfile, "%d\n", pid) < 0) { 2538 fclose(_pidfile); 2539 return errno; 2540 } 2541 2542 if (fclose(_pidfile)) 2543 return errno; 2544 2545 return 0; 2546 } 2547 2548 static int checkpath(const char *path) 2549 { 2550 int dirfd = open(path, O_RDONLY | O_DIRECTORY | O_CLOEXEC); 2551 if (dirfd < 0) { 2552 ERROR("path %s open failed %m\n", path); 2553 return -1; 2554 } 2555 close(dirfd); 2556 2557 return 0; 2558 } 2559 2560 static struct ubus_method container_methods[] = { 2561 UBUS_METHOD_NOARG("start", handle_start), 2562 UBUS_METHOD_NOARG("state", handle_state), 2563 UBUS_METHOD("kill", container_handle_kill, container_kill_attrs), 2564 }; 2565 2566 static struct ubus_object_type container_object_type = 2567 UBUS_OBJECT_TYPE("container", container_methods); 2568 2569 static struct ubus_object container_object = { 2570 .type = &container_object_type, 2571 .methods = container_methods, 2572 .n_methods = ARRAY_SIZE(container_methods), 2573 }; 2574 2575 static void post_main(struct uloop_timeout *t); 2576 static struct uloop_timeout post_main_timeout = { 2577 .cb = post_main, 2578 }; 2579 static int netns_fd; 2580 static int pidns_fd; 2581 #ifdef CLONE_NEWTIME 2582 static int timens_fd; 2583 #endif 2584 static void post_create_runtime(void); 2585 2586 struct env_e { 2587 struct list_head list; 2588 char *envarg; 2589 }; 2590 2591 int main(int argc, char **argv) 2592 { 2593 uid_t uid = getuid(); 2594 const char log[] = "/dev/log"; 2595 const char ubus[] = "/var/run/ubus/ubus.sock"; 2596 int ret = EXIT_FAILURE; 2597 int ch; 2598 char *tmp; 2599 struct list_head envl = LIST_HEAD_INIT(envl); 2600 struct env_e *enve, *tmpenve; 2601 unsigned short int envn = 0, envc = 0; 2602 2603 if (uid) { 2604 ERROR("not root, aborting: %m\n"); 2605 return EXIT_FAILURE; 2606 } 2607 2608 /* those are filehandlers, so -1 indicates unused */ 2609 opts.setns.pid = -1; 2610 opts.setns.net = -1; 2611 opts.setns.ns = -1; 2612 opts.setns.ipc = -1; 2613 opts.setns.uts = -1; 2614 opts.setns.user = -1; 2615 opts.setns.cgroup = -1; 2616 #ifdef CLONE_NEWTIME 2617 opts.setns.time = -1; 2618 #endif 2619 2620 /* default 5 seconds timeout after SIGTERM before SIGKILL is sent */ 2621 opts.term_timeout = 5; 2622 2623 umask(022); 2624 mount_list_init(); 2625 init_library_search(); 2626 cgroups_prepare(); 2627 exit_from_child = false; 2628 2629 while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) { 2630 switch (ch) { 2631 case 'd': 2632 debug = atoi(optarg); 2633 break; 2634 case 'e': 2635 enve = calloc(1, sizeof(*enve)); 2636 enve->envarg = optarg; 2637 list_add_tail(&enve->list, &envl); 2638 break; 2639 case 'p': 2640 opts.namespace |= CLONE_NEWNS; 2641 opts.procfs = 1; 2642 break; 2643 case 'o': 2644 opts.namespace |= CLONE_NEWNS; 2645 opts.ronly = 1; 2646 break; 2647 case 'f': 2648 opts.namespace |= CLONE_NEWUSER; 2649 break; 2650 case 'F': 2651 opts.namespace |= CLONE_NEWCGROUP; 2652 break; 2653 case 'R': 2654 opts.extroot = realpath(optarg, NULL); 2655 break; 2656 case 's': 2657 opts.namespace |= CLONE_NEWNS; 2658 opts.sysfs = 1; 2659 break; 2660 case 'S': 2661 opts.seccomp = optarg; 2662 add_mount_bind(optarg, 1, -1); 2663 break; 2664 case 'C': 2665 opts.capabilities = optarg; 2666 break; 2667 case 'c': 2668 opts.no_new_privs = 1; 2669 break; 2670 case 'n': 2671 opts.name = optarg; 2672 break; 2673 case 'N': 2674 opts.namespace |= CLONE_NEWNET; 2675 break; 2676 case 'h': 2677 opts.namespace |= CLONE_NEWUTS; 2678 opts.hostname = strdup(optarg); 2679 break; 2680 case 'j': 2681 jail_join_ns(optarg); 2682 break; 2683 case 'r': 2684 opts.namespace |= CLONE_NEWNS; 2685 tmp = strchr(optarg, ':'); 2686 if (tmp) { 2687 *(tmp++) = '\0'; 2688 add_2paths_and_deps(optarg, tmp, 1, 0, 0); 2689 } else { 2690 add_path_and_deps(optarg, 1, 0, 0); 2691 } 2692 break; 2693 case 'w': 2694 opts.namespace |= CLONE_NEWNS; 2695 tmp = strchr(optarg, ':'); 2696 if (tmp) { 2697 *(tmp++) = '\0'; 2698 add_2paths_and_deps(optarg, tmp, 0, 0, 0); 2699 } else { 2700 add_path_and_deps(optarg, 0, 0, 0); 2701 } 2702 break; 2703 case 'u': 2704 opts.namespace |= CLONE_NEWNS; 2705 add_mount_bind(ubus, 0, -1); 2706 break; 2707 case 'l': 2708 opts.namespace |= CLONE_NEWNS; 2709 add_mount_bind(log, 0, -1); 2710 break; 2711 case 'U': 2712 opts.user = optarg; 2713 break; 2714 case 'G': 2715 opts.group = optarg; 2716 break; 2717 case 'O': 2718 opts.overlaydir = realpath(optarg, NULL); 2719 break; 2720 case 't': 2721 opts.term_timeout = atoi(optarg); 2722 break; 2723 case 'T': 2724 opts.tmpoverlaysize = optarg; 2725 break; 2726 case 'E': 2727 opts.require_jail = 1; 2728 break; 2729 case 'y': 2730 opts.console = 1; 2731 break; 2732 case 'J': 2733 opts.ocibundle = optarg; 2734 break; 2735 case 'i': 2736 opts.immediately = true; 2737 break; 2738 case 'P': 2739 opts.pidfile = optarg; 2740 break; 2741 } 2742 } 2743 2744 if (opts.namespace && !opts.ocibundle) 2745 opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID; 2746 2747 /* 2748 * env import from cmdline is not available for OCI containers 2749 */ 2750 if (opts.ocibundle && !list_empty(&envl)) { 2751 ret=-ENOTSUP; 2752 goto errout; 2753 } 2754 2755 /* 2756 * prepare list of env variables to import for slim containers 2757 */ 2758 if (!list_empty(&envl)) { 2759 list_for_each_entry(enve, &envl, list) 2760 ++envn; 2761 2762 opts.envp = calloc(1 + envn, sizeof(char*)); 2763 list_for_each_entry_safe(enve, tmpenve, &envl, list) { 2764 tmp = getenv(enve->envarg); 2765 if (tmp) { 2766 ret = asprintf(&opts.envp[envc++], "%s=%s", enve->envarg, tmp); 2767 if (ret < 0) { 2768 ERROR("filed to handle envargs %s\n", tmp); 2769 free(enve); 2770 goto errout; 2771 } 2772 } 2773 2774 list_del(&enve->list); 2775 free(enve); 2776 } 2777 2778 opts.envp[envc] = NULL; 2779 } 2780 2781 /* 2782 * uid in parent user namespace representing root user in new 2783 * user namespace, defaults to nobody unless specified in uidMappings 2784 */ 2785 opts.root_map_uid = 65534; 2786 2787 if (opts.capabilities && parseOCIcapabilities_from_file(&opts.capset, opts.capabilities)) { 2788 ERROR("failed to read capabilities from file %s\n", opts.capabilities); 2789 ret=-1; 2790 goto errout; 2791 } 2792 2793 if (opts.ocibundle) { 2794 char *jsonfile; 2795 int ocires; 2796 2797 if (!opts.name) { 2798 ERROR("OCI bundle needs a named jail\n"); 2799 ret=-1; 2800 goto errout; 2801 } 2802 if (asprintf(&jsonfile, "%s/config.json", opts.ocibundle) < 0) { 2803 ret=-ENOMEM; 2804 goto errout; 2805 } 2806 ocires = parseOCI(jsonfile); 2807 free(jsonfile); 2808 if (ocires) { 2809 ERROR("parsing of OCI JSON spec has failed: %s (%d)\n", strerror(ocires), ocires); 2810 ret=ocires; 2811 goto errout; 2812 } 2813 } 2814 2815 if (opts.namespace & CLONE_NEWNET) { 2816 if (!opts.name) { 2817 ERROR("netns needs a named jail\n"); 2818 ret=-1; 2819 goto errout; 2820 } 2821 } 2822 2823 2824 if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) { 2825 ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize); 2826 ret=-1; 2827 goto errout; 2828 } 2829 2830 if (opts.extroot && checkpath(opts.extroot)) { 2831 ERROR("invalid rootfs path '%s'", opts.extroot); 2832 ret=-1; 2833 goto errout; 2834 } 2835 2836 if (opts.overlaydir && checkpath(opts.overlaydir)) { 2837 ERROR("invalid rootfs overlay path '%s'", opts.overlaydir); 2838 ret=-1; 2839 goto errout; 2840 } 2841 2842 /* no <binary> param found */ 2843 if (!opts.ocibundle && (argc - optind < 1)) { 2844 usage(); 2845 ret=EXIT_FAILURE; 2846 goto errout; 2847 } 2848 if (!(opts.ocibundle||opts.namespace||opts.capabilities||opts.seccomp|| 2849 (opts.setns.net != -1) || 2850 (opts.setns.ns != -1) || 2851 (opts.setns.ipc != -1) || 2852 (opts.setns.uts != -1) || 2853 (opts.setns.user != -1) || 2854 (opts.setns.cgroup != -1))) { 2855 ERROR("Not using namespaces, capabilities or seccomp !!!\n\n"); 2856 usage(); 2857 ret=EXIT_FAILURE; 2858 goto errout; 2859 } 2860 DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n", 2861 opts.namespace, 2862 opts.capset.apply, 2863 opts.seccomp != 0 || opts.ociseccomp != 0); 2864 2865 uloop_init(); 2866 signals_init(); 2867 2868 parent_ctx = ubus_connect(NULL); 2869 if (!parent_ctx) { 2870 ERROR("Connection to ubus failed\n"); 2871 ret = -ECONNREFUSED; 2872 goto errout; 2873 } 2874 2875 ubus_add_uloop(parent_ctx); 2876 2877 if (opts.ocibundle) { 2878 char *objname; 2879 if (asprintf(&objname, "container.%s", opts.name) < 0) { 2880 ret=-ENOMEM; 2881 goto errout; 2882 } 2883 2884 container_object.name = objname; 2885 ret = ubus_add_object(parent_ctx, &container_object); 2886 if (ret) { 2887 ERROR("Failed to add object: %s\n", ubus_strerror(ret)); 2888 ret=-1; 2889 goto errout; 2890 } 2891 } 2892 2893 /* deliberately not using 'else' on unrelated conditional branches */ 2894 if (!opts.ocibundle) { 2895 /* allocate NULL-terminated array for argv */ 2896 opts.jail_argv = calloc(1 + argc - optind, sizeof(void *)); 2897 if (!opts.jail_argv) { 2898 ret=EXIT_FAILURE; 2899 goto errout; 2900 } 2901 for (size_t s = optind; s < argc; s++) 2902 opts.jail_argv[s - optind] = strdup(argv[s]); 2903 2904 if (opts.namespace & CLONE_NEWUSER) 2905 get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid); 2906 } 2907 2908 if (!opts.extroot) { 2909 if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) { 2910 ERROR("failed to load dependencies\n"); 2911 ret=-1; 2912 goto errout; 2913 } 2914 } 2915 2916 if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) { 2917 ERROR("failed to load libpreload-seccomp.so\n"); 2918 opts.seccomp = 0; 2919 if (opts.require_jail) { 2920 ret=-1; 2921 goto errout; 2922 } 2923 } 2924 2925 uloop_timeout_add(&post_main_timeout); 2926 uloop_run(); 2927 2928 errout: 2929 if (opts.ocibundle) 2930 cgroups_free(); 2931 2932 free_opts(true); 2933 2934 return ret; 2935 } 2936 2937 static void post_main(struct uloop_timeout *t) 2938 { 2939 if (apply_rlimits()) { 2940 ERROR("error applying resource limits\n"); 2941 free_and_exit(EXIT_FAILURE); 2942 } 2943 2944 if (opts.name) 2945 prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL); 2946 2947 if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0) 2948 free_and_exit(-1); 2949 2950 if (has_namespaces()) { 2951 if (opts.namespace & CLONE_NEWNS) { 2952 if (!opts.extroot && (opts.user || opts.group)) { 2953 add_mount_bind("/etc/passwd", 1, -1); 2954 add_mount_bind("/etc/group", 1, -1); 2955 } 2956 2957 #if defined(__GLIBC__) 2958 if (!opts.extroot) 2959 add_mount_bind("/etc/nsswitch.conf", 1, -1); 2960 #endif 2961 if (opts.setns.ns == -1) { 2962 if (!(opts.namespace & CLONE_NEWNET)) { 2963 add_mount_bind("/etc/resolv.conf", 1, 0); 2964 } else { 2965 /* new mount namespace to provide /dev/resolv.conf.d */ 2966 char hostdir[PATH_MAX]; 2967 2968 snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name); 2969 if (mkdir_p(hostdir, 0755)) { 2970 ERROR("mkdir(%s) failed: %m\n", hostdir); 2971 free_and_exit(-1); 2972 } 2973 add_mount(hostdir, "/dev/resolv.conf.d", NULL, 2974 MS_BIND | MS_NOEXEC | MS_NOATIME | MS_NOSUID | MS_NODEV | MS_RDONLY, 0, NULL, 0); 2975 } 2976 } 2977 /* default mounts */ 2978 add_mount(NULL, "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "size=1M", -1); 2979 add_mount(NULL, "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "newinstance,ptmxmode=0666,mode=0620,gid=5", 0); 2980 2981 if (opts.procfs || opts.ocibundle) { 2982 add_mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0, NULL, -1); 2983 2984 /* 2985 * hack to make /proc/sys/net read-write while the rest of /proc/sys is read-only 2986 * which cannot be expressed with OCI spec, but happends to be very useful. 2987 * Only apply it if '/proc/sys' is not already listed as mount, maskedPath or 2988 * readonlyPath. 2989 * If not running in a new network namespace, only make /proc/sys read-only. 2990 * If running in a new network namespace, temporarily stash (ie. mount-bind) 2991 * /proc/sys/net into (totally unrelated, but surely existing) /proc/self/net. 2992 * Then we mount-bind /proc/sys read-only and then mount-move /proc/self/net into 2993 * /proc/sys/net. 2994 * This works because mounts are executed in incrementing strcmp() order and 2995 * /proc/self/net appears there before /proc/sys/net and hence the operation 2996 * succeeds as the bind-mount of /proc/self/net is performed first and then 2997 * move-mount of /proc/sys/net follows because 'e' preceeds 'y' in the ASCII 2998 * table (and in the alphabet). 2999 */ 3000 if (!add_mount(NULL, "/proc/sys", NULL, MS_BIND | MS_RDONLY, 0, NULL, -1)) 3001 if (opts.namespace & CLONE_NEWNET) 3002 if (!add_mount_inner("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, 0, NULL, -1)) 3003 add_mount_inner("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, 0, NULL, -1); 3004 3005 } 3006 if (opts.sysfs || opts.ocibundle) 3007 add_mount("sysfs", "/sys", "sysfs", MS_RELATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, 0, NULL, -1); 3008 3009 if (opts.ocibundle) 3010 add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, 0, "mode=1777", -1); 3011 3012 } 3013 3014 if (opts.setns.pid != -1) { 3015 pidns_fd = ns_open_pid("pid", getpid()); 3016 setns_open(CLONE_NEWPID); 3017 } else { 3018 pidns_fd = -1; 3019 } 3020 3021 #ifdef CLONE_NEWTIME 3022 if (opts.setns.time != -1) { 3023 timens_fd = ns_open_pid("time", getpid()); 3024 setns_open(CLONE_NEWTIME); 3025 } else { 3026 timens_fd = -1; 3027 } 3028 #endif 3029 3030 if (opts.namespace & CLONE_NEWUSER) { 3031 if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) { 3032 ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n"); 3033 free_and_exit(EXIT_FAILURE); 3034 } 3035 if (seteuid(opts.root_map_uid)) { 3036 ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid); 3037 free_and_exit(EXIT_FAILURE); 3038 } 3039 } 3040 3041 jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | (opts.namespace & (~CLONE_NEWCGROUP)), NULL); 3042 } else { 3043 jail_process.pid = fork(); 3044 } 3045 3046 if (jail_process.pid > 0) { 3047 /* parent process */ 3048 char sig_buf[1]; 3049 3050 uloop_process_add(&jail_process); 3051 jail_running = 1; 3052 if (seteuid(0)) { 3053 ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid); 3054 free_and_exit(EXIT_FAILURE); 3055 } 3056 3057 prctl(PR_SET_SECUREBITS, 0); 3058 3059 if (pidns_fd != -1) { 3060 setns(pidns_fd, CLONE_NEWPID); 3061 close(pidns_fd); 3062 } 3063 #ifdef CLONE_NEWTIME 3064 if (timens_fd != -1) { 3065 setns(timens_fd, CLONE_NEWTIME); 3066 close(timens_fd); 3067 } 3068 #endif 3069 if (opts.setns.net != -1) 3070 close(opts.setns.net); 3071 if (opts.setns.ns != -1) 3072 close(opts.setns.ns); 3073 if (opts.setns.ipc != -1) 3074 close(opts.setns.ipc); 3075 if (opts.setns.uts != -1) 3076 close(opts.setns.uts); 3077 if (opts.setns.user != -1) 3078 close(opts.setns.user); 3079 if (opts.setns.cgroup != -1) 3080 close(opts.setns.cgroup); 3081 close(pipes[1]); 3082 close(pipes[2]); 3083 if (read(pipes[0], sig_buf, 1) < 1) { 3084 ERROR("can't read from child\n"); 3085 free_and_exit(-1); 3086 } 3087 close(pipes[0]); 3088 set_oom_score_adj(); 3089 3090 if (opts.ocibundle) 3091 cgroups_apply(jail_process.pid); 3092 3093 if (opts.namespace & CLONE_NEWUSER) { 3094 if (write_setgroups(jail_process.pid, true)) { 3095 ERROR("can't write setgroups\n"); 3096 free_and_exit(-1); 3097 } 3098 if (!opts.uidmap) { 3099 bool has_gr = (opts.gr_gid != -1); 3100 if (opts.pw_uid != -1) { 3101 write_single_uid_gid_map(jail_process.pid, 0, opts.pw_uid); 3102 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid); 3103 } else { 3104 write_single_uid_gid_map(jail_process.pid, 0, 65534); 3105 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534); 3106 } 3107 } else { 3108 write_uid_gid_map(jail_process.pid, 0, opts.uidmap); 3109 if (opts.gidmap) 3110 write_uid_gid_map(jail_process.pid, 1, opts.gidmap); 3111 } 3112 } 3113 3114 if (opts.namespace & CLONE_NEWNET) 3115 jail_network_start(parent_ctx, opts.name, jail_process.pid); 3116 3117 if (jail_writepid(jail_process.pid)) { 3118 ERROR("failed to write pidfile: %m\n"); 3119 free_and_exit(-1); 3120 } 3121 } else if (jail_process.pid == 0) { 3122 /* fork child process */ 3123 free_and_exit(exec_jail(NULL)); 3124 } else { 3125 ERROR("failed to clone/fork: %m\n"); 3126 free_and_exit(EXIT_FAILURE); 3127 } 3128 run_hooks(opts.hooks.createRuntime, post_create_runtime); 3129 } 3130 3131 static void post_poststart(void); 3132 static void post_create_runtime(void) 3133 { 3134 char sig_buf[1]; 3135 3136 sig_buf[0] = 'O'; 3137 if (write(pipes[3], sig_buf, 1) < 0) { 3138 ERROR("can't write to child\n"); 3139 free_and_exit(-1); 3140 } 3141 3142 jail_oci_state = OCI_STATE_CREATED; 3143 if (opts.ocibundle && !opts.immediately) 3144 uloop_run(); /* wait for 'start' command via ubus */ 3145 else 3146 pipe_send_start_container(NULL); 3147 } 3148 3149 static void pipe_send_start_container(struct uloop_timeout *t) 3150 { 3151 char sig_buf[1]; 3152 3153 jail_oci_state = OCI_STATE_RUNNING; 3154 sig_buf[0] = '!'; 3155 if (write(pipes[3], sig_buf, 1) < 0) { 3156 ERROR("can't write to child\n"); 3157 free_and_exit(-1); 3158 } 3159 close(pipes[3]); 3160 3161 run_hooks(opts.hooks.poststart, post_poststart); 3162 } 3163 3164 static void post_poststart(void) 3165 { 3166 uloop_run(); /* idle here while jail is running */ 3167 if (jail_running) { 3168 DEBUG("uloop interrupted, killing jail process\n"); 3169 kill(jail_process.pid, SIGTERM); 3170 uloop_timeout_set(&jail_process_timeout, 1000); 3171 uloop_run(); 3172 } 3173 uloop_done(); 3174 poststop(); 3175 } 3176 3177 static void post_poststop(void); 3178 static void poststop(void) { 3179 if (opts.namespace & CLONE_NEWNET) { 3180 setns(netns_fd, CLONE_NEWNET); 3181 jail_network_stop(); 3182 close(netns_fd); 3183 } 3184 run_hooks(opts.hooks.poststop, post_poststop); 3185 } 3186 3187 static void post_poststop(void) 3188 { 3189 free_opts(true); 3190 if (parent_ctx) 3191 ubus_free(parent_ctx); 3192 3193 exit(jail_return_code); 3194 } 3195
This page was automatically generated by LXR 0.3.1. • OpenWrt