• source navigation  • diff markup  • identifier search  • freetext search  • 

Sources/procd/jail/cgroups-bpf.c

  1 /*
  2  * Copyright (C) 2021 Daniel Golle <daniel@makrotopia.org>
  3  *
  4  * This program is free software; you can redistribute it and/or modify
  5  * it under the terms of the GNU Lesser General Public License version 2.1
  6  * as published by the Free Software Foundation
  7  *
  8  * This program is distributed in the hope that it will be useful,
  9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11  * GNU General Public License for more details.
 12  *
 13  * somehow emulate devices.allow/devices.deny using eBPF
 14  *
 15  * OCI run-time spec defines the syntax for allowing/denying access
 16  * to devices according to the definition of cgroup-v1 in the Kernel
 17  * as described in Documentation/admin-guide/cgroup-v1.
 18  */
 19 
 20 #include <assert.h>
 21 #include <linux/bpf.h>
 22 #ifdef __GLIBC__
 23 #include <sys/cdefs.h>
 24 #else
 25 #include <sys/reg.h>
 26 #endif
 27 #include <sys/syscall.h>
 28 
 29 #include <libubox/blobmsg.h>
 30 #include <libubox/blobmsg_json.h>
 31 #include <libubox/list.h>
 32 
 33 #include "cgroups.h"
 34 #include "cgroups-bpf.h"
 35 #include "log.h"
 36 
 37 static struct bpf_insn *program = NULL;
 38 static int bpf_total_insn = 0;
 39 static const char *license = "GPL";
 40 
 41 static int
 42 syscall_bpf (int cmd, union bpf_attr *attr, unsigned int size)
 43 {
 44         return (int) syscall (__NR_bpf, cmd, attr, size);
 45 }
 46 
 47 /* from crun/src/libcrun/ebpf.c */
 48 #define BPF_ALU32_IMM(OP, DST, IMM) \
 49         ((struct bpf_insn){ .code = BPF_ALU | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
 50 
 51 #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
 52         ((struct bpf_insn){                    \
 53                 .code = BPF_LDX | BPF_SIZE (SIZE) | BPF_MEM, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
 54 
 55 #define BPF_MOV64_REG(DST, SRC) \
 56         ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
 57 
 58 #define BPF_JMP_A(OFF) \
 59         ((struct bpf_insn){ .code = BPF_JMP | BPF_JA, .dst_reg = 0, .src_reg = 0, .off = OFF, .imm = 0 })
 60 
 61 #define BPF_JMP_IMM(OP, DST, IMM, OFF) \
 62         ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = OFF, .imm = IMM })
 63 
 64 #define BPF_JMP_REG(OP, DST, SRC, OFF) \
 65         ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
 66 
 67 #define BPF_MOV64_IMM(DST, IMM) \
 68         ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
 69 
 70 #define BPF_MOV32_REG(DST, SRC) \
 71         ((struct bpf_insn){ .code = BPF_ALU | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
 72 
 73 #define BPF_EXIT_INSN() \
 74         ((struct bpf_insn){ .code = BPF_JMP | BPF_EXIT, .dst_reg = 0, .src_reg = 0, .off = 0, .imm = 0 })
 75 
 76 /* taken from systemd.  */
 77 static const struct bpf_insn pre_insn[] = {
 78         /* type -> R2.  */
 79         BPF_LDX_MEM (BPF_W, BPF_REG_2, BPF_REG_1, 0),
 80         BPF_ALU32_IMM (BPF_AND, BPF_REG_2, 0xFFFF),
 81         /* access -> R3.  */
 82         BPF_LDX_MEM (BPF_W, BPF_REG_3, BPF_REG_1, 0),
 83         BPF_ALU32_IMM (BPF_RSH, BPF_REG_3, 16),
 84         /* major -> R4.  */
 85         BPF_LDX_MEM (BPF_W, BPF_REG_4, BPF_REG_1, 4),
 86         /* minor -> R5.  */
 87         BPF_LDX_MEM (BPF_W, BPF_REG_5, BPF_REG_1, 8),
 88 };
 89 
 90 enum {
 91         OCI_LINUX_CGROUPS_DEVICES_ALLOW,
 92         OCI_LINUX_CGROUPS_DEVICES_TYPE,
 93         OCI_LINUX_CGROUPS_DEVICES_MAJOR,
 94         OCI_LINUX_CGROUPS_DEVICES_MINOR,
 95         OCI_LINUX_CGROUPS_DEVICES_ACCESS,
 96         __OCI_LINUX_CGROUPS_DEVICES_MAX,
 97 };
 98 
 99 static const struct blobmsg_policy oci_linux_cgroups_devices_policy[] = {
100         [OCI_LINUX_CGROUPS_DEVICES_ALLOW] = { "allow", BLOBMSG_TYPE_BOOL },
101         [OCI_LINUX_CGROUPS_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING },
102         [OCI_LINUX_CGROUPS_DEVICES_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
103         [OCI_LINUX_CGROUPS_DEVICES_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
104         [OCI_LINUX_CGROUPS_DEVICES_ACCESS] = { "access", BLOBMSG_TYPE_STRING },
105 };
106 
107 /*
108  * cgroup-v1 devices got a (default) behaviour and a list of exceptions.
109  * define datatypes similar to the legacy kernel code.
110  */
111 #define DEVCG_DEV_ALL (BPF_DEVCG_DEV_BLOCK | BPF_DEVCG_DEV_CHAR)
112 #define DEVCG_ACC_ALL (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)
113 
114 enum devcg_behavior {
115         DEVCG_DEFAULT_NONE,
116         DEVCG_DEFAULT_ALLOW,
117         DEVCG_DEFAULT_DENY,
118 };
119 
120 struct dev_exception_item {
121         uint32_t                major, minor;
122         short                   type;
123         short                   access;
124         struct list_head        list;
125         bool                    allow;
126 };
127 
128 /*
129  * add a bunch of default rules
130  */
131 static int add_default_exceptions(struct list_head *exceptions)
132 {
133         int i, ret = 0;
134         struct dev_exception_item *cur;
135         /* from crun/src/libcrun/cgroup.c */
136         const struct dev_exception_item defrules[] = {
137                 /* always allow mknod */
138                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = ~0,  .minor = ~0,  .access = BPF_DEVCG_ACC_MKNOD },
139                 { .allow = true, .type = BPF_DEVCG_DEV_BLOCK, .major = ~0,  .minor = ~0,  .access = BPF_DEVCG_ACC_MKNOD },
140                 /* /dev/null */
141                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 1,   .minor = 3,   .access = DEVCG_ACC_ALL },
142                 /* /dev/random */
143                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 1,   .minor = 8,   .access = DEVCG_ACC_ALL },
144                 /* /dev/full */
145                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 1,   .minor = 7,   .access = DEVCG_ACC_ALL },
146                 /* /dev/tty */
147                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 5,   .minor = 0,   .access = DEVCG_ACC_ALL },
148                 /* /dev/zero */
149                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 1,   .minor = 5,   .access = DEVCG_ACC_ALL },
150                 /* /dev/urandom */
151                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 1,   .minor = 9,   .access = DEVCG_ACC_ALL },
152                 /* /dev/console */
153                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 5,   .minor = 1,   .access = DEVCG_ACC_ALL },
154                 /* /dev/pts/[0-255] */
155                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 136, .minor = ~0,  .access = DEVCG_ACC_ALL },
156                 /* /dev/ptmx */
157                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 5,   .minor = 2,   .access = DEVCG_ACC_ALL },
158                 /* /dev/net/tun */
159                 { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 10,  .minor = 200, .access = DEVCG_ACC_ALL },
160         };
161 
162         for (i = 0; i < (sizeof(defrules) / sizeof(struct dev_exception_item)); ++i) {
163                 cur = malloc(sizeof(struct dev_exception_item));
164                 if (!cur) {
165                         ret = ENOMEM;
166                         break;
167                 }
168                 /* add defaults to list in reverse order (last item will be first in list) */
169                 memcpy(cur, &defrules[i], sizeof(struct dev_exception_item));
170                 list_add(&cur->list, exceptions);
171         }
172 
173         return ret;
174 }
175 
176 /*
177  * free all exceptions in the list
178  */
179 static void flush_exceptions(struct list_head *freelist)
180 {
181         struct dev_exception_item *dl, *dln;
182 
183         if (!list_empty(freelist))
184                 list_for_each_entry_safe(dl, dln, freelist, list) {
185                         list_del(&dl->list);
186                         free(dl);
187                 }
188 }
189 
190 /*
191  * parse OCI cgroups devices and translate into cgroups-v2 eBPF program
192  */
193 int parseOCIlinuxcgroups_devices(struct blob_attr *msg)
194 {
195         struct blob_attr *tb[__OCI_LINUX_CGROUPS_DEVICES_MAX];
196         struct blob_attr *cur;
197         int rem, ret = 0;
198         int bpf_type, bpf_access;
199         unsigned char acidx;
200         bool allow = false,
201              has_access = false,
202              has_type = false,
203              has_major = false,
204              has_minor = false;
205         int total_ins = 0,
206             cur_ins = 0,
207             pre_insn_len = sizeof(pre_insn) / sizeof(struct bpf_insn),
208             next_ins;
209         char *access, *devtype;
210         uint32_t devmajor, devminor;
211         struct dev_exception_item *dl;
212         struct list_head exceptions;
213         enum devcg_behavior behavior = DEVCG_DEFAULT_ALLOW;
214         INIT_LIST_HEAD(&exceptions);
215 
216         /* parse according to OCI spec */
217         blobmsg_for_each_attr(cur, msg, rem) {
218                 blobmsg_parse(oci_linux_cgroups_devices_policy, __OCI_LINUX_CGROUPS_DEVICES_MAX,
219                               tb, blobmsg_data(cur), blobmsg_len(cur));
220 
221                 if (!tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]) {
222                         ret = EINVAL;
223                         goto out;
224                 }
225 
226                 allow = blobmsg_get_bool(tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]);
227 
228                 bpf_access = 0;
229                 if (tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]) {
230                         access = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]);
231                         if ((strlen(access) > 3) || (strlen(access) == 0)) {
232                                 ret = EINVAL;
233                                 goto out;
234                         }
235 
236                         for (acidx = 0; acidx < strlen(access); ++acidx) {
237                                 switch (access[acidx]) {
238                                         case 'r':
239                                                 bpf_access |= BPF_DEVCG_ACC_READ;
240                                                 break;
241                                         case 'w':
242                                                 bpf_access |= BPF_DEVCG_ACC_WRITE;
243                                                 break;
244                                         case 'm':
245                                                 bpf_access |= BPF_DEVCG_ACC_MKNOD;
246                                                 break;
247                                         default:
248                                                 ret = EINVAL;
249                                                 goto out;
250                                 }
251                         }
252                 }
253 
254                 if (!bpf_access)
255                         bpf_access = DEVCG_ACC_ALL;
256 
257                 bpf_type = 0;
258                 if (tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]) {
259                         devtype = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]);
260 
261                         switch (devtype[0]) {
262                                 case 'c':
263                                         bpf_type = BPF_DEVCG_DEV_CHAR;
264                                         break;
265                                 case 'b':
266                                         bpf_type = BPF_DEVCG_DEV_BLOCK;
267                                         break;
268                                 case 'a':
269                                         bpf_type = DEVCG_DEV_ALL;
270                                         break;
271                                 default:
272                                         ret = EINVAL;
273                                         goto out;
274                         }
275                 }
276 
277                 if (!bpf_type)
278                         bpf_type = DEVCG_DEV_ALL;
279 
280                 if (tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR])
281                         devmajor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR]);
282                 else
283                         devmajor = ~0;
284 
285                 if (tb[OCI_LINUX_CGROUPS_DEVICES_MINOR])
286                         devminor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MINOR]);
287                 else
288                         devminor = ~0;
289 
290                 if (bpf_type == DEVCG_DEV_ALL) {
291                         /* wildcard => change default policy and flush all existing rules */
292                         flush_exceptions(&exceptions);
293                         behavior = allow?DEVCG_DEFAULT_ALLOW:DEVCG_DEFAULT_DENY;
294                 } else {
295                         /* allocate and populate record for exception */
296                         dl = malloc(sizeof(struct dev_exception_item));
297                         if (!dl) {
298                                 ret = ENOSPC;
299                                 break;
300                         }
301                         dl->allow = allow;
302                         dl->type = bpf_type;
303                         dl->access = bpf_access;
304                         dl->major = devmajor;
305                         dl->minor = devminor;
306 
307                         /* push to exceptions list, last goes first */
308                         list_add(&dl->list, &exceptions);
309                 }
310         }
311         if (ret)
312                 goto out;
313 
314         /* add default rules */
315         ret = add_default_exceptions(&exceptions);
316         if (ret)
317                 goto out;
318 
319         /* calculate number of instructions to allocate */
320         list_for_each_entry(dl, &exceptions, list) {
321                 has_access = dl->access != DEVCG_ACC_ALL;
322                 has_type = dl->type != DEVCG_DEV_ALL;
323                 has_major = dl->major != ~0;
324                 has_minor = dl->minor != ~0;
325 
326                 total_ins += (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 2;
327         }
328 
329         /* acccount for loader instructions */
330         total_ins += pre_insn_len;
331 
332         /* final accept/deny block */
333         total_ins += 2;
334 
335         /* allocate memory for eBPF program */
336         program = calloc(total_ins, sizeof(struct bpf_insn));
337         if (!program) {
338                 ret = ENOMEM;
339                 goto out;
340         }
341 
342         /* copy program loader instructions */
343         memcpy(program, &pre_insn, sizeof(pre_insn));
344         cur_ins = pre_insn_len;
345 
346         /* generate eBPF program */
347         list_for_each_entry(dl, &exceptions, list) {
348                 has_access = dl->access != DEVCG_ACC_ALL;
349                 has_type = dl->type != DEVCG_DEV_ALL;
350                 has_major = dl->major != ~0;
351                 has_minor = dl->minor != ~0;
352 
353                 next_ins = (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 1;
354 
355                 if (has_type) {
356                         program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_2, dl->type, next_ins);
357                         --next_ins;
358                 }
359 
360                 if (has_access) {
361                         program[cur_ins++] = BPF_MOV32_REG(BPF_REG_1, BPF_REG_3);
362                         program[cur_ins++] = BPF_ALU32_IMM(BPF_AND, BPF_REG_1, dl->access);
363                         program[cur_ins++] = BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, next_ins - 2);
364                         next_ins -= 3;
365                 }
366 
367                 if (has_major) {
368                         program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_4, dl->major, next_ins);
369                         --next_ins;
370                 }
371 
372                 if (has_minor) {
373                         program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_5, dl->minor, next_ins);
374                         --next_ins;
375                 }
376 
377                 program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, dl->allow ? 1 : 0);
378                 program[cur_ins++] = BPF_EXIT_INSN();
379         }
380 
381         /* default behavior */
382         program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, (behavior == DEVCG_DEFAULT_ALLOW)?1:0);
383         program[cur_ins++] = BPF_EXIT_INSN();
384 
385         if (debug) {
386                 fprintf(stderr, "cgroup devices:\na > devices.%s\n",
387                         (behavior == DEVCG_DEFAULT_ALLOW)?"allow":"deny");
388 
389                 list_for_each_entry(dl, &exceptions, list)
390                         fprintf(stderr, "%c %d:%d %s%s%s > devices.%s\n",
391                                 (dl->type == DEVCG_DEV_ALL)?'a':
392                                         (dl->type == BPF_DEVCG_DEV_CHAR)?'c':'b',
393                                 (dl->major == ~0)?-1:dl->major,
394                                 (dl->minor == ~0)?-1:dl->minor,
395                                 (dl->access & BPF_DEVCG_ACC_READ)?"r":"",
396                                 (dl->access & BPF_DEVCG_ACC_WRITE)?"w":"",
397                                 (dl->access & BPF_DEVCG_ACC_MKNOD)?"m":"",
398                                 (dl->allow)?"allow":"deny");
399 
400                 fprintf(stderr, "generated cgroup-devices eBPF program:\n");
401                 fprintf(stderr, " [idx]\tcode\t dest\t src\t off\t imm\n");
402                 for (cur_ins=0; cur_ins<total_ins; cur_ins++)
403                         fprintf(stderr, " [%03d]\t%02hhx\t%3hhu\t%3hhu\t%04hx\t%d\n", cur_ins,
404                                 program[cur_ins].code,
405                                 program[cur_ins].dst_reg,
406                                 program[cur_ins].src_reg,
407                                 program[cur_ins].off,
408                                 program[cur_ins].imm);
409         }
410 
411         assert(cur_ins == total_ins);
412         bpf_total_insn = total_ins;
413         ret = 0;
414 
415 out:
416         flush_exceptions(&exceptions);
417         return ret;
418 }
419 
420 /*
421  * attach eBPF program to cgroup
422  */
423 int attach_cgroups_ebpf(int cgroup_dirfd) {
424         int prog_fd;
425 #if ( __WORDSIZE == 64 )
426         uint64_t program_ptr = (uint64_t)program;
427         uint64_t license_ptr = (uint64_t)license;
428 #elif ( __WORDSIZE == 32 )
429         uint32_t program_ptr = (uint32_t)program;
430         uint32_t license_ptr = (uint32_t)license;
431 #else
432 #error
433 #endif
434         union bpf_attr load_attr = {
435                 .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE,
436                 .license   = license_ptr,
437                 .insns     = program_ptr,
438                 .insn_cnt  = bpf_total_insn,
439         };
440 
441         if (!program)
442                 return 0;
443 
444         prog_fd = syscall_bpf(BPF_PROG_LOAD, &load_attr, sizeof(load_attr));
445         if (prog_fd < 0)
446                 return EIO;
447 
448         union bpf_attr attach_attr = {
449                 .attach_type = BPF_CGROUP_DEVICE,
450                 .target_fd = cgroup_dirfd,
451                 .attach_bpf_fd = prog_fd,
452         };
453 
454         return syscall_bpf(BPF_PROG_ATTACH, &attach_attr, sizeof (attach_attr));
455 }
456 

This page was automatically generated by LXR 0.3.1.  •  OpenWrt