Skip to content

Commit 04fd61a

Browse files
Alexei Starovoitovdavem330
Alexei Starovoitov
authored andcommitted
bpf: allow bpf programs to tail-call other bpf programs
introduce bpf_tail_call(ctx, &jmp_table, index) helper function which can be used from BPF programs like: int bpf_prog(struct pt_regs *ctx) { ... bpf_tail_call(ctx, &jmp_table, index); ... } that is roughly equivalent to: int bpf_prog(struct pt_regs *ctx) { ... if (jmp_table[index]) return (*jmp_table[index])(ctx); ... } The important detail that it's not a normal call, but a tail call. The kernel stack is precious, so this helper reuses the current stack frame and jumps into another BPF program without adding extra call frame. It's trivially done in interpreter and a bit trickier in JITs. In case of x64 JIT the bigger part of generated assembler prologue is common for all programs, so it is simply skipped while jumping. Other JITs can do similar prologue-skipping optimization or do stack unwind before jumping into the next program. bpf_tail_call() arguments: ctx - context pointer jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table index - index in the jump table Since all BPF programs are idenitified by file descriptor, user space need to populate the jmp_table with FDs of other BPF programs. If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere and program execution continues as normal. New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can populate this jmp_table array with FDs of other bpf programs. Programs can share the same jmp_table array or use multiple jmp_tables. The chain of tail calls can form unpredictable dynamic loops therefore tail_call_cnt is used to limit the number of calls and currently is set to 32. Use cases: Acked-by: Daniel Borkmann <[email protected]> ========== - simplify complex programs by splitting them into a sequence of small programs - dispatch routine For tracing and future seccomp the program may be triggered on all system calls, but processing of syscall arguments will be different. It's more efficient to implement them as: int syscall_entry(struct seccomp_data *ctx) { bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */); ... default: process unknown syscall ... } int sys_write_event(struct seccomp_data *ctx) {...} int sys_read_event(struct seccomp_data *ctx) {...} syscall_jmp_table[__NR_write] = sys_write_event; syscall_jmp_table[__NR_read] = sys_read_event; For networking the program may call into different parsers depending on packet format, like: int packet_parser(struct __sk_buff *skb) { ... parse L2, L3 here ... __u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol)); bpf_tail_call(skb, &ipproto_jmp_table, ipproto); ... default: process unknown protocol ... } int parse_tcp(struct __sk_buff *skb) {...} int parse_udp(struct __sk_buff *skb) {...} ipproto_jmp_table[IPPROTO_TCP] = parse_tcp; ipproto_jmp_table[IPPROTO_UDP] = parse_udp; - for TC use case, bpf_tail_call() allows to implement reclassify-like logic - bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table are atomic, so user space can build chains of BPF programs on the fly Implementation details: ======================= - high performance of bpf_tail_call() is the goal. It could have been implemented without JIT changes as a wrapper on top of BPF_PROG_RUN() macro, but with two downsides: . all programs would have to pay performance penalty for this feature and tail call itself would be slower, since mandatory stack unwind, return, stack allocate would be done for every tailcall. . tailcall would be limited to programs running preempt_disabled, since generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would need to be either global per_cpu variable accessed by helper and by wrapper or global variable protected by locks. In this implementation x64 JIT bypasses stack unwind and jumps into the callee program after prologue. - bpf_prog_array_compatible() ensures that prog_type of callee and caller are the same and JITed/non-JITed flag is the same, since calling JITed program from non-JITed is invalid, since stack frames are different. Similarly calling kprobe type program from socket type program is invalid. - jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map' abstraction, its user space API and all of verifier logic. It's in the existing arraymap.c file, since several functions are shared with regular array map. Signed-off-by: Alexei Starovoitov <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent e7582ba commit 04fd61a

File tree

9 files changed

+255
-9
lines changed

9 files changed

+255
-9
lines changed

Diff for: include/linux/bpf.h

+22
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,27 @@ struct bpf_prog_aux {
126126
struct work_struct work;
127127
};
128128

129+
struct bpf_array {
130+
struct bpf_map map;
131+
u32 elem_size;
132+
/* 'ownership' of prog_array is claimed by the first program that
133+
* is going to use this map or by the first program which FD is stored
134+
* in the map to make sure that all callers and callees have the same
135+
* prog_type and JITed flag
136+
*/
137+
enum bpf_prog_type owner_prog_type;
138+
bool owner_jited;
139+
union {
140+
char value[0] __aligned(8);
141+
struct bpf_prog *prog[0] __aligned(8);
142+
};
143+
};
144+
#define MAX_TAIL_CALL_CNT 32
145+
146+
u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
147+
void bpf_prog_array_map_clear(struct bpf_map *map);
148+
bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
149+
129150
#ifdef CONFIG_BPF_SYSCALL
130151
void bpf_register_prog_type(struct bpf_prog_type_list *tl);
131152
void bpf_register_map_type(struct bpf_map_type_list *tl);
@@ -160,5 +181,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;
160181

161182
extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
162183
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
184+
extern const struct bpf_func_proto bpf_tail_call_proto;
163185

164186
#endif /* _LINUX_BPF_H */

Diff for: include/linux/filter.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
378378

379379
int sk_filter(struct sock *sk, struct sk_buff *skb);
380380

381-
void bpf_prog_select_runtime(struct bpf_prog *fp);
381+
int bpf_prog_select_runtime(struct bpf_prog *fp);
382382
void bpf_prog_free(struct bpf_prog *fp);
383383

384384
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);

Diff for: include/uapi/linux/bpf.h

+10
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ enum bpf_map_type {
113113
BPF_MAP_TYPE_UNSPEC,
114114
BPF_MAP_TYPE_HASH,
115115
BPF_MAP_TYPE_ARRAY,
116+
BPF_MAP_TYPE_PROG_ARRAY,
116117
};
117118

118119
enum bpf_prog_type {
@@ -210,6 +211,15 @@ enum bpf_func_id {
210211
* Return: 0 on success
211212
*/
212213
BPF_FUNC_l4_csum_replace,
214+
215+
/**
216+
* bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
217+
* @ctx: context pointer passed to next program
218+
* @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
219+
* @index: index inside array that selects specific program to run
220+
* Return: 0 on success
221+
*/
222+
BPF_FUNC_tail_call,
213223
__BPF_FUNC_MAX_ID,
214224
};
215225

Diff for: kernel/bpf/arraymap.c

+107-6
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,7 @@
1414
#include <linux/vmalloc.h>
1515
#include <linux/slab.h>
1616
#include <linux/mm.h>
17-
18-
struct bpf_array {
19-
struct bpf_map map;
20-
u32 elem_size;
21-
char value[0] __aligned(8);
22-
};
17+
#include <linux/filter.h>
2318

2419
/* Called from syscall */
2520
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
154149
return 0;
155150
}
156151
late_initcall(register_array_map);
152+
153+
static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
154+
{
155+
/* only bpf_prog file descriptors can be stored in prog_array map */
156+
if (attr->value_size != sizeof(u32))
157+
return ERR_PTR(-EINVAL);
158+
return array_map_alloc(attr);
159+
}
160+
161+
static void prog_array_map_free(struct bpf_map *map)
162+
{
163+
struct bpf_array *array = container_of(map, struct bpf_array, map);
164+
int i;
165+
166+
synchronize_rcu();
167+
168+
/* make sure it's empty */
169+
for (i = 0; i < array->map.max_entries; i++)
170+
BUG_ON(array->prog[i] != NULL);
171+
kvfree(array);
172+
}
173+
174+
static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
175+
{
176+
return NULL;
177+
}
178+
179+
/* only called from syscall */
180+
static int prog_array_map_update_elem(struct bpf_map *map, void *key,
181+
void *value, u64 map_flags)
182+
{
183+
struct bpf_array *array = container_of(map, struct bpf_array, map);
184+
struct bpf_prog *prog, *old_prog;
185+
u32 index = *(u32 *)key, ufd;
186+
187+
if (map_flags != BPF_ANY)
188+
return -EINVAL;
189+
190+
if (index >= array->map.max_entries)
191+
return -E2BIG;
192+
193+
ufd = *(u32 *)value;
194+
prog = bpf_prog_get(ufd);
195+
if (IS_ERR(prog))
196+
return PTR_ERR(prog);
197+
198+
if (!bpf_prog_array_compatible(array, prog)) {
199+
bpf_prog_put(prog);
200+
return -EINVAL;
201+
}
202+
203+
old_prog = xchg(array->prog + index, prog);
204+
if (old_prog)
205+
bpf_prog_put(old_prog);
206+
207+
return 0;
208+
}
209+
210+
static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
211+
{
212+
struct bpf_array *array = container_of(map, struct bpf_array, map);
213+
struct bpf_prog *old_prog;
214+
u32 index = *(u32 *)key;
215+
216+
if (index >= array->map.max_entries)
217+
return -E2BIG;
218+
219+
old_prog = xchg(array->prog + index, NULL);
220+
if (old_prog) {
221+
bpf_prog_put(old_prog);
222+
return 0;
223+
} else {
224+
return -ENOENT;
225+
}
226+
}
227+
228+
/* decrement refcnt of all bpf_progs that are stored in this map */
229+
void bpf_prog_array_map_clear(struct bpf_map *map)
230+
{
231+
struct bpf_array *array = container_of(map, struct bpf_array, map);
232+
int i;
233+
234+
for (i = 0; i < array->map.max_entries; i++)
235+
prog_array_map_delete_elem(map, &i);
236+
}
237+
238+
static const struct bpf_map_ops prog_array_ops = {
239+
.map_alloc = prog_array_map_alloc,
240+
.map_free = prog_array_map_free,
241+
.map_get_next_key = array_map_get_next_key,
242+
.map_lookup_elem = prog_array_map_lookup_elem,
243+
.map_update_elem = prog_array_map_update_elem,
244+
.map_delete_elem = prog_array_map_delete_elem,
245+
};
246+
247+
static struct bpf_map_type_list prog_array_type __read_mostly = {
248+
.ops = &prog_array_ops,
249+
.type = BPF_MAP_TYPE_PROG_ARRAY,
250+
};
251+
252+
static int __init register_prog_array_map(void)
253+
{
254+
bpf_register_map_type(&prog_array_type);
255+
return 0;
256+
}
257+
late_initcall(register_prog_array_map);

Diff for: kernel/bpf/core.c

+72-1
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,15 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
176176
return 0;
177177
}
178178

179+
const struct bpf_func_proto bpf_tail_call_proto = {
180+
.func = NULL,
181+
.gpl_only = false,
182+
.ret_type = RET_VOID,
183+
.arg1_type = ARG_PTR_TO_CTX,
184+
.arg2_type = ARG_CONST_MAP_PTR,
185+
.arg3_type = ARG_ANYTHING,
186+
};
187+
179188
/**
180189
* __bpf_prog_run - run eBPF program on a given context
181190
* @ctx: is the data we are operating on
@@ -244,6 +253,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
244253
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
245254
/* Call instruction */
246255
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
256+
[BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
247257
/* Jumps */
248258
[BPF_JMP | BPF_JA] = &&JMP_JA,
249259
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +296,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
286296
[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
287297
[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
288298
};
299+
u32 tail_call_cnt = 0;
289300
void *ptr;
290301
int off;
291302

@@ -431,6 +442,30 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
431442
BPF_R4, BPF_R5);
432443
CONT;
433444

445+
JMP_TAIL_CALL: {
446+
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
447+
struct bpf_array *array = container_of(map, struct bpf_array, map);
448+
struct bpf_prog *prog;
449+
u64 index = BPF_R3;
450+
451+
if (unlikely(index >= array->map.max_entries))
452+
goto out;
453+
454+
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
455+
goto out;
456+
457+
tail_call_cnt++;
458+
459+
prog = READ_ONCE(array->prog[index]);
460+
if (unlikely(!prog))
461+
goto out;
462+
463+
ARG1 = BPF_R1;
464+
insn = prog->insnsi;
465+
goto select_insn;
466+
out:
467+
CONT;
468+
}
434469
/* JMP */
435470
JMP_JA:
436471
insn += insn->off;
@@ -619,21 +654,57 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
619654
{
620655
}
621656

657+
bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp)
658+
{
659+
if (array->owner_prog_type) {
660+
if (array->owner_prog_type != fp->type)
661+
return false;
662+
if (array->owner_jited != fp->jited)
663+
return false;
664+
} else {
665+
array->owner_prog_type = fp->type;
666+
array->owner_jited = fp->jited;
667+
}
668+
return true;
669+
}
670+
671+
static int check_tail_call(const struct bpf_prog *fp)
672+
{
673+
struct bpf_prog_aux *aux = fp->aux;
674+
int i;
675+
676+
for (i = 0; i < aux->used_map_cnt; i++) {
677+
struct bpf_array *array;
678+
struct bpf_map *map;
679+
680+
map = aux->used_maps[i];
681+
if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
682+
continue;
683+
array = container_of(map, struct bpf_array, map);
684+
if (!bpf_prog_array_compatible(array, fp))
685+
return -EINVAL;
686+
}
687+
688+
return 0;
689+
}
690+
622691
/**
623692
* bpf_prog_select_runtime - select execution runtime for BPF program
624693
* @fp: bpf_prog populated with internal BPF program
625694
*
626695
* try to JIT internal BPF program, if JIT is not available select interpreter
627696
* BPF program will be executed via BPF_PROG_RUN() macro
628697
*/
629-
void bpf_prog_select_runtime(struct bpf_prog *fp)
698+
int bpf_prog_select_runtime(struct bpf_prog *fp)
630699
{
631700
fp->bpf_func = (void *) __bpf_prog_run;
632701

633702
/* Probe if internal BPF can be JITed */
634703
bpf_int_jit_compile(fp);
635704
/* Lock whole bpf_prog as read-only */
636705
bpf_prog_lock_ro(fp);
706+
707+
return check_tail_call(fp);
637708
}
638709
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
639710

Diff for: kernel/bpf/syscall.c

+22-1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
6868
{
6969
struct bpf_map *map = filp->private_data;
7070

71+
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
72+
/* prog_array stores refcnt-ed bpf_prog pointers
73+
* release them all when user space closes prog_array_fd
74+
*/
75+
bpf_prog_array_map_clear(map);
76+
7177
bpf_map_put(map);
7278
return 0;
7379
}
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
392398
*/
393399
BUG_ON(!prog->aux->ops->get_func_proto);
394400

401+
if (insn->imm == BPF_FUNC_tail_call) {
402+
/* mark bpf_tail_call as different opcode
403+
* to avoid conditional branch in
404+
* interpeter for every normal call
405+
* and to prevent accidental JITing by
406+
* JIT compiler that doesn't support
407+
* bpf_tail_call yet
408+
*/
409+
insn->imm = 0;
410+
insn->code |= BPF_X;
411+
continue;
412+
}
413+
395414
fn = prog->aux->ops->get_func_proto(insn->imm);
396415
/* all functions that have prototype and verifier allowed
397416
* programs to call them, must be real in-kernel functions
@@ -532,7 +551,9 @@ static int bpf_prog_load(union bpf_attr *attr)
532551
fixup_bpf_calls(prog);
533552

534553
/* eBPF program is ready to be JITed */
535-
bpf_prog_select_runtime(prog);
554+
err = bpf_prog_select_runtime(prog);
555+
if (err < 0)
556+
goto free_used_maps;
536557

537558
err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
538559
if (err < 0)

Diff for: kernel/bpf/verifier.c

+17
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
907907
fn->ret_type, func_id);
908908
return -EINVAL;
909909
}
910+
911+
if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
912+
func_id != BPF_FUNC_tail_call)
913+
/* prog_array map type needs extra care:
914+
* only allow to pass it into bpf_tail_call() for now.
915+
* bpf_map_delete_elem() can be allowed in the future,
916+
* while bpf_map_update_elem() must only be done via syscall
917+
*/
918+
return -EINVAL;
919+
920+
if (func_id == BPF_FUNC_tail_call &&
921+
map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
922+
/* don't allow any other map type to be passed into
923+
* bpf_tail_call()
924+
*/
925+
return -EINVAL;
926+
910927
return 0;
911928
}
912929

Diff for: kernel/trace/bpf_trace.c

+2
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
172172
return &bpf_probe_read_proto;
173173
case BPF_FUNC_ktime_get_ns:
174174
return &bpf_ktime_get_ns_proto;
175+
case BPF_FUNC_tail_call:
176+
return &bpf_tail_call_proto;
175177

176178
case BPF_FUNC_trace_printk:
177179
/*

Diff for: net/core/filter.c

+2
Original file line numberDiff line numberDiff line change
@@ -1421,6 +1421,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
14211421
return &bpf_get_prandom_u32_proto;
14221422
case BPF_FUNC_get_smp_processor_id:
14231423
return &bpf_get_smp_processor_id_proto;
1424+
case BPF_FUNC_tail_call:
1425+
return &bpf_tail_call_proto;
14241426
default:
14251427
return NULL;
14261428
}

0 commit comments

Comments
 (0)