diff --git a/cover.txt b/cover.txt new file mode 100644 index 0000000000000..149b0cf659b79 --- /dev/null +++ b/cover.txt @@ -0,0 +1,82 @@ +mm: BPF OOM + +This patchset adds an ability to customize the out of memory +handling using bpf. + +It focuses on two parts: +1) OOM handling policy, +2) PSI-based OOM invocation. + +The idea to use bpf for customizing the OOM handling is not new, but +unlike the previous proposal [1], which augmented the existing task +ranking policy, this one tries to be as generic as possible and +leverage the full power of the modern bpf. + +It provides a generic interface which is called before the existing OOM +killer code and allows implementing any policy, e.g. picking a victim +task or memory cgroup or potentially even releasing memory in other +ways, e.g. deleting tmpfs files (the last one might require some +additional but relatively simple changes). + +The past attempt to implement memory-cgroup aware policy [2] showed +that there are multiple opinions on what the best policy is. As it's +highly workload-dependent and specific to a concrete way of organizing +workloads, the structure of the cgroup tree etc, a customizable +bpf-based implementation is preferable over an in-kernel implementation +with a dozen of sysctls. + +The second part is related to the fundamental question on when to +declare the OOM event. It's a trade-off between the risk of +unnecessary OOM kills and associated work losses and the risk of +infinite trashing and effective soft lockups. In the last few years +several PSI-based userspace solutions were developed (e.g. OOMd [3] or +systemd-OOMd [4]). The common idea was to use userspace daemons to +implement custom OOM logic as well as rely on PSI monitoring to avoid +stalls. In this scenario the userspace daemon was supposed to handle +the majority of OOMs, while the in-kernel OOM killer worked as the +last resort measure to guarantee that the system would never deadlock +on the memory. But this approach creates additional infrastructure +churn: userspace OOM daemon is a separate entity which needs to be +deployed, updated, monitored. A completely different pipeline needs to +be built to monitor both types of OOM events and collect associated +logs. A userspace daemon is more restricted in terms on what data is +available to it. Implementing a daemon which can work reliably under a +heavy memory pressure in the system is also tricky. + +This patchset includes the code, tests and many ideas from the patchset +of JP Kobryn, which implemented bpf kfuncs to provide a faster method +to access memcg data [5]. + +[1]: https://lwn.net/ml/linux-kernel/20230810081319.65668-1-zhouchuyi@bytedance.com/ +[2]: https://lore.kernel.org/lkml/20171130152824.1591-1-guro@fb.com/ +[3]: https://github.com/facebookincubator/oomd +[4]: https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html +[5]: https://lkml.org/lkml/2025/10/15/1554 + +---- +v2: + 1) A single bpf_oom can be attached system-wide and a single bpf_oom per memcg. + (by Alexei Starovoitov) + 2) Initial support for attaching struct ops to cgroups (Martin KaFai Lau, + Andrii Nakryiko and others) + 3) bpf memcontrol kfuncs enhancements and tests (co-developed by JP Kobryn) + 4) Many mall-ish fixes and cleanups (suggested by Andrew Morton, Suren Baghdasaryan, + Andrii Nakryiko and Kumar Kartikeya Dwivedi) + 5) bpf_out_of_memory() is taking u64 flags instead of bool wait_on_oom_lock + (suggested by Kumar Kartikeya Dwivedi) + 6) bpf_get_mem_cgroup() got KF_RCU flag (suggested by Kumar Kartikeya Dwivedi) + 7) cgroup online and offline callbacks for bpf_psi, cgroup offline for bpf_oom + +v1: + 1) Both OOM and PSI parts are now implemented using bpf struct ops, + providing a path the future extensions (suggested by Kumar Kartikeya Dwivedi, + Song Liu and Matt Bobrowski) + 2) It's possible to create PSI triggers from BPF, no need for an additional + userspace agent. (suggested by Suren Baghdasaryan) + Also there is now a callback for the cgroup release event. + 3) Added an ability to block on oom_lock instead of bailing out (suggested by Michal Hocko) + 4) Added bpf_task_is_oom_victim (suggested by Michal Hocko) + 5) PSI callbacks are scheduled using a separate workqueue (suggested by Suren Baghdasaryan) + +RFC: + https://lwn.net/ml/all/20250428033617.3797686-1-roman.gushchin@linux.dev/ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e53cda0aabb68..4abef08b3ed90 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1837,6 +1837,13 @@ struct bpf_raw_tp_link { u64 cookie; }; +struct bpf_struct_ops_link { + struct bpf_link link; + struct bpf_map __rcu *map; + wait_queue_head_t wait_hup; + u64 cgroup_id; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h new file mode 100644 index 0000000000000..d93dba501a006 --- /dev/null +++ b/include/linux/bpf_oom.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __BPF_OOM_H +#define __BPF_OOM_H + +struct oom_control; + +#define BPF_OOM_NAME_MAX_LEN 64 + +struct bpf_oom_ctx { + /* + * If bpf_oom_ops is attached to a cgroup, id of this cgroup. + * 0 otherwise. + */ + u64 cgroup_id; +}; + +struct bpf_oom_ops { + /** + * @handle_out_of_memory: Out of memory bpf handler, called before + * the in-kernel OOM killer. + * @oc: OOM control structure + * @ctx: Execution context + * + * Should return 1 if some memory was freed up, otherwise + * the in-kernel OOM killer is invoked. + */ + int (*handle_out_of_memory)(struct oom_control *oc, struct bpf_oom_ctx *ctx); + + /** + * @handle_cgroup_offline: Cgroup offline callback + * @cgroup_id: Id of deleted cgroup + * + * Called if the cgroup with the attached bpf_oom_ops is deleted. + */ + void (*handle_cgroup_offline)(u64 cgroup_id, struct bpf_oom_ctx *ctx); + + /** + * @name: BPF OOM policy name + */ + char name[BPF_OOM_NAME_MAX_LEN]; +}; + +#ifdef CONFIG_BPF_SYSCALL +/** + * @bpf_handle_oom: handle out of memory condition using bpf + * @oc: OOM control structure + * + * Returns true if some memory was freed. + */ +bool bpf_handle_oom(struct oom_control *oc); + + +/** + * @bpf_oom_memcg_offline: handle memcg offlining + * @memcg: Memory cgroup is offlined + * + * When a memory cgroup is about to be deleted and there is an + * attached BPF OOM structure, it has to be detached. + */ +void bpf_oom_memcg_offline(struct mem_cgroup *memcg); + +#else /* CONFIG_BPF_SYSCALL */ +static inline bool bpf_handle_oom(struct oom_control *oc) +{ + return false; +} + +static inline void bpf_oom_memcg_offline(struct mem_cgroup *memcg) {} + +#endif /* CONFIG_BPF_SYSCALL */ + +#endif /* __BPF_OOM_H */ diff --git a/include/linux/bpf_psi.h b/include/linux/bpf_psi.h new file mode 100644 index 0000000000000..df00778e474ee --- /dev/null +++ b/include/linux/bpf_psi.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __BPF_PSI_H +#define __BPF_PSI_H + +#include +#include +#include +#include + +struct cgroup; +struct bpf_psi; +struct psi_trigger; +struct psi_trigger_params; + +#define BPF_PSI_FULL 0x80000000 + +struct bpf_psi_ops { + /** + * @init: Initialization callback, suited for creating psi triggers. + * @bpf_psi: bpf_psi pointer, can be passed to bpf_psi_create_trigger(). + * + * A non-0 return value means the initialization has been failed. + */ + int (*init)(struct bpf_psi *bpf_psi); + + /** + * @handle_psi_event: PSI event callback + * @t: psi_trigger pointer + */ + void (*handle_psi_event)(struct psi_trigger *t); + + /** + * @handle_cgroup_online: Cgroup online callback + * @cgroup_id: Id of the new cgroup + * + * Called every time a new cgroup is created. Can be used + * to create new psi triggers. + */ + void (*handle_cgroup_online)(u64 cgroup_id); + + /** + * @handle_cgroup_offline: Cgroup offline callback + * @cgroup_id: Id of offlined cgroup + * + * Called every time a cgroup with an attached bpf psi trigger is + * offlined. + */ + void (*handle_cgroup_offline)(u64 cgroup_id); + + /* private */ + struct bpf_psi *bpf_psi; +}; + +struct bpf_psi { + spinlock_t lock; + struct list_head triggers; + struct bpf_psi_ops *ops; + struct srcu_struct srcu; + struct list_head node; /* Protected by bpf_psi_lock */ +}; + +#ifdef CONFIG_BPF_SYSCALL +void bpf_psi_add_trigger(struct psi_trigger *t, + const struct psi_trigger_params *params); +void bpf_psi_remove_trigger(struct psi_trigger *t); +void bpf_psi_handle_event(struct psi_trigger *t); + +#else /* CONFIG_BPF_SYSCALL */ +static inline void bpf_psi_add_trigger(struct psi_trigger *t, + const struct psi_trigger_params *params) {} +static inline void bpf_psi_remove_trigger(struct psi_trigger *t) {} +static inline void bpf_psi_handle_event(struct psi_trigger *t) {} + +#endif /* CONFIG_BPF_SYSCALL */ + +#if (defined(CONFIG_CGROUPS) && defined(CONFIG_PSI) && defined(CONFIG_BPF_SYSCALL)) +void bpf_psi_cgroup_online(struct cgroup *cgroup); +void bpf_psi_cgroup_offline(struct cgroup *cgroup); + +#else /* CONFIG_CGROUPS && CONFIG_PSI && CONFIG_BPF_SYSCALL */ +static inline void bpf_psi_cgroup_online(struct cgroup *cgroup) {} +static inline void bpf_psi_cgroup_offline(struct cgroup *cgroup) {} + +#endif /* CONFIG_CGROUPS && CONFIG_PSI && CONFIG_BPF_SYSCALL */ + +#endif /* __BPF_PSI_H */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6ed477338b166..1a99da44999ed 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -707,6 +707,10 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} +static inline struct cgroup *cgroup_get_from_id(u64 id) +{ + return NULL; +} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 873e510d6f8d9..b9e08dddd7ada 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,6 +29,7 @@ struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; +struct bpf_oom_ops; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { @@ -226,6 +227,10 @@ struct mem_cgroup { */ bool oom_group; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_oom_ops *bpf_oom; +#endif + int swappiness; /* memory.events and memory.events.local */ @@ -832,9 +837,9 @@ static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } +#endif struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); -#endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -948,7 +953,10 @@ static inline void mod_memcg_page_state(struct page *page, rcu_read_unlock(); } +unsigned long memcg_events(struct mem_cgroup *memcg, int event); +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); +unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); @@ -1331,12 +1339,12 @@ static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; } +#endif static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } -#endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { diff --git a/include/linux/oom.h b/include/linux/oom.h index 7b02bc1d0a7ea..704fc0e786c62 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -19,6 +19,12 @@ enum oom_constraint { CONSTRAINT_CPUSET, CONSTRAINT_MEMORY_POLICY, CONSTRAINT_MEMCG, + CONSTRAINT_BPF, +}; + +enum bpf_oom_flags { + BPF_OOM_FLAGS_WAIT_ON_OOM_LOCK = 1 << 0, + BPF_OOM_FLAGS_LAST = 1 << 1, }; /* @@ -51,6 +57,17 @@ struct oom_control { /* Used to print the constraint info. */ enum oom_constraint constraint; + +#ifdef CONFIG_BPF_SYSCALL + /* Used by the bpf oom implementation to mark the forward progress */ + bool bpf_memory_freed; + + /* Policy name */ + const char *bpf_policy_name; + + /* BPF-specific constraint name */ + const char *bpf_constraint; +#endif }; extern struct mutex oom_lock; diff --git a/include/linux/psi.h b/include/linux/psi.h index e0745873e3f26..8ffe84cd8571a 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -23,14 +23,23 @@ void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); -struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, - enum psi_res res, struct file *file, - struct kernfs_open_file *of); +int psi_trigger_parse(struct psi_trigger_params *params, const char *buf); +struct psi_trigger *psi_trigger_create(struct psi_group *group, + const struct psi_trigger_params *param); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); +static inline bool psi_file_privileged(struct file *file) +{ + /* + * Checking the privilege here on file->f_cred implies that a privileged user + * could open the file and delegate the write to an unprivileged one. + */ + return cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE); +} + #ifdef CONFIG_CGROUPS static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { @@ -41,6 +50,12 @@ int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); void psi_cgroup_restart(struct psi_group *group); + +#else +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return &psi_system; +} #endif #else /* CONFIG_PSI */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index dd10c22299ab8..e551df9d6336c 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -119,7 +119,46 @@ struct psi_window { u64 prev_growth; }; +enum psi_trigger_type { + PSI_SYSTEM, + PSI_CGROUP, + PSI_BPF, +}; + +struct psi_trigger_params { + /* Trigger type */ + enum psi_trigger_type type; + + /* Resource to be monitored */ + enum psi_res res; + + /* True if all threads should be stalled to trigger */ + bool full; + + /* Threshold in us */ + u32 threshold_us; + + /* Window in us */ + u32 window_us; + + /* Privileged triggers are treated differently */ + bool privileged; + + union { + /* Link to kernfs open file, only for PSI_CGROUP */ + struct kernfs_open_file *of; + +#ifdef CONFIG_BPF_SYSCALL + /* Link to bpf_psi structure, only for BPF_PSI */ + struct bpf_psi *bpf_psi; +#endif + }; +}; + struct psi_trigger { + /* Trigger type */ + enum psi_trigger_type type; + /* PSI state being monitored by the trigger */ enum psi_states state; @@ -135,7 +174,7 @@ struct psi_trigger { /* Wait queue for polling */ wait_queue_head_t event_wait; - /* Kernfs file for cgroup triggers */ + /* Kernfs file for PSI_CGROUP triggers */ struct kernfs_open_file *of; /* Pending event flag */ @@ -155,6 +194,31 @@ struct psi_trigger { /* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */ enum psi_aggregators aggregator; + +#ifdef CONFIG_BPF_SYSCALL + /* Fields specific to PSI_BPF triggers */ + + /* Bpf psi structure for events handling */ + struct bpf_psi *bpf_psi; + + /* List node inside bpf_psi->triggers list */ + struct list_head bpf_psi_node; + + /* List node inside group->bpf_triggers list */ + struct list_head bpf_group_node; + + /* Work structure, used to execute event handlers */ + struct work_struct bpf_work; + + /* + * Whether the trigger is being pinned in memory. + * Protected by group->bpf_triggers_lock. + */ + bool pinned; + + /* Cgroup Id */ + u64 cgroup_id; +#endif }; struct psi_group { @@ -203,6 +267,12 @@ struct psi_group { u64 rtpoll_total[NR_PSI_STATES - 1]; u64 rtpoll_next_update; u64 rtpoll_until; + +#ifdef CONFIG_BPF_SYSCALL + /* List of triggers owned by bpf and corresponding lock */ + spinlock_t bpf_triggers_lock; + struct list_head bpf_triggers; +#endif }; #else /* CONFIG_PSI */ diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index a41e6730edcf3..58664779a2b6f 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -13,6 +13,7 @@ #include #include #include +#include struct bpf_struct_ops_value { struct bpf_struct_ops_common_value common; @@ -55,12 +56,6 @@ struct bpf_struct_ops_map { struct bpf_struct_ops_value kvalue; }; -struct bpf_struct_ops_link { - struct bpf_link link; - struct bpf_map __rcu *map; - wait_queue_head_t wait_hup; -}; - static DEFINE_MUTEX(update_mutex); #define VALUE_PREFIX "bpf_struct_ops_" @@ -1365,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL, attr->link_create.attach_type); +#ifdef CONFIG_CGROUPS + if (attr->link_create.cgroup.relative_fd) { + struct cgroup *cgrp; + + cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + link->cgroup_id = cgroup_id(cgrp); + cgroup_put(cgrp); + } +#endif /* CONFIG_CGROUPS */ err = bpf_link_prime(&link->link, &link_primer); if (err) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 248f517d66d04..4df4c49ba1793 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -557,9 +558,11 @@ static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, switch (action) { case CGROUP_LIFETIME_ONLINE: + bpf_psi_cgroup_online(cgrp); ret = cgroup_bpf_inherit(cgrp); break; case CGROUP_LIFETIME_OFFLINE: + bpf_psi_cgroup_offline(cgrp); cgroup_bpf_offline(cgrp); break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d175849e57ac..7ef954760078d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7101,6 +7101,10 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) { struct file *vm_file; }; +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct oom_control) { + struct mem_cgroup *memcg; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -7143,6 +7147,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct oom_control)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6ae5f48cf64e3..836b28676abcb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4000,6 +4000,12 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; + struct psi_trigger_params params; + int err; + + err = psi_trigger_parse(¶ms, buf); + if (err) + return err; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) @@ -4015,7 +4021,13 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_psi(cgrp); - new = psi_trigger_create(psi, buf, res, of->file, of); + + params.type = PSI_CGROUP; + params.res = res; + params.privileged = psi_file_privileged(of->file); + params.of = of; + + new = psi_trigger_create(psi, ¶ms); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c new file mode 100644 index 0000000000000..952c7bd3ff3d2 --- /dev/null +++ b/kernel/sched/bpf_psi.c @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * BPF PSI event handlers + * + * Author: Roman Gushchin + */ + +#include +#include + +struct bpf_struct_ops bpf_psi_bpf_ops; +static struct workqueue_struct *bpf_psi_wq; + +static DEFINE_MUTEX(bpf_psi_lock); +static LIST_HEAD(bpf_psi_notify_list); +static DEFINE_STATIC_KEY_FALSE(bpf_psi_notify_key); + +static struct bpf_psi *bpf_psi_create(struct bpf_psi_ops *ops) +{ + struct bpf_psi *bpf_psi; + + bpf_psi = kzalloc(sizeof(*bpf_psi), GFP_KERNEL); + if (!bpf_psi) + return NULL; + + if (init_srcu_struct(&bpf_psi->srcu)) { + kfree(bpf_psi); + return NULL; + } + + spin_lock_init(&bpf_psi->lock); + bpf_psi->ops = ops; + INIT_LIST_HEAD(&bpf_psi->triggers); + ops->bpf_psi = bpf_psi; + + if (ops->handle_cgroup_online) { + mutex_lock(&bpf_psi_lock); + list_add(&bpf_psi->node, &bpf_psi_notify_list); + mutex_unlock(&bpf_psi_lock); + static_branch_inc(&bpf_psi_notify_key); + } else { + INIT_LIST_HEAD(&bpf_psi->node); + } + + return bpf_psi; +} + +static void bpf_psi_handle_event_fn(struct work_struct *work) +{ + struct psi_trigger *t; + struct bpf_psi *bpf_psi; + int idx; + + t = container_of(work, struct psi_trigger, bpf_work); + bpf_psi = READ_ONCE(t->bpf_psi); + + if (likely(bpf_psi)) { + idx = srcu_read_lock(&bpf_psi->srcu); + bpf_psi->ops->handle_psi_event(t); + srcu_read_unlock(&bpf_psi->srcu, idx); + } +} + +void bpf_psi_add_trigger(struct psi_trigger *t, + const struct psi_trigger_params *params) +{ + t->bpf_psi = params->bpf_psi; + t->pinned = false; + INIT_WORK(&t->bpf_work, bpf_psi_handle_event_fn); + + spin_lock(&t->bpf_psi->lock); + list_add(&t->bpf_psi_node, &t->bpf_psi->triggers); + spin_unlock(&t->bpf_psi->lock); + + spin_lock(&t->group->bpf_triggers_lock); + list_add(&t->bpf_group_node, &t->group->bpf_triggers); + spin_unlock(&t->group->bpf_triggers_lock); +} + +void bpf_psi_remove_trigger(struct psi_trigger *t) +{ + spin_lock(&t->group->bpf_triggers_lock); + list_del(&t->bpf_group_node); + spin_unlock(&t->group->bpf_triggers_lock); + + spin_lock(&t->bpf_psi->lock); + list_del(&t->bpf_psi_node); + spin_unlock(&t->bpf_psi->lock); +} + +#ifdef CONFIG_CGROUPS +void bpf_psi_cgroup_online(struct cgroup *cgroup) +{ + struct bpf_psi *bpf_psi; + int idx; + + if (!static_branch_likely(&bpf_psi_notify_key)) + return; + + mutex_lock(&bpf_psi_lock); + list_for_each_entry(bpf_psi, &bpf_psi_notify_list, node) { + idx = srcu_read_lock(&bpf_psi->srcu); + if (bpf_psi->ops->handle_cgroup_online) + bpf_psi->ops->handle_cgroup_online(cgroup_id(cgroup)); + srcu_read_unlock(&bpf_psi->srcu, idx); + } + mutex_unlock(&bpf_psi_lock); +} + +void bpf_psi_cgroup_offline(struct cgroup *cgroup) +{ + struct psi_group *group = cgroup->psi; + u64 cgrp_id = cgroup_id(cgroup); + struct psi_trigger *t, *p; + struct bpf_psi *bpf_psi; + LIST_HEAD(to_destroy); + int idx; + + spin_lock(&group->bpf_triggers_lock); + list_for_each_entry_safe(t, p, &group->bpf_triggers, bpf_group_node) { + if (!t->pinned) { + t->pinned = true; + list_move(&t->bpf_group_node, &to_destroy); + } + } + spin_unlock(&group->bpf_triggers_lock); + + list_for_each_entry_safe(t, p, &to_destroy, bpf_group_node) { + bpf_psi = READ_ONCE(t->bpf_psi); + + idx = srcu_read_lock(&bpf_psi->srcu); + if (bpf_psi->ops->handle_cgroup_offline) + bpf_psi->ops->handle_cgroup_offline(cgrp_id); + srcu_read_unlock(&bpf_psi->srcu, idx); + + spin_lock(&bpf_psi->lock); + list_del(&t->bpf_psi_node); + spin_unlock(&bpf_psi->lock); + + WRITE_ONCE(t->bpf_psi, NULL); + flush_workqueue(bpf_psi_wq); + synchronize_srcu(&bpf_psi->srcu); + psi_trigger_destroy(t); + } +} +#endif + +void bpf_psi_handle_event(struct psi_trigger *t) +{ + queue_work(bpf_psi_wq, &t->bpf_work); +} + +/* BPF struct ops */ + +static int __bpf_psi_init(struct bpf_psi *bpf_psi) { return 0; } +static void __bpf_psi_handle_psi_event(struct psi_trigger *t) {} +static void __bpf_psi_handle_cgroup_online(u64 cgroup_id) {} +static void __bpf_psi_handle_cgroup_offline(u64 cgroup_id) {} + +static struct bpf_psi_ops __bpf_psi_ops = { + .init = __bpf_psi_init, + .handle_psi_event = __bpf_psi_handle_psi_event, + .handle_cgroup_online = __bpf_psi_handle_cgroup_online, + .handle_cgroup_offline = __bpf_psi_handle_cgroup_offline, +}; + +static const struct bpf_func_proto * +bpf_psi_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return tracing_prog_func_proto(func_id, prog); +} + +static bool bpf_psi_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static const struct bpf_verifier_ops bpf_psi_verifier_ops = { + .get_func_proto = bpf_psi_func_proto, + .is_valid_access = bpf_psi_ops_is_valid_access, +}; + +__bpf_kfunc_start_defs(); + +/** + * bpf_psi_create_trigger - Create a PSI trigger + * @bpf_psi: bpf_psi struct to attach the trigger to + * @cgroup_id: cgroup Id to attach the trigger; 0 for system-wide scope + * @resource: resource to monitor (PSI_MEM, PSI_IO, etc) and the full bit. + * @threshold_us: threshold in us + * @window_us: window in us + * + * Creates a PSI trigger and attached is to bpf_psi. The trigger will be + * active unless bpf struct ops is unloaded or the corresponding cgroup + * is deleted. + * + * Resource's most significant bit encodes whether "some" or "full" + * PSI state should be tracked. + * + * Returns 0 on success and the error code on failure. + */ +__bpf_kfunc int bpf_psi_create_trigger(struct bpf_psi *bpf_psi, + u64 cgroup_id, u32 resource, + u32 threshold_us, u32 window_us) +{ + enum psi_res res = resource & ~BPF_PSI_FULL; + bool full = resource & BPF_PSI_FULL; + struct psi_trigger_params params; + struct cgroup *cgroup __maybe_unused = NULL; + struct psi_group *group; + struct psi_trigger *t; + int ret = 0; + + if (res >= NR_PSI_RESOURCES) + return -EINVAL; + + if (IS_ENABLED(CONFIG_CGROUPS) && cgroup_id) { + cgroup = cgroup_get_from_id(cgroup_id); + if (IS_ERR_OR_NULL(cgroup)) + return PTR_ERR(cgroup); + + group = cgroup_psi(cgroup); + } else { + group = &psi_system; + } + + params.type = PSI_BPF; + params.bpf_psi = bpf_psi; + params.privileged = capable(CAP_SYS_RESOURCE); + params.res = res; + params.full = full; + params.threshold_us = threshold_us; + params.window_us = window_us; + + t = psi_trigger_create(group, ¶ms); + if (IS_ERR(t)) + ret = PTR_ERR(t); + else + t->cgroup_id = cgroup_id; + +#ifdef CONFIG_CGROUPS + if (cgroup) + cgroup_put(cgroup); +#endif + + return ret; +} +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_psi_kfuncs) +BTF_ID_FLAGS(func, bpf_psi_create_trigger, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_psi_kfuncs) + +static int bpf_psi_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (btf_id_set8_contains(&bpf_psi_kfuncs, kfunc_id) && + prog->aux->st_ops != &bpf_psi_bpf_ops) + return -EACCES; + + return 0; +} + +static const struct btf_kfunc_id_set bpf_psi_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_psi_kfuncs, + .filter = bpf_psi_kfunc_filter, +}; + +static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_psi_ops *ops = kdata; + struct bpf_psi *bpf_psi; + + bpf_psi = bpf_psi_create(ops); + if (!bpf_psi) + return -ENOMEM; + + return ops->init(bpf_psi); +} + +static void bpf_psi_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_psi_ops *ops = kdata; + struct bpf_psi *bpf_psi = ops->bpf_psi; + struct psi_trigger *t, *p; + LIST_HEAD(to_destroy); + + spin_lock(&bpf_psi->lock); + list_for_each_entry_safe(t, p, &bpf_psi->triggers, bpf_psi_node) { + spin_lock(&t->group->bpf_triggers_lock); + if (!t->pinned) { + t->pinned = true; + list_move(&t->bpf_group_node, &to_destroy); + list_del(&t->bpf_psi_node); + + WRITE_ONCE(t->bpf_psi, NULL); + } + spin_unlock(&t->group->bpf_triggers_lock); + } + spin_unlock(&bpf_psi->lock); + + flush_workqueue(bpf_psi_wq); + synchronize_srcu(&bpf_psi->srcu); + + list_for_each_entry_safe(t, p, &to_destroy, bpf_group_node) + psi_trigger_destroy(t); + + if (!list_empty(&bpf_psi->node)) { + mutex_lock(&bpf_psi_lock); + list_del(&bpf_psi->node); + mutex_unlock(&bpf_psi_lock); + static_branch_dec(&bpf_psi_notify_key); + } + + cleanup_srcu_struct(&bpf_psi->srcu); + kfree(bpf_psi); +} + +static int bpf_psi_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct bpf_psi_ops, init): + fallthrough; + case offsetof(struct bpf_psi_ops, handle_psi_event): + if (!prog) + return -EINVAL; + break; + } + + return 0; +} + +static int bpf_psi_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static int bpf_psi_ops_init(struct btf *btf) +{ + return 0; +} + +struct bpf_struct_ops bpf_psi_bpf_ops = { + .verifier_ops = &bpf_psi_verifier_ops, + .reg = bpf_psi_ops_reg, + .unreg = bpf_psi_ops_unreg, + .check_member = bpf_psi_ops_check_member, + .init_member = bpf_psi_ops_init_member, + .init = bpf_psi_ops_init, + .name = "bpf_psi_ops", + .owner = THIS_MODULE, + .cfi_stubs = &__bpf_psi_ops +}; + +static int __init bpf_psi_struct_ops_init(void) +{ + int wq_flags = WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI; + int err; + + bpf_psi_wq = alloc_workqueue("bpf_psi_wq", wq_flags, 0); + if (!bpf_psi_wq) + return -ENOMEM; + + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &bpf_psi_kfunc_set); + if (err) { + pr_warn("error while registering bpf psi kfuncs: %d", err); + goto err; + } + + err = register_bpf_struct_ops(&bpf_psi_bpf_ops, bpf_psi_ops); + if (err) { + pr_warn("error while registering bpf psi struct ops: %d", err); + goto err; + } + + return 0; + +err: + destroy_workqueue(bpf_psi_wq); + return err; +} +late_initcall(bpf_psi_struct_ops_init); diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index e2cf3b08d4e95..1f90781781a12 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -91,6 +92,9 @@ #ifdef CONFIG_PSI # include "psi.c" +# ifdef CONFIG_BPF_SYSCALL +# include "bpf_psi.c" +# endif #endif #ifdef CONFIG_MEMBARRIER diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 59fdb7ebbf22a..26de772750e82 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -223,6 +223,10 @@ static void group_init(struct psi_group *group) init_waitqueue_head(&group->rtpoll_wait); timer_setup(&group->rtpoll_timer, poll_timer_fn, 0); rcu_assign_pointer(group->rtpoll_task, NULL); +#ifdef CONFIG_BPF_SYSCALL + spin_lock_init(&group->bpf_triggers_lock); + INIT_LIST_HEAD(&group->bpf_triggers); +#endif } void __init psi_init(void) @@ -511,10 +515,17 @@ static void update_triggers(struct psi_group *group, u64 now, /* Generate an event */ if (cmpxchg(&t->event, 0, 1) == 0) { - if (t->of) - kernfs_notify(t->of->kn); - else + switch (t->type) { + case PSI_SYSTEM: wake_up_interruptible(&t->event_wait); + break; + case PSI_CGROUP: + kernfs_notify(t->of->kn); + break; + case PSI_BPF: + bpf_psi_handle_event(t); + break; + } } t->last_event_time = now; /* Reset threshold breach flag once event got generated */ @@ -1292,74 +1303,91 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, - enum psi_res res, struct file *file, - struct kernfs_open_file *of) +int psi_trigger_parse(struct psi_trigger_params *params, const char *buf) { - struct psi_trigger *t; - enum psi_states state; - u32 threshold_us; - bool privileged; - u32 window_us; + u32 threshold_us, window_us; if (static_branch_likely(&psi_disabled)) - return ERR_PTR(-EOPNOTSUPP); - - /* - * Checking the privilege here on file->f_cred implies that a privileged user - * could open the file and delegate the write to an unprivileged one. - */ - privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE); + return -EOPNOTSUPP; if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) - state = PSI_IO_SOME + res * 2; + params->full = false; else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) - state = PSI_IO_FULL + res * 2; + params->full = true; else - return ERR_PTR(-EINVAL); + return -EINVAL; + + params->threshold_us = threshold_us; + params->window_us = window_us; + return 0; +} + +struct psi_trigger *psi_trigger_create(struct psi_group *group, + const struct psi_trigger_params *params) +{ + struct psi_trigger *t; + enum psi_states state; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + state = params->full ? PSI_IO_FULL : PSI_IO_SOME; + state += params->res * 2; #ifdef CONFIG_IRQ_TIME_ACCOUNTING - if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + if (params->res == PSI_IRQ && --state != PSI_IRQ_FULL) return ERR_PTR(-EINVAL); #endif if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); - if (window_us == 0 || window_us > WINDOW_MAX_US) + if (params->window_us == 0 || params->window_us > WINDOW_MAX_US) return ERR_PTR(-EINVAL); /* * Unprivileged users can only use 2s windows so that averages aggregation * work is used, and no RT threads need to be spawned. */ - if (!privileged && window_us % 2000000) + if (!params->privileged && params->window_us % 2000000) return ERR_PTR(-EINVAL); /* Check threshold */ - if (threshold_us == 0 || threshold_us > window_us) + if (params->threshold_us == 0 || params->threshold_us > params->window_us) return ERR_PTR(-EINVAL); t = kmalloc(sizeof(*t), GFP_KERNEL); if (!t) return ERR_PTR(-ENOMEM); + t->type = params->type; t->group = group; t->state = state; - t->threshold = threshold_us * NSEC_PER_USEC; - t->win.size = window_us * NSEC_PER_USEC; + t->threshold = params->threshold_us * NSEC_PER_USEC; + t->win.size = params->window_us * NSEC_PER_USEC; window_reset(&t->win, sched_clock(), group->total[PSI_POLL][t->state], 0); t->event = 0; t->last_event_time = 0; - t->of = of; - if (!of) + + switch (params->type) { + case PSI_SYSTEM: init_waitqueue_head(&t->event_wait); + t->of = NULL; + break; + case PSI_CGROUP: + t->of = params->of; + break; + case PSI_BPF: + bpf_psi_add_trigger(t, params); + break; + } + t->pending_event = false; - t->aggregator = privileged ? PSI_POLL : PSI_AVGS; + t->aggregator = params->privileged ? PSI_POLL : PSI_AVGS; - if (privileged) { + if (params->privileged) { mutex_lock(&group->rtpoll_trigger_lock); if (!rcu_access_pointer(group->rtpoll_task)) { @@ -1367,8 +1395,10 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, task = kthread_create(psi_rtpoll_worker, group, "psimon"); if (IS_ERR(task)) { - kfree(t); mutex_unlock(&group->rtpoll_trigger_lock); + if (t->type == PSI_BPF) + bpf_psi_remove_trigger(t); + kfree(t); return ERR_CAST(task); } atomic_set(&group->rtpoll_wakeup, 0); @@ -1412,10 +1442,16 @@ void psi_trigger_destroy(struct psi_trigger *t) * being accessed later. Can happen if cgroup is deleted from under a * polling process. */ - if (t->of) - kernfs_notify(t->of->kn); - else + switch (t->type) { + case PSI_SYSTEM: wake_up_interruptible(&t->event_wait); + break; + case PSI_CGROUP: + kernfs_notify(t->of->kn); + break; + case PSI_BPF: + break; + } if (t->aggregator == PSI_AVGS) { mutex_lock(&group->avgs_lock); @@ -1492,10 +1528,16 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - if (t->of) - kernfs_generic_poll(t->of, wait); - else + switch (t->type) { + case PSI_SYSTEM: poll_wait(file, &t->event_wait, wait); + break; + case PSI_CGROUP: + kernfs_generic_poll(t->of, wait); + break; + case PSI_BPF: + break; + } if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; @@ -1541,6 +1583,8 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t buf_size; struct seq_file *seq; struct psi_trigger *new; + struct psi_trigger_params params; + int err; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; @@ -1554,6 +1598,10 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; + err = psi_trigger_parse(¶ms, buf); + if (err) + return err; + seq = file->private_data; /* Take seq->lock to protect seq->private from concurrent writes */ @@ -1565,7 +1613,11 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, res, file, NULL); + params.type = PSI_SYSTEM; + params.res = res; + params.privileged = psi_file_privileged(file); + + new = psi_trigger_create(&psi_system, ¶ms); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); diff --git a/mm/Makefile b/mm/Makefile index 21abb33535501..2d8f9beb3c710 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -105,6 +105,10 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o ifdef CONFIG_SWAP obj-$(CONFIG_MEMCG) += swap_cgroup.o endif +ifdef CONFIG_BPF_SYSCALL +obj-y += bpf_oom.o +obj-$(CONFIG_MEMCG) += bpf_memcontrol.o +endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c new file mode 100644 index 0000000000000..458ad022b036f --- /dev/null +++ b/mm/bpf_memcontrol.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Memory Controller-related BPF kfuncs and auxiliary code + * + * Author: Roman Gushchin + */ + +#include +#include + +__bpf_kfunc_start_defs(); + +/** + * bpf_get_root_mem_cgroup - Returns a pointer to the root memory cgroup + * + * The function has KF_ACQUIRE semantics, even though the root memory + * cgroup is never destroyed after being created and doesn't require + * reference counting. And it's perfectly safe to pass it to + * bpf_put_mem_cgroup() + */ +__bpf_kfunc struct mem_cgroup *bpf_get_root_mem_cgroup(void) +{ + /* css_get() is not needed */ + return root_mem_cgroup; +} + +/** + * bpf_get_mem_cgroup - Get a reference to a memory cgroup + * @css: pointer to the css structure + * + * Returns a pointer to a mem_cgroup structure after bumping + * the corresponding css's reference counter. + * + * It's fine to pass a css which belongs to any cgroup controller, + * e.g. unified hierarchy's main css. + * + * Implements KF_ACQUIRE semantics. + */ +__bpf_kfunc struct mem_cgroup * +bpf_get_mem_cgroup(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = NULL; + bool rcu_unlock = false; + + if (!root_mem_cgroup) + return NULL; + + if (root_mem_cgroup->css.ss != css->ss) { + struct cgroup *cgroup = css->cgroup; + int ssid = root_mem_cgroup->css.ss->id; + + rcu_read_lock(); + rcu_unlock = true; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + } + + if (css && css_tryget(css)) + memcg = container_of(css, struct mem_cgroup, css); + + if (rcu_unlock) + rcu_read_unlock(); + + return memcg; +} + +/** + * bpf_put_mem_cgroup - Put a reference to a memory cgroup + * @memcg: memory cgroup to release + * + * Releases a previously acquired memcg reference. + * Implements KF_RELEASE semantics. + */ +__bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) +{ + css_put(&memcg->css); +} + +/** + * bpf_mem_cgroup_vm_events - Read memory cgroup's vm event counter + * @memcg: memory cgroup + * @event: event id + * + * Allows to read memory cgroup event counters. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_vm_events(struct mem_cgroup *memcg, + enum vm_event_item event) +{ + return memcg_events(memcg, event); +} + +/** + * bpf_mem_cgroup_usage - Read memory cgroup's usage + * @memcg: memory cgroup + * + * Returns current memory cgroup size in bytes. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_usage(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory); +} + +/** + * bpf_mem_cgroup_events - Read memory cgroup's page state counter + * bpf_mem_cgroup_memory_events - Read memory cgroup's memory event value + * @memcg: memory cgroup + * @event: memory event id + * + * Returns current memory event count. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_memory_events(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + if (event >= MEMCG_NR_MEMORY_EVENTS) + return (unsigned long)-1; + + return atomic_long_read(&memcg->memory_events[event]); +} + +/** + * bpf_mem_cgroup_page_state - Read memory cgroup's page state counter + * @memcg: memory cgroup + * @idx: counter idx + * + * Allows to read memory cgroup statistics. The output is in bytes. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_page_state(struct mem_cgroup *memcg, int idx) +{ + if (idx < 0 || idx >= MEMCG_NR_STAT) + return (unsigned long)-1; + + return memcg_page_state_output(memcg, idx); +} + +/** + * bpf_mem_cgroup_flush_stats - Flush memory cgroup's statistics + * @memcg: memory cgroup + * + * Propagate memory cgroup's statistics up the cgroup tree. + */ +__bpf_kfunc void bpf_mem_cgroup_flush_stats(struct mem_cgroup *memcg) +{ + mem_cgroup_flush_stats(memcg); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_memcontrol_kfuncs) +BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) +BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) + +BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) + +BTF_KFUNCS_END(bpf_memcontrol_kfuncs) + +static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_memcontrol_kfuncs, +}; + +static int __init bpf_memcontrol_init(void) +{ + int err; + + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, + &bpf_memcontrol_kfunc_set); + if (err) + pr_warn("error while registering bpf memcontrol kfuncs: %d", err); + + return err; +} +late_initcall(bpf_memcontrol_init); diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c new file mode 100644 index 0000000000000..a7e021c9db44b --- /dev/null +++ b/mm/bpf_oom.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * BPF-driven OOM killer customization + * + * Author: Roman Gushchin + */ + +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_SRCU(bpf_oom_srcu); +static struct bpf_oom_ops *system_bpf_oom; + +static int bpf_ops_handle_oom(struct bpf_oom_ops *bpf_oom_ops, + struct mem_cgroup *memcg, + struct oom_control *oc) +{ + struct bpf_oom_ctx exec_ctx; + int ret; + + if (memcg) + exec_ctx.cgroup_id = cgroup_id(memcg->css.cgroup); + else + exec_ctx.cgroup_id = 0; + + oc->bpf_policy_name = &bpf_oom_ops->name[0]; + oc->bpf_memory_freed = false; + ret = bpf_oom_ops->handle_out_of_memory(oc, &exec_ctx); + oc->bpf_policy_name = NULL; + + return ret; +} + +bool bpf_handle_oom(struct oom_control *oc) +{ + struct bpf_oom_ops *bpf_oom_ops = NULL; + struct mem_cgroup *memcg; + int idx, ret = 0; + + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */ + idx = srcu_read_lock(&bpf_oom_srcu); + + /* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */ + for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) { + bpf_oom_ops = READ_ONCE(memcg->bpf_oom); + if (!bpf_oom_ops) + continue; + + /* Call BPF OOM handler */ + ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc); + if (ret && oc->bpf_memory_freed) + goto exit; + } + /* + * System-wide OOM or per-memcg BPF OOM handler wasn't successful? + * Try system_bpf_oom. + */ + bpf_oom_ops = READ_ONCE(system_bpf_oom); + if (!bpf_oom_ops) + goto exit; + + /* Call BPF OOM handler */ + ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc); +exit: + srcu_read_unlock(&bpf_oom_srcu, idx); + return ret && oc->bpf_memory_freed; +} + +static int __handle_out_of_memory(struct oom_control *oc, + struct bpf_oom_ctx *exec_ctx) +{ + return 0; +} + +static void __handle_cgroup_offline(u64 cgroup_id, struct bpf_oom_ctx *exec_ctx) +{ +} + +static struct bpf_oom_ops __bpf_oom_ops = { + .handle_out_of_memory = __handle_out_of_memory, + .handle_cgroup_offline = __handle_cgroup_offline, +}; + +static const struct bpf_func_proto * +bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return tracing_prog_func_proto(func_id, prog); +} + +static bool bpf_oom_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static const struct bpf_verifier_ops bpf_oom_verifier_ops = { + .get_func_proto = bpf_oom_func_proto, + .is_valid_access = bpf_oom_ops_is_valid_access, +}; + +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link); + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL; + struct bpf_oom_ops *bpf_oom_ops = kdata; + struct mem_cgroup *memcg = NULL; + int err = 0; + + if (ops_link->cgroup_id) { + /* Attach to a memory cgroup? */ + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + return PTR_ERR(memcg); + bpf_oom_ops_ptr = &memcg->bpf_oom; + } else { + /* System-wide OOM handler */ + bpf_oom_ops_ptr = &system_bpf_oom; + } + + /* Another struct ops attached? */ + if (READ_ONCE(*bpf_oom_ops_ptr)) { + err = -EBUSY; + goto exit; + } + + /* Expose bpf_oom_ops structure */ + WRITE_ONCE(*bpf_oom_ops_ptr, bpf_oom_ops); +exit: + mem_cgroup_put(memcg); + return err; +} + +static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link); + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL; + struct bpf_oom_ops *bpf_oom_ops = kdata; + struct mem_cgroup *memcg = NULL; + + if (ops_link->cgroup_id) { + /* Detach from a memory cgroup? */ + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + goto exit; + bpf_oom_ops_ptr = &memcg->bpf_oom; + } else { + /* System-wide OOM handler */ + bpf_oom_ops_ptr = &system_bpf_oom; + } + + /* Hide bpf_oom_ops from new callers */ + if (!WARN_ON(READ_ONCE(*bpf_oom_ops_ptr) != bpf_oom_ops)) + WRITE_ONCE(*bpf_oom_ops_ptr, NULL); + + mem_cgroup_put(memcg); + +exit: + /* Release bpf_oom_ops after a srcu grace period */ + synchronize_srcu(&bpf_oom_srcu); +} + +void bpf_oom_memcg_offline(struct mem_cgroup *memcg) +{ + struct bpf_oom_ops *bpf_oom_ops; + struct bpf_oom_ctx exec_ctx; + u64 cgrp_id; + int idx; + + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */ + idx = srcu_read_lock(&bpf_oom_srcu); + + bpf_oom_ops = READ_ONCE(memcg->bpf_oom); + WRITE_ONCE(memcg->bpf_oom, NULL); + + if (bpf_oom_ops && bpf_oom_ops->handle_cgroup_offline) { + cgrp_id = cgroup_id(memcg->css.cgroup); + exec_ctx.cgroup_id = cgrp_id; + bpf_oom_ops->handle_cgroup_offline(cgrp_id, &exec_ctx); + } + + srcu_read_unlock(&bpf_oom_srcu, idx); +} + +static int bpf_oom_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct bpf_oom_ops, handle_out_of_memory): + if (!prog) + return -EINVAL; + break; + } + + return 0; +} + +static int bpf_oom_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct bpf_oom_ops *uops = udata; + struct bpf_oom_ops *ops = kdata; + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct bpf_oom_ops, name): + if (uops->name[0]) + strscpy_pad(ops->name, uops->name, sizeof(ops->name)); + else + strscpy_pad(ops->name, "bpf_defined_policy"); + return 1; + } + return 0; +} + +static int bpf_oom_ops_init(struct btf *btf) +{ + return 0; +} + +static struct bpf_struct_ops bpf_oom_bpf_ops = { + .verifier_ops = &bpf_oom_verifier_ops, + .reg = bpf_oom_ops_reg, + .unreg = bpf_oom_ops_unreg, + .check_member = bpf_oom_ops_check_member, + .init_member = bpf_oom_ops_init_member, + .init = bpf_oom_ops_init, + .name = "bpf_oom_ops", + .owner = THIS_MODULE, + .cfi_stubs = &__bpf_oom_ops +}; + +static int __init bpf_oom_struct_ops_init(void) +{ + return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops); +} +late_initcall(bpf_oom_struct_ops_init); diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 6358464bb4160..a304ad418cdfe 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -27,7 +27,6 @@ unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); -unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); int memory_stat_show(struct seq_file *m, void *v); void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4deda33625f41..d44c1f293e168 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,6 +63,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -3618,7 +3619,6 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return xa_load(&mem_cgroup_ids, id); } -#ifdef CONFIG_SHRINKER_DEBUG struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { struct cgroup *cgrp; @@ -3639,7 +3639,6 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) return memcg; } -#endif static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn) { @@ -3887,6 +3886,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) zswap_memcg_offline_cleanup(memcg); + bpf_oom_memcg_offline(memcg); memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c145b0feecc1f..65a3b4c1fc725 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include "internal.h" @@ -239,12 +240,35 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) return points; } -static const char * const oom_constraint_text[] = { - [CONSTRAINT_NONE] = "CONSTRAINT_NONE", - [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET", - [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY", - [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", -}; +static const char *oom_policy_name(struct oom_control *oc) +{ +#ifdef CONFIG_BPF_SYSCALL + if (oc->bpf_policy_name) + return oc->bpf_policy_name; +#endif + return "default"; +} + +static const char *oom_constraint_text(struct oom_control *oc) +{ + switch (oc->constraint) { + case CONSTRAINT_NONE: + return "CONSTRAINT_NONE"; + case CONSTRAINT_CPUSET: + return "CONSTRAINT_CPUSET"; + case CONSTRAINT_MEMORY_POLICY: + return "CONSTRAINT_MEMORY_POLICY"; + case CONSTRAINT_MEMCG: + return "CONSTRAINT_MEMCG"; +#ifdef CONFIG_BPF_SYSCALL + case CONSTRAINT_BPF: + return oc->bpf_constraint ? : "CONSTRAINT_BPF"; +#endif + default: + WARN_ON_ONCE(1); + return ""; + } +} /* * Determine the type of allocation constraint. @@ -257,6 +281,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) bool cpuset_limited = false; int nid; + if (oc->constraint == CONSTRAINT_BPF) + return CONSTRAINT_BPF; + if (is_memcg_oom(oc)) { oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; return CONSTRAINT_MEMCG; @@ -448,7 +475,7 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) { /* one line summary of the oom killer context. */ pr_info("oom-kill:constraint=%s,nodemask=%*pbl", - oom_constraint_text[oc->constraint], + oom_constraint_text(oc), nodemask_pr_args(oc->nodemask)); cpuset_print_current_mems_allowed(); mem_cgroup_print_oom_context(oc->memcg, victim); @@ -458,9 +485,10 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) static void dump_header(struct oom_control *oc) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\noom_policy=%s\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, - current->signal->oom_score_adj); + current->signal->oom_score_adj, + oom_policy_name(oc)); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); @@ -1167,6 +1195,13 @@ bool out_of_memory(struct oom_control *oc) return true; } + /* + * Let bpf handle the OOM first. If it was able to free up some memory, + * bail out. Otherwise fall back to the kernel OOM killer. + */ + if (bpf_handle_oom(oc)) + return true; + select_bad_process(oc); /* Found nothing?!?! */ if (!oc->chosen) { @@ -1270,3 +1305,153 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) return -ENOSYS; #endif /* CONFIG_MMU */ } + +#ifdef CONFIG_BPF_SYSCALL + +__bpf_kfunc_start_defs(); +/** + * bpf_oom_kill_process - Kill a process as OOM killer + * @oc: pointer to oom_control structure, describes OOM context + * @task: task to be killed + * @message__str: message to print in dmesg + * + * Kill a process in a way similar to the kernel OOM killer. + * This means dump the necessary information to dmesg, adjust memcg + * statistics, leverage the oom reaper, respect memory.oom.group etc. + * + * bpf_oom_kill_process() marks the forward progress by setting + * oc->bpf_memory_freed. If the progress was made, the bpf program + * is free to decide if the kernel oom killer should be invoked. + * Otherwise it's enforced, so that a bad bpf program can't + * deadlock the machine on memory. + */ +__bpf_kfunc int bpf_oom_kill_process(struct oom_control *oc, + struct task_struct *task, + const char *message__str) +{ + if (oom_unkillable_task(task)) + return -EPERM; + + /* paired with put_task_struct() in oom_kill_process() */ + task = tryget_task_struct(task); + if (!task) + return -EINVAL; + + oc->chosen = task; + + oom_kill_process(oc, message__str); + + oc->chosen = NULL; + oc->bpf_memory_freed = true; + + return 0; +} + +/** + * bpf_out_of_memory - declare Out Of Memory state and invoke OOM killer + * @memcg__nullable: memcg or NULL for system-wide OOMs + * @order: order of page which wasn't allocated + * @flags: flags + * @constraint_text__nullable: custom constraint description for the OOM report + * + * Declares the Out Of Memory state and invokes the OOM killer. + * + * OOM handlers are synchronized using the oom_lock mutex. If wait_on_oom_lock + * is true, the function will wait on it. Otherwise it bails out with -EBUSY + * if oom_lock is contended. + * + * Generally it's advised to pass wait_on_oom_lock=false for global OOMs + * and wait_on_oom_lock=true for memcg-scoped OOMs. + * + * Returns 1 if the forward progress was achieved and some memory was freed. + * Returns a negative value if an error occurred. + */ +__bpf_kfunc int bpf_out_of_memory(struct mem_cgroup *memcg__nullable, + int order, u64 flags, + const char *constraint_text__nullable) +{ + struct oom_control oc = { + .memcg = memcg__nullable, + .order = order, + .constraint = CONSTRAINT_BPF, + .bpf_constraint = constraint_text__nullable, + }; + int ret; + + if (flags & ~(BPF_OOM_FLAGS_LAST - 1)) + return -EINVAL; + + if (oc.order < 0 || oc.order > MAX_PAGE_ORDER) + return -EINVAL; + + if (flags & BPF_OOM_FLAGS_WAIT_ON_OOM_LOCK) { + ret = mutex_lock_killable(&oom_lock); + if (ret) + return ret; + } else if (!mutex_trylock(&oom_lock)) + return -EBUSY; + + ret = out_of_memory(&oc); + + mutex_unlock(&oom_lock); + return ret; +} + +/** + * bpf_task_is_oom_victim - Check if the task has been marked as an OOM victim + * @task: task to check + * + * Returns true if the task has been previously selected by the OOM killer + * to be killed. It's expected that the task will be destroyed soon and some + * memory will be freed, so maybe no additional actions required. + */ +__bpf_kfunc bool bpf_task_is_oom_victim(struct task_struct *task) +{ + return tsk_is_oom_victim(task); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_oom_kfuncs) +BTF_ID_FLAGS(func, bpf_oom_kill_process, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_out_of_memory, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_is_oom_victim, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_oom_kfuncs) + +BTF_SET_START(bpf_oom_declare_oom_kfuncs) +BTF_ID(func, bpf_out_of_memory) +BTF_SET_END(bpf_oom_declare_oom_kfuncs) + +extern struct bpf_struct_ops bpf_psi_bpf_ops; + +static int bpf_oom_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set_contains(&bpf_oom_declare_oom_kfuncs, kfunc_id)) + return 0; + + if (IS_ENABLED(CONFIG_PSI) && prog->aux->st_ops == &bpf_psi_bpf_ops) + return 0; + + return -EACCES; +} + +static const struct btf_kfunc_id_set bpf_oom_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_oom_kfuncs, + .filter = bpf_oom_kfunc_filter, +}; + +static int __init bpf_oom_init(void) +{ + int err; + + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &bpf_oom_kfunc_set); + if (err) + pr_warn("error while registering bpf oom kfuncs: %d", err); + + return err; +} +late_initcall(bpf_oom_init); + +#endif diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 339b197972374..4c8944f8d6ba5 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -883,6 +883,14 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, cgroup)) return libbpf_err(-EINVAL); break; + case BPF_STRUCT_OPS: + relative_fd = OPTS_GET(opts, cgroup.relative_fd, 0); + attr.link_create.cgroup.relative_fd = relative_fd; + attr.link_create.cgroup.expected_revision = + OPTS_GET(opts, cgroup.expected_revision, 0); + if (!OPTS_ZEROED(opts, cgroup)) + return libbpf_err(-EINVAL); + break; default: if (!OPTS_ZEROED(opts, flags)) return libbpf_err(-EINVAL); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b90574f39d1c7..be56a5dee5050 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -13196,12 +13196,19 @@ static int bpf_link__detach_struct_ops(struct bpf_link *link) return close(link->fd); } -struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +struct bpf_link *bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts) { + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts); struct bpf_link_struct_ops *link; __u32 zero = 0; int err, fd; + if (!OPTS_VALID(opts, bpf_struct_ops_opts)) { + pr_warn("map '%s': invalid opts\n", map->name); + return libbpf_err_ptr(-EINVAL); + } + if (!bpf_map__is_struct_ops(map)) { pr_warn("map '%s': can't attach non-struct_ops map\n", map->name); return libbpf_err_ptr(-EINVAL); @@ -13237,7 +13244,9 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) return &link->link; } - fd = bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL); + link_opts.cgroup.relative_fd = OPTS_GET(opts, relative_fd, 0); + + fd = bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, &link_opts); if (fd < 0) { free(link); return libbpf_err_ptr(fd); @@ -13249,6 +13258,11 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) return &link->link; } +struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +{ + return bpf_map__attach_struct_ops_opts(map, NULL); +} + /* * Swap the back struct_ops of a link with a new struct_ops map. */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5118d0a90e243..dc84898715cfc 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -922,6 +922,20 @@ bpf_program__attach_cgroup_opts(const struct bpf_program *prog, int cgroup_fd, struct bpf_map; LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map); + +struct bpf_struct_ops_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + __u32 flags; + __u32 relative_fd; + __u64 expected_revision; + size_t :0; +}; +#define bpf_struct_ops_opts__last_field expected_revision + +LIBBPF_API struct bpf_link * +bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts); LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map); struct bpf_iter_attach_opts { diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 8ed8749907d47..bc00089343ce4 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -451,4 +451,5 @@ LIBBPF_1.7.0 { global: bpf_map__set_exclusive_program; bpf_map__exclusive_program; + bpf_map__attach_struct_ops_opts; } LIBBPF_1.6.0; diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 20cede4db3cee..8fb02fe4c4aaa 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -126,6 +126,45 @@ int enable_controllers(const char *relative_path, const char *controllers) return __enable_controllers(cgroup_path, controllers); } +static size_t __read_cgroup_file(const char *cgroup_path, const char *file, + char *buf, size_t size) +{ + char file_path[PATH_MAX + 1]; + size_t ret; + int fd; + + snprintf(file_path, sizeof(file_path), "%s/%s", cgroup_path, file); + fd = open(file_path, O_RDONLY); + if (fd < 0) { + log_err("Opening %s", file_path); + return -1; + } + + ret = read(fd, buf, size); + close(fd); + return ret; +} + +/** + * read_cgroup_file() - Read to a cgroup file + * @relative_path: The cgroup path, relative to the workdir + * @file: The name of the file in cgroupfs to read to + * @buf: Buffer to read from the file + * @size: Size of the buffer + * + * Read to a file in the given cgroup's directory. + * + * If successful, the number of read bytes is returned. + */ +size_t read_cgroup_file(const char *relative_path, const char *file, + char *buf, size_t size) +{ + char cgroup_path[PATH_MAX - 24]; + + format_cgroup_path(cgroup_path, relative_path); + return __read_cgroup_file(cgroup_path, file, buf, size); +} + static int __write_cgroup_file(const char *cgroup_path, const char *file, const char *buf) { diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h index 3857304be8741..9f9bb6b5d9928 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.h +++ b/tools/testing/selftests/bpf/cgroup_helpers.h @@ -11,6 +11,8 @@ /* cgroupv2 related */ int enable_controllers(const char *relative_path, const char *controllers); +size_t read_cgroup_file(const char *relative_path, const char *file, + char *buf, size_t size); int write_cgroup_file(const char *relative_path, const char *file, const char *buf); int write_cgroup_file_parent(const char *relative_path, const char *file, diff --git a/tools/testing/selftests/bpf/cgroup_iter_memcg.h b/tools/testing/selftests/bpf/cgroup_iter_memcg.h new file mode 100644 index 0000000000000..3f59b127943ba --- /dev/null +++ b/tools/testing/selftests/bpf/cgroup_iter_memcg.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef __CGROUP_ITER_MEMCG_H +#define __CGROUP_ITER_MEMCG_H + +struct memcg_query { + /* some node_stat_item's */ + unsigned long nr_anon_mapped; + unsigned long nr_shmem; + unsigned long nr_file_pages; + unsigned long nr_file_mapped; + /* some memcg_stat_item */ + unsigned long memcg_kmem; + /* some vm_event_item */ + unsigned long pgfault; +}; + +#endif /* __CGROUP_ITER_MEMCG_H */ diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 70b28c1e653ea..178c840c844bc 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -110,6 +110,7 @@ CONFIG_IP6_NF_IPTABLES=y CONFIG_IP6_NF_FILTER=y CONFIG_NF_NAT=y CONFIG_PACKET=y +CONFIG_PSI=y CONFIG_RC_CORE=y CONFIG_SECURITY=y CONFIG_SECURITYFS=y diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c new file mode 100644 index 0000000000000..215e4c98c76f1 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include "cgroup_helpers.h" +#include "cgroup_iter_memcg.h" +#include "cgroup_iter_memcg.skel.h" + +static int read_stats(struct bpf_link *link) +{ + int fd, ret = 0; + ssize_t bytes; + + fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_OK_FD(fd, "bpf_iter_create")) + return 1; + + /* + * Invoke iter program by reading from its fd. We're not expecting any + * data to be written by the bpf program so the result should be zero. + * Results will be read directly through the custom data section + * accessible through skel->data_query.memcg_query. + */ + bytes = read(fd, NULL, 0); + if (!ASSERT_EQ(bytes, 0, "read fd")) + ret = 1; + + close(fd); + return ret; +} + +static void test_anon(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg anon usage by mapping and writing + * to a new anon region. + */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_anon_mapped, 0, "final anon mapped val"); + +cleanup: + munmap(map, len); +} + +static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + char *path; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + path = "/tmp/test_cgroup_iter_memcg"; + + /* + * Increase memcg file usage by creating and writing + * to a mapped file. + */ + fd = open(path, O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "open fd")) + return; + if (!ASSERT_OK(ftruncate(fd, len), "ftruncate")) + goto cleanup_fd; + + map = mmap(NULL, len, PROT_WRITE, MAP_SHARED, fd, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap file")) + goto cleanup_fd; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup_map; + + ASSERT_GT(memcg_query->nr_file_pages, 0, "final file value"); + ASSERT_GT(memcg_query->nr_file_mapped, 0, "final file mapped value"); + +cleanup_map: + munmap(map, len); +cleanup_fd: + close(fd); + unlink(path); +} + +static void test_shmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + size_t len; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg shmem usage by creating and writing + * to a shmem object. + */ + fd = shm_open("/tmp_shmem", O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "shm_open")) + return; + + if (!ASSERT_OK(fallocate(fd, 0, 0, len), "fallocate")) + goto cleanup; + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_shmem, 0, "final shmem value"); + +cleanup: + close(fd); + shm_unlink("/tmp_shmem"); +} + +#define NR_PIPES 64 +static void test_kmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + int fds[NR_PIPES][2], i; + + /* + * Increase kmem value by creating pipes which will allocate some + * kernel buffers. + */ + for (i = 0; i < NR_PIPES; i++) { + if (!ASSERT_OK(pipe(fds[i]), "pipe")) + goto cleanup; + } + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->memcg_kmem, 0, "kmem value"); + +cleanup: + for (i = 0; i < NR_PIPES; i++) { + close(fds[i][0]); + close(fds[i][1]); + } +} + +static void test_pgfault(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* Create region to use for triggering a page fault. */ + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + /* Trigger page fault. */ + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->pgfault, 0, "final pgfault val"); + +cleanup: + munmap(map, len); +} + +void test_cgroup_iter_memcg(void) +{ + char *cgroup_rel_path = "/cgroup_iter_memcg_test"; + struct cgroup_iter_memcg *skel; + struct bpf_link *link; + int cgroup_fd; + + cgroup_fd = cgroup_setup_and_join(cgroup_rel_path); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup_setup_and_join")) + return; + + skel = cgroup_iter_memcg__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_iter_memcg__open_and_load")) + goto cleanup_cgroup_fd; + + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo = { + .cgroup.cgroup_fd = cgroup_fd, + .cgroup.order = BPF_CGROUP_ITER_SELF_ONLY, + }; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_memcg_query, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto cleanup_skel; + + if (test__start_subtest("cgroup_iter_memcg__anon")) + test_anon(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__shmem")) + test_shmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__file")) + test_file(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__kmem")) + test_kmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__pgfault")) + test_pgfault(link, &skel->data_query->memcg_query); + + bpf_link__destroy(link); +cleanup_skel: + cgroup_iter_memcg__destroy(skel); +cleanup_cgroup_fd: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_oom.c b/tools/testing/selftests/bpf/prog_tests/test_oom.c new file mode 100644 index 0000000000000..6126d961aba3d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_oom.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +#include "cgroup_helpers.h" +#include "test_oom.skel.h" + +struct cgroup_desc { + const char *path; + int fd; + unsigned long long id; + int pid; + size_t target; + size_t max; + int oom_score_adj; + bool victim; +}; + +#define MB (1024 * 1024) +#define OOM_SCORE_ADJ_MIN (-1000) +#define OOM_SCORE_ADJ_MAX 1000 + +static struct cgroup_desc cgroups[] = { + { .path = "/oom_test", .max = 80 * MB}, + { .path = "/oom_test/cg1", .target = 10 * MB, + .oom_score_adj = OOM_SCORE_ADJ_MAX }, + { .path = "/oom_test/cg2", .target = 40 * MB, + .oom_score_adj = OOM_SCORE_ADJ_MIN }, + { .path = "/oom_test/cg3" }, + { .path = "/oom_test/cg3/cg4", .target = 30 * MB, + .victim = true }, + { .path = "/oom_test/cg3/cg5", .target = 20 * MB }, +}; + +static int spawn_task(struct cgroup_desc *desc) +{ + char *ptr; + int pid; + + pid = fork(); + if (pid < 0) + return pid; + + if (pid > 0) { + /* parent */ + desc->pid = pid; + return 0; + } + + /* child */ + if (desc->oom_score_adj) { + char buf[64]; + int fd = open("/proc/self/oom_score_adj", O_WRONLY); + + if (fd < 0) + return -1; + + snprintf(buf, sizeof(buf), "%d", desc->oom_score_adj); + write(fd, buf, sizeof(buf)); + close(fd); + } + + ptr = (char *)malloc(desc->target); + if (!ptr) + return -ENOMEM; + + memset(ptr, 'a', desc->target); + + while (1) + sleep(1000); + + return 0; +} + +static void setup_environment(void) +{ + int i, err; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + goto cleanup; + + for (i = 0; i < ARRAY_SIZE(cgroups); i++) { + cgroups[i].fd = create_and_get_cgroup(cgroups[i].path); + if (!ASSERT_GE(cgroups[i].fd, 0, "create_and_get_cgroup")) + goto cleanup; + + cgroups[i].id = get_cgroup_id(cgroups[i].path); + if (!ASSERT_GT(cgroups[i].id, 0, "get_cgroup_id")) + goto cleanup; + + /* Freeze the top-level cgroup */ + if (i == 0) { + /* Freeze the top-level cgroup */ + err = write_cgroup_file(cgroups[i].path, "cgroup.freeze", "1"); + if (!ASSERT_OK(err, "freeze cgroup")) + goto cleanup; + } + + /* Recursively enable the memory controller */ + if (!cgroups[i].target) { + + err = write_cgroup_file(cgroups[i].path, "cgroup.subtree_control", + "+memory"); + if (!ASSERT_OK(err, "enable memory controller")) + goto cleanup; + } + + /* Set memory.max */ + if (cgroups[i].max) { + char buf[256]; + + snprintf(buf, sizeof(buf), "%lu", cgroups[i].max); + err = write_cgroup_file(cgroups[i].path, "memory.max", buf); + if (!ASSERT_OK(err, "set memory.max")) + goto cleanup; + + snprintf(buf, sizeof(buf), "0"); + write_cgroup_file(cgroups[i].path, "memory.swap.max", buf); + + } + + /* Spawn tasks creating memory pressure */ + if (cgroups[i].target) { + char buf[256]; + + err = spawn_task(&cgroups[i]); + if (!ASSERT_OK(err, "spawn task")) + goto cleanup; + + snprintf(buf, sizeof(buf), "%d", cgroups[i].pid); + err = write_cgroup_file(cgroups[i].path, "cgroup.procs", buf); + if (!ASSERT_OK(err, "put child into a cgroup")) + goto cleanup; + } + } + + return; + +cleanup: + cleanup_cgroup_environment(); +} + +static int run_and_wait_for_oom(void) +{ + int ret = -1; + bool first = true; + char buf[4096] = {}; + size_t size; + + /* Unfreeze the top-level cgroup */ + ret = write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0"); + if (!ASSERT_OK(ret, "freeze cgroup")) + return -1; + + for (;;) { + int i, status; + pid_t pid = wait(&status); + + if (pid == -1) { + if (errno == EINTR) + continue; + /* ECHILD */ + break; + } + + if (!first) + continue; + + first = false; + + /* Check which process was terminated first */ + for (i = 0; i < ARRAY_SIZE(cgroups); i++) { + if (!ASSERT_OK(cgroups[i].victim != + (pid == cgroups[i].pid), + "correct process was killed")) { + ret = -1; + break; + } + + if (!cgroups[i].victim) + continue; + + /* Check the memcg oom counter */ + size = read_cgroup_file(cgroups[i].path, + "memory.events", + buf, sizeof(buf)); + if (!ASSERT_OK(size <= 0, "read memory.events")) { + ret = -1; + break; + } + + if (!ASSERT_OK(strstr(buf, "oom_kill 1") == NULL, + "oom_kill count check")) { + ret = -1; + break; + } + } + + /* Kill all remaining tasks */ + for (i = 0; i < ARRAY_SIZE(cgroups); i++) + if (cgroups[i].pid && cgroups[i].pid != pid) + kill(cgroups[i].pid, SIGKILL); + } + + return ret; +} + +void test_oom(void) +{ + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + struct test_oom *skel; + struct bpf_link *link1, *link2; + int err = 0; + + setup_environment(); + + skel = test_oom__open_and_load(); + if (!skel) { + err = -errno; + CHECK_FAIL(err); + goto cleanup; + } + + opts.relative_fd = cgroups[0].fd; + link1 = bpf_map__attach_struct_ops_opts(skel->maps.test_bpf_oom, &opts); + if (!link1) { + err = -errno; + CHECK_FAIL(err); + goto cleanup; + } + + opts.relative_fd = 0; /* attach system-wide */ + link2 = bpf_map__attach_struct_ops_opts(skel->maps.test_bpf_oom, &opts); + if (!link2) { + err = -errno; + CHECK_FAIL(err); + goto cleanup; + } + + /* Unfreeze all child tasks and create the memory pressure */ + err = run_and_wait_for_oom(); + CHECK_FAIL(err); + +cleanup: + cleanup_cgroup_environment(); + test_oom__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_psi.c b/tools/testing/selftests/bpf/prog_tests/test_psi.c new file mode 100644 index 0000000000000..b294cea0a6fe2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_psi.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +#include "cgroup_helpers.h" +#include "test_psi.skel.h" + +enum psi_res { + PSI_IO, + PSI_MEM, + PSI_CPU, + PSI_IRQ, + NR_PSI_RESOURCES, +}; + +struct cgroup_desc { + const char *path; + unsigned long long id; + int pid; + int fd; + size_t target; + size_t high; + bool victim; +}; + +#define MB (1024 * 1024) + +static struct cgroup_desc cgroups[] = { + { .path = "/psi_test" }, + { .path = "/psi_test/cg1" }, + { .path = "/psi_test/cg2", .target = 500 * MB, + .high = 40 * MB, .victim = true }, +}; + +static int spawn_task(struct cgroup_desc *desc) +{ + char *ptr; + int pid; + + pid = fork(); + if (pid < 0) + return pid; + + if (pid > 0) { + /* parent */ + desc->pid = pid; + return 0; + } + + /* child */ + ptr = (char *)malloc(desc->target); + if (!ptr) + return -ENOMEM; + + memset(ptr, 'a', desc->target); + + while (1) + sleep(1000); + + return 0; +} + +static void setup_environment(void) +{ + int i, err; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + goto cleanup; + + for (i = 0; i < ARRAY_SIZE(cgroups); i++) { + cgroups[i].fd = create_and_get_cgroup(cgroups[i].path); + if (!ASSERT_GE(cgroups[i].fd, 0, "create_and_get_cgroup")) + goto cleanup; + + cgroups[i].id = get_cgroup_id(cgroups[i].path); + if (!ASSERT_GT(cgroups[i].id, 0, "get_cgroup_id")) + goto cleanup; + + /* Freeze the top-level cgroup and enable the memory controller */ + if (i == 0) { + err = write_cgroup_file(cgroups[i].path, "cgroup.freeze", "1"); + if (!ASSERT_OK(err, "freeze cgroup")) + goto cleanup; + + err = write_cgroup_file(cgroups[i].path, "cgroup.subtree_control", + "+memory"); + if (!ASSERT_OK(err, "enable memory controller")) + goto cleanup; + } + + /* Set memory.high */ + if (cgroups[i].high) { + char buf[256]; + + snprintf(buf, sizeof(buf), "%lu", cgroups[i].high); + err = write_cgroup_file(cgroups[i].path, "memory.high", buf); + if (!ASSERT_OK(err, "set memory.high")) + goto cleanup; + + snprintf(buf, sizeof(buf), "0"); + write_cgroup_file(cgroups[i].path, "memory.swap.max", buf); + } + + /* Spawn tasks creating memory pressure */ + if (cgroups[i].target) { + char buf[256]; + + err = spawn_task(&cgroups[i]); + if (!ASSERT_OK(err, "spawn task")) + goto cleanup; + + snprintf(buf, sizeof(buf), "%d", cgroups[i].pid); + err = write_cgroup_file(cgroups[i].path, "cgroup.procs", buf); + if (!ASSERT_OK(err, "put child into a cgroup")) + goto cleanup; + } + } + + return; + +cleanup: + cleanup_cgroup_environment(); +} + +static int run_and_wait_for_oom(void) +{ + int ret = -1; + bool first = true; + char buf[4096] = {}; + size_t size; + + /* Unfreeze the top-level cgroup */ + ret = write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0"); + if (!ASSERT_OK(ret, "unfreeze cgroup")) + return -1; + + for (;;) { + int i, status; + pid_t pid = wait(&status); + + if (pid == -1) { + if (errno == EINTR) + continue; + /* ECHILD */ + break; + } + + if (!first) + continue; + first = false; + + /* Check which process was terminated first */ + for (i = 0; i < ARRAY_SIZE(cgroups); i++) { + if (!ASSERT_OK(cgroups[i].victim != + (pid == cgroups[i].pid), + "correct process was killed")) { + ret = -1; + break; + } + + if (!cgroups[i].victim) + continue; + + /* Check the memcg oom counter */ + size = read_cgroup_file(cgroups[i].path, "memory.events", + buf, sizeof(buf)); + if (!ASSERT_OK(size <= 0, "read memory.events")) { + ret = -1; + break; + } + + if (!ASSERT_OK(strstr(buf, "oom_kill 1") == NULL, + "oom_kill count check")) { + ret = -1; + break; + } + } + + /* Kill all remaining tasks */ + for (i = 0; i < ARRAY_SIZE(cgroups); i++) + if (cgroups[i].pid && cgroups[i].pid != pid) + kill(cgroups[i].pid, SIGKILL); + } + + return ret; +} + +void test_psi(void) +{ + struct test_psi *skel; + u64 deleted_cgroup_id; + int new_cgroup_fd; + u64 new_cgroup_id; + int err; + + setup_environment(); + + skel = test_psi__open_and_load(); + err = libbpf_get_error(skel); + if (CHECK_FAIL(err)) + goto cleanup; + + skel->bss->deleted_cgroup_id = cgroups[1].id; + skel->bss->high_pressure_cgroup_id = cgroups[2].id; + + err = test_psi__attach(skel); + if (CHECK_FAIL(err)) + goto cleanup; + + /* Delete the first cgroup, it should trigger handle_cgroup_offline() */ + remove_cgroup(cgroups[1].path); + + new_cgroup_fd = create_and_get_cgroup("/psi_test_new"); + if (!ASSERT_GE(new_cgroup_fd, 0, "create_and_get_cgroup")) + goto cleanup; + + new_cgroup_id = get_cgroup_id("/psi_test_new"); + if (!ASSERT_GT(new_cgroup_id, 0, "get_cgroup_id")) + goto cleanup; + + /* Unfreeze all child tasks and create the memory pressure */ + err = run_and_wait_for_oom(); + CHECK_FAIL(err); + + /* Check the result of the handle_cgroup_offline() handler */ + deleted_cgroup_id = skel->bss->deleted_cgroup_id; + ASSERT_EQ(deleted_cgroup_id, cgroups[1].id, "deleted cgroup id"); + + /* Check the result of the handle_cgroup_online() handler */ + ASSERT_EQ(skel->bss->new_cgroup_id, new_cgroup_id, + "new cgroup id"); + +cleanup: + cleanup_cgroup_environment(); + test_psi__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c new file mode 100644 index 0000000000000..92db5fd11391d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "cgroup_iter_memcg.h" + +char _license[] SEC("license") = "GPL"; + +/* The latest values read are stored here. */ +struct memcg_query memcg_query SEC(".data.query"); + +SEC("iter.s/cgroup") +int cgroup_memcg_query(struct bpf_iter__cgroup *ctx) +{ + struct cgroup *cgrp = ctx->cgroup; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + if (!cgrp) + return 1; + + css = container_of(cgrp, struct cgroup_subsys_state, cgroup); + if (!css) + return 1; + + memcg = bpf_get_mem_cgroup(css); + if (!memcg) + return 1; + + bpf_mem_cgroup_flush_stats(memcg); + + memcg_query.nr_anon_mapped = bpf_mem_cgroup_page_state(memcg, NR_ANON_MAPPED); + memcg_query.nr_shmem = bpf_mem_cgroup_page_state(memcg, NR_SHMEM); + memcg_query.nr_file_pages = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES); + memcg_query.nr_file_mapped = bpf_mem_cgroup_page_state(memcg, NR_FILE_MAPPED); + memcg_query.memcg_kmem = bpf_mem_cgroup_page_state(memcg, MEMCG_KMEM); + memcg_query.pgfault = bpf_mem_cgroup_vm_events(memcg, PGFAULT); + + bpf_put_mem_cgroup(memcg); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_oom.c b/tools/testing/selftests/bpf/progs/test_oom.c new file mode 100644 index 0000000000000..352b522ae584c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_oom.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#define OOM_SCORE_ADJ_MIN (-1000) + +static bool mem_cgroup_killable(struct mem_cgroup *memcg) +{ + struct task_struct *task; + bool ret = true; + + bpf_for_each(css_task, task, &memcg->css, CSS_TASK_ITER_PROCS) + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + return false; + + return ret; +} + +/* + * Find the largest leaf cgroup (ignoring page cache) without unkillable tasks + * and kill all belonging tasks. + */ +SEC("struct_ops.s/handle_out_of_memory") +int BPF_PROG(test_out_of_memory, struct oom_control *oc, struct bpf_oom_ctx *exec_ctx) +{ + struct task_struct *task; + struct mem_cgroup *root_memcg = oc->memcg; + struct mem_cgroup *memcg, *victim = NULL; + struct cgroup_subsys_state *css_pos; + unsigned long usage, max_usage = 0; + unsigned long pagecache = 0; + int ret = 0; + + /* Pass to the system-level bpf_oom ops */ + if (exec_ctx->cgroup_id) + return 0; + + if (root_memcg) + root_memcg = bpf_get_mem_cgroup(&root_memcg->css); + else + root_memcg = bpf_get_root_mem_cgroup(); + + if (!root_memcg) + return 0; + + bpf_rcu_read_lock(); + bpf_for_each(css, css_pos, &root_memcg->css, BPF_CGROUP_ITER_DESCENDANTS_POST) { + if (css_pos->cgroup->nr_descendants + css_pos->cgroup->nr_dying_descendants) + continue; + + memcg = bpf_get_mem_cgroup(css_pos); + if (!memcg) + continue; + + usage = bpf_mem_cgroup_usage(memcg); + pagecache = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES); + + if (usage > pagecache) + usage -= pagecache; + else + usage = 0; + + if ((usage > max_usage) && mem_cgroup_killable(memcg)) { + max_usage = usage; + if (victim) + bpf_put_mem_cgroup(victim); + victim = bpf_get_mem_cgroup(&memcg->css); + } + + bpf_put_mem_cgroup(memcg); + } + bpf_rcu_read_unlock(); + + if (!victim) + goto exit; + + bpf_for_each(css_task, task, &victim->css, CSS_TASK_ITER_PROCS) { + struct task_struct *t = bpf_task_acquire(task); + + if (t) { + /* + * If the task is already an OOM victim, it will + * quit soon and release some memory. + */ + if (bpf_task_is_oom_victim(task)) { + bpf_task_release(t); + ret = 1; + break; + } + + bpf_oom_kill_process(oc, task, "bpf oom test"); + bpf_task_release(t); + ret = 1; + } + } + + bpf_put_mem_cgroup(victim); +exit: + bpf_put_mem_cgroup(root_memcg); + + return ret; +} + +SEC("struct_ops.s/handle_cgroup_offline") +int BPF_PROG(test_cgroup_offline, u64 cgroup_id, struct bpf_oom_ctx *exec_ctx) +{ + return 0; +} + +SEC(".struct_ops.link") +struct bpf_oom_ops test_bpf_oom = { + .name = "bpf_test_policy", + .handle_out_of_memory = (void *)test_out_of_memory, + .handle_cgroup_offline = (void *)test_cgroup_offline, +}; diff --git a/tools/testing/selftests/bpf/progs/test_psi.c b/tools/testing/selftests/bpf/progs/test_psi.c new file mode 100644 index 0000000000000..4e5cdb5242d1f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_psi.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#define PSI_FULL 0x80000000 + +/* cgroup which will experience the high memory pressure */ +u64 high_pressure_cgroup_id; + +/* cgroup which will be deleted */ +u64 deleted_cgroup_id; + +/* cgroup which will be created */ +u64 new_cgroup_id; + +/* cgroup which was deleted */ +u64 deleted_cgroup_id; + +char constraint_name[] = "CONSTRAINT_BPF_PSI_MEM"; + +SEC("struct_ops.s/init") +int BPF_PROG(psi_init, struct bpf_psi *bpf_psi) +{ + int ret; + + ret = bpf_psi_create_trigger(bpf_psi, high_pressure_cgroup_id, + PSI_MEM | PSI_FULL, 100000, 1000000); + if (ret) + return ret; + + return bpf_psi_create_trigger(bpf_psi, deleted_cgroup_id, + PSI_IO, 100000, 1000000); +} + +SEC("struct_ops.s/handle_psi_event") +void BPF_PROG(handle_psi_event, struct psi_trigger *t) +{ + u64 cgroup_id = t->cgroup_id; + struct mem_cgroup *memcg; + struct cgroup *cgroup; + + cgroup = bpf_cgroup_from_id(cgroup_id); + if (!cgroup) + return; + + memcg = bpf_get_mem_cgroup(&cgroup->self); + if (!memcg) { + bpf_cgroup_release(cgroup); + return; + } + + bpf_out_of_memory(memcg, 0, BPF_OOM_FLAGS_WAIT_ON_OOM_LOCK, + constraint_name); + + bpf_put_mem_cgroup(memcg); + bpf_cgroup_release(cgroup); +} + +SEC("struct_ops.s/handle_cgroup_online") +void BPF_PROG(handle_cgroup_online, u64 cgroup_id) +{ + new_cgroup_id = cgroup_id; +} + +SEC("struct_ops.s/handle_cgroup_offline") +void BPF_PROG(handle_cgroup_offline, u64 cgroup_id) +{ + deleted_cgroup_id = cgroup_id; +} + +SEC(".struct_ops.link") +struct bpf_psi_ops test_bpf_psi = { + .init = (void *)psi_init, + .handle_psi_event = (void *)handle_psi_event, + .handle_cgroup_online = (void *)handle_cgroup_online, + .handle_cgroup_offline = (void *)handle_cgroup_offline, +};