Skip to content

Commit 750a40d

Browse files
committed
sched_ext: Synchronize bypass state changes with rq lock
While the BPF scheduler is being unloaded, the following warning messages trigger sometimes: NOHZ tick-stop error: local softirq work is pending, handler torvalds#80!!! This is caused by the CPU entering idle while there are pending softirqs. The main culprit is the bypassing state assertion not being synchronized with rq operations. As the BPF scheduler cannot be trusted in the disable path, the first step is entering the bypass mode where the BPF scheduler is ignored and scheduling becomes global FIFO. This is implemented by turning scx_ops_bypassing() true. However, the transition isn't synchronized against anything and it's possible for enqueue and dispatch paths to have different ideas on whether bypass mode is on. Make each rq track its own bypass state with SCX_RQ_BYPASSING which is modified while rq is locked. This removes most of the NOHZ tick-stop messages but not completely. I believe the stragglers are from the sched core bug where pick_task_scx() can be called without preceding balance_scx(). Once that bug is fixed, we should verify that all occurrences of this error message are gone too. v2: scx_enabled() test moved inside the for_each_possible_cpu() loop so that the per-cpu states are always synchronized with the global state. Signed-off-by: Tejun Heo <[email protected]> Reported-by: David Vernet <[email protected]>
1 parent 2d285d5 commit 750a40d

File tree

2 files changed

+38
-26
lines changed

2 files changed

+38
-26
lines changed

kernel/sched/ext.c

+37-26
Original file line numberDiff line numberDiff line change
@@ -1411,9 +1411,9 @@ static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
14111411
return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
14121412
}
14131413

1414-
static bool scx_ops_bypassing(void)
1414+
static bool scx_rq_bypassing(struct rq *rq)
14151415
{
1416-
return unlikely(atomic_read(&scx_ops_bypass_depth));
1416+
return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
14171417
}
14181418

14191419
/**
@@ -1948,7 +1948,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
19481948
if (!scx_rq_online(rq))
19491949
goto local;
19501950

1951-
if (scx_ops_bypassing())
1951+
if (scx_rq_bypassing(rq))
19521952
goto global;
19531953

19541954
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2612,7 +2612,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
26122612
* bypassing test.
26132613
*/
26142614
if ((prev->scx.flags & SCX_TASK_QUEUED) &&
2615-
prev->scx.slice && !scx_ops_bypassing()) {
2615+
prev->scx.slice && !scx_rq_bypassing(rq)) {
26162616
rq->scx.flags |= SCX_RQ_BAL_KEEP;
26172617
goto has_tasks;
26182618
}
@@ -2625,7 +2625,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
26252625
if (consume_dispatch_q(rq, &scx_dsq_global))
26262626
goto has_tasks;
26272627

2628-
if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
2628+
if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
26292629
goto no_tasks;
26302630

26312631
dspc->rq = rq;
@@ -2671,7 +2671,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
26712671
* %SCX_OPS_ENQ_LAST is in effect.
26722672
*/
26732673
if ((prev->scx.flags & SCX_TASK_QUEUED) &&
2674-
(!static_branch_unlikely(&scx_ops_enq_last) || scx_ops_bypassing())) {
2674+
(!static_branch_unlikely(&scx_ops_enq_last) ||
2675+
scx_rq_bypassing(rq))) {
26752676
rq->scx.flags |= SCX_RQ_BAL_KEEP;
26762677
goto has_tasks;
26772678
}
@@ -2863,7 +2864,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
28632864
* forcing a different task. Leave it at the head of the local
28642865
* DSQ.
28652866
*/
2866-
if (p->scx.slice && !scx_ops_bypassing()) {
2867+
if (p->scx.slice && !scx_rq_bypassing(rq)) {
28672868
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
28682869
return;
28692870
}
@@ -2928,7 +2929,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
29282929
return NULL;
29292930

29302931
if (unlikely(!p->scx.slice)) {
2931-
if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
2932+
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
29322933
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
29332934
p->comm, p->pid);
29342935
scx_warned_zero_slice = true;
@@ -2966,7 +2967,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
29662967
* calling ops.core_sched_before(). Accesses are controlled by the
29672968
* verifier.
29682969
*/
2969-
if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing())
2970+
if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
29702971
return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
29712972
(struct task_struct *)a,
29722973
(struct task_struct *)b);
@@ -3325,7 +3326,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
33253326
* While disabling, always resched and refresh core-sched timestamp as
33263327
* we can't trust the slice management or ops.core_sched_before().
33273328
*/
3328-
if (scx_ops_bypassing()) {
3329+
if (scx_rq_bypassing(rq)) {
33293330
curr->scx.slice = 0;
33303331
touch_core_sched(rq, curr);
33313332
} else if (SCX_HAS_OP(tick)) {
@@ -3664,7 +3665,7 @@ bool scx_can_stop_tick(struct rq *rq)
36643665
{
36653666
struct task_struct *p = rq->curr;
36663667

3667-
if (scx_ops_bypassing())
3668+
if (scx_rq_bypassing(rq))
36683669
return false;
36693670

36703671
if (p->sched_class != &ext_sched_class)
@@ -4256,17 +4257,9 @@ static void scx_ops_bypass(bool bypass)
42564257
return;
42574258
}
42584259

4259-
/*
4260-
* We need to guarantee that no tasks are on the BPF scheduler while
4261-
* bypassing. Either we see enabled or the enable path sees the
4262-
* increased bypass_depth before moving tasks to SCX.
4263-
*/
4264-
if (!scx_enabled())
4265-
return;
4266-
42674260
/*
42684261
* No task property is changing. We just need to make sure all currently
4269-
* queued tasks are re-queued according to the new scx_ops_bypassing()
4262+
* queued tasks are re-queued according to the new scx_rq_bypassing()
42704263
* state. As an optimization, walk each rq's runnable_list instead of
42714264
* the scx_tasks list.
42724265
*
@@ -4280,6 +4273,24 @@ static void scx_ops_bypass(bool bypass)
42804273

42814274
rq_lock_irqsave(rq, &rf);
42824275

4276+
if (bypass) {
4277+
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
4278+
rq->scx.flags |= SCX_RQ_BYPASSING;
4279+
} else {
4280+
WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
4281+
rq->scx.flags &= ~SCX_RQ_BYPASSING;
4282+
}
4283+
4284+
/*
4285+
* We need to guarantee that no tasks are on the BPF scheduler
4286+
* while bypassing. Either we see enabled or the enable path
4287+
* sees scx_rq_bypassing() before moving tasks to SCX.
4288+
*/
4289+
if (!scx_enabled()) {
4290+
rq_unlock_irqrestore(rq, &rf);
4291+
continue;
4292+
}
4293+
42834294
/*
42844295
* The use of list_for_each_entry_safe_reverse() is required
42854296
* because each task is going to be removed from and added back
@@ -6397,17 +6408,17 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
63976408
if (!ops_cpu_valid(cpu, NULL))
63986409
return;
63996410

6411+
local_irq_save(irq_flags);
6412+
6413+
this_rq = this_rq();
6414+
64006415
/*
64016416
* While bypassing for PM ops, IRQ handling may not be online which can
64026417
* lead to irq_work_queue() malfunction such as infinite busy wait for
64036418
* IRQ status update. Suppress kicking.
64046419
*/
6405-
if (scx_ops_bypassing())
6406-
return;
6407-
6408-
local_irq_save(irq_flags);
6409-
6410-
this_rq = this_rq();
6420+
if (scx_rq_bypassing(this_rq))
6421+
goto out;
64116422

64126423
/*
64136424
* Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting

kernel/sched/sched.h

+1
Original file line numberDiff line numberDiff line change
@@ -750,6 +750,7 @@ enum scx_rq_flags {
750750
SCX_RQ_ONLINE = 1 << 0,
751751
SCX_RQ_CAN_STOP_TICK = 1 << 1,
752752
SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */
753+
SCX_RQ_BYPASSING = 1 << 3,
753754

754755
SCX_RQ_IN_WAKEUP = 1 << 16,
755756
SCX_RQ_IN_BALANCE = 1 << 17,

0 commit comments

Comments
 (0)