From f1308b6737866d808c000a63844d87d0498c38b8 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 12 Sep 2024 12:21:47 -0400 Subject: [PATCH] netfilter: conntrack: serialize hash resizes and cleanups jira LE-1907 cve CVE-2021-47408 Rebuild_History Non-Buildable kernel-4.18.0-553.16.1.el8_10 commit-author Eric Dumazet commit e9edc188fc76499b0b9bd60364084037f6d03773 Empty-Commit: Cherry-Pick Conflicts during history rebuild. Will be included in final tarball splat. Ref for failed cherry-pick at: ciq/ciq_backports/kernel-4.18.0-553.16.1.el8_10/e9edc188.failed Syzbot was able to trigger the following warning [1] No repro found by syzbot yet but I was able to trigger similar issue by having 2 scripts running in parallel, changing conntrack hash sizes, and: for j in `seq 1 1000` ; do unshare -n /bin/true >/dev/null ; done It would take more than 5 minutes for net_namespace structures to be cleaned up. This is because nf_ct_iterate_cleanup() has to restart everytime a resize happened. By adding a mutex, we can serialize hash resizes and cleanups and also make get_next_corpse() faster by skipping over empty buckets. Even without resizes in the picture, this patch considerably speeds up network namespace dismantles. [1] INFO: task syz-executor.0:8312 can't die for more than 144 seconds. task:syz-executor.0 state:R running task stack:25672 pid: 8312 ppid: 6573 flags:0x00004006 Call Trace: context_switch kernel/sched/core.c:4955 [inline] __schedule+0x940/0x26f0 kernel/sched/core.c:6236 preempt_schedule_common+0x45/0xc0 kernel/sched/core.c:6408 preempt_schedule_thunk+0x16/0x18 arch/x86/entry/thunk_64.S:35 __local_bh_enable_ip+0x109/0x120 kernel/softirq.c:390 local_bh_enable include/linux/bottom_half.h:32 [inline] get_next_corpse net/netfilter/nf_conntrack_core.c:2252 [inline] nf_ct_iterate_cleanup+0x15a/0x450 net/netfilter/nf_conntrack_core.c:2275 nf_conntrack_cleanup_net_list+0x14c/0x4f0 net/netfilter/nf_conntrack_core.c:2469 ops_exit_list+0x10d/0x160 net/core/net_namespace.c:171 setup_net+0x639/0xa30 net/core/net_namespace.c:349 copy_net_ns+0x319/0x760 net/core/net_namespace.c:470 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226 ksys_unshare+0x445/0x920 kernel/fork.c:3128 __do_sys_unshare kernel/fork.c:3202 [inline] __se_sys_unshare kernel/fork.c:3200 [inline] __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3200 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f63da68e739 RSP: 002b:00007f63d7c05188 EFLAGS: 00000246 ORIG_RAX: 0000000000000110 RAX: ffffffffffffffda RBX: 00007f63da792f80 RCX: 00007f63da68e739 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000040000000 RBP: 00007f63da6e8cc4 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f63da792f80 R13: 00007fff50b75d3f R14: 00007f63d7c05300 R15: 0000000000022000 Showing all locks held in the system: 1 lock held by khungtaskd/27: #0: ffffffff8b980020 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260 kernel/locking/lockdep.c:6446 2 locks held by kworker/u4:2/153: #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic_long_set include/linux/atomic/atomic-long.h:41 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: atomic_long_set include/linux/atomic/atomic-instrumented.h:1198 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_data kernel/workqueue.c:634 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_pool_and_clear_pending kernel/workqueue.c:661 [inline] #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x896/0x1690 kernel/workqueue.c:2268 #1: ffffc9000140fdb0 ((kfence_timer).work){+.+.}-{0:0}, at: process_one_work+0x8ca/0x1690 kernel/workqueue.c:2272 1 lock held by systemd-udevd/2970: 1 lock held by in:imklog/6258: #0: ffff88807f970ff0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0xe9/0x100 fs/file.c:990 3 locks held by kworker/1:6/8158: 1 lock held by syz-executor.0/8312: 2 locks held by kworker/u4:13/9320: 1 lock held by syz-executor.5/10178: 1 lock held by syz-executor.4/10217: Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Pablo Neira Ayuso (cherry picked from commit e9edc188fc76499b0b9bd60364084037f6d03773) Signed-off-by: Jonathan Maple # Conflicts: # net/netfilter/nf_conntrack_core.c --- .../e9edc188.failed | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 ciq/ciq_backports/kernel-4.18.0-553.16.1.el8_10/e9edc188.failed diff --git a/ciq/ciq_backports/kernel-4.18.0-553.16.1.el8_10/e9edc188.failed b/ciq/ciq_backports/kernel-4.18.0-553.16.1.el8_10/e9edc188.failed new file mode 100644 index 000000000000..89f270a58ab6 --- /dev/null +++ b/ciq/ciq_backports/kernel-4.18.0-553.16.1.el8_10/e9edc188.failed @@ -0,0 +1,127 @@ +netfilter: conntrack: serialize hash resizes and cleanups + +jira LE-1907 +cve CVE-2021-47408 +Rebuild_History Non-Buildable kernel-4.18.0-553.16.1.el8_10 +commit-author Eric Dumazet +commit e9edc188fc76499b0b9bd60364084037f6d03773 +Empty-Commit: Cherry-Pick Conflicts during history rebuild. +Will be included in final tarball splat. Ref for failed cherry-pick at: +ciq/ciq_backports/kernel-4.18.0-553.16.1.el8_10/e9edc188.failed + +Syzbot was able to trigger the following warning [1] + +No repro found by syzbot yet but I was able to trigger similar issue +by having 2 scripts running in parallel, changing conntrack hash sizes, +and: + +for j in `seq 1 1000` ; do unshare -n /bin/true >/dev/null ; done + +It would take more than 5 minutes for net_namespace structures +to be cleaned up. + +This is because nf_ct_iterate_cleanup() has to restart everytime +a resize happened. + +By adding a mutex, we can serialize hash resizes and cleanups +and also make get_next_corpse() faster by skipping over empty +buckets. + +Even without resizes in the picture, this patch considerably +speeds up network namespace dismantles. + +[1] +INFO: task syz-executor.0:8312 can't die for more than 144 seconds. +task:syz-executor.0 state:R running task stack:25672 pid: 8312 ppid: 6573 flags:0x00004006 +Call Trace: + context_switch kernel/sched/core.c:4955 [inline] + __schedule+0x940/0x26f0 kernel/sched/core.c:6236 + preempt_schedule_common+0x45/0xc0 kernel/sched/core.c:6408 + preempt_schedule_thunk+0x16/0x18 arch/x86/entry/thunk_64.S:35 + __local_bh_enable_ip+0x109/0x120 kernel/softirq.c:390 + local_bh_enable include/linux/bottom_half.h:32 [inline] + get_next_corpse net/netfilter/nf_conntrack_core.c:2252 [inline] + nf_ct_iterate_cleanup+0x15a/0x450 net/netfilter/nf_conntrack_core.c:2275 + nf_conntrack_cleanup_net_list+0x14c/0x4f0 net/netfilter/nf_conntrack_core.c:2469 + ops_exit_list+0x10d/0x160 net/core/net_namespace.c:171 + setup_net+0x639/0xa30 net/core/net_namespace.c:349 + copy_net_ns+0x319/0x760 net/core/net_namespace.c:470 + create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 + unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226 + ksys_unshare+0x445/0x920 kernel/fork.c:3128 + __do_sys_unshare kernel/fork.c:3202 [inline] + __se_sys_unshare kernel/fork.c:3200 [inline] + __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3200 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x44/0xae +RIP: 0033:0x7f63da68e739 +RSP: 002b:00007f63d7c05188 EFLAGS: 00000246 ORIG_RAX: 0000000000000110 +RAX: ffffffffffffffda RBX: 00007f63da792f80 RCX: 00007f63da68e739 +RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000040000000 +RBP: 00007f63da6e8cc4 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000246 R12: 00007f63da792f80 +R13: 00007fff50b75d3f R14: 00007f63d7c05300 R15: 0000000000022000 + +Showing all locks held in the system: +1 lock held by khungtaskd/27: + #0: ffffffff8b980020 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260 kernel/locking/lockdep.c:6446 +2 locks held by kworker/u4:2/153: + #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic64_set arch/x86/include/asm/atomic64_64.h:34 [inline] + #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: arch_atomic_long_set include/linux/atomic/atomic-long.h:41 [inline] + #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: atomic_long_set include/linux/atomic/atomic-instrumented.h:1198 [inline] + #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_data kernel/workqueue.c:634 [inline] + #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: set_work_pool_and_clear_pending kernel/workqueue.c:661 [inline] + #0: ffff888010c69138 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x896/0x1690 kernel/workqueue.c:2268 + #1: ffffc9000140fdb0 ((kfence_timer).work){+.+.}-{0:0}, at: process_one_work+0x8ca/0x1690 kernel/workqueue.c:2272 +1 lock held by systemd-udevd/2970: +1 lock held by in:imklog/6258: + #0: ffff88807f970ff0 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0xe9/0x100 fs/file.c:990 +3 locks held by kworker/1:6/8158: +1 lock held by syz-executor.0/8312: +2 locks held by kworker/u4:13/9320: +1 lock held by syz-executor.5/10178: +1 lock held by syz-executor.4/10217: + + Signed-off-by: Eric Dumazet + Reported-by: syzbot + Signed-off-by: Pablo Neira Ayuso +(cherry picked from commit e9edc188fc76499b0b9bd60364084037f6d03773) + Signed-off-by: Jonathan Maple + +# Conflicts: +# net/netfilter/nf_conntrack_core.c +diff --cc net/netfilter/nf_conntrack_core.c +index c6b0b1417a6e,770a63103c7a..000000000000 +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@@ -76,16 -71,17 +76,27 @@@ struct conntrack_gc_work + }; + + static __read_mostly struct kmem_cache *nf_conntrack_cachep; + -static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); + +static __read_mostly spinlock_t nf_conntrack_locks_all_lock; + +static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); + static __read_mostly bool nf_conntrack_locks_all; + +++<<<<<<< HEAD + +/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ + +#define GC_MAX_BUCKETS_DIV 128u + +/* upper bound of full table scan */ + +#define GC_MAX_SCAN_JIFFIES (16u * HZ) + +/* desired ratio of entries found to be expired */ + +#define GC_EVICT_RATIO 50u +++======= ++ /* serialize hash resizes and nf_ct_iterate_cleanup */ ++ static DEFINE_MUTEX(nf_conntrack_mutex); ++ ++ #define GC_SCAN_INTERVAL (120u * HZ) ++ #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) ++ ++ #define MIN_CHAINLEN 8u ++ #define MAX_CHAINLEN (32u - MIN_CHAINLEN) +++>>>>>>> e9edc188fc76 (netfilter: conntrack: serialize hash resizes and cleanups) + + static struct conntrack_gc_work conntrack_gc_work; + +* Unmerged path net/netfilter/nf_conntrack_core.c