LVS
lvs-devel
Google
 
Web LinuxVirtualServer.org

[PATCH net 3/3] ipvs: fix the spin_lock usage for RT build

To: Simon Horman <horms@xxxxxxxxxxxx>
Subject: [PATCH net 3/3] ipvs: fix the spin_lock usage for RT build
Cc: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>, Florian Westphal <fw@xxxxxxxxx>, lvs-devel@xxxxxxxxxxxxxxx, netfilter-devel@xxxxxxxxxxxxxxx
From: Julian Anastasov <ja@xxxxxx>
Date: Wed, 15 Apr 2026 23:02:16 +0300
syzbot reports for sleeping function called from invalid context [1].
The recently added code for resizable hash tables uses
hlist_bl bit locks in combination with spin_lock for
the connection fields (cp->lock).

Fix the following problems:

* avoid using spin_lock(&cp->lock) under locked bit lock
because it sleeps on PREEMPT_RT

* as the recent changes call ip_vs_conn_hash() only for newly
allocated connection, the spin_lock can be removed there because
the connection is still not linked to table and does not need
cp->lock protection.

* the lock can be removed also from ip_vs_conn_unlink() where we
are the last connection user.

* the last place that is fixed is ip_vs_conn_fill_cport()
where the locks can be reordered to follow the RT rules.

[1]:
BUG: sleeping function called from invalid context at 
kernel/locking/spinlock_rt.c:48
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 16, name: ktimers/0
preempt_count: 2, expected: 0
RCU nest depth: 3, expected: 3
8 locks held by ktimers/0/16:
 #0: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: 
__local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163
 #1: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: 
__local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163
 #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: spin_lock 
include/linux/spinlock_rt.h:45 [inline]
 #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: 
timer_base_lock_expiry kernel/time/timer.c:1502 [inline]
 #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: 
__run_timer_base+0x120/0x9f0 kernel/time/timer.c:2384
 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire 
include/linux/rcupdate.h:300 [inline]
 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock 
include/linux/rcupdate.h:838 [inline]
 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __rt_spin_lock 
kernel/locking/spinlock_rt.c:50 [inline]
 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x1e0/0x400 
kernel/locking/spinlock_rt.c:57
 #4: ffffc90000157a80 ((&cp->timer)){+...}-{0:0}, at: call_timer_fn+0xd4/0x5e0 
kernel/time/timer.c:1745
 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire 
include/linux/rcupdate.h:300 [inline]
 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock 
include/linux/rcupdate.h:838 [inline]
 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_unlink 
net/netfilter/ipvs/ip_vs_conn.c:315 [inline]
 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: 
ip_vs_conn_expire+0x257/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260
 #6: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: 
__local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163
 #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: spin_lock 
include/linux/spinlock_rt.h:45 [inline]
 #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_unlink 
net/netfilter/ipvs/ip_vs_conn.c:324 [inline]
 #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: 
ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260
Preemption disabled at:
[<ffffffff898a6358>] bit_spin_lock include/linux/bit_spinlock.h:38 [inline]
[<ffffffff898a6358>] hlist_bl_lock+0x18/0x110 include/linux/list_bl.h:149
CPU: 0 UID: 0 PID: 16 Comm: ktimers/0 Tainted: G        W    L      syzkaller 
#0 PREEMPT_{RT,(full)}
Tainted: [W]=WARN, [L]=SOFTLOCKUP
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
03/18/2026
Call Trace:
 <TASK>
 dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
 __might_resched+0x329/0x480 kernel/sched/core.c:9162
 __rt_spin_lock kernel/locking/spinlock_rt.c:48 [inline]
 rt_spin_lock+0xc2/0x400 kernel/locking/spinlock_rt.c:57
 spin_lock include/linux/spinlock_rt.h:45 [inline]
 ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline]
 ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260
 call_timer_fn+0x192/0x5e0 kernel/time/timer.c:1748
 expire_timers kernel/time/timer.c:1799 [inline]
 __run_timers kernel/time/timer.c:2374 [inline]
 __run_timer_base+0x6a3/0x9f0 kernel/time/timer.c:2386
 run_timer_base kernel/time/timer.c:2395 [inline]
 run_timer_softirq+0xb7/0x170 kernel/time/timer.c:2405
 handle_softirqs+0x1de/0x6d0 kernel/softirq.c:622
 __do_softirq kernel/softirq.c:656 [inline]
 run_ktimerd+0x69/0x100 kernel/softirq.c:1151
 smpboot_thread_fn+0x541/0xa50 kernel/smpboot.c:160
 kthread+0x388/0x470 kernel/kthread.c:436
 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>

Reported-by: syzbot+504e778ddaecd36fdd17@xxxxxxxxxxxxxxxxxxxxxxxxx
Fixes: 2fa7cc9c7025 ("ipvs: switch to per-net connection table")
Signed-off-by: Julian Anastasov <ja@xxxxxx>
---
 net/netfilter/ipvs/ip_vs_conn.c | 49 ++++++++++++++-------------------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 84a4921a7865..cf19dc06c65d 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -267,27 +267,20 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
                hash_key2 = hash_key;
                use2 = false;
        }
+
        conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */,
                      &head, &head2);
-       spin_lock(&cp->lock);
-
-       if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
-               cp->flags |= IP_VS_CONN_F_HASHED;
-               WRITE_ONCE(cp->hn0.hash_key, hash_key);
-               WRITE_ONCE(cp->hn1.hash_key, hash_key2);
-               refcount_inc(&cp->refcnt);
-               hlist_bl_add_head_rcu(&cp->hn0.node, head);
-               if (use2)
-                       hlist_bl_add_head_rcu(&cp->hn1.node, head2);
-               ret = 1;
-       } else {
-               pr_err("%s(): request for already hashed, called from %pS\n",
-                      __func__, __builtin_return_address(0));
-               ret = 0;
-       }
 
-       spin_unlock(&cp->lock);
+       cp->flags |= IP_VS_CONN_F_HASHED;
+       WRITE_ONCE(cp->hn0.hash_key, hash_key);
+       WRITE_ONCE(cp->hn1.hash_key, hash_key2);
+       refcount_inc(&cp->refcnt);
+       hlist_bl_add_head_rcu(&cp->hn0.node, head);
+       if (use2)
+               hlist_bl_add_head_rcu(&cp->hn1.node, head2);
+
        conn_tab_unlock(head, head2);
+       ret = 1;
 
        /* Schedule resizing if load increases */
        if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
@@ -321,7 +314,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 
        conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */,
                      &head, &head2);
-       spin_lock(&cp->lock);
 
        if (cp->flags & IP_VS_CONN_F_HASHED) {
                /* Decrease refcnt and unlink conn only if we are last user */
@@ -334,7 +326,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
                }
        }
 
-       spin_unlock(&cp->lock);
        conn_tab_unlock(head, head2);
 
        rcu_read_unlock();
@@ -637,6 +628,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 
cport)
        struct ip_vs_conn_hnode *hn;
        u32 hash_key, hash_key_new;
        struct ip_vs_conn_param p;
+       bool changed = false;
        int ntbl;
        int dir;
 
@@ -709,9 +701,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 
cport)
                goto retry;
        }
 
-       spin_lock(&cp->lock);
-       if ((cp->flags & IP_VS_CONN_F_NO_CPORT) &&
-           (cp->flags & IP_VS_CONN_F_HASHED)) {
+       if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
                /* We do not recalc hash_key_r under lock, we assume the
                 * parameters in cp do not change, i.e. cport is
                 * the only possible change.
@@ -726,19 +716,22 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 
cport)
                        hlist_bl_del_rcu(&hn->node);
                        hlist_bl_add_head_rcu(&hn->node, head_new);
                }
-               if (!dir) {
-                       atomic_dec(&ipvs->no_cport_conns[af_id]);
-                       cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
-                       cp->cport = cport;
-               }
+               if (!dir)
+                       changed = true;
        }
-       spin_unlock(&cp->lock);
 
        if (head != head2)
                hlist_bl_unlock(head2);
        hlist_bl_unlock(head);
        write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
        preempt_enable_nested();
+       if (changed) {
+               atomic_dec(&ipvs->no_cport_conns[af_id]);
+               spin_lock(&cp->lock);
+               cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+               cp->cport = cport;
+               spin_unlock(&cp->lock);
+       }
        spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l);
        if (dir--)
                goto next_dir;
-- 
2.53.0




<Prev in Thread] Current Thread [Next in Thread>