LVS
lvs-devel
Google
 
Web LinuxVirtualServer.org

[PATCH RFC nf 2/2] ipvs: adjust double hashing when fwd method changes

To: tt roxy <roxy520tt@xxxxxxxxx>
Subject: [PATCH RFC nf 2/2] ipvs: adjust double hashing when fwd method changes
Cc: Ren Wei <n05ec@xxxxxxxxxx>, lvs-devel@xxxxxxxxxxxxxxx, netfilter-devel@xxxxxxxxxxxxxxx, yuantan098@xxxxxxxxx, yifanwucs@xxxxxxxxx, tomapufckgml@xxxxxxxxx, bird@xxxxxxxxxx
From: Julian Anastasov <ja@xxxxxx>
Date: Tue, 30 Jun 2026 22:36:46 +0300
Synced conns can be created with one forwarding method
and later updated with different one after the dest
server is configured. This needs adjusting the hashing
for node hn1 because only MASQ supports double hashing.

Modify conn_tab_lock() to support seeking for hash node
hn0 together with adding for hn1. By this way we can
safely modify the forwarding method and hn1.hash_key
under bucket lock for the first node hn0. The forwarding
method is also protected by cp->lock as it is part of
cp->flags.

Reported-by: Zhiling Zou <roxy520tt@xxxxxxxxx>
Link: 
https://lore.kernel.org/lvs-devel/1b914f41d725bc064c9ba9830dc8169329737270.1782540466.git.roxy520tt@xxxxxxxxx/
Fixes: f20c73b0460d ("ipvs: use more keys for connection hashing")
Signed-off-by: Julian Anastasov <ja@xxxxxx>
---
 net/netfilter/ipvs/ip_vs_conn.c | 168 +++++++++++++++++++++++++-------
 1 file changed, 134 insertions(+), 34 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 805ca1fc3bc8..53a7de3a9f2b 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -70,25 +70,47 @@ static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
  * bucket or hash table
  * - hash table resize works like rehash but always rehashes into new table
  * - bit lock on bucket serializes all operations that modify the chain
+ * - on resize, bucket from the old table is locked before bucket from the
+ * new table
  * - cp->lock protects conn fields like cp->flags, cp->dest
  */
 
-/* Lock conn_tab bucket for conn hash/unhash, not for rehash */
+/**
+  * conn_tab_lock - Lock conn_tab buckets for conn hash/unhash, not for rehash
+  * @t:                hash table for hn0
+  * @t2:       hash table for hn1
+  * @cp:       connection
+  * @hash_key: hash key for hn0
+  * @hash_key2:        hash key for hn1
+  * @use2:     using hn1 (double hashing) based on the forwarding method
+  * @new_hash: mode for hn0, hash node (true) or seek node (false)
+  * @new_hash2:        mode for hn1, hash node (true) or seek node (false)
+  * @head_ret: returned head for hn0
+  * @head2_ret:        returned head for hn1
+  *
+  * We support 3 modes:
+  * - seek mode for both nodes, used for unhashing
+  * - hash mode for both nodes, used for hashing
+  * - seek hn0 and hash hn1, used when forwarding method is changed
+  */
 static __always_inline void
-conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key,
-             u32 hash_key2, bool use2, bool new_hash,
-             struct hlist_bl_head **head_ret, struct hlist_bl_head **head2_ret)
+conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_rht *t2, struct ip_vs_conn *cp,
+             u32 hash_key, u32 hash_key2, bool use2, bool new_hash,
+             bool new_hash2, struct hlist_bl_head **head_ret,
+             struct hlist_bl_head **head2_ret)
 {
        struct hlist_bl_head *head, *head2;
        u32 hash_key_new, hash_key_new2;
-       struct ip_vs_rht *t2 = t;
        u32 idx, idx2;
 
        idx = hash_key & t->mask;
-       if (use2)
-               idx2 = hash_key2 & t->mask;
-       else
-               idx2 = idx;
+       idx2 = hash_key2 & t->mask;
+       /* Advance idx2 when new_hash is not set but hash_key2
+        * is for new table
+        */
+       if (new_hash2 && use2 && t != t2)
+               idx2 |= IP_VS_RHT_TABLE_ID_MASK;
+
        if (!new_hash) {
                /* We need to lock the bucket in the right table */
 
@@ -103,24 +125,19 @@ conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, 
u32 hash_key,
                        idx = hash_key & t->mask;
                        idx |= IP_VS_RHT_TABLE_ID_MASK;
                }
-               if (use2) {
-                       if (!ip_vs_rht_same_table(t2, hash_key2)) {
-                               /* It is already moved to new table */
-                               t2 = rcu_dereference(t2->new_tbl);
-                               idx2 = hash_key2 & t2->mask;
-                               idx2 |= IP_VS_RHT_TABLE_ID_MASK;
-                       }
-               } else {
-                       idx2 = idx;
-               }
+       }
+       if (use2 && !new_hash2 && !ip_vs_rht_same_table(t2, hash_key2)) {
+               /* It is already moved to new table */
+               t2 = rcu_dereference(t2->new_tbl);
+               idx2 = hash_key2 & t2->mask;
+               idx2 |= IP_VS_RHT_TABLE_ID_MASK;
        }
 
+       if (!use2)
+               idx2 = idx;
        head = t->buckets + (hash_key & t->mask);
        head2 = use2 ? t2->buckets + (hash_key2 & t2->mask) : head;
 
-       local_bh_disable();
-       /* Do not touch seqcount, this is a safe operation */
-
        if (idx <= idx2) {
                hlist_bl_lock(head);
                if (head != head2)
@@ -130,16 +147,22 @@ conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, 
u32 hash_key,
                hlist_bl_lock(head);
        }
        if (!new_hash) {
+               bool changed;
+
                /* Ensure hash_key is read under lock */
                hash_key_new = READ_ONCE(cp->hn0.hash_key);
-               hash_key_new2 = READ_ONCE(cp->hn1.hash_key);
+               changed = hash_key != hash_key_new;
+               if (use2 && !new_hash2) {
+                       hash_key_new2 = READ_ONCE(cp->hn1.hash_key);
+                       changed |= hash_key2 != hash_key_new2;
+               } else {
+                       hash_key_new2 = hash_key2;
+               }
                /* Hash changed ? */
-               if (hash_key != hash_key_new ||
-                   (hash_key2 != hash_key_new2 && use2)) {
+               if (changed) {
                        if (head != head2)
                                hlist_bl_unlock(head2);
                        hlist_bl_unlock(head);
-                       local_bh_enable();
                        hash_key = hash_key_new;
                        hash_key2 = hash_key_new2;
                        goto retry;
@@ -155,7 +178,6 @@ static inline void conn_tab_unlock(struct hlist_bl_head 
*head,
        if (head != head2)
                hlist_bl_unlock(head2);
        hlist_bl_unlock(head);
-       local_bh_enable();
 }
 
 static void ip_vs_conn_expire(struct timer_list *t);
@@ -268,8 +290,9 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
                use2 = false;
        }
 
-       conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */,
-                     &head, &head2);
+       local_bh_disable();
+       conn_tab_lock(t, t, cp, hash_key, hash_key2, use2, true /* new_hash */,
+                     true /* new_hash2 */, &head, &head2);
 
        cp->flags |= IP_VS_CONN_F_HASHED;
        WRITE_ONCE(cp->hn0.hash_key, hash_key);
@@ -280,6 +303,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
                hlist_bl_add_head_rcu(&cp->hn1.node, head2);
 
        conn_tab_unlock(head, head2);
+       local_bh_enable();
        ret = 1;
 
        /* Schedule resizing if load increases */
@@ -306,18 +330,20 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn 
*cp)
                return refcount_dec_if_one(&cp->refcnt);
 
        rcu_read_lock();
+       local_bh_disable();
 
        t = rcu_dereference(ipvs->conn_tab);
        hash_key = READ_ONCE(cp->hn0.hash_key);
        hash_key2 = READ_ONCE(cp->hn1.hash_key);
        use2 = ip_vs_conn_use_hash2(cp);
 
-       conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */,
-                     &head, &head2);
+       conn_tab_lock(t, t, cp, hash_key, hash_key2, use2, false /* new_hash */,
+                     false /* new_hash2 */, &head, &head2);
 
        if (cp->flags & IP_VS_CONN_F_HASHED) {
                /* Decrease refcnt and unlink conn only if we are last user */
-               if (refcount_dec_if_one(&cp->refcnt)) {
+               if (use2 == ip_vs_conn_use_hash2(cp) &&
+                   refcount_dec_if_one(&cp->refcnt)) {
                        hlist_bl_del_rcu(&cp->hn0.node);
                        if (use2)
                                hlist_bl_del_rcu(&cp->hn1.node);
@@ -328,6 +354,7 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
 
        conn_tab_unlock(head, head2);
 
+       local_bh_enable();
        rcu_read_unlock();
 
        return ret;
@@ -686,6 +713,16 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 
cport)
        /* Protect the cp->flags modification */
        spin_lock_bh(&cp->lock);
 
+       /* Recheck the forwarding method under lock */
+       if (use2 != ip_vs_conn_use_hash2(cp)) {
+               spin_unlock_bh(&cp->lock);
+               use2 = !use2;
+               if (dir) {
+                       dir--;
+                       goto next_dir;
+               }
+       }
+
        /* Lock seqcount only for the old bucket, even if we are on new table
         * because it affects the del operation, not the adding.
         */
@@ -752,6 +789,60 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 
cport)
                goto next_dir;
 }
 
+/* Change forwarding method for hashed conn */
+static void ip_vs_conn_change_fwd_mask(struct ip_vs_conn *cp, u32 new_flags)
+{
+       struct netns_ipvs *ipvs = cp->ipvs;
+       struct hlist_bl_head *head, *head2;
+       u32 hash2, hash_key, hash_key2;
+       struct ip_vs_rht *t, *t2;
+
+       /* See ip_vs_conn_use_hash2() for reference */
+       if ((cp->flags & IP_VS_CONN_F_TEMPLATE) ||
+           /* No change in double hashing ? */
+           !((cp->flags ^ new_flags) & IP_VS_CONN_F_FWD_MASK)) {
+               cp->flags = new_flags;
+               return;
+       }
+       t = rcu_dereference(ipvs->conn_tab);
+       if (ip_vs_conn_use_hash2(cp)) {
+               /* Stop double hashing */
+               hash_key = READ_ONCE(cp->hn0.hash_key);
+               hash_key2 = READ_ONCE(cp->hn1.hash_key);
+
+               conn_tab_lock(t, t, cp, hash_key, hash_key2, true /* use2 */,
+                             false /* new_hash */, false /* new_hash2 */,
+                             &head, &head2);
+
+               /* Keep both hash keys in same table */
+               hash_key = READ_ONCE(cp->hn0.hash_key);
+               WRITE_ONCE(cp->hn1.hash_key, hash_key);
+               hlist_bl_del_rcu(&cp->hn1.node);
+               cp->flags = new_flags;
+
+               conn_tab_unlock(head, head2);
+       } else {
+               /* Start double hashing */
+
+               hash_key = READ_ONCE(cp->hn0.hash_key);
+
+               t2 = rcu_dereference(t->new_tbl);
+               hash2 = ip_vs_conn_hashkey_conn(t2, cp, true);
+               hash_key2 = ip_vs_rht_build_hash_key(t2, hash2);
+
+               /* Change the forwarding method under locked hn0 */
+               conn_tab_lock(t, t2, cp, hash_key, hash_key2, true /* use2 */,
+                             false /* new_hash */, true /* new_hash2 */,
+                             &head, &head2);
+
+               WRITE_ONCE(cp->hn1.hash_key, hash_key2);
+               cp->flags = new_flags;
+               hlist_bl_add_head_rcu(&cp->hn1.node, head2);
+
+               conn_tab_unlock(head, head2);
+       }
+}
+
 /* Get default load factor to map conn_count/u_thresh to t->size */
 static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs)
 {
@@ -1024,9 +1115,18 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest 
*dest)
                        conn_flags &= ~IP_VS_CONN_F_INACTIVE;
                /* connections inherit forwarding method from dest */
                flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
+               flags |= conn_flags;
+               /* Changing forwarding method for hashed conn can
+                * happen only under locks
+                */
+               if (cp->flags & IP_VS_CONN_F_HASHED)
+                       ip_vs_conn_change_fwd_mask(cp, flags);
+               else
+                       cp->flags = flags;
+       } else {
+               flags |= conn_flags;
+               cp->flags = flags;
        }
-       flags |= conn_flags;
-       cp->flags = flags;
        cp->dest = dest;
 
        IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
-- 
2.54.0




<Prev in Thread] Current Thread [Next in Thread>