LVS
lvs-devel
Google
 
Web LinuxVirtualServer.org

[PATCH nf] ipvs: avoid possible loop in ip_vs_dst_event on resizing

To: Simon Horman <horms@xxxxxxxxxxxx>
Subject: [PATCH nf] ipvs: avoid possible loop in ip_vs_dst_event on resizing
Cc: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>, Florian Westphal <fw@xxxxxxxxx>, lvs-devel@xxxxxxxxxxxxxxx, netfilter-devel@xxxxxxxxxxxxxxx
From: Julian Anastasov <ja@xxxxxx>
Date: Wed, 6 May 2026 21:44:21 +0300
Sashiko points out that unprivileged user can frequently
call ip_vs_flush() or ip_vs_del_service() to trigger
svc_table_changes updates that can lead to infinite loop
in ip_vs_dst_event(). This can also happen if the user
triggers frequent table resizing without deleting all
services.

One way to solve it is to hold svc_resize_work in
ip_vs_dst_event() but this can block the dev notifier
during the whole resizing process.

Instead, use new rw_semaphore svc_replace_sem to protect
the svc_table replacement which is a short code section.
Then hold svc_replace_sem in ip_vs_dst_event() to serialize
with replacing the svc_table. By this way changes in
svc_table_changes can happen only when all services are
removed and all dev references dropped which allows us
to exit the loop.

Link: 
https://sashiko.dev/#/patchset/20260505001648.360569-1-pablo%40netfilter.org
Fixes: 840aac3d900d ("ipvs: use resizable hash table for services")
Signed-off-by: Julian Anastasov <ja@xxxxxx>
---
 include/net/ip_vs.h            |  3 +-
 net/netfilter/ipvs/ip_vs_ctl.c | 52 ++++++++++++++++++++++++----------
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 02762ce73a0c..a02e569813d2 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1186,8 +1186,9 @@ struct netns_ipvs {
        struct timer_list       dest_trash_timer; /* expiration timer */
        struct mutex            service_mutex;    /* service reconfig */
        struct rw_semaphore     svc_resize_sem;   /* svc_table resizing */
+       struct rw_semaphore     svc_replace_sem;  /* svc_table replace */
        struct delayed_work     svc_resize_work;  /* resize svc_table */
-       atomic_t                svc_table_changes;/* ++ on new table */
+       atomic_t                svc_table_changes;/* ++ on table changes */
        /* Service counters */
        atomic_t                num_services[IP_VS_AF_MAX];   /* Services */
        atomic_t                fwm_services[IP_VS_AF_MAX];   /* Services */
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c7c7f6a7a9f6..0c1f409f41ba 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -327,13 +327,17 @@ ip_vs_use_count_dec(void)
 /* Service hashing:
  * Operation                   Locking order
  * ---------------------------------------------------------------------------
- * add table                   service_mutex, svc_resize_sem(W)
+ * add table                   service_mutex
+ * replace table               service_mutex, svc_resize_sem(W),
+ *                             svc_replace_sem(W)
  * del table                   service_mutex
  * move between tables         svc_resize_sem(W), seqcount_t(W), bit lock
  * add/del service             service_mutex, bit lock
  * find service                        RCU, seqcount_t(R)
  * walk services(blocking)     service_mutex, svc_resize_sem(R)
  * walk services(non-blocking) RCU, seqcount_t(R)
+ * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R)
+ * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R)
  *
  * - new tables are linked/unlinked under service_mutex and svc_resize_sem
  * - new table is linked on resizing and all operations can run in parallel
@@ -346,9 +350,14 @@ ip_vs_use_count_dec(void)
  * services are moved to new table
  * - move operations may disturb readers: find operation will not miss entries
  * but walkers may see same entry twice if they are forced to retry chains
+ * or to walk the newly attached second table
  * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold
  * service_mutex to disallow new tables to be installed or to check
  * svc_table_changes and repeat the RCU read section if new table is installed
+ * - walkers may serialize with the whole resizing process (svc_resize_sem)
+ * to prevent seeing same service twice or just with the svc_table
+ * replace (svc_replace_sem) when we can see entries twice but we
+ * prefer to run concurrently with the rehashing.
  */
 
 /*
@@ -764,8 +773,12 @@ static void svc_resize_work_handler(struct work_struct 
*work)
                        goto same_bucket;
        }
 
-       /* Tables can be switched only under service_mutex */
-       while (!mutex_trylock(&ipvs->service_mutex)) {
+       /* Tables can be switched only under service_mutex, svc_replace_sem */
+       for (;;) {
+               down_write(&ipvs->svc_replace_sem);
+               if (mutex_trylock(&ipvs->service_mutex))
+                       break;
+               up_write(&ipvs->svc_replace_sem);
                cond_resched();
                if (!READ_ONCE(ipvs->enable) ||
                    test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
@@ -773,7 +786,7 @@ static void svc_resize_work_handler(struct work_struct 
*work)
        }
        if (!READ_ONCE(ipvs->enable) ||
            test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
-               goto unlock_m;
+               goto unlock_repl;
 
        rcu_assign_pointer(ipvs->svc_table, t_new);
        /* Inform readers that new table is installed */
@@ -781,6 +794,9 @@ static void svc_resize_work_handler(struct work_struct 
*work)
        atomic_inc(&ipvs->svc_table_changes);
        t_free = t;
 
+unlock_repl:
+       up_write(&ipvs->svc_replace_sem);
+
 unlock_m:
        mutex_unlock(&ipvs->service_mutex);
 
@@ -2184,17 +2200,21 @@ static int ip_vs_dst_event(struct notifier_block *this, 
unsigned long event,
        struct ip_vs_service *svc;
        struct hlist_bl_node *e;
        struct ip_vs_dest *dest;
-       int old_gen, new_gen;
+       int old_gen;
 
        if (event != NETDEV_DOWN || !ipvs)
                return NOTIFY_DONE;
        IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
 
+       /* Allow concurrent rehashing on resize but to avoid loop
+        * serialize with installing the new table.
+        */
+       down_read(&ipvs->svc_replace_sem);
+
        old_gen = atomic_read(&ipvs->svc_table_changes);
 
        rcu_read_lock();
 
-repeat:
        smp_rmb(); /* ipvs->svc_table and svc_table_changes */
        ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
                hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
@@ -2207,17 +2227,17 @@ static int ip_vs_dst_event(struct notifier_block *this, 
unsigned long event,
                }
                resched_score++;
                if (resched_score >= 100) {
-                       resched_score = 0;
                        cond_resched_rcu();
-                       new_gen = atomic_read(&ipvs->svc_table_changes);
-                       /* New table installed ? */
-                       if (old_gen != new_gen) {
-                               old_gen = new_gen;
-                               goto repeat;
-                       }
+                       /* Flushed? So no more dev refs */
+                       if (atomic_read(&ipvs->svc_table_changes) != old_gen)
+                               goto done;
+                       resched_score = 0;
                }
        }
+
+done:
        rcu_read_unlock();
+       up_read(&ipvs->svc_replace_sem);
 
        return NOTIFY_DONE;
 }
@@ -3503,7 +3523,7 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
        int ret = 0;
 
        lockdep_assert_held(&ipvs->svc_resize_sem);
-       /* All service modifications are disabled, go ahead */
+       /* All svc_table modifications are disabled, go ahead */
        ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
                hlist_bl_for_each_entry(svc, e, head, s_list) {
                        /* Only expose IPv4 entries to old interface */
@@ -3687,7 +3707,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user 
*user, int *len)
                        pr_err("length: %u != %zu\n", *len, size);
                        return -EINVAL;
                }
-               /* Protect against table resizer moving the entries.
+               /* Prevent modifications to the list with services.
                 * Try reverse locking, so that we do not hold the mutex
                 * while waiting for semaphore.
                 */
@@ -4029,6 +4049,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
        int start = cb->args[0];
        int idx = 0;
 
+       /* Make sure we do not see same service twice during resize */
        down_read(&ipvs->svc_resize_sem);
        rcu_read_lock();
        ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) {
@@ -5072,6 +5093,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs 
*ipvs)
        /* Initialize service_mutex, svc_table per netns */
        __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", 
&__ipvs_service_key);
        init_rwsem(&ipvs->svc_resize_sem);
+       init_rwsem(&ipvs->svc_replace_sem);
        INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler);
        atomic_set(&ipvs->svc_table_changes, 0);
        RCU_INIT_POINTER(ipvs->svc_table, NULL);
-- 
2.53.0




<Prev in Thread] Current Thread [Next in Thread>