|
Sashiko points out that unprivileged user can frequently
call ip_vs_flush() or ip_vs_del_service() to trigger
svc_table_changes updates that can lead to infinite loop
in ip_vs_dst_event(). This can also happen if the user
triggers frequent table resizing without deleting all
services.
One way to solve it is to hold svc_resize_work in
ip_vs_dst_event() but this can block the dev notifier
during the whole resizing process.
Instead, use new rw_semaphore svc_replace_sem to protect
the svc_table replacement which is a short code section.
Then hold svc_replace_sem in ip_vs_dst_event() to serialize
with replacing the svc_table. By this way changes in
svc_table_changes can happen only when all services are
removed and all dev references dropped which allows us
to exit the loop.
Link:
https://sashiko.dev/#/patchset/20260505001648.360569-1-pablo%40netfilter.org
Fixes: 840aac3d900d ("ipvs: use resizable hash table for services")
Signed-off-by: Julian Anastasov <ja@xxxxxx>
---
include/net/ip_vs.h | 3 +-
net/netfilter/ipvs/ip_vs_ctl.c | 52 ++++++++++++++++++++++++----------
2 files changed, 39 insertions(+), 16 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 02762ce73a0c..a02e569813d2 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1186,8 +1186,9 @@ struct netns_ipvs {
struct timer_list dest_trash_timer; /* expiration timer */
struct mutex service_mutex; /* service reconfig */
struct rw_semaphore svc_resize_sem; /* svc_table resizing */
+ struct rw_semaphore svc_replace_sem; /* svc_table replace */
struct delayed_work svc_resize_work; /* resize svc_table */
- atomic_t svc_table_changes;/* ++ on new table */
+ atomic_t svc_table_changes;/* ++ on table changes */
/* Service counters */
atomic_t num_services[IP_VS_AF_MAX]; /* Services */
atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c7c7f6a7a9f6..0c1f409f41ba 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -327,13 +327,17 @@ ip_vs_use_count_dec(void)
/* Service hashing:
* Operation Locking order
* ---------------------------------------------------------------------------
- * add table service_mutex, svc_resize_sem(W)
+ * add table service_mutex
+ * replace table service_mutex, svc_resize_sem(W),
+ * svc_replace_sem(W)
* del table service_mutex
* move between tables svc_resize_sem(W), seqcount_t(W), bit lock
* add/del service service_mutex, bit lock
* find service RCU, seqcount_t(R)
* walk services(blocking) service_mutex, svc_resize_sem(R)
* walk services(non-blocking) RCU, seqcount_t(R)
+ * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R)
+ * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R)
*
* - new tables are linked/unlinked under service_mutex and svc_resize_sem
* - new table is linked on resizing and all operations can run in parallel
@@ -346,9 +350,14 @@ ip_vs_use_count_dec(void)
* services are moved to new table
* - move operations may disturb readers: find operation will not miss entries
* but walkers may see same entry twice if they are forced to retry chains
+ * or to walk the newly attached second table
* - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold
* service_mutex to disallow new tables to be installed or to check
* svc_table_changes and repeat the RCU read section if new table is installed
+ * - walkers may serialize with the whole resizing process (svc_resize_sem)
+ * to prevent seeing same service twice or just with the svc_table
+ * replace (svc_replace_sem) when we can see entries twice but we
+ * prefer to run concurrently with the rehashing.
*/
/*
@@ -764,8 +773,12 @@ static void svc_resize_work_handler(struct work_struct
*work)
goto same_bucket;
}
- /* Tables can be switched only under service_mutex */
- while (!mutex_trylock(&ipvs->service_mutex)) {
+ /* Tables can be switched only under service_mutex, svc_replace_sem */
+ for (;;) {
+ down_write(&ipvs->svc_replace_sem);
+ if (mutex_trylock(&ipvs->service_mutex))
+ break;
+ up_write(&ipvs->svc_replace_sem);
cond_resched();
if (!READ_ONCE(ipvs->enable) ||
test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
@@ -773,7 +786,7 @@ static void svc_resize_work_handler(struct work_struct
*work)
}
if (!READ_ONCE(ipvs->enable) ||
test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
- goto unlock_m;
+ goto unlock_repl;
rcu_assign_pointer(ipvs->svc_table, t_new);
/* Inform readers that new table is installed */
@@ -781,6 +794,9 @@ static void svc_resize_work_handler(struct work_struct
*work)
atomic_inc(&ipvs->svc_table_changes);
t_free = t;
+unlock_repl:
+ up_write(&ipvs->svc_replace_sem);
+
unlock_m:
mutex_unlock(&ipvs->service_mutex);
@@ -2184,17 +2200,21 @@ static int ip_vs_dst_event(struct notifier_block *this,
unsigned long event,
struct ip_vs_service *svc;
struct hlist_bl_node *e;
struct ip_vs_dest *dest;
- int old_gen, new_gen;
+ int old_gen;
if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
+ /* Allow concurrent rehashing on resize but to avoid loop
+ * serialize with installing the new table.
+ */
+ down_read(&ipvs->svc_replace_sem);
+
old_gen = atomic_read(&ipvs->svc_table_changes);
rcu_read_lock();
-repeat:
smp_rmb(); /* ipvs->svc_table and svc_table_changes */
ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
@@ -2207,17 +2227,17 @@ static int ip_vs_dst_event(struct notifier_block *this,
unsigned long event,
}
resched_score++;
if (resched_score >= 100) {
- resched_score = 0;
cond_resched_rcu();
- new_gen = atomic_read(&ipvs->svc_table_changes);
- /* New table installed ? */
- if (old_gen != new_gen) {
- old_gen = new_gen;
- goto repeat;
- }
+ /* Flushed? So no more dev refs */
+ if (atomic_read(&ipvs->svc_table_changes) != old_gen)
+ goto done;
+ resched_score = 0;
}
}
+
+done:
rcu_read_unlock();
+ up_read(&ipvs->svc_replace_sem);
return NOTIFY_DONE;
}
@@ -3503,7 +3523,7 @@ __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
int ret = 0;
lockdep_assert_held(&ipvs->svc_resize_sem);
- /* All service modifications are disabled, go ahead */
+ /* All svc_table modifications are disabled, go ahead */
ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
hlist_bl_for_each_entry(svc, e, head, s_list) {
/* Only expose IPv4 entries to old interface */
@@ -3687,7 +3707,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user
*user, int *len)
pr_err("length: %u != %zu\n", *len, size);
return -EINVAL;
}
- /* Protect against table resizer moving the entries.
+ /* Prevent modifications to the list with services.
* Try reverse locking, so that we do not hold the mutex
* while waiting for semaphore.
*/
@@ -4029,6 +4049,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
int start = cb->args[0];
int idx = 0;
+ /* Make sure we do not see same service twice during resize */
down_read(&ipvs->svc_resize_sem);
rcu_read_lock();
ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) {
@@ -5072,6 +5093,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs
*ipvs)
/* Initialize service_mutex, svc_table per netns */
__mutex_init(&ipvs->service_mutex, "ipvs->service_mutex",
&__ipvs_service_key);
init_rwsem(&ipvs->svc_resize_sem);
+ init_rwsem(&ipvs->svc_replace_sem);
INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler);
atomic_set(&ipvs->svc_table_changes, 0);
RCU_INIT_POINTER(ipvs->svc_table, NULL);
--
2.53.0
|