|
Sashiko reports for races and possible crash around
the usage of est_cpulist_valid and sysctl_est_cpulist.
The problem is that we do not lock est_mutex in some
places which can lead to wrong write ordering and
as result problems when calling cpumask_weight()
and cpumask_empty().
Fix them by moving the est_max_threads read/write under
locked est_mutex. Do the same for one ip_vs_est_reload_start()
call to protect the cpumask_empty() usage of sysctl_est_cpulist.
To remove the chance of deadlock while stopping the
estimation kthreads, use the service_mutex to walk the
array with kthreads and hold it during kthread_stop().
OTOH, est_mutex is needed only while starting the
kthreads, not for stopping. Reorganize the code in
kthread 0 to use mutex_trylock() for the service_mutex
to ensure kthread_should_stop() is not true while we
attempt to lock the mutex.
Link:
https://sashiko.dev/#/patchset/20260331165015.2777765-1-longman%40redhat.com
Link: https://sashiko.dev/#/patchset/20260420171308.87192-1-ja%40ssi.bg
Fixes: f0be83d54217 ("ipvs: add est_cpulist and est_nice sysctl vars")
Signed-off-by: Julian Anastasov <ja@xxxxxx>
---
net/netfilter/ipvs/ip_vs_ctl.c | 11 +++++--
net/netfilter/ipvs/ip_vs_est.c | 59 ++++++++++++++++++++++------------
2 files changed, 47 insertions(+), 23 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index caec516856e9..fda166aca4fb 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -250,7 +250,7 @@ static void est_reload_work_handler(struct work_struct
*work)
int genid;
int id;
- mutex_lock(&ipvs->est_mutex);
+ mutex_lock(&ipvs->service_mutex);
genid = atomic_read(&ipvs->est_genid);
for (id = 0; id < ipvs->est_kt_count; id++) {
struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
@@ -263,12 +263,14 @@ static void est_reload_work_handler(struct work_struct
*work)
/* New config ? Stop kthread tasks */
if (genid != genid_done)
ip_vs_est_kthread_stop(kd);
+ mutex_lock(&ipvs->est_mutex);
if (!kd->task && !ip_vs_est_stopped(ipvs)) {
/* Do not start kthreads above 0 in calc phase */
if ((!id || !ipvs->est_calc_phase) &&
ip_vs_est_kthread_start(ipvs, kd) < 0)
repeat = true;
}
+ mutex_unlock(&ipvs->est_mutex);
}
atomic_set(&ipvs->est_genid_done, genid);
@@ -278,7 +280,7 @@ static void est_reload_work_handler(struct work_struct
*work)
delay);
unlock:
- mutex_unlock(&ipvs->est_mutex);
+ mutex_unlock(&ipvs->service_mutex);
}
static int get_conn_tab_size(struct netns_ipvs *ipvs)
@@ -1812,11 +1814,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct
ip_vs_service_user_kern *u,
*svc_p = svc;
if (!READ_ONCE(ipvs->enable)) {
+ mutex_lock(&ipvs->est_mutex);
+
/* Now there is a service - full throttle */
WRITE_ONCE(ipvs->enable, 1);
+ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+
/* Start estimation for first time */
ip_vs_est_reload_start(ipvs);
+ mutex_unlock(&ipvs->est_mutex);
}
return 0;
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 433ba3cab58c..216e3c354125 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -68,6 +68,10 @@
and the limit of estimators per kthread
- est_add_ktid: ktid where to add new ests, can point to empty slot where
we should add kt data
+ - data protected by service_mutex: est_temp_list, est_add_ktid,
+ est_kt_count, est_kt_arr, est_genid_done, kd->task
+ - data protected by est_mutex: est_genid, est_max_threads,
sysctl_est_cpulist,
+ est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation
*/
static struct lock_class_key __ipvs_est_key;
@@ -229,6 +233,8 @@ static int ip_vs_estimation_kthread(void *data)
/* Schedule stop/start for kthread tasks */
void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
{
+ lockdep_assert_held(&ipvs->est_mutex);
+
/* Ignore reloads before first service is added */
if (!READ_ONCE(ipvs->enable))
return;
@@ -304,12 +310,17 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
void *arr = NULL;
int i;
- if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
- READ_ONCE(ipvs->enable) && ipvs->est_max_threads)
- return -EINVAL;
-
mutex_lock(&ipvs->est_mutex);
+ /* Allow kt 0 data to be created before the services are added
+ * and limit the kthreads when services are present.
+ */
+ if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
+ READ_ONCE(ipvs->enable) && ipvs->est_max_threads) {
+ ret = -EINVAL;
+ goto out;
+ }
+
for (i = 0; i < id; i++) {
if (!ipvs->est_kt_arr[i])
break;
@@ -485,9 +496,6 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct
ip_vs_stats *stats)
struct ip_vs_estimator *est = &stats->est;
int ret;
- if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable))
- ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
-
est->ktid = -1;
est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */
@@ -561,7 +569,6 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct
ip_vs_stats *stats)
}
if (ktid > 0) {
- mutex_lock(&ipvs->est_mutex);
ip_vs_est_kthread_destroy(kd);
ipvs->est_kt_arr[ktid] = NULL;
if (ktid == ipvs->est_kt_count - 1) {
@@ -570,7 +577,6 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct
ip_vs_stats *stats)
!ipvs->est_kt_arr[ipvs->est_kt_count - 1])
ipvs->est_kt_count--;
}
- mutex_unlock(&ipvs->est_mutex);
/* This slot is now empty, prefer another available kt slot */
if (ktid == ipvs->est_add_ktid)
@@ -582,13 +588,11 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct
ip_vs_stats *stats)
if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
kd = ipvs->est_kt_arr[0];
if (!kd || !kd->est_count) {
- mutex_lock(&ipvs->est_mutex);
if (kd) {
ip_vs_est_kthread_destroy(kd);
ipvs->est_kt_arr[0] = NULL;
}
ipvs->est_kt_count--;
- mutex_unlock(&ipvs->est_mutex);
ipvs->est_add_ktid = 0;
}
}
@@ -602,13 +606,17 @@ static void ip_vs_est_drain_temp_list(struct netns_ipvs
*ipvs)
while (1) {
int max = 16;
- mutex_lock(&ipvs->service_mutex);
+ while (!mutex_trylock(&ipvs->service_mutex)) {
+ if (kthread_should_stop() || !READ_ONCE(ipvs->enable))
+ return;
+ cond_resched();
+ }
while (max-- > 0) {
est = hlist_entry_safe(ipvs->est_temp_list.first,
struct ip_vs_estimator, list);
if (est) {
- if (kthread_should_stop())
+ if (!READ_ONCE(ipvs->enable))
goto unlock;
hlist_del_init(&est->list);
if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
@@ -647,7 +655,11 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs,
int *chain_max)
u64 val;
INIT_HLIST_HEAD(&chain);
- mutex_lock(&ipvs->service_mutex);
+ while (!mutex_trylock(&ipvs->service_mutex)) {
+ if (kthread_should_stop() || !READ_ONCE(ipvs->enable))
+ return 0;
+ cond_resched();
+ }
kd = ipvs->est_kt_arr[0];
mutex_unlock(&ipvs->service_mutex);
s = kd ? kd->calc_stats : NULL;
@@ -748,22 +760,24 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
if (!ip_vs_est_calc_limits(ipvs, &chain_max))
return;
- mutex_lock(&ipvs->service_mutex);
+ while (!mutex_trylock(&ipvs->service_mutex)) {
+ if (kthread_should_stop() || !READ_ONCE(ipvs->enable))
+ return;
+ cond_resched();
+ }
/* Stop all other tasks, so that we can immediately move the
* estimators to est_temp_list without RCU grace period
*/
- mutex_lock(&ipvs->est_mutex);
for (id = 1; id < ipvs->est_kt_count; id++) {
/* netns clean up started, abort */
- if (!READ_ONCE(ipvs->enable))
- goto unlock2;
+ if (kthread_should_stop() || !READ_ONCE(ipvs->enable))
+ goto unlock;
kd = ipvs->est_kt_arr[id];
if (!kd)
continue;
ip_vs_est_kthread_stop(kd);
}
- mutex_unlock(&ipvs->est_mutex);
/* Move all estimators to est_temp_list but carefully,
* all estimators and kthread data can be released while
@@ -817,7 +831,11 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
*/
mutex_unlock(&ipvs->service_mutex);
cond_resched();
- mutex_lock(&ipvs->service_mutex);
+ while (!mutex_trylock(&ipvs->service_mutex)) {
+ if (kthread_should_stop() || !READ_ONCE(ipvs->enable))
+ return;
+ cond_resched();
+ }
/* Current kt released ? */
if (id >= ipvs->est_kt_count)
@@ -889,7 +907,6 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
if (genid == atomic_read(&ipvs->est_genid))
ipvs->est_calc_phase = 0;
-unlock2:
mutex_unlock(&ipvs->est_mutex);
unlock:
--
2.53.0
|