LVS
lvs-devel
Google
 
Web LinuxVirtualServer.org

Re: [PATCHv2 net] ipvs: fix races around est_mutex and est_cpulist

To: Simon Horman <horms@xxxxxxxxxxxx>
Subject: Re: [PATCHv2 net] ipvs: fix races around est_mutex and est_cpulist
Cc: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>, Florian Westphal <fw@xxxxxxxxx>, lvs-devel@xxxxxxxxxxxxxxx, netfilter-devel@xxxxxxxxxxxxxxx
From: Julian Anastasov <ja@xxxxxx>
Date: Thu, 23 Apr 2026 13:05:00 +0300 (EEST)
        Hello,

On Wed, 22 Apr 2026, Julian Anastasov wrote:

> Sashiko reports for races and possible crash around
> the usage of est_cpulist_valid and sysctl_est_cpulist.
> The problem is that we do not lock est_mutex in some
> places which can lead to wrong write ordering and
> as result problems when calling cpumask_weight()
> and cpumask_empty().
> 
> Fix them by moving the est_max_threads read/write under
> locked est_mutex. Do the same for one ip_vs_est_reload_start()
> call to protect the cpumask_empty() usage of sysctl_est_cpulist.
> 
> To remove the chance of deadlock while stopping the
> estimation kthreads, use the service_mutex to walk the
> array with kthreads and hold it during kthread_stop().
> OTOH, est_mutex is needed only while starting the
> kthreads, not for stopping. Reorganize the code in
> kthread 0 to use mutex_trylock() for the service_mutex
> to ensure kthread_should_stop() is not true while we
> attempt to lock the mutex.
> 
> Link: 
> https://sashiko.dev/#/patchset/20260331165015.2777765-1-longman%40redhat.com
> Link: https://sashiko.dev/#/patchset/20260420171308.87192-1-ja%40ssi.bg
> Fixes: f0be83d54217 ("ipvs: add est_cpulist and est_nice sysctl vars")
> Signed-off-by: Julian Anastasov <ja@xxxxxx>

        According to Sashiko, this patch needs more
work. There is a way to change how kthread 0 is stopped.
I'll send new patch version soon...

pw-bot: changes-requested

> ---
>  net/netfilter/ipvs/ip_vs_ctl.c | 11 +++++--
>  net/netfilter/ipvs/ip_vs_est.c | 59 ++++++++++++++++++++++------------
>  2 files changed, 47 insertions(+), 23 deletions(-)
> 
> diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
> index caec516856e9..fda166aca4fb 100644
> --- a/net/netfilter/ipvs/ip_vs_ctl.c
> +++ b/net/netfilter/ipvs/ip_vs_ctl.c
> @@ -250,7 +250,7 @@ static void est_reload_work_handler(struct work_struct 
> *work)
>       int genid;
>       int id;
>  
> -     mutex_lock(&ipvs->est_mutex);
> +     mutex_lock(&ipvs->service_mutex);
>       genid = atomic_read(&ipvs->est_genid);
>       for (id = 0; id < ipvs->est_kt_count; id++) {
>               struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
> @@ -263,12 +263,14 @@ static void est_reload_work_handler(struct work_struct 
> *work)
>               /* New config ? Stop kthread tasks */
>               if (genid != genid_done)
>                       ip_vs_est_kthread_stop(kd);
> +             mutex_lock(&ipvs->est_mutex);
>               if (!kd->task && !ip_vs_est_stopped(ipvs)) {
>                       /* Do not start kthreads above 0 in calc phase */
>                       if ((!id || !ipvs->est_calc_phase) &&
>                           ip_vs_est_kthread_start(ipvs, kd) < 0)
>                               repeat = true;
>               }
> +             mutex_unlock(&ipvs->est_mutex);
>       }
>  
>       atomic_set(&ipvs->est_genid_done, genid);
> @@ -278,7 +280,7 @@ static void est_reload_work_handler(struct work_struct 
> *work)
>                                  delay);
>  
>  unlock:
> -     mutex_unlock(&ipvs->est_mutex);
> +     mutex_unlock(&ipvs->service_mutex);
>  }
>  
>  static int get_conn_tab_size(struct netns_ipvs *ipvs)
> @@ -1812,11 +1814,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct 
> ip_vs_service_user_kern *u,
>       *svc_p = svc;
>  
>       if (!READ_ONCE(ipvs->enable)) {
> +             mutex_lock(&ipvs->est_mutex);
> +
>               /* Now there is a service - full throttle */
>               WRITE_ONCE(ipvs->enable, 1);
>  
> +             ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
> +
>               /* Start estimation for first time */
>               ip_vs_est_reload_start(ipvs);
> +             mutex_unlock(&ipvs->est_mutex);
>       }
>  
>       return 0;
> diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
> index 433ba3cab58c..216e3c354125 100644
> --- a/net/netfilter/ipvs/ip_vs_est.c
> +++ b/net/netfilter/ipvs/ip_vs_est.c
> @@ -68,6 +68,10 @@
>      and the limit of estimators per kthread
>    - est_add_ktid: ktid where to add new ests, can point to empty slot where
>      we should add kt data
> +  - data protected by service_mutex: est_temp_list, est_add_ktid,
> +    est_kt_count, est_kt_arr, est_genid_done, kd->task
> +  - data protected by est_mutex: est_genid, est_max_threads, 
> sysctl_est_cpulist,
> +    est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation
>   */
>  
>  static struct lock_class_key __ipvs_est_key;
> @@ -229,6 +233,8 @@ static int ip_vs_estimation_kthread(void *data)
>  /* Schedule stop/start for kthread tasks */
>  void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
>  {
> +     lockdep_assert_held(&ipvs->est_mutex);
> +
>       /* Ignore reloads before first service is added */
>       if (!READ_ONCE(ipvs->enable))
>               return;
> @@ -304,12 +310,17 @@ static int ip_vs_est_add_kthread(struct netns_ipvs 
> *ipvs)
>       void *arr = NULL;
>       int i;
>  
> -     if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
> -         READ_ONCE(ipvs->enable) && ipvs->est_max_threads)
> -             return -EINVAL;
> -
>       mutex_lock(&ipvs->est_mutex);
>  
> +     /* Allow kt 0 data to be created before the services are added
> +      * and limit the kthreads when services are present.
> +      */
> +     if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
> +         READ_ONCE(ipvs->enable) && ipvs->est_max_threads) {
> +             ret = -EINVAL;
> +             goto out;
> +     }
> +
>       for (i = 0; i < id; i++) {
>               if (!ipvs->est_kt_arr[i])
>                       break;
> @@ -485,9 +496,6 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct 
> ip_vs_stats *stats)
>       struct ip_vs_estimator *est = &stats->est;
>       int ret;
>  
> -     if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable))
> -             ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
> -
>       est->ktid = -1;
>       est->ktrow = IPVS_EST_NTICKS - 1;       /* Initial delay */
>  
> @@ -561,7 +569,6 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct 
> ip_vs_stats *stats)
>       }
>  
>       if (ktid > 0) {
> -             mutex_lock(&ipvs->est_mutex);
>               ip_vs_est_kthread_destroy(kd);
>               ipvs->est_kt_arr[ktid] = NULL;
>               if (ktid == ipvs->est_kt_count - 1) {
> @@ -570,7 +577,6 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct 
> ip_vs_stats *stats)
>                              !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
>                               ipvs->est_kt_count--;
>               }
> -             mutex_unlock(&ipvs->est_mutex);
>  
>               /* This slot is now empty, prefer another available kt slot */
>               if (ktid == ipvs->est_add_ktid)
> @@ -582,13 +588,11 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, 
> struct ip_vs_stats *stats)
>       if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
>               kd = ipvs->est_kt_arr[0];
>               if (!kd || !kd->est_count) {
> -                     mutex_lock(&ipvs->est_mutex);
>                       if (kd) {
>                               ip_vs_est_kthread_destroy(kd);
>                               ipvs->est_kt_arr[0] = NULL;
>                       }
>                       ipvs->est_kt_count--;
> -                     mutex_unlock(&ipvs->est_mutex);
>                       ipvs->est_add_ktid = 0;
>               }
>       }
> @@ -602,13 +606,17 @@ static void ip_vs_est_drain_temp_list(struct netns_ipvs 
> *ipvs)
>       while (1) {
>               int max = 16;
>  
> -             mutex_lock(&ipvs->service_mutex);
> +             while (!mutex_trylock(&ipvs->service_mutex)) {
> +                     if (kthread_should_stop() || !READ_ONCE(ipvs->enable))
> +                             return;
> +                     cond_resched();
> +             }
>  
>               while (max-- > 0) {
>                       est = hlist_entry_safe(ipvs->est_temp_list.first,
>                                              struct ip_vs_estimator, list);
>                       if (est) {
> -                             if (kthread_should_stop())
> +                             if (!READ_ONCE(ipvs->enable))
>                                       goto unlock;
>                               hlist_del_init(&est->list);
>                               if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
> @@ -647,7 +655,11 @@ static int ip_vs_est_calc_limits(struct netns_ipvs 
> *ipvs, int *chain_max)
>       u64 val;
>  
>       INIT_HLIST_HEAD(&chain);
> -     mutex_lock(&ipvs->service_mutex);
> +     while (!mutex_trylock(&ipvs->service_mutex)) {
> +             if (kthread_should_stop() || !READ_ONCE(ipvs->enable))
> +                     return 0;
> +             cond_resched();
> +     }
>       kd = ipvs->est_kt_arr[0];
>       mutex_unlock(&ipvs->service_mutex);
>       s = kd ? kd->calc_stats : NULL;
> @@ -748,22 +760,24 @@ static void ip_vs_est_calc_phase(struct netns_ipvs 
> *ipvs)
>       if (!ip_vs_est_calc_limits(ipvs, &chain_max))
>               return;
>  
> -     mutex_lock(&ipvs->service_mutex);
> +     while (!mutex_trylock(&ipvs->service_mutex)) {
> +             if (kthread_should_stop() || !READ_ONCE(ipvs->enable))
> +                     return;
> +             cond_resched();
> +     }
>  
>       /* Stop all other tasks, so that we can immediately move the
>        * estimators to est_temp_list without RCU grace period
>        */
> -     mutex_lock(&ipvs->est_mutex);
>       for (id = 1; id < ipvs->est_kt_count; id++) {
>               /* netns clean up started, abort */
> -             if (!READ_ONCE(ipvs->enable))
> -                     goto unlock2;
> +             if (kthread_should_stop() || !READ_ONCE(ipvs->enable))
> +                     goto unlock;
>               kd = ipvs->est_kt_arr[id];
>               if (!kd)
>                       continue;
>               ip_vs_est_kthread_stop(kd);
>       }
> -     mutex_unlock(&ipvs->est_mutex);
>  
>       /* Move all estimators to est_temp_list but carefully,
>        * all estimators and kthread data can be released while
> @@ -817,7 +831,11 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
>                */
>               mutex_unlock(&ipvs->service_mutex);
>               cond_resched();
> -             mutex_lock(&ipvs->service_mutex);
> +             while (!mutex_trylock(&ipvs->service_mutex)) {
> +                     if (kthread_should_stop() || !READ_ONCE(ipvs->enable))
> +                             return;
> +                     cond_resched();
> +             }
>  
>               /* Current kt released ? */
>               if (id >= ipvs->est_kt_count)
> @@ -889,7 +907,6 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
>       if (genid == atomic_read(&ipvs->est_genid))
>               ipvs->est_calc_phase = 0;
>  
> -unlock2:
>       mutex_unlock(&ipvs->est_mutex);
>  
>  unlock:
> -- 
> 2.53.0

Regards

--
Julian Anastasov <ja@xxxxxx>



<Prev in Thread] Current Thread [Next in Thread>