LVS
lvs-devel
Google
 
Web LinuxVirtualServer.org

Re: [RFC] IPVS: Convert connection table lock over to RCU

To: Simon Horman <horms@xxxxxxxxxxxx>
Subject: Re: [RFC] IPVS: Convert connection table lock over to RCU
Cc: netdev@xxxxxxxxxxxxxxx, lvs-devel@xxxxxxxxxxxxxxx, Wensong Zhang <wensong@xxxxxxxxxxxx>, Julian Anastasov <ja@xxxxxx>, Patrick McHardy <kaber@xxxxxxxxx>, "David S. Miller" <davem@xxxxxxxxxxxxx>
From: Eric Dumazet <eric.dumazet@xxxxxxxxx>
Date: Fri, 26 Feb 2010 14:57:18 +0100
Le vendredi 26 février 2010 à 14:00 +1100, Simon Horman a écrit :
> Signed-off-by: Simon Horman <horms@xxxxxxxxxxxx>
> 
> --- 
> 
> This seems to be a fairly clean conversion to me. But its my journey
> into the world of RCU, so I would appreciate a careful review.
> 
> I have deliberately introduced some noise into this patch
> in the form of changing the name of some global variables and functions.
> This is in order to clearly highlight changes at the call-sites.
> 
> The table of 16 locks (4 bits) used for the connection table seems
> to be somewhat arbitrary to me, this patch intentionally leaves
> that as is.
> 
> Index: net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c
> ===================================================================
> --- net-next-2.6.orig/net/netfilter/ipvs/ip_vs_conn.c 2010-02-26 
> 10:42:16.000000000 +1100
> +++ net-next-2.6/net/netfilter/ipvs/ip_vs_conn.c      2010-02-26 
> 10:52:32.000000000 +1100
> @@ -35,6 +35,8 @@
>  #include <linux/seq_file.h>
>  #include <linux/jhash.h>
>  #include <linux/random.h>
> +#include <linux/spinlock.h>
> +#include <linux/rculist.h>
>  
>  #include <net/net_namespace.h>
>  #include <net/ip_vs.h>
> @@ -75,57 +77,37 @@ static unsigned int ip_vs_conn_rnd;
>  /*
>   *  Fine locking granularity for big connection hash table
>   */
> -#define CT_LOCKARRAY_BITS  4
> -#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
> -#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
> +#define CT_MUTEX_BITS  4
> +#define CT_MUTEX_SIZE  (1<<CT_MUTEX_BITS)
> +#define CT_MUTEX_MASK  (CT_MUTEX_SIZE-1)
>  
> -struct ip_vs_aligned_lock
> +struct ip_vs_aligned_spinlock
>  {
> -     rwlock_t        l;
> +     spinlock_t      l;
>  } __attribute__((__aligned__(SMP_CACHE_BYTES)));
>  
> -/* lock array for conn table */
> -static struct ip_vs_aligned_lock
> -__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
> +/* mutex array for connection table */
> +static struct ip_vs_aligned_spinlock
> +__ip_vs_conntbl_mutex[CT_MUTEX_SIZE] __cacheline_aligned;
>  
> -static inline void ct_read_lock(unsigned key)
> +static inline void ct_mutex_lock(unsigned key)
>  {
> -     read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> +     spin_lock(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
>  }
>  
> -static inline void ct_read_unlock(unsigned key)
> +static inline void ct_mutex_unlock(unsigned key)
>  {
> -     read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> +     spin_unlock(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
>  }
>  
> -static inline void ct_write_lock(unsigned key)
> +static inline void ct_mutex_lock_bh(unsigned key)
>  {
> -     write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> +     spin_lock_bh(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
>  }
>  
> -static inline void ct_write_unlock(unsigned key)
> +static inline void ct_mutex_unlock_bh(unsigned key)
>  {
> -     write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_read_lock_bh(unsigned key)
> -{
> -     read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_read_unlock_bh(unsigned key)
> -{
> -     read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_write_lock_bh(unsigned key)
> -{
> -     write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> -}
> -
> -static inline void ct_write_unlock_bh(unsigned key)
> -{
> -     write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
> +     spin_unlock_bh(&__ip_vs_conntbl_mutex[key&CT_MUTEX_MASK].l);
>  }
>  
> 
> @@ -155,27 +137,27 @@ static unsigned int ip_vs_conn_hashkey(i
>  static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
>  {
>       unsigned hash;
> -     int ret;
>  
>       /* Hash by protocol, client address and port */
>       hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
>  
> -     ct_write_lock(hash);
> +     ct_mutex_lock(hash);
>  
>       if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
> -             list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
> +             list_add_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
>               cp->flags |= IP_VS_CONN_F_HASHED;
>               atomic_inc(&cp->refcnt);
> -             ret = 1;
> -     } else {
> -             pr_err("%s(): request for already hashed, called from %pF\n",
> -                    __func__, __builtin_return_address(0));
> -             ret = 0;
> +             ct_mutex_unlock(hash);
> +             synchronize_rcu();

Why is synchronize_rcu() necessary here ?

When adding a new item in a list, you dont need any rcu grace period.

> +             return 1;
>       }
>  
> -     ct_write_unlock(hash);
> +     ct_mutex_unlock(hash);
>  
> -     return ret;
> +     pr_err("%s(): request for already hashed, called from %pF\n",
> +            __func__, __builtin_return_address(0));
> +
> +     return 0;
>  }
>  
> 
> @@ -186,24 +168,24 @@ static inline int ip_vs_conn_hash(struct
>  static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
>  {
>       unsigned hash;
> -     int ret;
>  
>       /* unhash it and decrease its reference counter */
>       hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport);
>  
> -     ct_write_lock(hash);
> +     ct_mutex_lock(hash);
>  
>       if (cp->flags & IP_VS_CONN_F_HASHED) {
> -             list_del(&cp->c_list);
> +             list_del_rcu(&cp->c_list);
>               cp->flags &= ~IP_VS_CONN_F_HASHED;
>               atomic_dec(&cp->refcnt);
> -             ret = 1;
> -     } else
> -             ret = 0;
> +             ct_mutex_unlock(hash);
> +             synchronize_rcu();

Are you sure we can afford a synchronize_rcu() call here ?

This is a very long primitive, and I bet this is not acceptable for IPVS
use case.

> +             return 1;
> +     }
>  
> -     ct_write_unlock(hash);
> +     ct_mutex_unlock(hash);
>  
> -     return ret;
> +     return 0;
>  }
>  
> 
> @@ -222,9 +204,9 @@ static inline struct ip_vs_conn *__ip_vs
>  
>       hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
>  
> -     ct_read_lock(hash);
> +     rcu_read_lock();
>  
> -     list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> +     list_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
>               if (cp->af == af &&
>                   ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
>                   ip_vs_addr_equal(af, d_addr, &cp->vaddr) &&
> @@ -233,12 +215,12 @@ static inline struct ip_vs_conn *__ip_vs
>                   protocol == cp->protocol) {
>                       /* HIT */
>                       atomic_inc(&cp->refcnt);
> -                     ct_read_unlock(hash);
> +                     rcu_read_unlock();
>                       return cp;
>               }
>       }
>  
> -     ct_read_unlock(hash);
> +     rcu_read_unlock();
>  
>       return NULL;
>  }
> @@ -273,9 +255,9 @@ struct ip_vs_conn *ip_vs_ct_in_get
>  
>       hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port);
>  
> -     ct_read_lock(hash);
> +     rcu_read_lock();
>  
> -     list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> +     list_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
>               if (cp->af == af &&
>                   ip_vs_addr_equal(af, s_addr, &cp->caddr) &&
>                   /* protocol should only be IPPROTO_IP if
> @@ -293,7 +275,7 @@ struct ip_vs_conn *ip_vs_ct_in_get
>       cp = NULL;
>  
>    out:
> -     ct_read_unlock(hash);
> +     rcu_read_unlock();
>  
>       IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
>                     ip_vs_proto_name(protocol),
> @@ -322,9 +304,9 @@ struct ip_vs_conn *ip_vs_conn_out_get
>        */
>       hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port);
>  
> -     ct_read_lock(hash);
> +     rcu_read_lock();
>  
> -     list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
> +     list_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
>               if (cp->af == af &&
>                   ip_vs_addr_equal(af, d_addr, &cp->caddr) &&
>                   ip_vs_addr_equal(af, s_addr, &cp->daddr) &&
> @@ -337,7 +319,7 @@ struct ip_vs_conn *ip_vs_conn_out_get
>               }
>       }
>  
> -     ct_read_unlock(hash);
> +     rcu_read_unlock();
>  
>       IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
>                     ip_vs_proto_name(protocol),
> @@ -776,14 +758,16 @@ static void *ip_vs_conn_array(struct seq
>       struct ip_vs_conn *cp;
>  
>       for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
> -             ct_read_lock_bh(idx);
> -             list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> +             rcu_read_lock_bh();
> +             list_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
>                       if (pos-- == 0) {
>                               seq->private = &ip_vs_conn_tab[idx];
> +                             /* N.B: no rcu_read_unlock_bh() here
> +                              *      Seems really horrible :-( */
>                               return cp;
>                       }
>               }
> -             ct_read_unlock_bh(idx);
> +             rcu_read_unlock_bh();
>       }
>  
>       return NULL;
> @@ -807,19 +791,22 @@ static void *ip_vs_conn_seq_next(struct
>  
>       /* more on same hash chain? */
>       if ((e = cp->c_list.next) != l)
> -             return list_entry(e, struct ip_vs_conn, c_list);
> +             return list_entry_rcu(e, struct ip_vs_conn, c_list);
>  
>       idx = l - ip_vs_conn_tab;
> -     ct_read_unlock_bh(idx);
> +     rcu_read_unlock_bh();
>  
>       while (++idx < ip_vs_conn_tab_size) {
> -             ct_read_lock_bh(idx);
> -             list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
> +             rcu_read_lock_bh();
> +             list_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
>                       seq->private = &ip_vs_conn_tab[idx];
> +                     /* N.B: no rcu_read_unlock_bh() here
> +                      *      Seems really horrible :-( */

... if you add a comment, please write why you need to keep rcu locked
... or dont add a comment, since this construct is quite common.



--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

<Prev in Thread] Current Thread [Next in Thread>