LVS
lvs-devel
Google
 
Web LinuxVirtualServer.org

Re: [PATCH] ipvs: Fix race conditions in lblcr scheduler

To: Sven Wegener <sven.wegener@xxxxxxxxxxx>
Subject: Re: [PATCH] ipvs: Fix race conditions in lblcr scheduler
Cc: netdev@xxxxxxxxxxxxxxx, lvs-devel@xxxxxxxxxxxxxxx, wensong@xxxxxxxxxxxx, ja@xxxxxx
From: Simon Horman <horms@xxxxxxxxxxxx>
Date: Tue, 19 Aug 2008 09:32:45 +1000
On Mon, Aug 18, 2008 at 12:53:05AM +0200, Sven Wegener wrote:
> We can't access the cache entry outside of our critical read-locked region,
> because someone may free that entry. Also getting an entry under read lock,
> then locking for write and trying to delete that entry looks fishy, but should
> be no problem here, because we're only comparing a pointer. Also there is no
> need for our own rwlock, there is already one in the service structure for use
> in the schedulers.

Hi Sven,

this looks good to me. Just a few minor comments inline.

> Signed-off-by: Sven Wegener <sven.wegener@xxxxxxxxxxx>
> ---
>  net/ipv4/ipvs/ip_vs_lblcr.c |  229 
> +++++++++++++++++++++----------------------
>  1 files changed, 114 insertions(+), 115 deletions(-)
> 
> diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
> index f1c8450..96bfdc2 100644
> --- a/net/ipv4/ipvs/ip_vs_lblcr.c
> +++ b/net/ipv4/ipvs/ip_vs_lblcr.c
> @@ -106,7 +106,7 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct 
> ip_vs_dest *dest)
>                       return NULL;
>       }
>  
> -     e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
> +     e = kmalloc(sizeof(*e), GFP_ATOMIC);

I think that I prefer using struct ip_vs_dest_list rather than *e.
Ditto for *tbl below.

>       if (e == NULL) {
>               IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
>               return NULL;
> @@ -116,11 +116,9 @@ ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct 
> ip_vs_dest *dest)
>       e->dest = dest;
>  
>       /* link it to the list */
> -     write_lock(&set->lock);
>       e->next = set->list;
>       set->list = e;
>       atomic_inc(&set->size);
> -     write_unlock(&set->lock);
>  
>       set->lastmod = jiffies;
>       return e;
> @@ -131,7 +129,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct 
> ip_vs_dest *dest)
>  {
>       struct ip_vs_dest_list *e, **ep;
>  
> -     write_lock(&set->lock);
>       for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
>               if (e->dest == dest) {
>                       /* HIT */
> @@ -144,7 +141,6 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct 
> ip_vs_dest *dest)
>               }
>               ep = &e->next;
>       }
> -     write_unlock(&set->lock);
>  }
>  
>  static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
> @@ -174,7 +170,6 @@ static inline struct ip_vs_dest 
> *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
>       if (set == NULL)
>               return NULL;
>  
> -     read_lock(&set->lock);
>       /* select the first destination server, whose weight > 0 */
>       for (e=set->list; e!=NULL; e=e->next) {
>               least = e->dest;
> @@ -188,7 +183,6 @@ static inline struct ip_vs_dest 
> *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
>                       goto nextstage;
>               }
>       }
> -     read_unlock(&set->lock);
>       return NULL;
>  
>       /* find the destination with the weighted least load */
> @@ -207,7 +201,6 @@ static inline struct ip_vs_dest 
> *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
>                       loh = doh;
>               }
>       }
> -     read_unlock(&set->lock);
>  
>       IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
>                 "activeconns %d refcnt %d weight %d overhead %d\n",
> @@ -229,7 +222,6 @@ static inline struct ip_vs_dest 
> *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
>       if (set == NULL)
>               return NULL;
>  
> -     read_lock(&set->lock);
>       /* select the first destination server, whose weight > 0 */
>       for (e=set->list; e!=NULL; e=e->next) {
>               most = e->dest;
> @@ -239,7 +231,6 @@ static inline struct ip_vs_dest 
> *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
>                       goto nextstage;
>               }
>       }
> -     read_unlock(&set->lock);
>       return NULL;
>  
>       /* find the destination with the weighted most load */
> @@ -256,7 +247,6 @@ static inline struct ip_vs_dest 
> *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
>                       moh = doh;
>               }
>       }
> -     read_unlock(&set->lock);
>  
>       IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
>                 "activeconns %d refcnt %d weight %d overhead %d\n",
> @@ -284,7 +274,6 @@ struct ip_vs_lblcr_entry {
>   *      IPVS lblcr hash table
>   */
>  struct ip_vs_lblcr_table {
> -     rwlock_t                lock;           /* lock for this table */
>       struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
>       atomic_t                entries;        /* number of entries */
>       int                     max_size;       /* maximum size of entries */
> @@ -311,32 +300,6 @@ static ctl_table vs_vars_table[] = {
>  
>  static struct ctl_table_header * sysctl_header;
>  
> -/*
> - *      new/free a ip_vs_lblcr_entry, which is a mapping of a destination
> - *      IP address to a server.
> - */
> -static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr)
> -{
> -     struct ip_vs_lblcr_entry *en;
> -
> -     en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
> -     if (en == NULL) {
> -             IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
> -             return NULL;
> -     }
> -
> -     INIT_LIST_HEAD(&en->list);
> -     en->addr = daddr;
> -
> -     /* initilize its dest set */
> -     atomic_set(&(en->set.size), 0);
> -     en->set.list = NULL;
> -     rwlock_init(&en->set.lock);
> -
> -     return en;
> -}
> -
> -
>  static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
>  {
>       list_del(&en->list);
> @@ -358,55 +321,68 @@ static inline unsigned ip_vs_lblcr_hashkey(__be32 addr)
>   *   Hash an entry in the ip_vs_lblcr_table.
>   *   returns bool success.
>   */
> -static int
> +static void
>  ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
>  {
> -     unsigned hash;
> -
> -     if (!list_empty(&en->list)) {
> -             IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
> -                       "called from %p\n", __builtin_return_address(0));
> -             return 0;
> -     }
> +     unsigned hash = ip_vs_lblcr_hashkey(en->addr);
>  
> -     /*
> -      *      Hash by destination IP address
> -      */
> -     hash = ip_vs_lblcr_hashkey(en->addr);
> -
> -     write_lock(&tbl->lock);
>       list_add(&en->list, &tbl->bucket[hash]);
>       atomic_inc(&tbl->entries);
> -     write_unlock(&tbl->lock);
> -
> -     return 1;
>  }
>  
>  
>  /*
> - *  Get ip_vs_lblcr_entry associated with supplied parameters.
> + *  Get ip_vs_lblcr_entry associated with supplied parameters. Called under
> + *  read lock.
>   */
>  static inline struct ip_vs_lblcr_entry *
>  ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr)
>  {
> -     unsigned hash;
> +     unsigned hash = ip_vs_lblcr_hashkey(addr);
>       struct ip_vs_lblcr_entry *en;
>  
> -     hash = ip_vs_lblcr_hashkey(addr);
> +     list_for_each_entry(en, &tbl->bucket[hash], list)
> +             if (en->addr == addr)
> +                     return en;
> +
> +     return NULL;
> +}
>  
> -     read_lock(&tbl->lock);
>  
> -     list_for_each_entry(en, &tbl->bucket[hash], list) {
> -             if (en->addr == addr) {
> -                     /* HIT */
> -                     read_unlock(&tbl->lock);
> -                     return en;
> +/*
> + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
> + * IP address to a server. Called under write lock.
> + */
> +static inline struct ip_vs_lblcr_entry *
> +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl,  __be32 daddr,
> +             struct ip_vs_dest *dest)
> +{
> +     struct ip_vs_lblcr_entry *en;
> +
> +     en = ip_vs_lblcr_get(tbl, daddr);
> +     if (!en) {
> +             en = kmalloc(sizeof(*en), GFP_ATOMIC);
> +             if (!en) {
> +                     IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
> +                     return NULL;
>               }
> +
> +             en->addr = daddr;
> +             en->lastuse = jiffies;
> +
> +             /* initilize its dest set */
> +             atomic_set(&(en->set.size), 0);
> +             en->set.list = NULL;
> +             rwlock_init(&en->set.lock);
> +
> +             ip_vs_lblcr_hash(tbl, en);
>       }
>  
> -     read_unlock(&tbl->lock);
> +     write_lock(&en->set.lock);
> +     ip_vs_dest_set_insert(&en->set, dest);
> +     write_unlock(&en->set.lock);
>  
> -     return NULL;
> +     return en;
>  }
>  
>  
> @@ -418,19 +394,18 @@ static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table 
> *tbl)
>       int i;
>       struct ip_vs_lblcr_entry *en, *nxt;
>  
> +     /* No locking required, only called during cleanup. */
>       for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
> -             write_lock(&tbl->lock);
>               list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
>                       ip_vs_lblcr_free(en);
> -                     atomic_dec(&tbl->entries);
>               }
> -             write_unlock(&tbl->lock);
>       }
>  }
>  
>  
> -static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
> +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
>  {
> +     struct ip_vs_lblcr_table *tbl = svc->sched_data;
>       unsigned long now = jiffies;
>       int i, j;
>       struct ip_vs_lblcr_entry *en, *nxt;
> @@ -438,7 +413,7 @@ static inline void ip_vs_lblcr_full_check(struct 
> ip_vs_lblcr_table *tbl)
>       for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
>               j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
>  
> -             write_lock(&tbl->lock);
> +             write_lock(&svc->sched_lock);
>               list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
>                       if 
> (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
>                                      now))
> @@ -447,7 +422,7 @@ static inline void ip_vs_lblcr_full_check(struct 
> ip_vs_lblcr_table *tbl)
>                       ip_vs_lblcr_free(en);
>                       atomic_dec(&tbl->entries);
>               }
> -             write_unlock(&tbl->lock);
> +             write_unlock(&svc->sched_lock);
>       }
>       tbl->rover = j;
>  }
> @@ -466,17 +441,16 @@ static inline void ip_vs_lblcr_full_check(struct 
> ip_vs_lblcr_table *tbl)
>   */
>  static void ip_vs_lblcr_check_expire(unsigned long data)
>  {
> -     struct ip_vs_lblcr_table *tbl;
> +     struct ip_vs_service *svc = (struct ip_vs_service *) data;
> +     struct ip_vs_lblcr_table *tbl = svc->sched_data;
>       unsigned long now = jiffies;
>       int goal;
>       int i, j;
>       struct ip_vs_lblcr_entry *en, *nxt;
>  
> -     tbl = (struct ip_vs_lblcr_table *)data;
> -
>       if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
>               /* do full expiration check */
> -             ip_vs_lblcr_full_check(tbl);
> +             ip_vs_lblcr_full_check(svc);
>               tbl->counter = 1;
>               goto out;
>       }
> @@ -493,7 +467,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
>       for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
>               j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
>  
> -             write_lock(&tbl->lock);
> +             write_lock(&svc->sched_lock);
>               list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
>                       if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
>                               continue;
> @@ -502,7 +476,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
>                       atomic_dec(&tbl->entries);
>                       goal--;
>               }
> -             write_unlock(&tbl->lock);
> +             write_unlock(&svc->sched_lock);
>               if (goal <= 0)
>                       break;
>       }
> @@ -520,15 +494,14 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service 
> *svc)
>       /*
>        *    Allocate the ip_vs_lblcr_table for this service
>        */
> -     tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
> +     tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC);
>       if (tbl == NULL) {
>               IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
>               return -ENOMEM;
>       }
>       svc->sched_data = tbl;
>       IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
> -               "current service\n",
> -               sizeof(struct ip_vs_lblcr_table));
> +               "current service\n", sizeof(*tbl));
>  
>       /*
>        *    Initialize the hash buckets
> @@ -536,7 +509,6 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
>       for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
>               INIT_LIST_HEAD(&tbl->bucket[i]);
>       }
> -     rwlock_init(&tbl->lock);
>       tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
>       tbl->rover = 0;
>       tbl->counter = 1;
> @@ -545,9 +517,8 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
>        *    Hook periodic timer for garbage collection
>        */
>       setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
> -                     (unsigned long)tbl);
> -     tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
> -     add_timer(&tbl->periodic_timer);
> +                     (unsigned long)svc);
> +     mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
>  
>       return 0;
>  }
> @@ -564,9 +535,9 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
>       ip_vs_lblcr_flush(tbl);
>  
>       /* release the table itself */
> -     kfree(svc->sched_data);
> +     kfree(tbl);
>       IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
> -               sizeof(struct ip_vs_lblcr_table));
> +               sizeof(*tbl));
>  
>       return 0;
>  }
> @@ -663,50 +634,78 @@ is_overloaded(struct ip_vs_dest *dest, struct 
> ip_vs_service *svc)
>  static struct ip_vs_dest *
>  ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
>  {
> -     struct ip_vs_dest *dest;
> -     struct ip_vs_lblcr_table *tbl;
> -     struct ip_vs_lblcr_entry *en;
> +     struct ip_vs_lblcr_table *tbl = svc->sched_data;
>       struct iphdr *iph = ip_hdr(skb);
> +     struct ip_vs_dest *dest = NULL;
> +     struct ip_vs_lblcr_entry *en;
>  
>       IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
>  
> -     tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
> +     /* First look in our cache */
> +     read_lock(&svc->sched_lock);
>       en = ip_vs_lblcr_get(tbl, iph->daddr);
> -     if (en == NULL) {
> -             dest = __ip_vs_lblcr_schedule(svc, iph);
> -             if (dest == NULL) {
> -                     IP_VS_DBG(1, "no destination available\n");
> -                     return NULL;
> -             }
> -             en = ip_vs_lblcr_new(iph->daddr);
> -             if (en == NULL) {
> -                     return NULL;
> -             }
> -             ip_vs_dest_set_insert(&en->set, dest);
> -             ip_vs_lblcr_hash(tbl, en);
> -     } else {
> +     if (en) {
> +             /* We only hold a read lock, but this is atomic */
> +             en->lastuse = jiffies;
> +
> +             /* Get the least loaded destination */
> +             read_lock(&en->set.lock);
>               dest = ip_vs_dest_set_min(&en->set);
> -             if (!dest || is_overloaded(dest, svc)) {
> -                     dest = __ip_vs_lblcr_schedule(svc, iph);
> -                     if (dest == NULL) {
> -                             IP_VS_DBG(1, "no destination available\n");
> -                             return NULL;
> -                     }
> -                     ip_vs_dest_set_insert(&en->set, dest);
> -             }
> +             read_unlock(&en->set.lock);
> +
> +             /* More than one destination + enough time passed by, cleanup */
>               if (atomic_read(&en->set.size) > 1 &&
> -                 jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
> +                             time_after(jiffies, en->set.lastmod +
> +                             sysctl_ip_vs_lblcr_expiration)) {
>                       struct ip_vs_dest *m;
> +
> +                     write_lock(&en->set.lock);
>                       m = ip_vs_dest_set_max(&en->set);
>                       if (m)
>                               ip_vs_dest_set_erase(&en->set, m);
> +                     write_unlock(&en->set.lock);
> +             }
> +
> +             /* If the destination is not overloaded, use it */
> +             if (dest && !is_overloaded(dest, svc)) {
> +                     read_unlock(&svc->sched_lock);
> +                     goto out;
> +             }
> +
> +             /* The cache entry is invalid, time to schedule */
> +             dest = __ip_vs_lblcr_schedule(svc, iph);
> +             if (!dest) {
> +                     IP_VS_DBG(1, "no destination available\n");
> +                     read_unlock(&svc->sched_lock);
> +                     return NULL;
>               }
> +
> +             /* Update our cache entry */
> +             write_lock(&en->set.lock);
> +             ip_vs_dest_set_insert(&en->set, dest);
> +             write_unlock(&en->set.lock);
> +     }
> +     read_unlock(&svc->sched_lock);
> +
> +     if (dest)
> +             goto out;
> +
> +     /* No cache entry, time to schedule */
> +     dest = __ip_vs_lblcr_schedule(svc, iph);
> +     if (!dest) {
> +             IP_VS_DBG(1, "no destination available\n");
> +             return NULL;
>       }
> -     en->lastuse = jiffies;
>  
> +     /* If we fail to create a cache entry, we'll just use the valid dest */
> +     write_lock(&svc->sched_lock);
> +     ip_vs_lblcr_new(tbl, iph->daddr, dest);
> +     write_unlock(&svc->sched_lock);
> +
> +out:
>       IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
>                 "--> server %u.%u.%u.%u:%d\n",
> -               NIPQUAD(en->addr),
> +               NIPQUAD(iph->addr),

Minor problem, this should be iph->daddr

>                 NIPQUAD(dest->addr),
>                 ntohs(dest->port));
>  
> --
> To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

<Prev in Thread] Current Thread [Next in Thread>