LVS
lvs-devel
Google
 
Web LinuxVirtualServer.org

[PATCHv2 ipvs-next 3/3] netfilter: ipvs: Refactor Maglev hashing schedul

To: lvs-devel@xxxxxxxxxxxxxxx
Subject: [PATCHv2 ipvs-next 3/3] netfilter: ipvs: Refactor Maglev hashing scheduler
Cc: Julian Anastasov <ja@xxxxxx>, Simon Horman <horms@xxxxxxxxxxxx>, Wensong Zhang <wensong@xxxxxxxxxxxx>
From: Inju Song <inju.song@xxxxxxxxxxxxx>
Date: Sun, 25 Feb 2018 15:44:00 +0900
  1) Change table size to primes array

  2) Add to check dest number > IP_VS_MH_TAB_SIZE

  3) Replace IP_VS_MH_LOOKUP_SIZE with IP_VS_MH_TAB_SIZE

  4) Replace large permutation array with dest_setup

  5) Add gcd, rshift and calculate ds->turns

Signed-off-by: Inju Song <inju.song@xxxxxxxxxxxxx>
---
 net/netfilter/ipvs/ip_vs_mh.c | 291 ++++++++++++++++++++++++++----------------
 1 file changed, 178 insertions(+), 113 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index d1bf50a..c868778 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -5,12 +5,13 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/vmalloc.h>
 #include <linux/skbuff.h>
 
 #include <net/ip_vs.h>
 
 #include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <linux/gcd.h>
 
 #define IP_VS_SVC_F_SCHED_MH_FALLBACK  IP_VS_SVC_F_SCHED1 /* MH fallback */
 #define IP_VS_SVC_F_SCHED_MH_PORT      IP_VS_SVC_F_SCHED2 /* MH use port */
@@ -19,23 +20,36 @@ struct ip_vs_mh_lookup {
        struct ip_vs_dest __rcu *dest;  /* real server (cache) */
 };
 
-/* for IPVS MH entry hash table */
-#ifndef CONFIG_IP_VS_MH_TAB_BITS
-#define CONFIG_IP_VS_MH_TAB_BITS        8
+struct ip_vs_mh_dest_setup {
+       int     offset; /* starting offset */
+       int     skip;   /* skip */
+       int     perm;   /* next_offset */
+       int     turns;  /* weight / gcd() */
+};
+
+/* Available prime numbers for MH table */
+static int primes[] = {251, 509, 1021, 2039, 4093,
+                      8191, 16381, 32749, 65521, 131071};
+
+/* For IPVS MH entry hash table */
+#ifndef CONFIG_IP_VS_MH_TAB_INDEX
+#define CONFIG_IP_VS_MH_TAB_INDEX      12
 #endif
-#define IP_VS_MH_TAB_BITS               CONFIG_IP_VS_MH_TAB_BITS
-#define IP_VS_MH_TAB_SIZE               BIT(IP_VS_MH_TAB_BITS)
-#define IP_VS_MH_LOOKUP_SIZE           65537 /* Must be prime number */
+#define IP_VS_MH_TAB_BITS              (CONFIG_IP_VS_MH_TAB_INDEX / 2)
+#define IP_VS_MH_TAB_INDEX             (CONFIG_IP_VS_MH_TAB_INDEX - 8)
+#define IP_VS_MH_TAB_SIZE               primes[IP_VS_MH_TAB_INDEX]
 
 struct ip_vs_mh_state {
-       struct rcu_head         rcu_head;
-       struct ip_vs_mh_lookup  lookup[IP_VS_MH_LOOKUP_SIZE];
-       unsigned int            **permutation;
-       hsiphash_key_t          hash1, hash2;
+       struct rcu_head                 rcu_head;
+       struct ip_vs_mh_lookup          *lookup;
+       struct ip_vs_mh_dest_setup      *dest_setup;
+       hsiphash_key_t                  hash1, hash2;
+       int                             gcd;
+       int                             rshift;
 };
 
-static inline void
-ip_vs_mh_generate_hash_secret(hsiphash_key_t *hash1, hsiphash_key_t *hash2)
+static inline void generate_hash_secret(hsiphash_key_t *hash1,
+                                       hsiphash_key_t *hash2)
 {
        hash1->key[0] = 2654435761UL;
        hash1->key[1] = 2654435761UL;
@@ -68,13 +82,13 @@ static inline bool is_unavailable(struct ip_vs_dest *dest)
        return hsiphash(&v, sizeof(v), key);
 }
 
-static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
-                                    struct ip_vs_service *svc)
+static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
+                             struct ip_vs_service *svc)
 {
-       int i, j;
        struct list_head *p;
+       struct ip_vs_mh_dest_setup *ds;
        struct ip_vs_dest *dest;
-       unsigned int offset, skip;
+       int lw;
        bool empty;
 
        p = &svc->destinations;
@@ -82,56 +96,50 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state 
*s,
        if (empty)
                return 0;
 
-       /* extending permutation table to 2d arrays */
-       for (i = 1; i < svc->num_dests; i++)
-               s->permutation[i] = s->permutation[i - 1] +
-                                   IP_VS_MH_LOOKUP_SIZE;
+       /* If gcd is smaller then 1, all last_weights of dest are zero.
+        * so skip permutation for the dests.
+        */
+       if (s->gcd < 1)
+               return 0;
 
-       i = 0;
+       /* Set dest_setup for the dests permutation */
+       ds = &s->dest_setup[0];
        while ((p = p->next) != &svc->destinations) {
                dest = list_entry(p, struct ip_vs_dest, n_list);
-               offset = ip_vs_mh_hashkey(svc->af, &dest->addr, dest->port,
-                                         &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE;
-               skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
-                                       dest->port, &s->hash2, 0) %
-                                       (IP_VS_MH_LOOKUP_SIZE - 1) + 1;
-
-               for (j = 0; j < IP_VS_MH_LOOKUP_SIZE; j++) {
-                       s->permutation[i][j] = (offset + (j * skip)) %
-                                               IP_VS_MH_LOOKUP_SIZE;
-               }
-               i++;
+
+               ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
+                                             dest->port, &s->hash1, 0) %
+                                             IP_VS_MH_TAB_SIZE;
+               ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
+                                           dest->port, &s->hash2, 0) %
+                                           (IP_VS_MH_TAB_SIZE - 1) + 1;
+               ds->perm = ds->offset % IP_VS_MH_TAB_SIZE;
+
+               lw = atomic_read(&dest->last_weight);
+               ds->turns = (lw > 1) ? lw / s->gcd : lw;
+               ds++;
        }
 
        return 0;
 }
 
-static inline int
-ip_vs_mh_populate(struct ip_vs_mh_state *s, struct ip_vs_service *svc)
+static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
+                            struct ip_vs_service *svc)
 {
-       int i, ret, dw_count;
-       unsigned int *next;
-       unsigned int **pmt;
+       int i, n;
        struct ip_vs_mh_lookup *entry, *l;
        struct list_head *p;
        struct ip_vs_dest *dest;
-       unsigned int n, c;
+       struct ip_vs_mh_dest_setup *ds;
+       int c, rs, dt_count;
        bool empty;
 
-       ret = 0;
-       next = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(unsigned int),
-                      GFP_KERNEL);
-       if (!next)
-               return -ENOMEM;
-
-       entry = kcalloc(IP_VS_MH_LOOKUP_SIZE, sizeof(*entry),
+       entry = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(*entry),
                        GFP_KERNEL);
-       if (!entry) {
-               ret = -ENOMEM;
-               goto err_alloc;
-       }
+       if (!entry)
+               return -ENOMEM;
 
-       for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++)
+       for (i = 0; i < IP_VS_MH_TAB_SIZE; i++)
                RCU_INIT_POINTER(entry[i].dest, NULL);
 
        p = &svc->destinations;
@@ -139,47 +147,61 @@ static inline int ip_vs_mh_permutate(struct 
ip_vs_mh_state *s,
        if (empty)
                goto out;
 
+       /* If gcd is smaller then 1, the weight is zero for all dests.
+        * so skip population for MH table and fills the table with NULLs.
+        */
+       if (s->gcd < 1)
+               goto out;
+
        n = 0;
-       dw_count = 0;
-       pmt = s->permutation;
-       while (n < IP_VS_MH_LOOKUP_SIZE) {
+       rs = s->rshift;
+       dt_count = 0;
+       while (n < IP_VS_MH_TAB_SIZE) {
                if (p == &svc->destinations)
                        p = p->next;
 
-               i = 0;
+               ds = &s->dest_setup[0];
                while (p != &svc->destinations) {
-                       c = pmt[i][next[i]];
+                       /* ignore added server with zero weight */
+                       if (ds->turns < 1) {
+                               p = p->next;
+                               ds++;
+                               continue;
+                       }
 
+                       c = ds->perm;
                        while (entry[c].dest) {
-                               next[i] = next[i] + 1;
-                               c = pmt[i][next[i]];
+                               /* Add skip, mod IP_VS_MH_TAB_SIZE */
+                               ds->perm += ds->skip;
+                               if (ds->perm >= IP_VS_MH_TAB_SIZE)
+                                       ds->perm -= IP_VS_MH_TAB_SIZE;
+                               c = ds->perm;
                        }
 
                        dest = list_entry(p, struct ip_vs_dest, n_list);
                        RCU_INIT_POINTER(entry[c].dest, dest);
 
-                       next[i] = next[i] + 1;
-                       n++;
-                       if (n == IP_VS_MH_LOOKUP_SIZE)
-                               break;
+                       if (++n == IP_VS_MH_TAB_SIZE)
+                               goto out;
 
-                       if (++dw_count >= atomic_read(&dest->weight)) {
+                       if (++dt_count >= (ds->turns >> rs)) {
+                               dt_count = 0;
                                p = p->next;
-                               dw_count = 0;
-                               i++;
+                               ds++;
                        }
                }
        }
 
 out:
        l = &s->lookup[0];
-       for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) {
-               dest = rcu_dereference_protected(entry[i].dest, 1);
+       for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
+               dest = rcu_dereference_protected(l->dest, 1);
                if (dest)
                        ip_vs_dest_put(dest);
                if (empty) {
                        RCU_INIT_POINTER(l->dest, NULL);
                } else {
+                       dest = rcu_dereference_protected(entry[i].dest, 1);
                        ip_vs_dest_hold(dest);
                        RCU_INIT_POINTER(l->dest, dest);
 
@@ -193,9 +215,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state 
*s,
        }
 
        kfree(entry);
-err_alloc:
-       kfree(next);
-       return ret;
+       return 0;
 }
 
 /* Get ip_vs_dest associated with supplied parameters. */
@@ -204,14 +224,13 @@ static inline int ip_vs_mh_permutate(struct 
ip_vs_mh_state *s,
             const union nf_inet_addr *addr, __be16 port)
 {
        unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
-                                            % IP_VS_MH_LOOKUP_SIZE;
+                                            % IP_VS_MH_TAB_SIZE;
        struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
 
        return (!dest || is_unavailable(dest)) ? NULL : dest;
 }
 
-/* As ip_vs_mh_get, but with fallback if selected server is unavailable
- */
+/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
 static inline struct ip_vs_dest *
 ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
                      const union nf_inet_addr *addr, __be16 port)
@@ -222,7 +241,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state 
*s,
 
        /* first try the dest it's supposed to go to */
        ihash = ip_vs_mh_hashkey(svc->af, addr, port,
-                                &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE;
+                                &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
        dest = rcu_dereference(s->lookup[ihash].dest);
        if (!dest)
                return NULL;
@@ -235,10 +254,10 @@ static inline int ip_vs_mh_permutate(struct 
ip_vs_mh_state *s,
        /* if the original dest is unavailable, loop around the table
         * starting from ihash to find a new dest
         */
-       for (offset = 0; offset < IP_VS_MH_LOOKUP_SIZE; offset++) {
-               roffset = (offset + ihash) % IP_VS_MH_LOOKUP_SIZE;
+       for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
+               roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
                hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
-                                       roffset) % IP_VS_MH_LOOKUP_SIZE;
+                                       roffset) % IP_VS_MH_TAB_SIZE;
                dest = rcu_dereference(s->lookup[hash].dest);
                if (!dest)
                        break;
@@ -261,7 +280,7 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s)
        struct ip_vs_dest *dest;
 
        l = &s->lookup[0];
-       for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) {
+       for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
                dest = rcu_dereference_protected(l->dest, 1);
                if (dest) {
                        ip_vs_dest_put(dest);
@@ -271,46 +290,85 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s)
        }
 }
 
-/* Assign all the hash buckets of the specified table with the service.
- */
-static int
-ip_vs_mh_reassign(struct ip_vs_mh_state *s, struct ip_vs_service *svc)
+/* Assign all the hash buckets of the specified table with the service. */
+static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
+                            struct ip_vs_service *svc)
 {
        int ret;
 
-       /* ip_vs_mh_reassign is responsible for assigning
-        * and releasing s->permutation table
-        */
-       s->permutation = vzalloc(IP_VS_MH_TAB_SIZE * sizeof(unsigned int *));
-       if (!s->permutation)
-               return -ENOMEM;
+       if (svc->num_dests > IP_VS_MH_TAB_SIZE)
+               return -EINVAL;
 
-       s->permutation[0] = vzalloc(IP_VS_MH_TAB_SIZE * IP_VS_MH_LOOKUP_SIZE *
-                                   sizeof(unsigned int));
-       if (!s->permutation[0]) {
-               ret = -ENOMEM;
-               goto err_alloc;
-       }
+       s->dest_setup = kcalloc(IP_VS_MH_TAB_SIZE,
+                               sizeof(struct ip_vs_mh_dest_setup),
+                               GFP_KERNEL);
+       if (!s->dest_setup)
+               return -ENOMEM;
 
-       ret = ip_vs_mh_permutate(s, svc);
-       if (ret < 0)
-               goto err_out;
+       ip_vs_mh_permutate(s, svc);
 
        ret = ip_vs_mh_populate(s, svc);
        if (ret < 0)
-               goto err_out;
+               goto out;
 
        IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%d\n",
                      IP_VS_DBG_ADDR(svc->af, &svc->addr),
                      ntohs(svc->port));
 
-err_out:
-       vfree(s->permutation[0]);
-err_alloc:
-       vfree(s->permutation);
+out:
+       kfree(s->dest_setup);
        return ret;
 }
 
+static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
+{
+       struct ip_vs_dest *dest;
+       int weight;
+       int g = 0;
+
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               weight = atomic_read(&dest->last_weight);
+               if (weight > 0) {
+                       if (g > 0)
+                               g = gcd(weight, g);
+                       else
+                               g = weight;
+               }
+       }
+       return g;
+}
+
+/* To avoid assigning huge weight for the MH table,
+ * calculate shift value with gcd.
+ */
+static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
+{
+       struct ip_vs_dest *dest;
+       int new_weight, weight = 0;
+       int mw, shift;
+
+       /* If gcd is smaller then 1, the weight is zero for all dests.
+        * skip calculation rshift.
+        */
+       if (gcd < 1)
+               return 0;
+
+       list_for_each_entry(dest, &svc->destinations, n_list) {
+               new_weight = atomic_read(&dest->last_weight);
+               if (new_weight > weight)
+                       weight = new_weight;
+       }
+
+       /* Because gcd is greater than zero,
+        * the maximum weight and gcd are always greater than zero
+        */
+       mw = weight / gcd;
+
+       /* shift = occupied bits of weight/gcd - MH highest bits */
+       shift = fls(mw) - IP_VS_MH_TAB_BITS;
+       return (shift >= 0) ? shift : 0;
+}
+
 static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
 {
        struct ip_vs_mh_state *s;
@@ -320,18 +378,24 @@ static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
        if (!s)
                return -ENOMEM;
 
-       svc->sched_data = s;
+       s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
+                           GFP_KERNEL);
+       if (!s->lookup) {
+               kfree_rcu(s, rcu_head);
+               return -ENOMEM;
+       }
+
+       generate_hash_secret(&s->hash1, &s->hash2);
+       s->gcd = ip_vs_mh_gcd_weight(svc);
+       s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
 
+       svc->sched_data = s;
        IP_VS_DBG(6,
                  "MH lookup table (memory=%zdbytes) allocated for current 
service\n",
-                 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE);
-
-       ip_vs_mh_generate_hash_secret(&s->hash1, &s->hash2);
+                 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
 
        /* permutate & populate with current dests */
-       ip_vs_mh_reassign(s, svc);
-
-       return 0;
+       return ip_vs_mh_reassign(s, svc);
 }
 
 static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
@@ -344,7 +408,7 @@ static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
        /* release the table itself */
        kfree_rcu(s, rcu_head);
        IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
-                 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE);
+                 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
 }
 
 static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
@@ -352,10 +416,11 @@ static int ip_vs_mh_dest_changed(struct ip_vs_service 
*svc,
 {
        struct ip_vs_mh_state *s = svc->sched_data;
 
-       /* assign the hash buckets with the updated service */
-       ip_vs_mh_reassign(s, svc);
+       s->gcd = ip_vs_mh_gcd_weight(svc);
+       s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
 
-       return 0;
+       /* assign the hash buckets with the updated service */
+       return ip_vs_mh_reassign(s, svc);
 }
 
 /* Helper function to get port number */
-- 
1.8.3.1


-- 
Inju Song
NAVER Corporation

--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

<Prev in Thread] Current Thread [Next in Thread>