1) Change table size to primes array
2) Add to check dest number > IP_VS_MH_TAB_SIZE
3) Replace IP_VS_MH_LOOKUP_SIZE with IP_VS_MH_TAB_SIZE
4) Replace large permutation array with dest_setup
5) Add gcd, rshift and calculate ds->turns
Signed-off-by: Inju Song <inju.song@xxxxxxxxxxxxx>
---
net/netfilter/ipvs/ip_vs_mh.c | 291 ++++++++++++++++++++++++++----------------
1 file changed, 178 insertions(+), 113 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
index d1bf50a..c868778 100644
--- a/net/netfilter/ipvs/ip_vs_mh.c
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -5,12 +5,13 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/kernel.h>
-#include <linux/vmalloc.h>
#include <linux/skbuff.h>
#include <net/ip_vs.h>
#include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <linux/gcd.h>
#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */
#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */
@@ -19,23 +20,36 @@ struct ip_vs_mh_lookup {
struct ip_vs_dest __rcu *dest; /* real server (cache) */
};
-/* for IPVS MH entry hash table */
-#ifndef CONFIG_IP_VS_MH_TAB_BITS
-#define CONFIG_IP_VS_MH_TAB_BITS 8
+struct ip_vs_mh_dest_setup {
+ int offset; /* starting offset */
+ int skip; /* skip */
+ int perm; /* next_offset */
+ int turns; /* weight / gcd() */
+};
+
+/* Available prime numbers for MH table */
+static int primes[] = {251, 509, 1021, 2039, 4093,
+ 8191, 16381, 32749, 65521, 131071};
+
+/* For IPVS MH entry hash table */
+#ifndef CONFIG_IP_VS_MH_TAB_INDEX
+#define CONFIG_IP_VS_MH_TAB_INDEX 12
#endif
-#define IP_VS_MH_TAB_BITS CONFIG_IP_VS_MH_TAB_BITS
-#define IP_VS_MH_TAB_SIZE BIT(IP_VS_MH_TAB_BITS)
-#define IP_VS_MH_LOOKUP_SIZE 65537 /* Must be prime number */
+#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2)
+#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8)
+#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX]
struct ip_vs_mh_state {
- struct rcu_head rcu_head;
- struct ip_vs_mh_lookup lookup[IP_VS_MH_LOOKUP_SIZE];
- unsigned int **permutation;
- hsiphash_key_t hash1, hash2;
+ struct rcu_head rcu_head;
+ struct ip_vs_mh_lookup *lookup;
+ struct ip_vs_mh_dest_setup *dest_setup;
+ hsiphash_key_t hash1, hash2;
+ int gcd;
+ int rshift;
};
-static inline void
-ip_vs_mh_generate_hash_secret(hsiphash_key_t *hash1, hsiphash_key_t *hash2)
+static inline void generate_hash_secret(hsiphash_key_t *hash1,
+ hsiphash_key_t *hash2)
{
hash1->key[0] = 2654435761UL;
hash1->key[1] = 2654435761UL;
@@ -68,13 +82,13 @@ static inline bool is_unavailable(struct ip_vs_dest *dest)
return hsiphash(&v, sizeof(v), key);
}
-static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
- struct ip_vs_service *svc)
+static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
{
- int i, j;
struct list_head *p;
+ struct ip_vs_mh_dest_setup *ds;
struct ip_vs_dest *dest;
- unsigned int offset, skip;
+ int lw;
bool empty;
p = &svc->destinations;
@@ -82,56 +96,50 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state
*s,
if (empty)
return 0;
- /* extending permutation table to 2d arrays */
- for (i = 1; i < svc->num_dests; i++)
- s->permutation[i] = s->permutation[i - 1] +
- IP_VS_MH_LOOKUP_SIZE;
+ /* If gcd is smaller then 1, all last_weights of dest are zero.
+ * so skip permutation for the dests.
+ */
+ if (s->gcd < 1)
+ return 0;
- i = 0;
+ /* Set dest_setup for the dests permutation */
+ ds = &s->dest_setup[0];
while ((p = p->next) != &svc->destinations) {
dest = list_entry(p, struct ip_vs_dest, n_list);
- offset = ip_vs_mh_hashkey(svc->af, &dest->addr, dest->port,
- &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE;
- skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
- dest->port, &s->hash2, 0) %
- (IP_VS_MH_LOOKUP_SIZE - 1) + 1;
-
- for (j = 0; j < IP_VS_MH_LOOKUP_SIZE; j++) {
- s->permutation[i][j] = (offset + (j * skip)) %
- IP_VS_MH_LOOKUP_SIZE;
- }
- i++;
+
+ ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
+ dest->port, &s->hash1, 0) %
+ IP_VS_MH_TAB_SIZE;
+ ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
+ dest->port, &s->hash2, 0) %
+ (IP_VS_MH_TAB_SIZE - 1) + 1;
+ ds->perm = ds->offset % IP_VS_MH_TAB_SIZE;
+
+ lw = atomic_read(&dest->last_weight);
+ ds->turns = (lw > 1) ? lw / s->gcd : lw;
+ ds++;
}
return 0;
}
-static inline int
-ip_vs_mh_populate(struct ip_vs_mh_state *s, struct ip_vs_service *svc)
+static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
{
- int i, ret, dw_count;
- unsigned int *next;
- unsigned int **pmt;
+ int i, n;
struct ip_vs_mh_lookup *entry, *l;
struct list_head *p;
struct ip_vs_dest *dest;
- unsigned int n, c;
+ struct ip_vs_mh_dest_setup *ds;
+ int c, rs, dt_count;
bool empty;
- ret = 0;
- next = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(unsigned int),
- GFP_KERNEL);
- if (!next)
- return -ENOMEM;
-
- entry = kcalloc(IP_VS_MH_LOOKUP_SIZE, sizeof(*entry),
+ entry = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(*entry),
GFP_KERNEL);
- if (!entry) {
- ret = -ENOMEM;
- goto err_alloc;
- }
+ if (!entry)
+ return -ENOMEM;
- for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++)
+ for (i = 0; i < IP_VS_MH_TAB_SIZE; i++)
RCU_INIT_POINTER(entry[i].dest, NULL);
p = &svc->destinations;
@@ -139,47 +147,61 @@ static inline int ip_vs_mh_permutate(struct
ip_vs_mh_state *s,
if (empty)
goto out;
+ /* If gcd is smaller then 1, the weight is zero for all dests.
+ * so skip population for MH table and fills the table with NULLs.
+ */
+ if (s->gcd < 1)
+ goto out;
+
n = 0;
- dw_count = 0;
- pmt = s->permutation;
- while (n < IP_VS_MH_LOOKUP_SIZE) {
+ rs = s->rshift;
+ dt_count = 0;
+ while (n < IP_VS_MH_TAB_SIZE) {
if (p == &svc->destinations)
p = p->next;
- i = 0;
+ ds = &s->dest_setup[0];
while (p != &svc->destinations) {
- c = pmt[i][next[i]];
+ /* ignore added server with zero weight */
+ if (ds->turns < 1) {
+ p = p->next;
+ ds++;
+ continue;
+ }
+ c = ds->perm;
while (entry[c].dest) {
- next[i] = next[i] + 1;
- c = pmt[i][next[i]];
+ /* Add skip, mod IP_VS_MH_TAB_SIZE */
+ ds->perm += ds->skip;
+ if (ds->perm >= IP_VS_MH_TAB_SIZE)
+ ds->perm -= IP_VS_MH_TAB_SIZE;
+ c = ds->perm;
}
dest = list_entry(p, struct ip_vs_dest, n_list);
RCU_INIT_POINTER(entry[c].dest, dest);
- next[i] = next[i] + 1;
- n++;
- if (n == IP_VS_MH_LOOKUP_SIZE)
- break;
+ if (++n == IP_VS_MH_TAB_SIZE)
+ goto out;
- if (++dw_count >= atomic_read(&dest->weight)) {
+ if (++dt_count >= (ds->turns >> rs)) {
+ dt_count = 0;
p = p->next;
- dw_count = 0;
- i++;
+ ds++;
}
}
}
out:
l = &s->lookup[0];
- for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) {
- dest = rcu_dereference_protected(entry[i].dest, 1);
+ for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(l->dest, 1);
if (dest)
ip_vs_dest_put(dest);
if (empty) {
RCU_INIT_POINTER(l->dest, NULL);
} else {
+ dest = rcu_dereference_protected(entry[i].dest, 1);
ip_vs_dest_hold(dest);
RCU_INIT_POINTER(l->dest, dest);
@@ -193,9 +215,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state
*s,
}
kfree(entry);
-err_alloc:
- kfree(next);
- return ret;
+ return 0;
}
/* Get ip_vs_dest associated with supplied parameters. */
@@ -204,14 +224,13 @@ static inline int ip_vs_mh_permutate(struct
ip_vs_mh_state *s,
const union nf_inet_addr *addr, __be16 port)
{
unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
- % IP_VS_MH_LOOKUP_SIZE;
+ % IP_VS_MH_TAB_SIZE;
struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
return (!dest || is_unavailable(dest)) ? NULL : dest;
}
-/* As ip_vs_mh_get, but with fallback if selected server is unavailable
- */
+/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
static inline struct ip_vs_dest *
ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
const union nf_inet_addr *addr, __be16 port)
@@ -222,7 +241,7 @@ static inline int ip_vs_mh_permutate(struct ip_vs_mh_state
*s,
/* first try the dest it's supposed to go to */
ihash = ip_vs_mh_hashkey(svc->af, addr, port,
- &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE;
+ &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
dest = rcu_dereference(s->lookup[ihash].dest);
if (!dest)
return NULL;
@@ -235,10 +254,10 @@ static inline int ip_vs_mh_permutate(struct
ip_vs_mh_state *s,
/* if the original dest is unavailable, loop around the table
* starting from ihash to find a new dest
*/
- for (offset = 0; offset < IP_VS_MH_LOOKUP_SIZE; offset++) {
- roffset = (offset + ihash) % IP_VS_MH_LOOKUP_SIZE;
+ for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
+ roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
- roffset) % IP_VS_MH_LOOKUP_SIZE;
+ roffset) % IP_VS_MH_TAB_SIZE;
dest = rcu_dereference(s->lookup[hash].dest);
if (!dest)
break;
@@ -261,7 +280,7 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s)
struct ip_vs_dest *dest;
l = &s->lookup[0];
- for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) {
+ for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
dest = rcu_dereference_protected(l->dest, 1);
if (dest) {
ip_vs_dest_put(dest);
@@ -271,46 +290,85 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s)
}
}
-/* Assign all the hash buckets of the specified table with the service.
- */
-static int
-ip_vs_mh_reassign(struct ip_vs_mh_state *s, struct ip_vs_service *svc)
+/* Assign all the hash buckets of the specified table with the service. */
+static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
{
int ret;
- /* ip_vs_mh_reassign is responsible for assigning
- * and releasing s->permutation table
- */
- s->permutation = vzalloc(IP_VS_MH_TAB_SIZE * sizeof(unsigned int *));
- if (!s->permutation)
- return -ENOMEM;
+ if (svc->num_dests > IP_VS_MH_TAB_SIZE)
+ return -EINVAL;
- s->permutation[0] = vzalloc(IP_VS_MH_TAB_SIZE * IP_VS_MH_LOOKUP_SIZE *
- sizeof(unsigned int));
- if (!s->permutation[0]) {
- ret = -ENOMEM;
- goto err_alloc;
- }
+ s->dest_setup = kcalloc(IP_VS_MH_TAB_SIZE,
+ sizeof(struct ip_vs_mh_dest_setup),
+ GFP_KERNEL);
+ if (!s->dest_setup)
+ return -ENOMEM;
- ret = ip_vs_mh_permutate(s, svc);
- if (ret < 0)
- goto err_out;
+ ip_vs_mh_permutate(s, svc);
ret = ip_vs_mh_populate(s, svc);
if (ret < 0)
- goto err_out;
+ goto out;
IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%d\n",
IP_VS_DBG_ADDR(svc->af, &svc->addr),
ntohs(svc->port));
-err_out:
- vfree(s->permutation[0]);
-err_alloc:
- vfree(s->permutation);
+out:
+ kfree(s->dest_setup);
return ret;
}
+static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int weight;
+ int g = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ weight = atomic_read(&dest->last_weight);
+ if (weight > 0) {
+ if (g > 0)
+ g = gcd(weight, g);
+ else
+ g = weight;
+ }
+ }
+ return g;
+}
+
+/* To avoid assigning huge weight for the MH table,
+ * calculate shift value with gcd.
+ */
+static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
+{
+ struct ip_vs_dest *dest;
+ int new_weight, weight = 0;
+ int mw, shift;
+
+ /* If gcd is smaller then 1, the weight is zero for all dests.
+ * skip calculation rshift.
+ */
+ if (gcd < 1)
+ return 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ new_weight = atomic_read(&dest->last_weight);
+ if (new_weight > weight)
+ weight = new_weight;
+ }
+
+ /* Because gcd is greater than zero,
+ * the maximum weight and gcd are always greater than zero
+ */
+ mw = weight / gcd;
+
+ /* shift = occupied bits of weight/gcd - MH highest bits */
+ shift = fls(mw) - IP_VS_MH_TAB_BITS;
+ return (shift >= 0) ? shift : 0;
+}
+
static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
{
struct ip_vs_mh_state *s;
@@ -320,18 +378,24 @@ static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
if (!s)
return -ENOMEM;
- svc->sched_data = s;
+ s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
+ GFP_KERNEL);
+ if (!s->lookup) {
+ kfree_rcu(s, rcu_head);
+ return -ENOMEM;
+ }
+
+ generate_hash_secret(&s->hash1, &s->hash2);
+ s->gcd = ip_vs_mh_gcd_weight(svc);
+ s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+ svc->sched_data = s;
IP_VS_DBG(6,
"MH lookup table (memory=%zdbytes) allocated for current
service\n",
- sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE);
-
- ip_vs_mh_generate_hash_secret(&s->hash1, &s->hash2);
+ sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
/* permutate & populate with current dests */
- ip_vs_mh_reassign(s, svc);
-
- return 0;
+ return ip_vs_mh_reassign(s, svc);
}
static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
@@ -344,7 +408,7 @@ static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
/* release the table itself */
kfree_rcu(s, rcu_head);
IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
- sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE);
+ sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
}
static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
@@ -352,10 +416,11 @@ static int ip_vs_mh_dest_changed(struct ip_vs_service
*svc,
{
struct ip_vs_mh_state *s = svc->sched_data;
- /* assign the hash buckets with the updated service */
- ip_vs_mh_reassign(s, svc);
+ s->gcd = ip_vs_mh_gcd_weight(svc);
+ s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
- return 0;
+ /* assign the hash buckets with the updated service */
+ return ip_vs_mh_reassign(s, svc);
}
/* Helper function to get port number */
--
1.8.3.1
--
Inju Song
NAVER Corporation
--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
|