Hello,
On Sun, 25 Feb 2018, Inju Song wrote:
> 1) Change table size to primes array
>
> 2) Add to check dest number > IP_VS_MH_TAB_SIZE
>
> 3) Replace IP_VS_MH_LOOKUP_SIZE with IP_VS_MH_TAB_SIZE
>
> 4) Replace large permutation array with dest_setup
>
> 5) Add gcd, rshift and calculate ds->turns
Makefile and Kconfig should be a last patch, otherwise
on bisecting we will fail to compile.
> Signed-off-by: Inju Song <inju.song@xxxxxxxxxxxxx>
> ---
> net/netfilter/ipvs/ip_vs_mh.c | 291
> ++++++++++++++++++++++++++----------------
> 1 file changed, 178 insertions(+), 113 deletions(-)
>
> diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
> index d1bf50a..c868778 100644
> --- a/net/netfilter/ipvs/ip_vs_mh.c
> +++ b/net/netfilter/ipvs/ip_vs_mh.c
There must be license info at the beginning.
> @@ -5,12 +5,13 @@
> #include <linux/slab.h>
> #include <linux/module.h>
> #include <linux/kernel.h>
> -#include <linux/vmalloc.h>
> #include <linux/skbuff.h>
>
> #include <net/ip_vs.h>
>
> #include <linux/siphash.h>
> +#include <linux/bitops.h>
> +#include <linux/gcd.h>
>
> #define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH
> fallback */
> #define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */
> @@ -19,23 +20,36 @@ struct ip_vs_mh_lookup {
> struct ip_vs_dest __rcu *dest; /* real server (cache) */
> };
>
> -/* for IPVS MH entry hash table */
> -#ifndef CONFIG_IP_VS_MH_TAB_BITS
> -#define CONFIG_IP_VS_MH_TAB_BITS 8
> +struct ip_vs_mh_dest_setup {
> + int offset; /* starting offset */
> + int skip; /* skip */
> + int perm; /* next_offset */
> + int turns; /* weight / gcd() */
> +};
> +
> +/* Available prime numbers for MH table */
> +static int primes[] = {251, 509, 1021, 2039, 4093,
> + 8191, 16381, 32749, 65521, 131071};
> +
> +/* For IPVS MH entry hash table */
> +#ifndef CONFIG_IP_VS_MH_TAB_INDEX
> +#define CONFIG_IP_VS_MH_TAB_INDEX 12
> #endif
> -#define IP_VS_MH_TAB_BITS CONFIG_IP_VS_MH_TAB_BITS
> -#define IP_VS_MH_TAB_SIZE BIT(IP_VS_MH_TAB_BITS)
> -#define IP_VS_MH_LOOKUP_SIZE 65537 /* Must be prime number */
> +#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2)
> +#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8)
> +#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX]
>
> struct ip_vs_mh_state {
> - struct rcu_head rcu_head;
> - struct ip_vs_mh_lookup lookup[IP_VS_MH_LOOKUP_SIZE];
> - unsigned int **permutation;
> - hsiphash_key_t hash1, hash2;
> + struct rcu_head rcu_head;
> + struct ip_vs_mh_lookup *lookup;
> + struct ip_vs_mh_dest_setup *dest_setup;
> + hsiphash_key_t hash1, hash2;
> + int gcd;
> + int rshift;
> };
>
> -static inline void
> -ip_vs_mh_generate_hash_secret(hsiphash_key_t *hash1, hsiphash_key_t *hash2)
> +static inline void generate_hash_secret(hsiphash_key_t *hash1,
> + hsiphash_key_t *hash2)
> {
> hash1->key[0] = 2654435761UL;
> hash1->key[1] = 2654435761UL;
> @@ -68,13 +82,13 @@ static inline bool is_unavailable(struct ip_vs_dest *dest)
> return hsiphash(&v, sizeof(v), key);
> }
>
> -static inline int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
> - struct ip_vs_service *svc)
> +static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
> + struct ip_vs_service *svc)
> {
> - int i, j;
> struct list_head *p;
> + struct ip_vs_mh_dest_setup *ds;
> struct ip_vs_dest *dest;
> - unsigned int offset, skip;
> + int lw;
> bool empty;
>
> p = &svc->destinations;
> @@ -82,56 +96,50 @@ static inline int ip_vs_mh_permutate(struct
> ip_vs_mh_state *s,
> if (empty)
> return 0;
Above empty and list_empty can be removed, we can
rely on the gcd < 1 check.
>
> - /* extending permutation table to 2d arrays */
> - for (i = 1; i < svc->num_dests; i++)
> - s->permutation[i] = s->permutation[i - 1] +
> - IP_VS_MH_LOOKUP_SIZE;
> + /* If gcd is smaller then 1, all last_weights of dest are zero.
> + * so skip permutation for the dests.
> + */
> + if (s->gcd < 1)
> + return 0;
>
> - i = 0;
> + /* Set dest_setup for the dests permutation */
> + ds = &s->dest_setup[0];
> while ((p = p->next) != &svc->destinations) {
> dest = list_entry(p, struct ip_vs_dest, n_list);
> - offset = ip_vs_mh_hashkey(svc->af, &dest->addr, dest->port,
> - &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE;
> - skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
> - dest->port, &s->hash2, 0) %
> - (IP_VS_MH_LOOKUP_SIZE - 1) + 1;
> -
> - for (j = 0; j < IP_VS_MH_LOOKUP_SIZE; j++) {
> - s->permutation[i][j] = (offset + (j * skip)) %
> - IP_VS_MH_LOOKUP_SIZE;
> - }
> - i++;
> +
> + ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
> + dest->port, &s->hash1, 0) %
> + IP_VS_MH_TAB_SIZE;
> + ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
> + dest->port, &s->hash2, 0) %
> + (IP_VS_MH_TAB_SIZE - 1) + 1;
> + ds->perm = ds->offset % IP_VS_MH_TAB_SIZE;
offset is already < IP_VS_MH_TAB_SIZE, no need to mod.
> +
> + lw = atomic_read(&dest->last_weight);
> + ds->turns = (lw > 1) ? lw / s->gcd : lw;
May be we can move the rshift to the calcs here?
ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
or optimized in different way but we must be sure
that rshift will not clear turns when lw > 0.
> + ds++;
> }
>
> return 0;
> }
>
> -static inline int
> -ip_vs_mh_populate(struct ip_vs_mh_state *s, struct ip_vs_service *svc)
> +static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
> + struct ip_vs_service *svc)
> {
> - int i, ret, dw_count;
> - unsigned int *next;
> - unsigned int **pmt;
> + int i, n;
> struct ip_vs_mh_lookup *entry, *l;
> struct list_head *p;
> struct ip_vs_dest *dest;
> - unsigned int n, c;
> + struct ip_vs_mh_dest_setup *ds;
> + int c, rs, dt_count;
> bool empty;
>
> - ret = 0;
> - next = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(unsigned int),
> - GFP_KERNEL);
> - if (!next)
> - return -ENOMEM;
> -
> - entry = kcalloc(IP_VS_MH_LOOKUP_SIZE, sizeof(*entry),
> + entry = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(*entry),
> GFP_KERNEL);
We can avoid this allocation if s->gcd < 1 because
entry is unused in this case.
> - if (!entry) {
> - ret = -ENOMEM;
> - goto err_alloc;
> - }
> + if (!entry)
> + return -ENOMEM;
>
> - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++)
> + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++)
> RCU_INIT_POINTER(entry[i].dest, NULL);
>
> p = &svc->destinations;
> @@ -139,47 +147,61 @@ static inline int ip_vs_mh_permutate(struct
> ip_vs_mh_state *s,
> if (empty)
> goto out;
We can avoid above list_empty and to use
empty = (s->gcd < 1) instead. Or may be 'empty' var is not
need at all if we use bitmap and populate dests one by one,
see below...
>
> + /* If gcd is smaller then 1, the weight is zero for all dests.
> + * so skip population for MH table and fills the table with NULLs.
> + */
> + if (s->gcd < 1)
> + goto out;
if (s->gcd < 1)
goto reset;
> +
> n = 0;
> - dw_count = 0;
> - pmt = s->permutation;
> - while (n < IP_VS_MH_LOOKUP_SIZE) {
> + rs = s->rshift;
> + dt_count = 0;
> + while (n < IP_VS_MH_TAB_SIZE) {
> if (p == &svc->destinations)
> p = p->next;
>
> - i = 0;
> + ds = &s->dest_setup[0];
> while (p != &svc->destinations) {
> - c = pmt[i][next[i]];
> + /* ignore added server with zero weight */
> + if (ds->turns < 1) {
> + p = p->next;
> + ds++;
> + continue;
> + }
>
> + c = ds->perm;
> while (entry[c].dest) {
> - next[i] = next[i] + 1;
> - c = pmt[i][next[i]];
> + /* Add skip, mod IP_VS_MH_TAB_SIZE */
> + ds->perm += ds->skip;
> + if (ds->perm >= IP_VS_MH_TAB_SIZE)
> + ds->perm -= IP_VS_MH_TAB_SIZE;
> + c = ds->perm;
> }
>
> dest = list_entry(p, struct ip_vs_dest, n_list);
> RCU_INIT_POINTER(entry[c].dest, dest);
entry is not visible to RCU readers, so normal access
is enough. It can be even a bitmap to reduce the memory usage
32 or 64 times, eg. we can use 'table' instead of 'entry':
unsigned long *table;
table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
sizeof(unsigned long));
then 'while (entry[c].dest)' becomes 'while (test_bit(c, table))',
'entry[c].dest = dest;' becomes '__set_bit(c, table);'
And we can directly populate the entry:
dest = rcu_dereference_protected(s->lookup[c].dest, 1);
dest2 = list_entry(p, struct ip_vs_dest, n_list);
if (dest != dest2) {
if (dest)
ip_vs_dest_put(dest);
ip_vs_dest_hold(dest2);
RCU_INIT_POINTER(s->lookup[c].dest, dest2);
}
>
> - next[i] = next[i] + 1;
> - n++;
> - if (n == IP_VS_MH_LOOKUP_SIZE)
> - break;
> + if (++n == IP_VS_MH_TAB_SIZE)
> + goto out;
>
> - if (++dw_count >= atomic_read(&dest->weight)) {
> + if (++dt_count >= (ds->turns >> rs)) {
turns can be already rshift-ed in ip_vs_mh_permutate.
> + dt_count = 0;
> p = p->next;
> - dw_count = 0;
> - i++;
> + ds++;
> }
> }
> }
Here we should jump to kfree(table):
goto out;
>
> out:
out: becomes reset:
This loop will remain for the 's->gcd < 1' case only:
> l = &s->lookup[0];
> - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) {
> - dest = rcu_dereference_protected(entry[i].dest, 1);
> + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
> + dest = rcu_dereference_protected(l->dest, 1);
> if (dest)
> ip_vs_dest_put(dest);
> if (empty) {
> RCU_INIT_POINTER(l->dest, NULL);
> } else {
This block will be removed:
> + dest = rcu_dereference_protected(entry[i].dest, 1);
> ip_vs_dest_hold(dest);
> RCU_INIT_POINTER(l->dest, dest);
>
> @@ -193,9 +215,7 @@ static inline int ip_vs_mh_permutate(struct
> ip_vs_mh_state *s,
> }
>
out: label comes here to kfree bitmap 'table'.
> kfree(entry);
> -err_alloc:
> - kfree(next);
> - return ret;
> + return 0;
> }
>
> /* Get ip_vs_dest associated with supplied parameters. */
> @@ -204,14 +224,13 @@ static inline int ip_vs_mh_permutate(struct
> ip_vs_mh_state *s,
> const union nf_inet_addr *addr, __be16 port)
> {
> unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
> - % IP_VS_MH_LOOKUP_SIZE;
> + % IP_VS_MH_TAB_SIZE;
> struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
>
> return (!dest || is_unavailable(dest)) ? NULL : dest;
> }
>
> -/* As ip_vs_mh_get, but with fallback if selected server is unavailable
> - */
> +/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
> static inline struct ip_vs_dest *
> ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
> const union nf_inet_addr *addr, __be16 port)
> @@ -222,7 +241,7 @@ static inline int ip_vs_mh_permutate(struct
> ip_vs_mh_state *s,
>
> /* first try the dest it's supposed to go to */
> ihash = ip_vs_mh_hashkey(svc->af, addr, port,
> - &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE;
> + &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
> dest = rcu_dereference(s->lookup[ihash].dest);
> if (!dest)
> return NULL;
> @@ -235,10 +254,10 @@ static inline int ip_vs_mh_permutate(struct
> ip_vs_mh_state *s,
> /* if the original dest is unavailable, loop around the table
> * starting from ihash to find a new dest
> */
> - for (offset = 0; offset < IP_VS_MH_LOOKUP_SIZE; offset++) {
> - roffset = (offset + ihash) % IP_VS_MH_LOOKUP_SIZE;
> + for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
> + roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
> hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
> - roffset) % IP_VS_MH_LOOKUP_SIZE;
> + roffset) % IP_VS_MH_TAB_SIZE;
> dest = rcu_dereference(s->lookup[hash].dest);
> if (!dest)
> break;
> @@ -261,7 +280,7 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s)
> struct ip_vs_dest *dest;
>
> l = &s->lookup[0];
> - for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) {
> + for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
> dest = rcu_dereference_protected(l->dest, 1);
> if (dest) {
> ip_vs_dest_put(dest);
> @@ -271,46 +290,85 @@ static void ip_vs_mh_flush(struct ip_vs_mh_state *s)
> }
> }
>
> -/* Assign all the hash buckets of the specified table with the service.
> - */
> -static int
> -ip_vs_mh_reassign(struct ip_vs_mh_state *s, struct ip_vs_service *svc)
> +/* Assign all the hash buckets of the specified table with the service. */
> +static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
> + struct ip_vs_service *svc)
> {
> int ret;
>
> - /* ip_vs_mh_reassign is responsible for assigning
> - * and releasing s->permutation table
> - */
> - s->permutation = vzalloc(IP_VS_MH_TAB_SIZE * sizeof(unsigned int *));
> - if (!s->permutation)
> - return -ENOMEM;
> + if (svc->num_dests > IP_VS_MH_TAB_SIZE)
> + return -EINVAL;
>
> - s->permutation[0] = vzalloc(IP_VS_MH_TAB_SIZE * IP_VS_MH_LOOKUP_SIZE *
> - sizeof(unsigned int));
> - if (!s->permutation[0]) {
> - ret = -ENOMEM;
> - goto err_alloc;
> - }
> + s->dest_setup = kcalloc(IP_VS_MH_TAB_SIZE,
> + sizeof(struct ip_vs_mh_dest_setup),
> + GFP_KERNEL);
For s->dest_setup we need to allocate svc->num_dests entries,
IP_VS_MH_TAB_SIZE is too large. But if num_dests=0 we should avoid
the allocation, then after kfree(s->dest_setup) we should use
s->dest_setup = NULL to avoid double free.
> + if (!s->dest_setup)
> + return -ENOMEM;
>
> - ret = ip_vs_mh_permutate(s, svc);
> - if (ret < 0)
> - goto err_out;
> + ip_vs_mh_permutate(s, svc);
>
> ret = ip_vs_mh_populate(s, svc);
> if (ret < 0)
> - goto err_out;
> + goto out;
>
> IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%d\n",
> IP_VS_DBG_ADDR(svc->af, &svc->addr),
> ntohs(svc->port));
>
> -err_out:
> - vfree(s->permutation[0]);
> -err_alloc:
> - vfree(s->permutation);
> +out:
> + kfree(s->dest_setup);
> return ret;
> }
>
> +static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
> +{
> + struct ip_vs_dest *dest;
> + int weight;
> + int g = 0;
> +
> + list_for_each_entry(dest, &svc->destinations, n_list) {
> + weight = atomic_read(&dest->last_weight);
> + if (weight > 0) {
> + if (g > 0)
> + g = gcd(weight, g);
> + else
> + g = weight;
> + }
> + }
> + return g;
> +}
> +
> +/* To avoid assigning huge weight for the MH table,
> + * calculate shift value with gcd.
> + */
> +static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
> +{
> + struct ip_vs_dest *dest;
> + int new_weight, weight = 0;
> + int mw, shift;
> +
> + /* If gcd is smaller then 1, the weight is zero for all dests.
> + * skip calculation rshift.
> + */
> + if (gcd < 1)
> + return 0;
> +
> + list_for_each_entry(dest, &svc->destinations, n_list) {
> + new_weight = atomic_read(&dest->last_weight);
> + if (new_weight > weight)
> + weight = new_weight;
> + }
> +
> + /* Because gcd is greater than zero,
> + * the maximum weight and gcd are always greater than zero
> + */
> + mw = weight / gcd;
> +
> + /* shift = occupied bits of weight/gcd - MH highest bits */
> + shift = fls(mw) - IP_VS_MH_TAB_BITS;
> + return (shift >= 0) ? shift : 0;
> +}
> +
> static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
> {
> struct ip_vs_mh_state *s;
> @@ -320,18 +378,24 @@ static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
> if (!s)
> return -ENOMEM;
>
> - svc->sched_data = s;
> + s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
> + GFP_KERNEL);
> + if (!s->lookup) {
> + kfree_rcu(s, rcu_head);
kfree(s) is ok here.
> + return -ENOMEM;
> + }
> +
> + generate_hash_secret(&s->hash1, &s->hash2);
> + s->gcd = ip_vs_mh_gcd_weight(svc);
> + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
>
> + svc->sched_data = s;
> IP_VS_DBG(6,
> "MH lookup table (memory=%zdbytes) allocated for current
> service\n",
> - sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE);
> -
> - ip_vs_mh_generate_hash_secret(&s->hash1, &s->hash2);
> + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
>
> /* permutate & populate with current dests */
> - ip_vs_mh_reassign(s, svc);
> -
> - return 0;
> + return ip_vs_mh_reassign(s, svc);
> }
>
> static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
> @@ -344,7 +408,7 @@ static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
> /* release the table itself */
> kfree_rcu(s, rcu_head);
s->lookup is not freed, we should free both
s and s->lookup in new RCU callback. It will need a change in
ip_vs_mh_cleanup: synchronize_rcu() -> rcu_barrier(), so that
module unload to wait for the callback to complete.
See ip_vs_lblcr.c how call_rcu() is called to schedule
callback. We will need:
static void ip_vs_mh_state_free(struct rcu_head *head)
{
struct ip_vs_mh_state *s;
s = container_of(head, struct ip_vs_mh_state, rcu_head);
kfree(s->lookup);
kfree(s);
}
> IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
> - sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE);
> + sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
> }
>
> static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
> @@ -352,10 +416,11 @@ static int ip_vs_mh_dest_changed(struct ip_vs_service
> *svc,
> {
> struct ip_vs_mh_state *s = svc->sched_data;
>
> - /* assign the hash buckets with the updated service */
> - ip_vs_mh_reassign(s, svc);
> + s->gcd = ip_vs_mh_gcd_weight(svc);
> + s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
>
> - return 0;
> + /* assign the hash buckets with the updated service */
> + return ip_vs_mh_reassign(s, svc);
> }
>
> /* Helper function to get port number */
> --
> 1.8.3.1
Regards
--
Julian Anastasov <ja@xxxxxx>
--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
|