Implements the Google's Maglev hashing algorithm as a IPVS scheduler.
Basically it provides consistent hashing but offers some special
features about disruption and load balancing.
1) minimal disruption: when the set of destinations changes,
a connection will likely be sent to the same destination
as it was before.
2) load balancing: each destination will receive an almost
equal number of connections.
Seel also for detail: [3.4 Consistent Hasing] in
https://static.googleusercontent.com/media/research.google.com/ko//pubs/archive/44824.pdf
Signed-off-by: Inju Song <inju.song@xxxxxxxxxxxxx>
---
net/netfilter/ipvs/ip_vs_mh.c | 402 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 402 insertions(+)
create mode 100644 net/netfilter/ipvs/ip_vs_mh.c
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
new file mode 100644
index 0000000..534a9f5
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -0,0 +1,402 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <linux/siphash.h>
+
+#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */
+#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */
+
+struct ip_vs_mh_lookup {
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
+};
+
+/* for IPVS MH entry hash table */
+#define IP_VS_MH_LOOKUP_SIZE 65537 /* Must be prime number */
+
+struct ip_vs_mh_state {
+ struct rcu_head rcu_head;
+ struct ip_vs_mh_lookup lookup[IP_VS_MH_LOOKUP_SIZE];
+ hsiphash_key_t hash1, hash2;
+};
+
+static inline void
+ip_vs_mh_generate_hash_secret(hsiphash_key_t *hash1, hsiphash_key_t *hash2)
+{
+ hash1->key[0] = 2654435761UL;
+ hash1->key[1] = 2654435761UL;
+
+ hash2->key[0] = 2654446892UL;
+ hash2->key[1] = 2654446892UL;
+}
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->weight) <= 0 ||
+ dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/* Returns hash value for IPVS MH entry */
+static inline unsigned int
+ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
+ __be16 port, hsiphash_key_t *key, unsigned int offset)
+{
+ unsigned int v;
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
+ addr->ip6[2] ^ addr->ip6[3];
+#endif
+ v = (offset + ntohs(port) + ntohl(addr_fold));
+ return hsiphash(&v, sizeof(v), key);
+}
+
+static inline unsigned int **
+ip_vs_mh_permutate(struct ip_vs_mh_state *s, struct ip_vs_service *svc)
+{
+ int i, j;
+ unsigned int **permutation;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+ unsigned int offset, skip;
+ int dcnt;
+
+ dcnt = svc->num_dests;
+ permutation = kcalloc(dcnt, sizeof(unsigned int *), GFP_KERNEL);
+ permutation[0] = kcalloc(dcnt * IP_VS_MH_LOOKUP_SIZE,
+ sizeof(unsigned int), GFP_KERNEL);
+ for (i = 1; i < dcnt; i++)
+ permutation[i] = permutation[i - 1] + IP_VS_MH_LOOKUP_SIZE;
+
+ p = &svc->destinations;
+ i = 0;
+ while ((p = p->next) != &svc->destinations) {
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ offset = ip_vs_mh_hashkey(svc->af, &dest->addr, dest->port,
+ &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE;
+ skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
+ dest->port, &s->hash2, 0) %
+ (IP_VS_MH_LOOKUP_SIZE - 1) + 1;
+
+ for (j = 0; j < IP_VS_MH_LOOKUP_SIZE; j++) {
+ permutation[i][j] = (offset + (j * skip)) %
+ IP_VS_MH_LOOKUP_SIZE;
+ }
+ i++;
+ }
+
+ return permutation;
+}
+
+static inline int
+ip_vs_mh_populate(struct ip_vs_mh_state *s, struct ip_vs_service *svc,
+ unsigned int **permutation)
+{
+ int i;
+ unsigned int *next;
+ struct ip_vs_mh_lookup *entry, *l;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+ int dcnt;
+ unsigned int n, c;
+
+ dcnt = svc->num_dests;
+ next = kcalloc(dcnt, sizeof(unsigned int), GFP_KERNEL);
+ entry = kcalloc(IP_VS_MH_LOOKUP_SIZE, sizeof(*entry),
+ GFP_KERNEL);
+ for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++)
+ RCU_INIT_POINTER(entry[i].dest, NULL);
+
+ n = 0;
+ while (n < IP_VS_MH_LOOKUP_SIZE) {
+ p = &svc->destinations;
+ for (i = 0; i < dcnt; i++) {
+ p = p->next;
+ c = permutation[i][next[i]];
+
+ while (entry[c].dest) {
+ next[i] = next[i] + 1;
+ c = permutation[i][next[i]];
+ }
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ RCU_INIT_POINTER(entry[c].dest, dest);
+
+ next[i] = next[i] + 1;
+ n++;
+ if (n == IP_VS_MH_LOOKUP_SIZE)
+ break;
+ }
+ }
+
+ l = &s->lookup[0];
+ for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) {
+ dest = rcu_dereference_protected(entry[i].dest, 1);
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(l->dest, dest);
+
+ IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
+ i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ atomic_read(&dest->weight));
+
+ RCU_INIT_POINTER(entry[i].dest, NULL);
+ l++;
+ }
+ kfree(next);
+ kfree(entry);
+
+ return 0;
+}
+
+/* Get ip_vs_dest associated with supplied parameters. */
+static inline struct ip_vs_dest *
+ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
+ % IP_VS_MH_LOOKUP_SIZE;
+ struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
+
+ return (!dest || is_unavailable(dest)) ? NULL : dest;
+}
+
+/* As ip_vs_mh_get, but with fallback if selected server is unavailable
+ */
+static inline struct ip_vs_dest *
+ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
+ struct ip_vs_dest *dest;
+
+ /* first try the dest it's supposed to go to */
+ ihash = ip_vs_mh_hashkey(svc->af, addr, port,
+ &s->hash1, 0) % IP_VS_MH_LOOKUP_SIZE;
+ dest = rcu_dereference(s->lookup[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%d, reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+ /* if the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
+ for (offset = 0; offset < IP_VS_MH_LOOKUP_SIZE; offset++) {
+ roffset = (offset + ihash) % IP_VS_MH_LOOKUP_SIZE;
+ hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
+ roffset) % IP_VS_MH_LOOKUP_SIZE;
+ dest = rcu_dereference(s->lookup[hash].dest);
+ if (!dest)
+ break;
+ if (!is_unavailable(dest))
+ return dest;
+ IP_VS_DBG_BUF(6,
+ "MH: selected unavailable server %s:%d (offset
%d), reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port), roffset);
+ }
+
+ return NULL;
+}
+
+/* Flush all the hash buckets of the specified table. */
+static void ip_vs_mh_flush(struct ip_vs_mh_state *s)
+{
+ int i;
+ struct ip_vs_mh_lookup *l;
+ struct ip_vs_dest *dest;
+
+ l = &s->lookup[0];
+ for (i = 0; i < IP_VS_MH_LOOKUP_SIZE; i++) {
+ dest = rcu_dereference_protected(l->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(l->dest, NULL);
+ }
+ l++;
+ }
+}
+
+/* Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_mh_reassign(struct ip_vs_mh_state *s, struct ip_vs_service *svc)
+{
+ int dcnt;
+ unsigned int **permutation;
+
+ /* flush all the hash entry before assigning mh entry */
+ ip_vs_mh_flush(s);
+
+ /* if destination number is zero, skip mh assign */
+ dcnt = svc->num_dests;
+ if (dcnt <= 0)
+ return 0;
+
+ permutation = ip_vs_mh_permutate(s, svc);
+ ip_vs_mh_populate(s, svc, permutation);
+
+ kfree(permutation[0]);
+ kfree(permutation);
+
+ return 0;
+}
+
+static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_mh_state *s;
+
+ /* allocate the MH table for this service */
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ svc->sched_data = s;
+
+ IP_VS_DBG(6,
+ "MH lookup table (memory=%zdbytes) allocated for current
service\n",
+ sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE);
+
+ ip_vs_mh_generate_hash_secret(&s->hash1, &s->hash2);
+
+ /* permutate & populate with current dests */
+ ip_vs_mh_reassign(s, svc);
+
+ return 0;
+}
+
+static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_mh_state *s = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_mh_flush(s);
+
+ /* release the table itself */
+ kfree_rcu(s, rcu_head);
+ IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
+ sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_LOOKUP_SIZE);
+}
+
+static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_mh_state *s = svc->sched_data;
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_mh_reassign(s, svc);
+
+ return 0;
+}
+
+/* Helper function to get port number */
+static inline __be16
+ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+ __be16 _ports[2], *ports;
+
+ /* At this point we know that we have a valid packet of some kind.
+ * Because ICMP packets are only guaranteed to have the first 8
+ * bytes, let's just grab the ports. Fortunately they're in the
+ * same position for all three of the protocols we care about.
+ */
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_SCTP:
+ ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
+ &_ports);
+ if (unlikely(!ports))
+ return 0;
+
+ if (likely(!ip_vs_iph_inverse(iph)))
+ return ports[0];
+ else
+ return ports[1];
+ default:
+ return 0;
+ }
+}
+
+/* Maglev Hashing scheduling */
+static struct ip_vs_dest *
+ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_mh_state *s;
+ __be16 port = 0;
+ const union nf_inet_addr *hash_addr;
+
+ hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
+
+ IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
+ port = ip_vs_mh_get_port(skb, iph);
+
+ s = (struct ip_vs_mh_state *)svc->sched_data;
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
+ dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
+ else
+ dest = ip_vs_mh_get(svc, s, hash_addr, port);
+
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ IP_VS_DBG_BUF(6, "MH: source IP address %s:%d --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, hash_addr),
+ ntohs(port),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+/* IPVS MH Scheduler structure */
+static struct ip_vs_scheduler ip_vs_mh_scheduler = {
+ .name = "mh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
+ .init_service = ip_vs_mh_init_svc,
+ .done_service = ip_vs_mh_done_svc,
+ .add_dest = ip_vs_mh_dest_changed,
+ .del_dest = ip_vs_mh_dest_changed,
+ .upd_dest = ip_vs_mh_dest_changed,
+ .schedule = ip_vs_mh_schedule,
+};
+
+static int __init ip_vs_mh_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
+}
+
+static void __exit ip_vs_mh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_mh_init);
+module_exit(ip_vs_mh_cleanup);
+MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Inju Song <inju.song@xxxxxxxxxxxxx>");
--
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
|