Hello,
In order to have an atomic failover from a real server pool with all
services quiesced to a spillover pool one needs to instrument the
kernel. This problem occurs when you have an application that cannot
deal properly with short but slashdot-like hypes and you limit the
destination of the services with the per real server thresholds
available in 2.6.x (and since I've backported and enhanced it, also in
2.4.x) kernels.
It works as RR scheduler and simply selects the server with the highest
weight which hasn't set the IP_VS_DEST_F_OVERLOAD flag. The modus
operandi from user space is to invoke the service session pool _and_
also add the overflow or spillover pool, but of course with a lower weight.
lb-lb0-phys:~# ipvsadm-2.4 -L -n; echo; ipvsadm-2.4 -L -n --thresholds
IP Virtual Server version 1.0.12 (size=4096)
Prot LocalAddress:Port Scheduler Flags
-> RemoteAddress:Port Forward Weight ActiveConn InActConn
TCP 112.2.13.20:80 hprio persistent 150
-> 112.2.13.13:80 Route 1 0 0
-> 112.2.13.23:80 Route 5 0 0
-> 112.2.13.22:80 Route 5 0 0
-> 112.2.13.21:80 Route 5 0 0
IP Virtual Server version 1.0.12 (size=4096)
Prot LocalAddress:Port Uthreshold Lthreshold ActiveConn InActConn
-> RemoteAddress:Port
TCP 112.2.13.20:80 hprio persistent 150
-> 112.2.13.13:80 0 0 0 0
-> 112.2.13.23:80 50 20 0 0
-> 112.2.13.22:80 50 20 0 0
-> 112.2.13.21:80 50 20 0 0
As it can be spotted easily, ~21, ~22 and ~23 are in the service pool
with an upper threshold of 50 and a weight of 5, while we have ~13 as
the only spillover server with weight 1 and no threshold limitation.
Once service bound on ~20 is "quiesced", meaning that all RS are
quiesced, the hprio scheduler will automatically and atomically
(initiated from ip_vs_bind_dest) switch to the overflow server pool, aka
~13.
The current implementation of my original 2.2.x patch in the 2.6.x
kernel is in my view unfinished and can hardly be used in production for
sites with heaps of page views and business logics in application servers.
Threshold limitation patch and user space patch follow.
Please discuss,
Roberto Nibali, ratz
ps.: No, I'm not back yet, just implementing missing stuff for a big
customer of ours ;).
--
-------------------------------------------------------------
addr://Kasinostrasse 30, CH-5001 Aarau tel://++41 62 823 9355
http://www.terreactive.com fax://++41 62 823 9356
-------------------------------------------------------------
terreActive AG Wir sichern Ihren Erfolg
-------------------------------------------------------------
diff -X dontdiff -Nur linux-2.4.31-orig/net/ipv4/ipvs/ip_vs_hprio.c
linux-2.4.31-pab2/net/ipv4/ipvs/ip_vs_hprio.c
--- linux-2.4.31-orig/net/ipv4/ipvs/ip_vs_hprio.c 1970-01-01
01:00:00.000000000 +0100
+++ linux-2.4.31-pab2/net/ipv4/ipvs/ip_vs_hprio.c 2005-10-26
13:18:12.000000000 +0200
@@ -0,0 +1,144 @@
+/*
+ * IPVS: Weighted Round-Robin High Priority Scheduling module
+ *
+ * Version: 0.0.3, 2005/10/26
+ *
+ * Authors: Roberto Nibali <ratz@xxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Algorithm:
+ * It's based on the rr scheduler but it only takes the highest
+ * weight and then discards all other servers which have not
+ * the highest weight.
+ *
+ * This can be used to simulate an atomic session LIP for peak
+ * loads where the user space application doesn't have the chance
+ * to react atomically.
+ *
+ * Changes:
+ * 0.0.2
+ * - Removed wrr related code for list traversal: would crash the
+ * whole kernel otherwise
+ * 0.0.3 - Ported to kernel 2.4
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+
+static int ip_vs_hprio_init_svc(struct ip_vs_service *svc) {
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+static int ip_vs_hprio_done_svc(struct ip_vs_service *svc) {
+ return 0;
+}
+
+static int ip_vs_hprio_update_svc(struct ip_vs_service *svc) {
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+
+/*
+ * Get the maximum weight of the service destinations.
+ */
+static int ip_vs_hprio_max_weight(struct ip_vs_service *svc) {
+ register struct list_head *l, *e;
+ struct ip_vs_dest *dest;
+ int weight = 0;
+
+ l = &svc->destinations;
+ for (e = l->next; e != l; e = e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+ IP_VS_DBG(1, " weight: %d\n", atomic_read(&dest->weight));
+ if (atomic_read(&dest->weight) > weight &&
+ !ip_vs_is_overloaded(dest)) {
+ weight = atomic_read(&dest->weight);
+ }
+ }
+ IP_VS_DBG(1, "max weight: %d\n", weight);
+ return weight;
+}
+
+
+/*
+ * Weighted Round-Robin Highest Priority Scheduling
+ */
+static struct ip_vs_dest* ip_vs_hprio_schedule(struct ip_vs_service *svc,
+ struct iphdr *iph) {
+ register struct list_head *p, *q;
+ struct ip_vs_dest *dest;
+ int max_weight;
+
+ IP_VS_DBG(1, "ip_vs_hprio_schedule(): Scheduling...\n");
+
+ write_lock(&svc->sched_lock);
+ max_weight = ip_vs_hprio_max_weight(svc);
+ p = (struct list_head *)svc->sched_data;
+ p = p->next;
+ q = p;
+ do {
+ if (q == &svc->destinations) {
+ q = q->next;
+ continue;
+ }
+ dest = list_entry(q, struct ip_vs_dest, n_list);
+ /*
+ The check for an overloaded destination is done in the
+ ip_vs_hprio_max_weight() function. With that we don't need
+ to quiesce destination servers by setting their weights to 0
+ */
+ if (atomic_read(&dest->weight) > 0 &&
+ atomic_read(&dest->weight) == max_weight) {
+ goto out;
+ }
+ q = q->next;
+ } while (q != p);
+ write_unlock(&svc->sched_lock);
+ return NULL;
+
+ out:
+ svc->sched_data = q;
+ write_unlock(&svc->sched_lock);
+ IP_VS_DBG(3, "HPRIO: server %d.%d.%d.%d:%d "
+ "activeconns %d refcnt %d weight %d\n",
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+ return dest;
+}
+
+static struct ip_vs_scheduler ip_vs_hprio_scheduler = {
+ {0}, /* n_list */
+ "hprio", /* name */
+ ATOMIC_INIT(0), /* refcnt */
+ THIS_MODULE, /* this module */
+ ip_vs_hprio_init_svc, /* service initializer */
+ ip_vs_hprio_done_svc, /* service done */
+ ip_vs_hprio_update_svc, /* service updater */
+ ip_vs_hprio_schedule, /* select a server from the destination list
*/
+};
+
+static int __init ip_vs_hprio_init(void) {
+ IP_VS_INFO("HPRIO scheduling module loaded.\n");
+ INIT_LIST_HEAD(&ip_vs_hprio_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_hprio_scheduler);
+}
+
+static void __exit ip_vs_hprio_cleanup(void) {
+ unregister_ip_vs_scheduler(&ip_vs_hprio_scheduler);
+}
+
+module_init(ip_vs_hprio_init);
+module_exit(ip_vs_hprio_cleanup);
+MODULE_LICENSE("GPL");
+
+
|