LVS
lvs-users
Google
 
Web LinuxVirtualServer.org

Re: [PATCH] wlib scheduler (fwd)

To: Wensong Zhang <wensong@xxxxxxxxxxxx>
Subject: Re: [PATCH] wlib scheduler (fwd)
Cc: lvs-users@xxxxxxxxxxxxxxxxxxxxxx
From: Chris Caputo <ccaputo@xxxxxxx>
Date: Wed, 26 Jan 2005 09:07:49 -0800 (PST)
On Thu, 27 Jan 2005, Wensong Zhang wrote:
> On Tue, 25 Jan 2005, Chris Caputo wrote:
> > On Wed, 26 Jan 2005, Wensong Zhang wrote:
> >> I see that recording current pointer into svc->sched_data is probably not
> >> necessary, because every time the scheduler will iterate servers from the
> >> beginning to the end.
> >
> > Hi.  This is done so that at startup (when there are ties) round-robin
> > scheduling occurs rather than sending initial connections to the same
> > server and creating an imbalance from the start.  I'd really prefer to
> > keep it in.  Please let me know if I am misunderstanding something.
> 
> You probably don't need to worry too much about imbalance at the startup, 
> any slight imbalance among servers will direct a new connection to the 
> least loaded server. It's impossible that all the connections are assigned 
> to a server, and the load (traffic/weight) of all the servers are equal.

Hi.  Through experimentation I have tested and observed that rr is
definitely needed on startup (or when resuming from idle) when dealing
with my example use for this scheduler (incoming newsfeeds) because of
their nature:

   - when a virtual server starts up, the first news server to connect may
     open up 10 to 20 connections to the virtual address at the same time
     (ie., in less than the 2 second estimator interval).  Without rr, all
     of those connections will be directed at the same real server,
     defeating the purpose of the scheduler, and resulting in an imbalance
     that may take days (due to the below bullet) to rectify.

   - incoming newsfeed TCP streams can have a very long lifetime (days 
     is not uncommon), and thus it is important to try to balance them out
     from the start because it may take a long time before scheduling of 
     new connections can help balance them.

Please let me know if this doesn't have you convinced.  :-)

By the way, here are the patches with the byterate/packetrate separated
into different files.  Also at:

   http://www.caputo.com/foss/ipvsadm-1.24-wlibp.patch
   http://www.caputo.com/foss/lvs_wlibp-2.6.10.patch

Thank you,
Chris

--- patch against ipvsadm-1.24 ---
--- patch against ipvsadm-1.24 ---

diff -upr ipvsadm-1.24/SCHEDULERS ipvsadm-1.24-wlibp/SCHEDULERS
--- ipvsadm-1.24/SCHEDULERS     2003-05-10 03:05:26.000000000 +0000
+++ ipvsadm-1.24-wlibp/SCHEDULERS       2005-01-26 16:49:48.826566318 
+0000
@@ -1 +1 @@
-rr|wrr|lc|wlc|lblc|lblcr|dh|sh|sed|nq
+rr|wrr|lc|wlc|lblc|lblcr|dh|sh|sed|nq|wlib|wlip
diff -upr ipvsadm-1.24/ipvsadm.8 ipvsadm-1.24-wlibp/ipvsadm.8
--- ipvsadm-1.24/ipvsadm.8      2003-07-05 05:32:38.000000000 +0000
+++ ipvsadm-1.24-wlibp/ipvsadm.8        2005-01-26 16:50:57.724104324 
+0000
@@ -255,6 +255,14 @@ fixed service rate (weight) of the ith s
 \fBnq\fR - Never Queue: assigns an incoming job to an idle server if
 there is, instead of waiting for a fast one; if all the servers are
 busy, it adopts the Shortest Expected Delay policy to assign the job.
+.sp
+\fBwlib\fR - Weighted Least Incoming Byterate: directs network
+connections to the real server with the least incoming byterate
+normalized by the server weight.
+.sp
+\fBwlip\fR - Weighted Least Incoming Packetrate: directs network
+connections to the real server with the least incoming packetrate
+normalized by the server weight.
 .TP
 .B -p, --persistent [\fItimeout\fP]
 Specify that a virtual service is persistent. If this option is

--- patch against linux kernel 2.6.10 ---
--- patch against linux kernel 2.6.10 ---

diff -upr -X dontdiff linux-2.6.10-stock/net/ipv4/ipvs/Kconfig 
linux/net/ipv4/ipvs/Kconfig
--- linux-2.6.10-stock/net/ipv4/ipvs/Kconfig    2005-01-10 03:57:58.000000000 
+0000
+++ linux/net/ipv4/ipvs/Kconfig 2005-01-26 16:30:25.099281162 +0000
@@ -224,6 +224,28 @@ config     IP_VS_NQ
          If you want to compile it in kernel, say Y. To compile it as a
          module, choose M here. If unsure, say N.
 
+config IP_VS_WLIB
+       tristate "weighted least incoming byterate scheduling"
+        depends on IP_VS
+       ---help---
+         The weighted least incoming byterate scheduling algorithm directs
+         network connections to the server with the least incoming byterate
+         normalized by the server weight.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
+config IP_VS_WLIP
+       tristate "weighted least incoming packetrate scheduling"
+        depends on IP_VS
+       ---help---
+         The weighted least incoming packetrate scheduling algorithm directs
+         network connections to the server with the least incoming packetrate
+         normalized by the server weight.
+
+         If you want to compile it in kernel, say Y. To compile it as a
+         module, choose M here. If unsure, say N.
+
 comment 'IPVS application helper'
        depends on IP_VS
 
diff -upr -X dontdiff linux-2.6.10-stock/net/ipv4/ipvs/Makefile 
linux/net/ipv4/ipvs/Makefile
--- linux-2.6.10-stock/net/ipv4/ipvs/Makefile   2005-01-10 03:58:08.000000000 
+0000
+++ linux/net/ipv4/ipvs/Makefile        2005-01-26 16:29:26.448449619 +0000
@@ -29,6 +29,8 @@ obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
 obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
 obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
 obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+obj-$(CONFIG_IP_VS_WLIB) += ip_vs_wlib.o
+obj-$(CONFIG_IP_VS_WLIP) += ip_vs_wlip.o
 
 # IPVS application helpers
 obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff -upr -X dontdiff linux-2.6.10-stock/net/ipv4/ipvs/ip_vs_wlib.c 
linux/net/ipv4/ipvs/ip_vs_wlib.c
--- linux-2.6.10-stock/net/ipv4/ipvs/ip_vs_wlib.c       2005-01-10 
03:59:54.000000000 +0000
+++ linux/net/ipv4/ipvs/ip_vs_wlib.c    2005-01-26 16:47:41.250238601 +0000
@@ -0,0 +1,158 @@
+/*
+ * IPVS:        Weighted Least Incoming Byterate Scheduling module
+ *
+ * Version:     ip_vs_wlib.c 1.02 2005/01/26 ccaputo
+ *
+ * Authors:     Chris Caputo <ccaputo@xxxxxxx> based on code by:
+ *
+ *                  Wensong Zhang <wensong@xxxxxxxxxxxxxxxxxxxxxx>
+ *                  Peter Kese <peter.kese@xxxxxx>
+ *                  Julian Anastasov <ja@xxxxxx>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Chris Caputo: Based code on ip_vs_wlc.c ip_vs_rr.c.
+ *
+ */
+
+/*
+ * The WLIB algorithm uses the results of the estimator's inbps
+ * calculations to determine which real server has the lowest incoming
+ * byterate.
+ *
+ * Real server weight is factored into the calculation.  An example way to
+ * use this is if you have one server that can handle 100 Mbps of input and
+ * another that can handle 1 Gbps you could set the weights to be 100 and 1000
+ * respectively.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_wlib_init_svc(struct ip_vs_service *svc)
+{
+       svc->sched_data = &svc->destinations;
+       return 0;
+}
+
+
+static int
+ip_vs_wlib_done_svc(struct ip_vs_service *svc)
+{
+       return 0;
+}
+
+
+static int
+ip_vs_wlib_update_svc(struct ip_vs_service *svc)
+{
+       svc->sched_data = &svc->destinations;
+       return 0;
+}
+
+
+/*
+ *     Weighted Least Incoming Byterate scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlib_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct list_head *p, *q;
+       struct ip_vs_dest *dest, *least = NULL;
+       u32 dr, lr = -1;
+       int dwgt, lwgt = 0;
+
+       IP_VS_DBG(6, "ip_vs_wlib_schedule(): Scheduling...\n");
+
+       /*
+        * We calculate the load of each dest server as follows:
+        *        (dest inbps rate) / dest->weight
+        *
+        * The comparison of dr*lwght < lr*dwght is equivalent to that of
+        * dr/dwght < lr/lwght if every weight is larger than zero.
+        *
+        * A server with weight=0 is quiesced and will not receive any
+        * new connections.
+        *
+        * In case of ties, highest weight is winner.  And if that still makes
+        * for a tie, round robin is used (which is why we remember our last
+        * starting location in the linked list).
+        */
+
+       write_lock(&svc->sched_lock);
+       p = (struct list_head *)svc->sched_data;
+       p = p->next;
+       q = p;
+       do {
+               /* skip list head */
+               if (q == &svc->destinations) {
+                       q = q->next;
+                       continue;
+               }
+
+               dest = list_entry(q, struct ip_vs_dest, n_list);
+               dwgt = atomic_read(&dest->weight);
+               if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && dwgt > 0) {
+                       spin_lock(&dest->stats.lock);
+                       dr = dest->stats.inbps;
+                       spin_unlock(&dest->stats.lock);
+
+                       if (least == NULL ||
+                               (u64)dr * (u64)lwgt < (u64)lr * (u64)dwgt ||
+                               (dr == lr && dwgt > lwgt)) {
+                               least = dest;
+                               lr = dr;
+                               lwgt = dwgt;
+                               svc->sched_data = q;
+                       }
+               }
+               q = q->next;
+       } while (q != p);
+       write_unlock(&svc->sched_lock);
+
+       if (least != NULL)
+               IP_VS_DBG(6, "WLIB: server %u.%u.%u.%u:%u "
+                         "activeconns %d refcnt %d weight %d\n",
+                         NIPQUAD(least->addr), ntohs(least->port),
+                         atomic_read(&least->activeconns),
+                         atomic_read(&least->refcnt),
+                         atomic_read(&least->weight));
+
+       return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlib_scheduler =
+{
+       .name =                 "wlib",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .init_service =         ip_vs_wlib_init_svc,
+       .done_service =         ip_vs_wlib_done_svc,
+       .update_service =       ip_vs_wlib_update_svc,
+       .schedule =             ip_vs_wlib_schedule,
+};
+
+
+static int __init ip_vs_wlib_init(void)
+{
+       INIT_LIST_HEAD(&ip_vs_wlib_scheduler.n_list);
+       return register_ip_vs_scheduler(&ip_vs_wlib_scheduler);
+}
+
+static void __exit ip_vs_wlib_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_wlib_scheduler);
+}
+
+module_init(ip_vs_wlib_init);
+module_exit(ip_vs_wlib_cleanup);
+MODULE_LICENSE("GPL");
diff -upr -X dontdiff linux-2.6.10-stock/net/ipv4/ipvs/ip_vs_wlip.c 
linux/net/ipv4/ipvs/ip_vs_wlip.c
--- linux-2.6.10-stock/net/ipv4/ipvs/ip_vs_wlip.c       2005-01-26 
16:56:40.422068155 +0000
+++ linux/net/ipv4/ipvs/ip_vs_wlip.c    2005-01-26 16:45:55.765211729 +0000
@@ -0,0 +1,158 @@
+/*
+ * IPVS:        Weighted Least Incoming Packetrate Scheduling module
+ *
+ * Version:     ip_vs_wlip.c 1.00 2005/01/26 ccaputo
+ *
+ * Authors:     Chris Caputo <ccaputo@xxxxxxx> based on code by:
+ *
+ *                  Wensong Zhang <wensong@xxxxxxxxxxxxxxxxxxxxxx>
+ *                  Peter Kese <peter.kese@xxxxxx>
+ *                  Julian Anastasov <ja@xxxxxx>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Chris Caputo: Based code on ip_vs_wlc.c ip_vs_rr.c.
+ *
+ */
+
+/*
+ * The WLIP algorithm uses the results of the estimator's inpps
+ * calculations to determine which real server has the lowest incoming
+ * packetrate.
+ *
+ * Real server weight is factored into the calculation.  An example way to
+ * use this is if you have one server that can handle 10 Kpps of input and
+ * another that can handle 100 Kpps you could set the weights to be 10 and 100
+ * respectively.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_wlip_init_svc(struct ip_vs_service *svc)
+{
+       svc->sched_data = &svc->destinations;
+       return 0;
+}
+
+
+static int
+ip_vs_wlip_done_svc(struct ip_vs_service *svc)
+{
+       return 0;
+}
+
+
+static int
+ip_vs_wlip_update_svc(struct ip_vs_service *svc)
+{
+       svc->sched_data = &svc->destinations;
+       return 0;
+}
+
+
+/*
+ *     Weighted Least Incoming Packetrate scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlip_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+       struct list_head *p, *q;
+       struct ip_vs_dest *dest, *least = NULL;
+       u32 dr, lr = -1;
+       int dwgt, lwgt = 0;
+
+       IP_VS_DBG(6, "ip_vs_wlip_schedule(): Scheduling...\n");
+
+       /*
+        * We calculate the load of each dest server as follows:
+        *        (dest inpps rate) / dest->weight
+        *
+        * The comparison of dr*lwght < lr*dwght is equivalent to that of
+        * dr/dwght < lr/lwght if every weight is larger than zero.
+        *
+        * A server with weight=0 is quiesced and will not receive any
+        * new connections.
+        *
+        * In case of ties, highest weight is winner.  And if that still makes
+        * for a tie, round robin is used (which is why we remember our last
+        * starting location in the linked list).
+        */
+
+       write_lock(&svc->sched_lock);
+       p = (struct list_head *)svc->sched_data;
+       p = p->next;
+       q = p;
+       do {
+               /* skip list head */
+               if (q == &svc->destinations) {
+                       q = q->next;
+                       continue;
+               }
+
+               dest = list_entry(q, struct ip_vs_dest, n_list);
+               dwgt = atomic_read(&dest->weight);
+               if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && dwgt > 0) {
+                       spin_lock(&dest->stats.lock);
+                       dr = dest->stats.inpps;
+                       spin_unlock(&dest->stats.lock);
+
+                       if (least == NULL ||
+                               (u64)dr * (u64)lwgt < (u64)lr * (u64)dwgt ||
+                               (dr == lr && dwgt > lwgt)) {
+                               least = dest;
+                               lr = dr;
+                               lwgt = dwgt;
+                               svc->sched_data = q;
+                       }
+               }
+               q = q->next;
+       } while (q != p);
+       write_unlock(&svc->sched_lock);
+
+       if (least != NULL)
+               IP_VS_DBG(6, "WLIP: server %u.%u.%u.%u:%u "
+                         "activeconns %d refcnt %d weight %d\n",
+                         NIPQUAD(least->addr), ntohs(least->port),
+                         atomic_read(&least->activeconns),
+                         atomic_read(&least->refcnt),
+                         atomic_read(&least->weight));
+
+       return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlip_scheduler =
+{
+       .name =                 "wlip",
+       .refcnt =               ATOMIC_INIT(0),
+       .module =               THIS_MODULE,
+       .init_service =         ip_vs_wlip_init_svc,
+       .done_service =         ip_vs_wlip_done_svc,
+       .update_service =       ip_vs_wlip_update_svc,
+       .schedule =             ip_vs_wlip_schedule,
+};
+
+
+static int __init ip_vs_wlip_init(void)
+{
+       INIT_LIST_HEAD(&ip_vs_wlip_scheduler.n_list);
+       return register_ip_vs_scheduler(&ip_vs_wlip_scheduler);
+}
+
+static void __exit ip_vs_wlip_cleanup(void)
+{
+       unregister_ip_vs_scheduler(&ip_vs_wlip_scheduler);
+}
+
+module_init(ip_vs_wlip_init);
+module_exit(ip_vs_wlip_cleanup);
+MODULE_LICENSE("GPL");


<Prev in Thread] Current Thread [Next in Thread>