LVS
lvs-users
Google
 
Web LinuxVirtualServer.org

[PATCH-2.4 1/2] Threshold Limitation & Overflow Server Pool

To: lvs-users@xxxxxxxxxxxxxxxxxxxxxx
Subject: [PATCH-2.4 1/2] Threshold Limitation & Overflow Server Pool
Cc: Horms <horms@xxxxxxxxxxxx>
Cc: Julian Anastasov <ja@xxxxxx>
From: Roberto Nibali <ratz@xxxxxx>
Date: Thu, 17 Nov 2005 19:13:16 +0100
Hi,

The kernel part as described in

[PATCH-2.4 0/2] Threshold Limitation & Overflow Server Pool

Regards,
Roberto Nibali
-- 
-------------------------------------------------------------
addr://Kasinostrasse 30, CH-5001 Aarau tel://++41 62 823 9355
http://www.terreactive.com             fax://++41 62 823 9356
-------------------------------------------------------------
terreActive AG                       Wir sichern Ihren Erfolg
-------------------------------------------------------------
diff -X dontdiff -Nur linux-2.4.32-orig/include/net/ip_vs.h 
linux-2.4.32-pab2/include/net/ip_vs.h
--- linux-2.4.32-orig/include/net/ip_vs.h       2005-10-27 17:17:13 +0200
+++ linux-2.4.32-pab2/include/net/ip_vs.h       2005-11-17 13:29:13 +0100
@@ -19,11 +19,16 @@
  */
 #define IP_VS_SVC_F_PERSISTENT        0x0001    /* persistent port */
 #define IP_VS_SVC_F_HASHED            0x0002    /* hashed entry */
+#define IP_VS_SVC_F_OVERLOAD          0x0004    /* service overloaded */
 
 /*
  *      Destination Server Flags
  */
 #define IP_VS_DEST_F_AVAILABLE        0x0001    /* Available tag */
+#define IP_VS_DEST_F_OVERLOAD         0x0002    /* server is overloaded */
+#define IP_VS_DEST_F_OVERFLOW         0x0004    /* RS is overflow server */
+#define IP_VS_DEST_F_PERSISTENT       0x0008    /* RS is overflow server
+                                                  and has persistency set */
 
 /*
  *      IPVS sync daemon states
@@ -113,8 +118,11 @@
        /* destination specific options */
        u_int32_t       daddr;          /* destination address */
        u_int16_t       dport;
-       unsigned        conn_flags;     /* destination flags */
+       unsigned        conn_flags;     /* connection flags */
+       unsigned        dest_flags;     /* destination flags */
        int             weight;         /* destination weight */
+       u_int32_t       u_threshold;    /* upper threshold */
+       u_int32_t       l_threshold;    /* lower threshold */
 };
 
 
@@ -165,6 +173,8 @@
 
        /* number of real servers */
        unsigned int    num_dests;
+       /* amount of available real servers (IP_VS_DEST_F_OVERLOAD not set) */
+       unsigned int    avail_dests;
 
        /* statistics */
        struct ip_vs_stats_user stats;
@@ -173,10 +183,14 @@
 struct ip_vs_dest_user {
        u_int32_t       addr;           /* destination address */
        u_int16_t       port;
-       unsigned        flags;          /* destination flags */
+       unsigned        flags;          /* connection flags */
+       unsigned        dest_flags;     /* destination flags */
        int             weight;         /* destination weight */
        u_int32_t       activeconns;    /* active connections */
        u_int32_t       inactconns;     /* inactive connections */
+       u_int32_t       u_threshold;    /* upper threshold */
+       u_int32_t       l_threshold;    /* lower threshold */
+       u_int32_t       persistconns;   /* persistent connections */
 
        /* statistics */
        struct ip_vs_stats_user stats;
@@ -192,6 +206,8 @@
 
        /* number of real servers */
        unsigned int    num_dests;
+       /* amount of available real servers (IP_VS_DEST_F_OVERLOAD not set) */
+       unsigned int    avail_dests;
 
        /* the real servers */
        struct ip_vs_dest_user entrytable[0];
@@ -464,6 +480,7 @@
 
        struct list_head        destinations;  /* real server d-linked list */
        __u32                   num_dests;     /* number of servers */
+       atomic_t                avail_dests;   /* available real servers */
        struct ip_vs_stats      stats;         /* statistics for the service */
 
        /* for scheduling */
@@ -486,11 +503,16 @@
        unsigned                flags;    /* dest status flags */
        atomic_t                weight;   /* server weight */
        atomic_t                conn_flags;     /* flags to copy to conn */
-       atomic_t                activeconns;    /* active connections */
-       atomic_t                inactconns;     /* inactive connections */
        atomic_t                refcnt;         /* reference counter */
        struct ip_vs_stats      stats;          /* statistics */
 
+       /* connection counters and thresholds */
+       atomic_t                activeconns;    /* active connections */
+       atomic_t                inactconns;     /* inactive connections */
+       atomic_t                persistconns;   /* persistent connections */
+       __u32                   u_threshold;    /* upper threshold */
+       __u32                   l_threshold;    /* lower threshold */
+
        /* for destination cache */
        spinlock_t              dst_lock;       /* lock dst_cache */
        struct dst_entry        *dst_cache;     /* destination cache entry */
@@ -935,6 +957,112 @@
        return 0;
 }
 
+/* State table for scheduler selection of dest from list
+   -----------------------------------------------------
+
+   Basically the schedulers walk through the list of real servers
+   (dests) of a service when a new connection entry is made and
+   then select a free and best (in the sense of the chosen scheduler)
+   destination to bind a connection to. In former days this was
+   rather straight forward since so long as a dest was not quiesced
+   the first found dest could be used. With the arrival of threshold
+   limitation per destination and a distinction of real server
+   functionality into POOL server (servicing normally) and SPILLOVER
+   server (normally used as a last resort to an overloaded service)
+   however, the selection of the free and best dest is a tiny bit
+   more complicated. The following state table gives the basic idea
+   behind the implementation in the schedulers.
+
+
+   z: dest overloaded
+   y: service overloaded
+   x: dest is overflow server
+   w: dest is quiesced
+   h: HIT (dest gets selected), 'x' means that this
+      combination is not possible
+   r: reason (number) for skipping dest
+
+
+      || z | y | x | w || h || r |
+   ---++---+---+---+---++---++---+
+   a0 || 0 | 0 | 0 | 0 || 1 ||   | <---
+   ---++---+---+---+---++---++---+
+   a1 || 0 | 0 | 0 | 1 || 0 || 4 |
+   ---++---+---+---+---++---++---+
+   a2 || 0 | 0 | 1 | 0 || 0 || 3 |
+   ---++---+---+---+---++---++---+
+   a3 || 0 | 0 | 1 | 1 || 0 || 3 |
+   ---++---+---+---+---++---++---+
+   a4 || 0 | 1 | 0 | 0 || x ||   |
+   ---++---+---+---+---++---++---+
+   a5 || 0 | 1 | 0 | 1 || x ||   |
+   ---++---+---+---+---++---++---+
+   a6 || 0 | 1 | 1 | 0 || 1 ||   | <---
+   ---++---+---+---+---++---++---+
+   a7 || 0 | 1 | 1 | 1 || 0 ||   |
+   ---++---+---+---+---++---++---+
+   a8 || 1 | 0 | 0 | 0 || 0 || 1 |
+   ---++---+---+---+---++---++---+
+   a9 || 1 | 0 | 0 | 1 || 0 || 1 |
+   ---++---+---+---+---++---++---+
+   aa || 1 | 0 | 1 | 0 || 0 || 1 |
+   ---++---+---+---+---++---++---+
+   ab || 1 | 0 | 1 | 1 || 0 || 1 |
+   ---++---+---+---+---++---++---+
+   ac || 1 | 1 | 0 | 0 || 0 || 1 |
+   ---++---+---+---+---++---++---+
+   ad || 1 | 1 | 0 | 1 || 0 || 1 |
+   ---++---+---+---+---++---++---+
+   ae || 1 | 1 | 1 | 0 || 0 || 1 |
+   ---++---+---+---+---++---++---+
+   af || 1 | 1 | 1 | 1 || 0 || 1 |
+   ---++---+---+---+---++---++---+
+
+   Thus: h = a0 || a6
+           = (!z && !y && !x && !w) || (!z && y && x && !w)
+           = !z && !w && ((!y && !x) || (y && x))
+           = !z && !w && (x == y)
+
+   Since (!z && !w) needs to be evaluated fewer timesi than (x == y),
+   we can write the whole term as:
+
+   h   = !z && !w ? (x == y) : 0
+   or: = !(z || w) ? (x == y) : 0
+
+
+ */
+
+/* Real server overloaded? OLD, USE THE ONES BELOW */
+static inline int ip_vs_is_overloaded(struct ip_vs_dest *dest) {
+       return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/* Real server overloaded? */
+static inline int ipvs_dest_overloaded(struct ip_vs_dest *dest) {
+       return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/* Real server is overflow server? */
+static inline int ipvs_dest_overflow(struct ip_vs_dest *dest) {
+       return dest->flags & IP_VS_DEST_F_OVERFLOW;
+}
+
+/* Service overloaded? */
+static inline int ipvs_svc_overloaded(struct ip_vs_dest *dest) {
+       return dest->svc->flags & IP_VS_SVC_F_OVERLOAD;
+}
+
+/* Real server quiesced (weight == 0)? */
+static inline int ipvs_dest_quiesced(struct ip_vs_dest *dest) {
+       return atomic_read(&dest->weight) == 0;
+}
+
+/* Is destination ok to be selected? Read above to understand the logic */
+static inline int ipvs_dest_select(struct ip_vs_dest *dest) {
+       return (!(ipvs_dest_overloaded(dest) || ipvs_dest_quiesced(dest)) ?
+               (ipvs_svc_overloaded(dest) == ipvs_dest_overflow(dest)) : 0);
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _IP_VS_H */
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/Config.in 
linux-2.4.32-pab2/net/ipv4/ipvs/Config.in
--- linux-2.4.32-orig/net/ipv4/ipvs/Config.in   2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/Config.in   2005-10-27 17:22:03 +0200
@@ -19,6 +19,7 @@
   dep_tristate '  source hashing scheduling' CONFIG_IP_VS_SH $CONFIG_IP_VS
   dep_tristate '  shortest expected delay scheduling' CONFIG_IP_VS_SED 
$CONFIG_IP_VS
   dep_tristate '  never queue scheduling' CONFIG_IP_VS_NQ $CONFIG_IP_VS
+  dep_tristate '  highest weight round-robin scheduling' CONFIG_IP_VS_HPRIO 
$CONFIG_IP_VS
   comment 'IPVS application helper'
   dep_tristate '  FTP protocol helper' CONFIG_IP_VS_FTP $CONFIG_IP_VS
 fi
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/Makefile 
linux-2.4.32-pab2/net/ipv4/ipvs/Makefile
--- linux-2.4.32-orig/net/ipv4/ipvs/Makefile    2003-11-28 19:26:21 +0100
+++ linux-2.4.32-pab2/net/ipv4/ipvs/Makefile    2005-10-27 17:22:03 +0200
@@ -33,6 +33,7 @@
 obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
 obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
 obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+obj-$(CONFIG_IP_VS_HPRIO) += ip_vs_hprio.o
 
 # IPVS application helpers
 obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_conn.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_conn.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_conn.c        2005-10-27 17:20:58 
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_conn.c        2005-11-08 08:26:01 
+0100
@@ -21,6 +21,7 @@
  * and others. Many code here is taken from IP MASQ code of kernel 2.2.
  *
  * Changes:
+ *     Roberto Nibali, ratz: backported per RS threshold limitation from 2.5.x
  *
  */
 
@@ -233,7 +234,7 @@
        if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
                cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
 
-       IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+       IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
                  ip_vs_proto_name(protocol),
                  NIPQUAD(s_addr), ntohs(s_port),
                  NIPQUAD(d_addr), ntohs(d_port),
@@ -268,7 +269,7 @@
 out:
        ct_read_unlock(hash);
 
-       IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d 
%s\n",
+       IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d 
%s\n",
                ip_vs_proto_name(protocol),
                NIPQUAD(s_addr), ntohs(s_port),
                NIPQUAD(d_addr), ntohs(d_port),
@@ -335,7 +336,8 @@
 
 
 /*
- *     Timeout table[state]
+ *     Timeout table[state] changes
+ *             [IP_VS_S_FIN_WAIT]      =       2*60*HZ,
  */
 struct ip_vs_timeout_table vs_timeout_table = {
        ATOMIC_INIT(0), /* refcnt */
@@ -345,7 +347,7 @@
                [IP_VS_S_ESTABLISHED]   =       15*60*HZ,
                [IP_VS_S_SYN_SENT]      =       2*60*HZ,
                [IP_VS_S_SYN_RECV]      =       1*60*HZ,
-               [IP_VS_S_FIN_WAIT]      =       2*60*HZ,
+               [IP_VS_S_FIN_WAIT]      =       10*HZ,
                [IP_VS_S_TIME_WAIT]     =       2*60*HZ,
                [IP_VS_S_CLOSE]         =       10*HZ,
                [IP_VS_S_CLOSE_WAIT]    =       60*HZ,
@@ -561,7 +563,7 @@
                struct ip_vs_dest *dest = cp->dest;
 
                IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
-                         "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+                         "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
                          ip_vs_proto_name(cp->protocol),
                          (state_off==VS_STATE_OUTPUT)?"output ":"input ",
                          th->syn? 'S' : '.',
@@ -1077,6 +1079,11 @@
        }
 }
 
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+       return atomic_read(&dest->activeconns)
+               + atomic_read(&dest->inactconns);
+}
 
 /*
  *  Bind a connection entry with a virtual service destination
@@ -1096,8 +1103,9 @@
        cp->flags |= atomic_read(&dest->conn_flags);
        cp->dest = dest;
 
-       IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
-                 "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
+       IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+                 "d:%u.%u.%u.%u:%d fwd:%c s:%s conn->flg:%X conn->refcnt:%d "
+                 "dest->refcnt:%d\n",
                  ip_vs_proto_name(cp->protocol),
                  NIPQUAD(cp->caddr), ntohs(cp->cport),
                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -1105,6 +1113,31 @@
                  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
                  cp->flags, atomic_read(&cp->refcnt),
                  atomic_read(&dest->refcnt));
+
+       /* Update connection counters */
+       if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+               /* It is a normal connection, so increase the inactive
+                  connection counter because it is in TCP SYNRECV
+                  state (inactive) or other protocol inacive state */
+               atomic_inc(&dest->inactconns);
+       } else {
+               /* It is a persistent connection/template, so increase
+                  the peristent connection counter */
+               atomic_inc(&dest->persistconns);
+       }
+
+       IP_VS_DBG(3, "Bind-dest: Threshold handling: avail_dests=%d\n",
+                       atomic_read(&dest->svc->avail_dests));
+       if (dest->u_threshold != 0
+           && !(dest->flags & IP_VS_DEST_F_OVERLOAD)
+           && ip_vs_dest_totalconns(dest) >= dest->u_threshold) {
+               dest->flags |= IP_VS_DEST_F_OVERLOAD;
+               if (!(dest->svc->flags & IP_VS_SVC_F_OVERLOAD)
+                   && atomic_dec_and_test(&dest->svc->avail_dests)) {
+                       /* All RS for this service are overloaded */
+                       dest->svc->flags |= IP_VS_SVC_F_OVERLOAD;
+               }
+       }
 }
 
 
@@ -1120,9 +1153,9 @@
        if (!dest)
                return;
 
-       IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d "
+       IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d "
                  "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c "
-                 "s:%s flg:%X cnt:%d destcnt:%d\n",
+                 "s:%s flg:%X conn->refcnt:%d dest->refcnt:%d\n",
                  ip_vs_proto_name(cp->protocol),
                  NIPQUAD(cp->caddr), ntohs(cp->cport),
                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -1131,16 +1164,39 @@
                  cp->flags, atomic_read(&cp->refcnt),
                  atomic_read(&dest->refcnt));
 
-       /*
-        * Decrease the inactconns or activeconns counter
-        * if it is not a connection template
-        */
-       if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
-               if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+       /* Update the connection counters */
+       if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+               /* It is a normal connection, so decrease the inactconns
+                  or activeconns counter */
+               if (cp->flags & IP_VS_CONN_F_INACTIVE) {
                        atomic_dec(&dest->inactconns);
                } else {
                        atomic_dec(&dest->activeconns);
                }
+       } else {
+               /* It is a persistent connection/template, so decrease
+                  the peristent connection counter */
+               atomic_dec(&dest->persistconns);
+       }
+
+       IP_VS_DBG(3, "Unbind-dest: Threshold handling: avail_dests=%d\n",
+                       atomic_read(&dest->svc->avail_dests));
+       if (dest->l_threshold != 0) {
+               /* This implies that the upper threshold is != 0 as well */
+               if ((dest->flags & IP_VS_DEST_F_OVERLOAD)
+                   && ip_vs_dest_totalconns(dest) <= dest->l_threshold) {
+                       dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+                       atomic_inc(&dest->svc->avail_dests);
+                       dest->svc->flags &= ~IP_VS_SVC_F_OVERLOAD;
+               }
+       } else {
+               /* We drop in here if the upper threshold is != 0 and the
+                  lower threshold is ==0. */
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD) {
+                       dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+                       atomic_inc(&dest->svc->avail_dests);
+                       dest->svc->flags &= ~IP_VS_SVC_F_OVERLOAD;
+               }
        }
 
        /*
@@ -1187,12 +1243,7 @@
                                ip_vs_conn_hash(ct);
                        }
                }
-
-               /*
-                * Simply decrease the refcnt of the template,
-                * don't restart its timer.
-                */
-               atomic_dec(&ct->refcnt);
+               __ip_vs_conn_put(ct);
                return 0;
        }
        return 1;
@@ -1270,7 +1321,7 @@
        ip_vs_conn_hash(cp);
 
   expire_later:
-       IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+       IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn.n_control=%d\n",
                  atomic_read(&cp->refcnt)-1,
                  atomic_read(&cp->n_control));
 
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_core.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_core.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_core.c        2005-10-27 17:17:13 
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_core.c        2005-11-16 16:24:31 
+0100
@@ -210,25 +210,32 @@
                         * vaddr,vport,daddr,dport> for non-ftp service,
                         * and <protocol,caddr,0,vaddr,0,daddr,0>
                         * for ftp service.
+                        *
+                        * If destination is not in template mode (either if
+                        * running in overflow mode or service is not 
persistent,
+                        * fallback to not creating a persistent template entry.
+                        *
                         */
-                       if (svc->port != FTPPORT)
-                               ct = ip_vs_conn_new(iph->protocol,
-                                                   snet, 0,
-                                                   iph->daddr, portp[1],
-                                                   dest->addr, dest->port,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       else
-                               ct = ip_vs_conn_new(iph->protocol,
-                                                   snet, 0,
-                                                   iph->daddr, 0,
-                                                   dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       if (ct == NULL)
-                               return NULL;
+                       if (dest->flags & IP_VS_DEST_F_PERSISTENT) {
+                               if (svc->port != FTPPORT)
+                                       ct = ip_vs_conn_new(iph->protocol,
+                                                           snet, 0,
+                                                           iph->daddr, 
portp[1],
+                                                           dest->addr, 
dest->port,
+                                                           
IP_VS_CONN_F_TEMPLATE,
+                                                           dest);
+                               else
+                                       ct = ip_vs_conn_new(iph->protocol,
+                                                           snet, 0,
+                                                           iph->daddr, 0,
+                                                           dest->addr, 0,
+                                                           
IP_VS_CONN_F_TEMPLATE,
+                                                           dest);
+                               if (ct == NULL)
+                                       return NULL;
 
-                       ct->timeout = svc->timeout;
+                               ct->timeout = svc->timeout;
+                       }
                } else {
                        /* set destination with the found template */
                        dest = ct->dest;
@@ -265,24 +272,26 @@
                        /*
                         * Create a template according to the service
                         */
-                       if (svc->fwmark)
-                               ct = ip_vs_conn_new(IPPROTO_IP,
-                                                   snet, 0,
-                                                   htonl(svc->fwmark), 0,
-                                                   dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       else
-                               ct = ip_vs_conn_new(iph->protocol,
-                                                   snet, 0,
-                                                   iph->daddr, 0,
-                                                   dest->addr, 0,
-                                                   IP_VS_CONN_F_TEMPLATE,
-                                                   dest);
-                       if (ct == NULL)
-                               return NULL;
+                       if (dest->flags & IP_VS_DEST_F_PERSISTENT) {
+                               if (svc->fwmark)
+                                       ct = ip_vs_conn_new(IPPROTO_IP,
+                                                           snet, 0,
+                                                           htonl(svc->fwmark), 
0,
+                                                           dest->addr, 0,
+                                                           
IP_VS_CONN_F_TEMPLATE,
+                                                           dest);
+                               else
+                                       ct = ip_vs_conn_new(iph->protocol,
+                                                           snet, 0,
+                                                           iph->daddr, 0,
+                                                           dest->addr, 0,
+                                                           
IP_VS_CONN_F_TEMPLATE,
+                                                           dest);
+                               if (ct == NULL)
+                                       return NULL;
 
-                       ct->timeout = svc->timeout;
+                               ct->timeout = svc->timeout;
+                       }
                } else {
                        /* set destination with the found template */
                        dest = ct->dest;
@@ -291,32 +300,45 @@
        }
 
        /*
-        *    Create a new connection according to the template
+        * Create a new connection according to the template
+        * Handle SPILLOVER pool server that have no persistency
+        * Note that we are in ip_vs_sched_persist() but in case
+        * IP_VS_DEST_F_PERSISTENT is set, we have not created a
+        * connection template which needs to be binded to the
+        * control connection. Keep that in mind when debugging
+        * this code, as it looks wrong at first. --ratz
         */
-       cp = ip_vs_conn_new(iph->protocol,
-                           iph->saddr, portp[0],
-                           iph->daddr, portp[1],
-                           dest->addr, dport,
-                           0,
-                           dest);
-       if (cp == NULL) {
+       if (dest->flags & IP_VS_DEST_F_PERSISTENT) {
+               cp = ip_vs_conn_new(iph->protocol,
+                                   iph->saddr, portp[0],
+                                   iph->daddr, portp[1],
+                                   dest->addr, dport,
+                                   0,
+                                   dest);
+               if (cp == NULL) {
+                       ip_vs_conn_put(ct);
+                       return NULL;
+               }
+
+               /*
+                *    Add its control
+                */
+               ip_vs_control_add(cp, ct);
                ip_vs_conn_put(ct);
-               return NULL;
+       } else {
+               /* handle non-persistent destinations */
+               cp = ip_vs_conn_new(iph->protocol,
+                                   iph->saddr, portp[0],
+                                   iph->daddr, portp[1],
+                                   dest->addr, dport,
+                                   0,
+                                   dest);
+               if (cp == NULL) {
+                       /* We don't need to invalidate ct here */
+                       return NULL;
+               }
        }
 
-       /*
-        *    Increase the inactive connection counter
-        *    because it is in Syn-Received
-        *    state (inactive) when the connection is created.
-        */
-       atomic_inc(&dest->inactconns);
-
-       /*
-        *    Add its control
-        */
-       ip_vs_control_add(cp, ct);
-
-       ip_vs_conn_put(ct);
        return cp;
 }
 
@@ -369,14 +391,8 @@
        if (cp == NULL)
                return NULL;
 
-       /*
-        *    Increase the inactive connection counter because it is in
-        *    Syn-Received state (inactive) when the connection is created.
-        */
-       atomic_inc(&dest->inactconns);
-
        IP_VS_DBG(6, "Schedule fwd:%c s:%s c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
-                 "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+                 "d:%u.%u.%u.%u:%u flg:%X conn->refcnt:%d\n",
                  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
                  NIPQUAD(cp->caddr), ntohs(cp->cport),
                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -1111,11 +1127,10 @@
                if (sysctl_ip_vs_expire_nodest_conn) {
                        /* try to expire the connection immediately */
                        ip_vs_conn_expire_now(cp);
-               } else {
-                       /* don't restart its timer, and silently
-                          drop the packet. */
-                       __ip_vs_conn_put(cp);
                }
+               /* don't restart its timer, and silently
+                  drop the packet. */
+               __ip_vs_conn_put(cp);
                return NF_DROP;
        }
 
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_ctl.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_ctl.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_ctl.c 2005-06-01 02:56:56 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_ctl.c 2005-11-17 13:50:26 +0100
@@ -17,6 +17,9 @@
  *              2 of the License, or (at your option) any later version.
  *
  * Changes:
+ *          Roberto Nibali, ratz: Implemented per RS threshold limitation based
+ *                                on 2.5.x code and former design in 2.2.x.
+ *                                Implemented pool and spillover architecture.
  *
  */
 
@@ -428,7 +431,7 @@
   out:
        read_unlock(&__ip_vs_svc_lock);
 
-       IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+       IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
                  fwmark, ip_vs_proto_name(protocol),
                  NIPQUAD(vaddr), ntohs(vport),
                  svc?"hit":"not hit");
@@ -590,7 +593,7 @@
        for (e=l->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
                IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
-                         "refcnt=%d\n",
+                         "dest->refcnt=%d\n",
                          dest->vfwmark,
                          NIPQUAD(dest->addr), ntohs(dest->port),
                          atomic_read(&dest->refcnt));
@@ -711,8 +714,26 @@
                }
        }
 
-       /* set the dest status flags */
+       /* set the dest status flags: overflow and persistency */
+       if (ur->dest_flags & IP_VS_DEST_F_OVERFLOW) {
+               dest->flags |= IP_VS_DEST_F_OVERFLOW;
+       }
+       if (ur->dest_flags & IP_VS_DEST_F_PERSISTENT) {
+               dest->flags |= IP_VS_DEST_F_PERSISTENT;
+       }
        dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+       if (ur->u_threshold == 0 || ur->u_threshold > dest->u_threshold) {
+               dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+               svc->flags &= ~IP_VS_SVC_F_OVERLOAD;
+       }
+       dest->u_threshold = ur->u_threshold;
+       dest->l_threshold = ur->l_threshold;
+       IP_VS_DBG(3, "__ip_vs_update_dest(): Added/updated RS [%s] with 
settings: "
+                    "addr = %u.%u.%u.%u, persistency = %d\n",
+                    (dest->flags & IP_VS_DEST_F_OVERFLOW) ? "SPILLOVER" : 
"POOL",
+                    NIPQUAD(ur->daddr),
+                    (dest->flags & IP_VS_DEST_F_PERSISTENT) ? 1 : 0);
 }
 
 
@@ -746,9 +767,12 @@
        dest->vfwmark = svc->fwmark;
        dest->addr = ur->daddr;
        dest->port = ur->dport;
+       /* should this be atomic set? */
+       dest->flags = ur->dest_flags;
 
        atomic_set(&dest->activeconns, 0);
        atomic_set(&dest->inactconns, 0);
+       atomic_set(&dest->persistconns, 0);
        atomic_set(&dest->refcnt, 0);
 
        INIT_LIST_HEAD(&dest->d_list);
@@ -796,7 +820,7 @@
        dest = ip_vs_trash_get_dest(svc, daddr, dport);
        if (dest != NULL) {
                IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
-                         "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+                         "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
                          NIPQUAD(daddr), ntohs(dport),
                          atomic_read(&dest->refcnt),
                          dest->vfwmark,
@@ -820,6 +844,9 @@
 
                list_add(&dest->n_list, &svc->destinations);
                svc->num_dests++;
+               if (!(dest->flags & IP_VS_DEST_F_OVERFLOW)) {
+                       atomic_inc(&svc->avail_dests);
+               }
 
                /* call the update_service function of its scheduler */
                svc->scheduler->update_service(svc);
@@ -850,6 +877,9 @@
 
        list_add(&dest->n_list, &svc->destinations);
        svc->num_dests++;
+       if (!(dest->flags & IP_VS_DEST_F_OVERFLOW)) {
+               atomic_inc(&svc->avail_dests);
+       }
 
        /* call the update_service function of its scheduler */
        svc->scheduler->update_service(svc);
@@ -935,7 +965,7 @@
                atomic_dec(&dest->svc->refcnt);
                kfree(dest);
        } else {
-               IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, 
refcnt=%d\n",
+               IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, 
dest->refcnt=%d\n",
                          NIPQUAD(dest->addr), ntohs(dest->port),
                          atomic_read(&dest->refcnt));
                list_add(&dest->n_list, &ip_vs_dest_trash);
@@ -958,6 +988,9 @@
         */
        list_del(&dest->n_list);
        svc->num_dests--;
+       if (!(dest->flags & IP_VS_DEST_F_OVERFLOW)) {
+               atomic_dec(&svc->avail_dests);
+       }
        if (svcupd) {
                /*
                 *  Call the update_service function of its scheduler
@@ -1848,6 +1881,7 @@
                        entry.timeout = svc->timeout / HZ;
                        entry.netmask = svc->netmask;
                        entry.num_dests = svc->num_dests;
+                       entry.avail_dests = atomic_read(&svc->avail_dests);
                        __ip_vs_copy_stats(&entry.stats, &svc->stats);
                        if (copy_to_user(&uptr->entrytable[count],
                                         &entry, sizeof(entry))) {
@@ -1873,6 +1907,7 @@
                        entry.timeout = svc->timeout / HZ;
                        entry.netmask = svc->netmask;
                        entry.num_dests = svc->num_dests;
+                       entry.avail_dests = atomic_read(&svc->avail_dests);
                        __ip_vs_copy_stats(&entry.stats, &svc->stats);
                        if (copy_to_user(&uptr->entrytable[count],
                                         &entry, sizeof(entry))) {
@@ -1912,9 +1947,13 @@
                        entry.addr = dest->addr;
                        entry.port = dest->port;
                        entry.flags = atomic_read(&dest->conn_flags);
+                       entry.dest_flags = dest->flags;
                        entry.weight = atomic_read(&dest->weight);
+                       entry.u_threshold = dest->u_threshold;
+                       entry.l_threshold = dest->l_threshold;
                        entry.activeconns = atomic_read(&dest->activeconns);
                        entry.inactconns = atomic_read(&dest->inactconns);
+                       entry.persistconns = atomic_read(&dest->persistconns);
                        __ip_vs_copy_stats(&entry.stats, &dest->stats);
                        if (copy_to_user(&uptr->entrytable[count],
                                         &entry, sizeof(entry))) {
@@ -2028,6 +2067,7 @@
                        get.timeout = svc->timeout / HZ;
                        get.netmask = svc->netmask;
                        get.num_dests = svc->num_dests;
+                       get.avail_dests = atomic_read(&svc->avail_dests);
                        __ip_vs_copy_stats(&get.stats, &svc->stats);
                        if (copy_to_user(user, &get, *len) != 0)
                                ret = -EFAULT;
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_dh.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_dh.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_dh.c  2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_dh.c  2005-11-17 16:33:14 +0100
@@ -103,7 +103,6 @@
                        dest = list_entry(p, struct ip_vs_dest, n_list);
                        atomic_inc(&dest->refcnt);
                        b->dest = dest;
-
                        p = p->next;
                }
                b++;
@@ -185,19 +184,6 @@
 
 
 /*
- *      If the number of active connections is twice larger than its weight,
- *      consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
-{
-       if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)*2) {
-               return 1;
-       }
-       return 0;
-}
-
-
-/*
  *      Destination hashing scheduling
  */
 static struct ip_vs_dest *
@@ -213,7 +199,7 @@
        if (!dest
            || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
            || atomic_read(&dest->weight) <= 0
-           || is_overloaded(dest)) {
+           || ip_vs_is_overloaded(dest)) {
                return NULL;
        }
 
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_hprio.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_hprio.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_hprio.c       1970-01-01 01:00:00 
+0100
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_hprio.c       2005-10-27 17:21:57 
+0200
@@ -0,0 +1,144 @@
+/*
+ * IPVS:        Weighted Round-Robin High Priority Scheduling module
+ *
+ * Version:     0.0.3, 2005/10/26
+ *
+ * Authors:     Roberto Nibali <ratz@xxxxxx>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Algorithm:
+ *             It's based on the rr scheduler but it only takes the highest
+ *             weight and then discards all other servers which have not
+ *             the highest weight.
+ *
+ *             This can be used to simulate an atomic session LIP for peak
+ *             loads where the user space application doesn't have the chance
+ *             to react atomically.
+ *
+ * Changes:
+ *     0.0.2
+ *             - Removed wrr related code for list traversal: would crash the
+ *               whole kernel otherwise
+ *     0.0.3   - Ported to kernel 2.4
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+
+static int ip_vs_hprio_init_svc(struct ip_vs_service *svc) {
+        svc->sched_data = &svc->destinations;
+        return 0;
+}
+
+static int ip_vs_hprio_done_svc(struct ip_vs_service *svc) {
+        return 0;
+}
+
+static int ip_vs_hprio_update_svc(struct ip_vs_service *svc) {
+        svc->sched_data = &svc->destinations;
+        return 0;
+}
+
+
+/*
+ *    Get the maximum weight of the service destinations.
+ */
+static int ip_vs_hprio_max_weight(struct ip_vs_service *svc) {
+        register struct list_head *l, *e;
+        struct ip_vs_dest *dest;
+        int weight = 0;
+
+        l = &svc->destinations;
+        for (e = l->next; e != l; e = e->next) {
+                dest = list_entry(e, struct ip_vs_dest, n_list);
+               IP_VS_DBG(1, "    weight: %d\n", atomic_read(&dest->weight));
+                if (atomic_read(&dest->weight) > weight &&
+                               !ip_vs_is_overloaded(dest)) {
+                        weight = atomic_read(&dest->weight);
+               }
+        }
+        IP_VS_DBG(1, "max weight: %d\n", weight);
+        return weight;
+}
+
+        
+/*
+ *    Weighted Round-Robin Highest Priority Scheduling
+ */
+static struct ip_vs_dest* ip_vs_hprio_schedule(struct ip_vs_service *svc,
+                                             struct iphdr *iph) {
+       register struct list_head *p, *q;
+        struct ip_vs_dest *dest;
+       int max_weight;
+
+        IP_VS_DBG(1, "ip_vs_hprio_schedule(): Scheduling...\n");
+
+       write_lock(&svc->sched_lock);
+       max_weight = ip_vs_hprio_max_weight(svc);
+        p = (struct list_head *)svc->sched_data;
+        p = p->next;
+        q = p;
+        do {
+                if (q == &svc->destinations) {
+                        q = q->next;
+                        continue;
+                }
+                dest = list_entry(q, struct ip_vs_dest, n_list);
+               /*
+                  The check for an overloaded destination is done in the
+                  ip_vs_hprio_max_weight() function. With that we don't need
+                  to quiesce destination servers by setting their weights to 0
+                */
+                if (atomic_read(&dest->weight) > 0 &&
+                               atomic_read(&dest->weight) == max_weight) {
+                        goto out;
+               }
+                q = q->next;
+        } while (q != p);
+       write_unlock(&svc->sched_lock);
+        return NULL;
+
+  out:
+        svc->sched_data = q;
+       write_unlock(&svc->sched_lock);
+        IP_VS_DBG(3, "HPRIO: server %d.%d.%d.%d:%d "
+                  "activeconns %d refcnt %d weight %d\n",
+                  NIPQUAD(dest->addr), ntohs(dest->port),
+                  atomic_read(&dest->activeconns),
+                  atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+        return dest;
+}
+
+static struct ip_vs_scheduler ip_vs_hprio_scheduler = {
+       {0},                      /* n_list */
+       "hprio",                  /* name */
+       ATOMIC_INIT(0),           /* refcnt */
+       THIS_MODULE,              /* this module */
+       ip_vs_hprio_init_svc,     /* service initializer */
+       ip_vs_hprio_done_svc,     /* service done */
+       ip_vs_hprio_update_svc,   /* service updater */
+       ip_vs_hprio_schedule,     /* select a server from the destination list 
*/
+};
+
+static int __init ip_vs_hprio_init(void) {
+       IP_VS_INFO("HPRIO scheduling module loaded.\n");
+        INIT_LIST_HEAD(&ip_vs_hprio_scheduler.n_list);
+       return register_ip_vs_scheduler(&ip_vs_hprio_scheduler);
+}
+
+static void __exit ip_vs_hprio_cleanup(void) {
+       unregister_ip_vs_scheduler(&ip_vs_hprio_scheduler);
+}
+
+module_init(ip_vs_hprio_init);
+module_exit(ip_vs_hprio_cleanup);
+MODULE_LICENSE("GPL");
+
+
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lblc.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lblc.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lblc.c        2004-04-14 15:05:41 
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lblc.c        2005-11-17 17:06:58 
+0100
@@ -207,33 +207,6 @@
 }
 
 
-#if 0000
-/*
- *     Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
- *     returns bool success.
- */
-static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
-                            struct ip_vs_lblc_entry *en)
-{
-       if (list_empty(&en->list)) {
-               IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
-                         "called from %p\n", __builtin_return_address(0));
-               return 0;
-       }
-
-       /*
-        * Remove it from the table
-        */
-       write_lock(&tbl->lock);
-       list_del(&en->list);
-       INIT_LIST_HEAD(&en->list);
-       write_unlock(&tbl->lock);
-
-       return 1;
-}
-#endif
-
-
 /*
  *  Get ip_vs_lblc_entry associated with supplied parameters.
  */
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lc.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lc.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lc.c  2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lc.c  2005-11-17 14:06:20 +0100
@@ -79,10 +79,10 @@
        l = &svc->destinations;
        for (e=l->next; e!=l; e=e->next) {
                least = list_entry (e, struct ip_vs_dest, n_list);
-               if (atomic_read(&least->weight) > 0) {
-                       loh = ip_vs_lc_dest_overhead(least);
-                       goto nextstage;
-               }
+               if (!ipvs_dest_select(least))
+                       continue;
+               loh = ip_vs_lc_dest_overhead(least);
+               goto nextstage;
        }
        return NULL;
 
@@ -92,7 +92,7 @@
   nextstage:
        for (e=e->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
-               if (atomic_read(&dest->weight) == 0)
+               if (!ipvs_dest_select(dest))
                        continue;
                doh = ip_vs_lc_dest_overhead(dest);
                if (doh < loh) {
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_rr.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_rr.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_rr.c  2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_rr.c  2005-11-17 13:53:14 +0100
@@ -68,7 +68,7 @@
                        continue;
                }
                dest = list_entry(q, struct ip_vs_dest, n_list);
-               if (atomic_read(&dest->weight) > 0)
+               if (ipvs_dest_select(dest))
                        /* HIT */
                        goto out;
                q = q->next;
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sed.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sed.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sed.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sed.c 2005-11-17 16:39:36 +0100
@@ -103,6 +103,8 @@
        l = &svc->destinations;
        for (e=l->next; e!=l; e=e->next) {
                least = list_entry(e, struct ip_vs_dest, n_list);
+               if (!ipvs_dest_select(least))
+                       continue;
                if (atomic_read(&least->weight) > 0) {
                        loh = ip_vs_sed_dest_overhead(least);
                        goto nextstage;
@@ -116,6 +118,8 @@
   nextstage:
        for (e=e->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
+               if (!ipvs_dest_select(dest))
+                       continue;
                doh = ip_vs_sed_dest_overhead(dest);
                if (loh * atomic_read(&dest->weight) >
                    doh * atomic_read(&least->weight)) {
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sh.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sh.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sh.c  2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sh.c  2005-11-17 16:32:47 +0100
@@ -182,19 +182,6 @@
 
 
 /*
- *      If the number of active connections is twice larger than its weight,
- *      consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
-{
-       if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)*2) {
-               return 1;
-       }
-       return 0;
-}
-
-
-/*
  *      Source Hashing scheduling
  */
 static struct ip_vs_dest *
@@ -210,7 +197,7 @@
        if (!dest
            || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
            || atomic_read(&dest->weight) <= 0
-           || is_overloaded(dest)) {
+           || ip_vs_is_overloaded(dest)) {
                return NULL;
        }
 
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wlc.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wlc.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wlc.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wlc.c 2005-11-17 14:05:43 +0100
@@ -91,10 +91,10 @@
        l = &svc->destinations;
        for (e=l->next; e!=l; e=e->next) {
                least = list_entry(e, struct ip_vs_dest, n_list);
-               if (atomic_read(&least->weight) > 0) {
-                       loh = ip_vs_wlc_dest_overhead(least);
-                       goto nextstage;
-               }
+               if (!ipvs_dest_select(least))
+                       continue;
+               loh = ip_vs_wlc_dest_overhead(least);
+               goto nextstage;
        }
        return NULL;
 
@@ -104,7 +104,8 @@
   nextstage:
        for (e=e->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
-
+               if (!ipvs_dest_select(dest))
+                       continue;
                doh = ip_vs_wlc_dest_overhead(dest);
                if (loh * atomic_read(&dest->weight) >
                    doh * atomic_read(&least->weight)) {
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wrr.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wrr.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wrr.c 2005-04-04 03:42:20 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wrr.c 2005-11-17 14:08:01 +0100
@@ -154,14 +154,16 @@
 {
        struct ip_vs_dest *dest;
        struct ip_vs_wrr_mark *mark = svc->sched_data;
+       struct list_head *p;
 
        IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
 
        /*
-        * This loop will always terminate, because 0<mark->cw<max_weight,
+        * This loop will always terminate, because 0 < mark->cw < max_weight,
         * and at least one server has its weight equal to max_weight.
         */
        write_lock(&svc->sched_lock);
+       p = mark->cl;
        while (1) {
                if (mark->cl == &svc->destinations) {
                        /* it is at the head of the destination list */
@@ -187,17 +189,29 @@
                                        return NULL;
                                }
                        }
-               }
-               else mark->cl = mark->cl->next;
+               } else
+                       mark->cl = mark->cl->next;
 
                if (mark->cl != &svc->destinations) {
                        /* not at the head of the list */
                        dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
-                       if (atomic_read(&dest->weight) >= mark->cw) {
+                       if (ipvs_dest_select(dest)
+                           && atomic_read(&dest->weight) >= mark->cw) {
                                write_unlock(&svc->sched_lock);
                                break;
                        }
                }
+
+               if (mark->cl == p) {
+                       /*
+                          We're back to the start and no dest has been found.
+                          It's only possible if all dests are OVERLOADED. This
+                          while ip_vs_wrr_schedule section should be adapted
+                          to match the 2.6.x kernel function, using goto.
+                        */
+                       write_unlock(&svc->sched_lock);
+                       return NULL;
+               }
        }
 
        IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
@@ -206,8 +220,7 @@
                  atomic_read(&dest->activeconns),
                  atomic_read(&dest->refcnt),
                  atomic_read(&dest->weight));
-
-       return  dest;
+       return dest;
 }
 
 




<Prev in Thread] Current Thread [Next in Thread>
  • [PATCH-2.4 1/2] Threshold Limitation & Overflow Server Pool, Roberto Nibali <=