LVS
lvs-users
Google
 
Web LinuxVirtualServer.org

Re: [PATCH 2.4] add per real server threshold limitation against ipvsadm

To: "LinuxVirtualServer.org users mailing list." <lvs-users@xxxxxxxxxxxxxxxxxxxxxx>
Subject: Re: [PATCH 2.4] add per real server threshold limitation against ipvsadm-1.21-11
From: Roberto Nibali <ratz@xxxxxx>
Date: Fri, 04 Nov 2005 11:14:04 +0100
Hi,

This is the second drop of the patch, mucho improved. Horms, I'm halfway
through implementing the session limitation server pool. I'd like to use
our idea instead of the hprio scheduler one day, however we didn't
really finish that code back then :).

> o added support for threshold limitation in 2.4.x kernels
> o string_to_number() should use unsigned int
> o POPT_ARGFLAG_OPTIONAL is unfortunately still not available on
>   all Linux systems, so drop it.
>   - Actually either drop it or rewrite the stuff without the popt
>     dependency, which would also make it easier to extend
>   - Or rewrite the stuff from scatch with netlink support (also needs
>     kernel part support for netlink, as discussed with Horms)
> o Change 65535 to UINT16_MAX
> o Add support for LDFLAGS passing so I can link the thing statically

o added more output, like persistent connection count and flag status
o added '-o' flag to add an OVERFLOW/SPILLOVER server

A typical output with 2 RS and 2 spillover server looks as follows:

#ipvsadm-2.4 -L -n --thresholds

IP Virtual Server version 1.0.12 (size=4096)
Prot LocalAddress:Port            Forward Weight Uthreshold Lthreshold
ActiveConn InActConn  PersConn   RS-Usage   RS-Status
  -> RemoteAddress:Port
TCP  212.243.13.20:80 hprio persistent 90 mask 255.255.255.255
avail_dests: 2
  -> 212.243.13.13:80             Route   1      0          0          0
         0          0          SPILLOVER  ONLINE
  -> 212.243.13.14:80             Route   1      0          0          0
         0          0          SPILLOVER  ONLINE
  -> 212.243.13.23:80             Route   5      10         7          0
         0          0          POOL       ONLINE
  -> 212.243.13.21:80             Route   5      10         7          0
         0          0          POOL       ONLINE

The overload situation looks as follows (and is kernel-wise atomic):

IP Virtual Server version 1.0.12 (size=4096)
Prot LocalAddress:Port            Forward Weight Uthreshold Lthreshold
ActiveConn InActConn  PersConn   RS-Usage   RS-Status
  -> RemoteAddress:Port
TCP  212.243.13.20:80 hprio persistent 90 mask 255.255.255.255
avail_dests: 0 OVERLOAD
  -> 212.243.13.13:80             Route   1      0          0          4
         9          17         SPILLOVER  ONLINE
  -> 212.243.13.14:80             Route   1      0          0          4
         8          16         SPILLOVER  ONLINE
  -> 212.243.13.23:80             Route   5      10         7          2
         21         41         POOL       OVERLOADED
  -> 212.243.13.21:80             Route   5      10         7          2
         21         41         POOL       OVERLOADED

What do you think about this? I will need to have it finished by Monday.
Right now the hprio scheduler selects the RS from the pool until they
are OVERLOADED or quiesced. Then it will select the SPILLOVER pool.

Best regards,
Roberto Nibali, ratz
-- 
-------------------------------------------------------------
addr://Kasinostrasse 30, CH-5001 Aarau tel://++41 62 823 9355
http://www.terreactive.com             fax://++41 62 823 9356
-------------------------------------------------------------
terreActive AG                       Wir sichern Ihren Erfolg
-------------------------------------------------------------
diff -X dontdiff -Nur linux-2.4.32-orig/include/net/ip_vs.h 
linux-2.4.32-pab2/include/net/ip_vs.h
--- linux-2.4.32-orig/include/net/ip_vs.h       2005-10-27 17:17:13 +0200
+++ linux-2.4.32-pab2/include/net/ip_vs.h       2005-11-03 15:18:26 +0100
@@ -19,11 +19,16 @@
  */
 #define IP_VS_SVC_F_PERSISTENT        0x0001    /* persistent port */
 #define IP_VS_SVC_F_HASHED            0x0002    /* hashed entry */
+#define IP_VS_SVC_F_OVERLOAD          0x0004    /* service overloaded */
 
 /*
  *      Destination Server Flags
  */
 #define IP_VS_DEST_F_AVAILABLE        0x0001    /* Available tag */
+#define IP_VS_DEST_F_OVERLOAD         0x0002    /* server is overloaded */
+#define IP_VS_DEST_F_OVERFLOW         0x0004    /* RS is overflow server */
+#define IP_VS_DEST_F_PERSISTENT       0x0008    /* RS is overflow server
+                                                  and has persistency set */
 
 /*
  *      IPVS sync daemon states
@@ -113,8 +118,11 @@
        /* destination specific options */
        u_int32_t       daddr;          /* destination address */
        u_int16_t       dport;
-       unsigned        conn_flags;     /* destination flags */
+       unsigned        conn_flags;     /* connection flags */
+       unsigned        dest_flags;     /* destination flags */
        int             weight;         /* destination weight */
+       u_int32_t       u_threshold;    /* upper threshold */
+       u_int32_t       l_threshold;    /* lower threshold */
 };
 
 
@@ -165,6 +173,8 @@
 
        /* number of real servers */
        unsigned int    num_dests;
+       /* amount of available real servers (IP_VS_DEST_F_OVERLOAD not set) */
+       unsigned int    avail_dests;
 
        /* statistics */
        struct ip_vs_stats_user stats;
@@ -173,10 +183,14 @@
 struct ip_vs_dest_user {
        u_int32_t       addr;           /* destination address */
        u_int16_t       port;
-       unsigned        flags;          /* destination flags */
+       unsigned        flags;          /* connection flags */
+       unsigned        dest_flags;     /* destination flags */
        int             weight;         /* destination weight */
        u_int32_t       activeconns;    /* active connections */
        u_int32_t       inactconns;     /* inactive connections */
+       u_int32_t       u_threshold;    /* upper threshold */
+       u_int32_t       l_threshold;    /* lower threshold */
+       u_int32_t       persistconns;   /* persistent connections */
 
        /* statistics */
        struct ip_vs_stats_user stats;
@@ -192,6 +206,8 @@
 
        /* number of real servers */
        unsigned int    num_dests;
+       /* amount of available real servers (IP_VS_DEST_F_OVERLOAD not set) */
+       unsigned int    avail_dests;
 
        /* the real servers */
        struct ip_vs_dest_user entrytable[0];
@@ -464,6 +480,7 @@
 
        struct list_head        destinations;  /* real server d-linked list */
        __u32                   num_dests;     /* number of servers */
+       atomic_t                avail_dests;   /* available real servers */
        struct ip_vs_stats      stats;         /* statistics for the service */
 
        /* for scheduling */
@@ -486,11 +503,16 @@
        unsigned                flags;    /* dest status flags */
        atomic_t                weight;   /* server weight */
        atomic_t                conn_flags;     /* flags to copy to conn */
-       atomic_t                activeconns;    /* active connections */
-       atomic_t                inactconns;     /* inactive connections */
        atomic_t                refcnt;         /* reference counter */
        struct ip_vs_stats      stats;          /* statistics */
 
+       /* connection counters and thresholds */
+       atomic_t                activeconns;    /* active connections */
+       atomic_t                inactconns;     /* inactive connections */
+       atomic_t                persistconns;   /* persistent connections */
+       __u32                   u_threshold;    /* upper threshold */
+       __u32                   l_threshold;    /* lower threshold */
+
        /* for destination cache */
        spinlock_t              dst_lock;       /* lock dst_cache */
        struct dst_entry        *dst_cache;     /* destination cache entry */
@@ -935,6 +957,13 @@
        return 0;
 }
 
+/*
+ *     Server overloaded? 
+ */
+static inline int ip_vs_is_overloaded(struct ip_vs_dest *dest) {
+       return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _IP_VS_H */
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/Config.in 
linux-2.4.32-pab2/net/ipv4/ipvs/Config.in
--- linux-2.4.32-orig/net/ipv4/ipvs/Config.in   2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/Config.in   2005-10-27 17:22:03 +0200
@@ -19,6 +19,7 @@
   dep_tristate '  source hashing scheduling' CONFIG_IP_VS_SH $CONFIG_IP_VS
   dep_tristate '  shortest expected delay scheduling' CONFIG_IP_VS_SED 
$CONFIG_IP_VS
   dep_tristate '  never queue scheduling' CONFIG_IP_VS_NQ $CONFIG_IP_VS
+  dep_tristate '  highest weight round-robin scheduling' CONFIG_IP_VS_HPRIO 
$CONFIG_IP_VS
   comment 'IPVS application helper'
   dep_tristate '  FTP protocol helper' CONFIG_IP_VS_FTP $CONFIG_IP_VS
 fi
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/Makefile 
linux-2.4.32-pab2/net/ipv4/ipvs/Makefile
--- linux-2.4.32-orig/net/ipv4/ipvs/Makefile    2003-11-28 19:26:21 +0100
+++ linux-2.4.32-pab2/net/ipv4/ipvs/Makefile    2005-10-27 17:22:03 +0200
@@ -33,6 +33,7 @@
 obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
 obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
 obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+obj-$(CONFIG_IP_VS_HPRIO) += ip_vs_hprio.o
 
 # IPVS application helpers
 obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_conn.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_conn.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_conn.c        2005-10-27 17:20:58 
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_conn.c        2005-11-03 15:53:30 
+0100
@@ -21,6 +21,7 @@
  * and others. Many code here is taken from IP MASQ code of kernel 2.2.
  *
  * Changes:
+ *     Roberto Nibali, ratz: backported per RS threshold limitation from 2.5.x
  *
  */
 
@@ -233,7 +234,7 @@
        if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
                cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
 
-       IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+       IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
                  ip_vs_proto_name(protocol),
                  NIPQUAD(s_addr), ntohs(s_port),
                  NIPQUAD(d_addr), ntohs(d_port),
@@ -268,7 +269,7 @@
 out:
        ct_read_unlock(hash);
 
-       IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d 
%s\n",
+       IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d 
%s\n",
                ip_vs_proto_name(protocol),
                NIPQUAD(s_addr), ntohs(s_port),
                NIPQUAD(d_addr), ntohs(d_port),
@@ -335,7 +336,8 @@
 
 
 /*
- *     Timeout table[state]
+ *     Timeout table[state] changes
+ *             [IP_VS_S_FIN_WAIT]      =       2*60*HZ,
  */
 struct ip_vs_timeout_table vs_timeout_table = {
        ATOMIC_INIT(0), /* refcnt */
@@ -345,7 +347,7 @@
                [IP_VS_S_ESTABLISHED]   =       15*60*HZ,
                [IP_VS_S_SYN_SENT]      =       2*60*HZ,
                [IP_VS_S_SYN_RECV]      =       1*60*HZ,
-               [IP_VS_S_FIN_WAIT]      =       2*60*HZ,
+               [IP_VS_S_FIN_WAIT]      =       10*HZ,
                [IP_VS_S_TIME_WAIT]     =       2*60*HZ,
                [IP_VS_S_CLOSE]         =       10*HZ,
                [IP_VS_S_CLOSE_WAIT]    =       60*HZ,
@@ -561,7 +563,7 @@
                struct ip_vs_dest *dest = cp->dest;
 
                IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
-                         "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+                         "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
                          ip_vs_proto_name(cp->protocol),
                          (state_off==VS_STATE_OUTPUT)?"output ":"input ",
                          th->syn? 'S' : '.',
@@ -1077,6 +1079,11 @@
        }
 }
 
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+       return atomic_read(&dest->activeconns)
+               + atomic_read(&dest->inactconns);
+}
 
 /*
  *  Bind a connection entry with a virtual service destination
@@ -1096,8 +1103,9 @@
        cp->flags |= atomic_read(&dest->conn_flags);
        cp->dest = dest;
 
-       IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
-                 "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
+       IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+                 "d:%u.%u.%u.%u:%d fwd:%c s:%s conn->flg:%X conn->refcnt:%d "
+                 "dest->refcnt:%d\n",
                  ip_vs_proto_name(cp->protocol),
                  NIPQUAD(cp->caddr), ntohs(cp->cport),
                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -1105,6 +1113,29 @@
                  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
                  cp->flags, atomic_read(&cp->refcnt),
                  atomic_read(&dest->refcnt));
+
+       /* Update connection counters */
+       if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+               /* It is a normal connection, so increase the inactive
+                  connection counter because it is in TCP SYNRECV
+                  state (inactive) or other protocol inacive state */
+               atomic_inc(&dest->inactconns);
+       } else {
+               /* It is a persistent connection/template, so increase
+                  the peristent connection counter */
+               atomic_inc(&dest->persistconns);
+       }
+
+       IP_VS_DBG(3, "Bind-dest: Threshold handling: avail_dests=%d\n",
+                       atomic_read(&dest->svc->avail_dests));
+       if (dest->u_threshold != 0 &&
+           ip_vs_dest_totalconns(dest) >= dest->u_threshold) {
+               dest->flags |= IP_VS_DEST_F_OVERLOAD;
+               if (atomic_dec_and_test(&dest->svc->avail_dests)) {
+                       /* All RS for this service are overloaded */
+                       dest->svc->flags |= IP_VS_SVC_F_OVERLOAD;
+               }
+       }
 }
 
 
@@ -1120,9 +1151,9 @@
        if (!dest)
                return;
 
-       IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d "
+       IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d "
                  "v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c "
-                 "s:%s flg:%X cnt:%d destcnt:%d\n",
+                 "s:%s flg:%X conn->refcnt:%d dest->refcnt:%d\n",
                  ip_vs_proto_name(cp->protocol),
                  NIPQUAD(cp->caddr), ntohs(cp->cport),
                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -1131,16 +1162,38 @@
                  cp->flags, atomic_read(&cp->refcnt),
                  atomic_read(&dest->refcnt));
 
-       /*
-        * Decrease the inactconns or activeconns counter
-        * if it is not a connection template
-        */
-       if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
-               if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+       /* Update the connection counters */
+       if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+               /* It is a normal connection, so decrease the inactconns
+                  or activeconns counter */
+               if (cp->flags & IP_VS_CONN_F_INACTIVE) {
                        atomic_dec(&dest->inactconns);
                } else {
                        atomic_dec(&dest->activeconns);
                }
+       } else {
+               /* It is a persistent connection/template, so decrease
+                  the peristent connection counter */
+               atomic_dec(&dest->persistconns);
+       }
+
+       IP_VS_DBG(3, "Unbind-dest: Threshold handling: avail_dests=%d\n",
+                       atomic_read(&dest->svc->avail_dests));
+       if (dest->l_threshold != 0) {
+               /* This implies that the upper threshold is != 0 as well */
+               if (ip_vs_dest_totalconns(dest) <= dest->l_threshold) {
+                       dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+                       atomic_inc(&dest->svc->avail_dests);
+                       dest->svc->flags &= ~IP_VS_SVC_F_OVERLOAD;
+               }
+       } else {
+               /* We drop in here if the upper threshold is != 0 and the
+                  lower threshold is ==0. */
+               if (dest->flags & IP_VS_DEST_F_OVERLOAD) {
+                       dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+                       atomic_inc(&dest->svc->avail_dests);
+                       dest->svc->flags &= ~IP_VS_SVC_F_OVERLOAD;
+               }
        }
 
        /*
@@ -1187,12 +1240,7 @@
                                ip_vs_conn_hash(ct);
                        }
                }
-
-               /*
-                * Simply decrease the refcnt of the template,
-                * don't restart its timer.
-                */
-               atomic_dec(&ct->refcnt);
+               __ip_vs_conn_put(ct);
                return 0;
        }
        return 1;
@@ -1270,7 +1318,7 @@
        ip_vs_conn_hash(cp);
 
   expire_later:
-       IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+       IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn.n_control=%d\n",
                  atomic_read(&cp->refcnt)-1,
                  atomic_read(&cp->n_control));
 
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_core.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_core.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_core.c        2005-10-27 17:17:13 
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_core.c        2005-11-03 09:21:51 
+0100
@@ -305,13 +305,6 @@
        }
 
        /*
-        *    Increase the inactive connection counter
-        *    because it is in Syn-Received
-        *    state (inactive) when the connection is created.
-        */
-       atomic_inc(&dest->inactconns);
-
-       /*
         *    Add its control
         */
        ip_vs_control_add(cp, ct);
@@ -369,14 +362,8 @@
        if (cp == NULL)
                return NULL;
 
-       /*
-        *    Increase the inactive connection counter because it is in
-        *    Syn-Received state (inactive) when the connection is created.
-        */
-       atomic_inc(&dest->inactconns);
-
        IP_VS_DBG(6, "Schedule fwd:%c s:%s c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
-                 "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+                 "d:%u.%u.%u.%u:%u flg:%X conn->refcnt:%d\n",
                  ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
                  NIPQUAD(cp->caddr), ntohs(cp->cport),
                  NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -1111,11 +1098,10 @@
                if (sysctl_ip_vs_expire_nodest_conn) {
                        /* try to expire the connection immediately */
                        ip_vs_conn_expire_now(cp);
-               } else {
-                       /* don't restart its timer, and silently
-                          drop the packet. */
-                       __ip_vs_conn_put(cp);
                }
+               /* don't restart its timer, and silently
+                  drop the packet. */
+               __ip_vs_conn_put(cp);
                return NF_DROP;
        }
 
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_ctl.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_ctl.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_ctl.c 2005-06-01 02:56:56 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_ctl.c 2005-11-03 15:58:33 +0100
@@ -17,6 +17,8 @@
  *              2 of the License, or (at your option) any later version.
  *
  * Changes:
+ *     Roberto Nibali, ratz: Implemented per RS threshold limitation based
+ *                           on 2.5.x code and former design in 2.2.x.
  *
  */
 
@@ -428,7 +430,7 @@
   out:
        read_unlock(&__ip_vs_svc_lock);
 
-       IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+       IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
                  fwmark, ip_vs_proto_name(protocol),
                  NIPQUAD(vaddr), ntohs(vport),
                  svc?"hit":"not hit");
@@ -590,7 +592,7 @@
        for (e=l->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
                IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
-                         "refcnt=%d\n",
+                         "dest->refcnt=%d\n",
                          dest->vfwmark,
                          NIPQUAD(dest->addr), ntohs(dest->port),
                          atomic_read(&dest->refcnt));
@@ -712,7 +714,17 @@
        }
 
        /* set the dest status flags */
+       if (ur->dest_flags & IP_VS_DEST_F_OVERFLOW) {
+               dest->flags |= IP_VS_DEST_F_OVERFLOW;
+       }
        dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+       if (ur->u_threshold == 0 || ur->u_threshold > dest->u_threshold) {
+               dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+               svc->flags &= ~IP_VS_SVC_F_OVERLOAD;
+       }
+       dest->u_threshold = ur->u_threshold;
+       dest->l_threshold = ur->l_threshold;
 }
 
 
@@ -746,9 +758,12 @@
        dest->vfwmark = svc->fwmark;
        dest->addr = ur->daddr;
        dest->port = ur->dport;
+       /* should this be atomic set? */
+       dest->flags = ur->dest_flags;
 
        atomic_set(&dest->activeconns, 0);
        atomic_set(&dest->inactconns, 0);
+       atomic_set(&dest->persistconns, 0);
        atomic_set(&dest->refcnt, 0);
 
        INIT_LIST_HEAD(&dest->d_list);
@@ -796,7 +811,7 @@
        dest = ip_vs_trash_get_dest(svc, daddr, dport);
        if (dest != NULL) {
                IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
-                         "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+                         "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
                          NIPQUAD(daddr), ntohs(dport),
                          atomic_read(&dest->refcnt),
                          dest->vfwmark,
@@ -820,6 +835,9 @@
 
                list_add(&dest->n_list, &svc->destinations);
                svc->num_dests++;
+               if (!(dest->flags & IP_VS_DEST_F_OVERFLOW)) {
+                       atomic_inc(&svc->avail_dests);
+               }
 
                /* call the update_service function of its scheduler */
                svc->scheduler->update_service(svc);
@@ -850,6 +868,9 @@
 
        list_add(&dest->n_list, &svc->destinations);
        svc->num_dests++;
+       if (!(dest->flags & IP_VS_DEST_F_OVERFLOW)) {
+               atomic_inc(&svc->avail_dests);
+       }
 
        /* call the update_service function of its scheduler */
        svc->scheduler->update_service(svc);
@@ -935,7 +956,7 @@
                atomic_dec(&dest->svc->refcnt);
                kfree(dest);
        } else {
-               IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, 
refcnt=%d\n",
+               IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, 
dest->refcnt=%d\n",
                          NIPQUAD(dest->addr), ntohs(dest->port),
                          atomic_read(&dest->refcnt));
                list_add(&dest->n_list, &ip_vs_dest_trash);
@@ -958,6 +979,9 @@
         */
        list_del(&dest->n_list);
        svc->num_dests--;
+       if (!(dest->flags & IP_VS_DEST_F_OVERFLOW)) {
+               atomic_dec(&svc->avail_dests);
+       }
        if (svcupd) {
                /*
                 *  Call the update_service function of its scheduler
@@ -1848,6 +1872,7 @@
                        entry.timeout = svc->timeout / HZ;
                        entry.netmask = svc->netmask;
                        entry.num_dests = svc->num_dests;
+                       entry.avail_dests = atomic_read(&svc->avail_dests);
                        __ip_vs_copy_stats(&entry.stats, &svc->stats);
                        if (copy_to_user(&uptr->entrytable[count],
                                         &entry, sizeof(entry))) {
@@ -1873,6 +1898,7 @@
                        entry.timeout = svc->timeout / HZ;
                        entry.netmask = svc->netmask;
                        entry.num_dests = svc->num_dests;
+                       entry.avail_dests = atomic_read(&svc->avail_dests);
                        __ip_vs_copy_stats(&entry.stats, &svc->stats);
                        if (copy_to_user(&uptr->entrytable[count],
                                         &entry, sizeof(entry))) {
@@ -1912,9 +1938,13 @@
                        entry.addr = dest->addr;
                        entry.port = dest->port;
                        entry.flags = atomic_read(&dest->conn_flags);
+                       entry.dest_flags = dest->flags;
                        entry.weight = atomic_read(&dest->weight);
+                       entry.u_threshold = dest->u_threshold;
+                       entry.l_threshold = dest->l_threshold;
                        entry.activeconns = atomic_read(&dest->activeconns);
                        entry.inactconns = atomic_read(&dest->inactconns);
+                       entry.persistconns = atomic_read(&dest->persistconns);
                        __ip_vs_copy_stats(&entry.stats, &dest->stats);
                        if (copy_to_user(&uptr->entrytable[count],
                                         &entry, sizeof(entry))) {
@@ -2028,6 +2058,7 @@
                        get.timeout = svc->timeout / HZ;
                        get.netmask = svc->netmask;
                        get.num_dests = svc->num_dests;
+                       get.avail_dests = atomic_read(&svc->avail_dests);
                        __ip_vs_copy_stats(&get.stats, &svc->stats);
                        if (copy_to_user(user, &get, *len) != 0)
                                ret = -EFAULT;
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_dh.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_dh.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_dh.c  2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_dh.c  2005-10-27 17:22:03 +0200
@@ -185,19 +185,6 @@
 
 
 /*
- *      If the number of active connections is twice larger than its weight,
- *      consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
-{
-       if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)*2) {
-               return 1;
-       }
-       return 0;
-}
-
-
-/*
  *      Destination hashing scheduling
  */
 static struct ip_vs_dest *
@@ -213,7 +200,7 @@
        if (!dest
            || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
            || atomic_read(&dest->weight) <= 0
-           || is_overloaded(dest)) {
+           || ip_vs_is_overloaded(dest)) {
                return NULL;
        }
 
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_hprio.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_hprio.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_hprio.c       1970-01-01 01:00:00 
+0100
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_hprio.c       2005-10-27 17:21:57 
+0200
@@ -0,0 +1,144 @@
+/*
+ * IPVS:        Weighted Round-Robin High Priority Scheduling module
+ *
+ * Version:     0.0.3, 2005/10/26
+ *
+ * Authors:     Roberto Nibali <ratz@xxxxxx>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Algorithm:
+ *             It's based on the rr scheduler but it only takes the highest
+ *             weight and then discards all other servers which have not
+ *             the highest weight.
+ *
+ *             This can be used to simulate an atomic session LIP for peak
+ *             loads where the user space application doesn't have the chance
+ *             to react atomically.
+ *
+ * Changes:
+ *     0.0.2
+ *             - Removed wrr related code for list traversal: would crash the
+ *               whole kernel otherwise
+ *     0.0.3   - Ported to kernel 2.4
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+
+static int ip_vs_hprio_init_svc(struct ip_vs_service *svc) {
+        svc->sched_data = &svc->destinations;
+        return 0;
+}
+
+static int ip_vs_hprio_done_svc(struct ip_vs_service *svc) {
+        return 0;
+}
+
+static int ip_vs_hprio_update_svc(struct ip_vs_service *svc) {
+        svc->sched_data = &svc->destinations;
+        return 0;
+}
+
+
+/*
+ *    Get the maximum weight of the service destinations.
+ */
+static int ip_vs_hprio_max_weight(struct ip_vs_service *svc) {
+        register struct list_head *l, *e;
+        struct ip_vs_dest *dest;
+        int weight = 0;
+
+        l = &svc->destinations;
+        for (e = l->next; e != l; e = e->next) {
+                dest = list_entry(e, struct ip_vs_dest, n_list);
+               IP_VS_DBG(1, "    weight: %d\n", atomic_read(&dest->weight));
+                if (atomic_read(&dest->weight) > weight &&
+                               !ip_vs_is_overloaded(dest)) {
+                        weight = atomic_read(&dest->weight);
+               }
+        }
+        IP_VS_DBG(1, "max weight: %d\n", weight);
+        return weight;
+}
+
+        
+/*
+ *    Weighted Round-Robin Highest Priority Scheduling
+ */
+static struct ip_vs_dest* ip_vs_hprio_schedule(struct ip_vs_service *svc,
+                                             struct iphdr *iph) {
+       register struct list_head *p, *q;
+        struct ip_vs_dest *dest;
+       int max_weight;
+
+        IP_VS_DBG(1, "ip_vs_hprio_schedule(): Scheduling...\n");
+
+       write_lock(&svc->sched_lock);
+       max_weight = ip_vs_hprio_max_weight(svc);
+        p = (struct list_head *)svc->sched_data;
+        p = p->next;
+        q = p;
+        do {
+                if (q == &svc->destinations) {
+                        q = q->next;
+                        continue;
+                }
+                dest = list_entry(q, struct ip_vs_dest, n_list);
+               /*
+                  The check for an overloaded destination is done in the
+                  ip_vs_hprio_max_weight() function. With that we don't need
+                  to quiesce destination servers by setting their weights to 0
+                */
+                if (atomic_read(&dest->weight) > 0 &&
+                               atomic_read(&dest->weight) == max_weight) {
+                        goto out;
+               }
+                q = q->next;
+        } while (q != p);
+       write_unlock(&svc->sched_lock);
+        return NULL;
+
+  out:
+        svc->sched_data = q;
+       write_unlock(&svc->sched_lock);
+        IP_VS_DBG(3, "HPRIO: server %d.%d.%d.%d:%d "
+                  "activeconns %d refcnt %d weight %d\n",
+                  NIPQUAD(dest->addr), ntohs(dest->port),
+                  atomic_read(&dest->activeconns),
+                  atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+        return dest;
+}
+
+static struct ip_vs_scheduler ip_vs_hprio_scheduler = {
+       {0},                      /* n_list */
+       "hprio",                  /* name */
+       ATOMIC_INIT(0),           /* refcnt */
+       THIS_MODULE,              /* this module */
+       ip_vs_hprio_init_svc,     /* service initializer */
+       ip_vs_hprio_done_svc,     /* service done */
+       ip_vs_hprio_update_svc,   /* service updater */
+       ip_vs_hprio_schedule,     /* select a server from the destination list 
*/
+};
+
+static int __init ip_vs_hprio_init(void) {
+       IP_VS_INFO("HPRIO scheduling module loaded.\n");
+        INIT_LIST_HEAD(&ip_vs_hprio_scheduler.n_list);
+       return register_ip_vs_scheduler(&ip_vs_hprio_scheduler);
+}
+
+static void __exit ip_vs_hprio_cleanup(void) {
+       unregister_ip_vs_scheduler(&ip_vs_hprio_scheduler);
+}
+
+module_init(ip_vs_hprio_init);
+module_exit(ip_vs_hprio_cleanup);
+MODULE_LICENSE("GPL");
+
+
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lblc.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lblc.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lblc.c        2004-04-14 15:05:41 
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lblc.c        2005-10-27 17:22:03 
+0200
@@ -473,6 +473,8 @@
        l = &svc->destinations;
        for (e=l->next; e!=l; e=e->next) {
                least = list_entry(e, struct ip_vs_dest, n_list);
+               if (ip_vs_is_overloaded(least))
+                       continue;
                if (atomic_read(&least->weight) > 0) {
                        loh = atomic_read(&least->activeconns) * 50
                                + atomic_read(&least->inactconns);
@@ -487,6 +489,8 @@
   nextstage:
        for (e=e->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
+               if (ip_vs_is_overloaded(dest))
+                       continue;
                doh = atomic_read(&dest->activeconns) * 50
                        + atomic_read(&dest->inactconns);
                if (loh * atomic_read(&dest->weight) >
@@ -558,7 +562,7 @@
                ip_vs_lblc_hash(tbl, en);
        } else {
                dest = en->dest;
-               if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
+               if (!(ip_vs_is_overloaded(dest))
                    || atomic_read(&dest->weight) <= 0
                    || is_overloaded(dest, svc)) {
                        dest = __ip_vs_wlc_schedule(svc, iph);
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lblcr.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lblcr.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lblcr.c       2004-04-14 15:05:41 
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lblcr.c       2005-10-27 17:22:03 
+0200
@@ -178,6 +178,8 @@
        /* select the first destination server, whose weight > 0 */
        for (e=set->list; e!=NULL; e=e->next) {
                least = e->dest;
+               if (ip_vs_is_overloaded(least))
+                       continue;
                if ((atomic_read(&least->weight) > 0)
                    && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
                        loh = atomic_read(&least->activeconns) * 50
@@ -192,6 +194,8 @@
   nextstage:
        for (e=e->next; e!=NULL; e=e->next) {
                dest = e->dest;
+               if (ip_vs_is_overloaded(dest))
+                       continue;
                doh = atomic_read(&dest->activeconns) * 50
                        + atomic_read(&dest->inactconns);
                if ((loh * atomic_read(&dest->weight) >
@@ -723,6 +727,8 @@
        l = &svc->destinations;
        for (e=l->next; e!=l; e=e->next) {
                least = list_entry(e, struct ip_vs_dest, n_list);
+               if (ip_vs_is_overloaded(least))
+                       continue;
                if (atomic_read(&least->weight) > 0) {
                        loh = atomic_read(&least->activeconns) * 50
                                + atomic_read(&least->inactconns);
@@ -737,6 +743,8 @@
   nextstage:
        for (e=e->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
+               if (ip_vs_is_overloaded(dest))
+                       continue;
                doh = atomic_read(&dest->activeconns) * 50
                        + atomic_read(&dest->inactconns);
                if (loh * atomic_read(&dest->weight) >
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lc.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lc.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lc.c  2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lc.c  2005-10-27 17:22:03 +0200
@@ -79,6 +79,8 @@
        l = &svc->destinations;
        for (e=l->next; e!=l; e=e->next) {
                least = list_entry (e, struct ip_vs_dest, n_list);
+               if (ip_vs_is_overloaded(least))
+                       continue;
                if (atomic_read(&least->weight) > 0) {
                        loh = ip_vs_lc_dest_overhead(least);
                        goto nextstage;
@@ -92,7 +94,8 @@
   nextstage:
        for (e=e->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
-               if (atomic_read(&dest->weight) == 0)
+               if (ip_vs_is_overloaded(dest)
+                   || atomic_read(&dest->weight) == 0)
                        continue;
                doh = ip_vs_lc_dest_overhead(dest);
                if (doh < loh) {
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_nq.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_nq.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_nq.c  2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_nq.c  2005-10-27 17:22:03 +0200
@@ -99,6 +99,8 @@
        l = &svc->destinations;
        for (e=l->next; e!=l; e=e->next) {
                least = list_entry(e, struct ip_vs_dest, n_list);
+               if (ip_vs_is_overloaded(least))
+                       continue;
                if (atomic_read(&least->weight) > 0) {
                        loh = ip_vs_nq_dest_overhead(least);
 
@@ -117,6 +119,8 @@
   nextstage:
        for (e=e->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
+               if (ip_vs_is_overloaded(dest))
+                       continue;
                doh = ip_vs_nq_dest_overhead(dest);
 
                /* return the server directly if it is idle */
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_rr.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_rr.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_rr.c  2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_rr.c  2005-10-27 17:22:03 +0200
@@ -68,7 +68,8 @@
                        continue;
                }
                dest = list_entry(q, struct ip_vs_dest, n_list);
-               if (atomic_read(&dest->weight) > 0)
+               if (!ip_vs_is_overloaded(dest)
+                   && atomic_read(&dest->weight) > 0)
                        /* HIT */
                        goto out;
                q = q->next;
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sed.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sed.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sed.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sed.c 2005-10-27 17:22:03 +0200
@@ -103,6 +103,8 @@
        l = &svc->destinations;
        for (e=l->next; e!=l; e=e->next) {
                least = list_entry(e, struct ip_vs_dest, n_list);
+               if (ip_vs_is_overloaded(least))
+                       continue;
                if (atomic_read(&least->weight) > 0) {
                        loh = ip_vs_sed_dest_overhead(least);
                        goto nextstage;
@@ -116,6 +118,8 @@
   nextstage:
        for (e=e->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
+               if (ip_vs_is_overloaded(dest))
+                       continue;
                doh = ip_vs_sed_dest_overhead(dest);
                if (loh * atomic_read(&dest->weight) >
                    doh * atomic_read(&least->weight)) {
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sh.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sh.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sh.c  2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sh.c  2005-10-27 17:22:03 +0200
@@ -182,19 +182,6 @@
 
 
 /*
- *      If the number of active connections is twice larger than its weight,
- *      consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
-{
-       if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)*2) {
-               return 1;
-       }
-       return 0;
-}
-
-
-/*
  *      Source Hashing scheduling
  */
 static struct ip_vs_dest *
@@ -210,7 +197,7 @@
        if (!dest
            || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
            || atomic_read(&dest->weight) <= 0
-           || is_overloaded(dest)) {
+           || ip_vs_is_overloaded(dest)) {
                return NULL;
        }
 
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wlc.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wlc.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wlc.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wlc.c 2005-10-27 17:22:03 +0200
@@ -91,6 +91,8 @@
        l = &svc->destinations;
        for (e=l->next; e!=l; e=e->next) {
                least = list_entry(e, struct ip_vs_dest, n_list);
+               if (ip_vs_is_overloaded(least))
+                       continue;
                if (atomic_read(&least->weight) > 0) {
                        loh = ip_vs_wlc_dest_overhead(least);
                        goto nextstage;
@@ -104,7 +106,8 @@
   nextstage:
        for (e=e->next; e!=l; e=e->next) {
                dest = list_entry(e, struct ip_vs_dest, n_list);
-
+               if (ip_vs_is_overloaded(dest))
+                       continue;
                doh = ip_vs_wlc_dest_overhead(dest);
                if (loh * atomic_read(&dest->weight) >
                    doh * atomic_read(&least->weight)) {
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wrr.c 
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wrr.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wrr.c 2005-04-04 03:42:20 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wrr.c 2005-10-27 17:22:03 +0200
@@ -154,14 +154,16 @@
 {
        struct ip_vs_dest *dest;
        struct ip_vs_wrr_mark *mark = svc->sched_data;
+       struct list_head *p;
 
        IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
 
        /*
-        * This loop will always terminate, because 0<mark->cw<max_weight,
+        * This loop will always terminate, because 0 < mark->cw < max_weight,
         * and at least one server has its weight equal to max_weight.
         */
        write_lock(&svc->sched_lock);
+       p = mark->cl;
        while (1) {
                if (mark->cl == &svc->destinations) {
                        /* it is at the head of the destination list */
@@ -187,17 +189,29 @@
                                        return NULL;
                                }
                        }
-               }
-               else mark->cl = mark->cl->next;
+               } else
+                       mark->cl = mark->cl->next;
 
                if (mark->cl != &svc->destinations) {
                        /* not at the head of the list */
                        dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
-                       if (atomic_read(&dest->weight) >= mark->cw) {
+                       if (!ip_vs_is_overloaded(dest)
+                           && atomic_read(&dest->weight) >= mark->cw) {
                                write_unlock(&svc->sched_lock);
                                break;
                        }
                }
+
+               if (mark->cl == p) {
+                       /*
+                          We're back to the start and no dest has been found.
+                          It's only possible if all dests are OVERLOADED. This
+                          while ip_vs_wrr_schedule section should be adapted
+                          to match the 2.6.x kernel function, using goto.
+                        */
+                       write_unlock(&svc->sched_lock);
+                       return NULL;
+               }
        }
 
        IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
@@ -206,8 +220,7 @@
                  atomic_read(&dest->activeconns),
                  atomic_read(&dest->refcnt),
                  atomic_read(&dest->weight));
-
-       return  dest;
+       return dest;
 }
 
 




<Prev in Thread] Current Thread [Next in Thread>