Hi,
This is the second drop of the patch, mucho improved. Horms, I'm halfway
through implementing the session limitation server pool. I'd like to use
our idea instead of the hprio scheduler one day, however we didn't
really finish that code back then :).
> o added support for threshold limitation in 2.4.x kernels
> o string_to_number() should use unsigned int
> o POPT_ARGFLAG_OPTIONAL is unfortunately still not available on
> all Linux systems, so drop it.
> - Actually either drop it or rewrite the stuff without the popt
> dependency, which would also make it easier to extend
> - Or rewrite the stuff from scatch with netlink support (also needs
> kernel part support for netlink, as discussed with Horms)
> o Change 65535 to UINT16_MAX
> o Add support for LDFLAGS passing so I can link the thing statically
o added more output, like persistent connection count and flag status
o added '-o' flag to add an OVERFLOW/SPILLOVER server
A typical output with 2 RS and 2 spillover server looks as follows:
#ipvsadm-2.4 -L -n --thresholds
IP Virtual Server version 1.0.12 (size=4096)
Prot LocalAddress:Port Forward Weight Uthreshold Lthreshold
ActiveConn InActConn PersConn RS-Usage RS-Status
-> RemoteAddress:Port
TCP 212.243.13.20:80 hprio persistent 90 mask 255.255.255.255
avail_dests: 2
-> 212.243.13.13:80 Route 1 0 0 0
0 0 SPILLOVER ONLINE
-> 212.243.13.14:80 Route 1 0 0 0
0 0 SPILLOVER ONLINE
-> 212.243.13.23:80 Route 5 10 7 0
0 0 POOL ONLINE
-> 212.243.13.21:80 Route 5 10 7 0
0 0 POOL ONLINE
The overload situation looks as follows (and is kernel-wise atomic):
IP Virtual Server version 1.0.12 (size=4096)
Prot LocalAddress:Port Forward Weight Uthreshold Lthreshold
ActiveConn InActConn PersConn RS-Usage RS-Status
-> RemoteAddress:Port
TCP 212.243.13.20:80 hprio persistent 90 mask 255.255.255.255
avail_dests: 0 OVERLOAD
-> 212.243.13.13:80 Route 1 0 0 4
9 17 SPILLOVER ONLINE
-> 212.243.13.14:80 Route 1 0 0 4
8 16 SPILLOVER ONLINE
-> 212.243.13.23:80 Route 5 10 7 2
21 41 POOL OVERLOADED
-> 212.243.13.21:80 Route 5 10 7 2
21 41 POOL OVERLOADED
What do you think about this? I will need to have it finished by Monday.
Right now the hprio scheduler selects the RS from the pool until they
are OVERLOADED or quiesced. Then it will select the SPILLOVER pool.
Best regards,
Roberto Nibali, ratz
--
-------------------------------------------------------------
addr://Kasinostrasse 30, CH-5001 Aarau tel://++41 62 823 9355
http://www.terreactive.com fax://++41 62 823 9356
-------------------------------------------------------------
terreActive AG Wir sichern Ihren Erfolg
-------------------------------------------------------------
diff -X dontdiff -Nur linux-2.4.32-orig/include/net/ip_vs.h
linux-2.4.32-pab2/include/net/ip_vs.h
--- linux-2.4.32-orig/include/net/ip_vs.h 2005-10-27 17:17:13 +0200
+++ linux-2.4.32-pab2/include/net/ip_vs.h 2005-11-03 15:18:26 +0100
@@ -19,11 +19,16 @@
*/
#define IP_VS_SVC_F_PERSISTENT 0x0001 /* persistent port */
#define IP_VS_SVC_F_HASHED 0x0002 /* hashed entry */
+#define IP_VS_SVC_F_OVERLOAD 0x0004 /* service overloaded */
/*
* Destination Server Flags
*/
#define IP_VS_DEST_F_AVAILABLE 0x0001 /* Available tag */
+#define IP_VS_DEST_F_OVERLOAD 0x0002 /* server is overloaded */
+#define IP_VS_DEST_F_OVERFLOW 0x0004 /* RS is overflow server */
+#define IP_VS_DEST_F_PERSISTENT 0x0008 /* RS is overflow server
+ and has persistency set */
/*
* IPVS sync daemon states
@@ -113,8 +118,11 @@
/* destination specific options */
u_int32_t daddr; /* destination address */
u_int16_t dport;
- unsigned conn_flags; /* destination flags */
+ unsigned conn_flags; /* connection flags */
+ unsigned dest_flags; /* destination flags */
int weight; /* destination weight */
+ u_int32_t u_threshold; /* upper threshold */
+ u_int32_t l_threshold; /* lower threshold */
};
@@ -165,6 +173,8 @@
/* number of real servers */
unsigned int num_dests;
+ /* amount of available real servers (IP_VS_DEST_F_OVERLOAD not set) */
+ unsigned int avail_dests;
/* statistics */
struct ip_vs_stats_user stats;
@@ -173,10 +183,14 @@
struct ip_vs_dest_user {
u_int32_t addr; /* destination address */
u_int16_t port;
- unsigned flags; /* destination flags */
+ unsigned flags; /* connection flags */
+ unsigned dest_flags; /* destination flags */
int weight; /* destination weight */
u_int32_t activeconns; /* active connections */
u_int32_t inactconns; /* inactive connections */
+ u_int32_t u_threshold; /* upper threshold */
+ u_int32_t l_threshold; /* lower threshold */
+ u_int32_t persistconns; /* persistent connections */
/* statistics */
struct ip_vs_stats_user stats;
@@ -192,6 +206,8 @@
/* number of real servers */
unsigned int num_dests;
+ /* amount of available real servers (IP_VS_DEST_F_OVERLOAD not set) */
+ unsigned int avail_dests;
/* the real servers */
struct ip_vs_dest_user entrytable[0];
@@ -464,6 +480,7 @@
struct list_head destinations; /* real server d-linked list */
__u32 num_dests; /* number of servers */
+ atomic_t avail_dests; /* available real servers */
struct ip_vs_stats stats; /* statistics for the service */
/* for scheduling */
@@ -486,11 +503,16 @@
unsigned flags; /* dest status flags */
atomic_t weight; /* server weight */
atomic_t conn_flags; /* flags to copy to conn */
- atomic_t activeconns; /* active connections */
- atomic_t inactconns; /* inactive connections */
atomic_t refcnt; /* reference counter */
struct ip_vs_stats stats; /* statistics */
+ /* connection counters and thresholds */
+ atomic_t activeconns; /* active connections */
+ atomic_t inactconns; /* inactive connections */
+ atomic_t persistconns; /* persistent connections */
+ __u32 u_threshold; /* upper threshold */
+ __u32 l_threshold; /* lower threshold */
+
/* for destination cache */
spinlock_t dst_lock; /* lock dst_cache */
struct dst_entry *dst_cache; /* destination cache entry */
@@ -935,6 +957,13 @@
return 0;
}
+/*
+ * Server overloaded?
+ */
+static inline int ip_vs_is_overloaded(struct ip_vs_dest *dest) {
+ return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
#endif /* __KERNEL__ */
#endif /* _IP_VS_H */
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/Config.in
linux-2.4.32-pab2/net/ipv4/ipvs/Config.in
--- linux-2.4.32-orig/net/ipv4/ipvs/Config.in 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/Config.in 2005-10-27 17:22:03 +0200
@@ -19,6 +19,7 @@
dep_tristate ' source hashing scheduling' CONFIG_IP_VS_SH $CONFIG_IP_VS
dep_tristate ' shortest expected delay scheduling' CONFIG_IP_VS_SED
$CONFIG_IP_VS
dep_tristate ' never queue scheduling' CONFIG_IP_VS_NQ $CONFIG_IP_VS
+ dep_tristate ' highest weight round-robin scheduling' CONFIG_IP_VS_HPRIO
$CONFIG_IP_VS
comment 'IPVS application helper'
dep_tristate ' FTP protocol helper' CONFIG_IP_VS_FTP $CONFIG_IP_VS
fi
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/Makefile
linux-2.4.32-pab2/net/ipv4/ipvs/Makefile
--- linux-2.4.32-orig/net/ipv4/ipvs/Makefile 2003-11-28 19:26:21 +0100
+++ linux-2.4.32-pab2/net/ipv4/ipvs/Makefile 2005-10-27 17:22:03 +0200
@@ -33,6 +33,7 @@
obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+obj-$(CONFIG_IP_VS_HPRIO) += ip_vs_hprio.o
# IPVS application helpers
obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_conn.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_conn.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_conn.c 2005-10-27 17:20:58
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_conn.c 2005-11-03 15:53:30
+0100
@@ -21,6 +21,7 @@
* and others. Many code here is taken from IP MASQ code of kernel 2.2.
*
* Changes:
+ * Roberto Nibali, ratz: backported per RS threshold limitation from 2.5.x
*
*/
@@ -233,7 +234,7 @@
if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
- IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+ IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
@@ -268,7 +269,7 @@
out:
ct_read_unlock(hash);
- IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d
%s\n",
+ IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d
%s\n",
ip_vs_proto_name(protocol),
NIPQUAD(s_addr), ntohs(s_port),
NIPQUAD(d_addr), ntohs(d_port),
@@ -335,7 +336,8 @@
/*
- * Timeout table[state]
+ * Timeout table[state] changes
+ * [IP_VS_S_FIN_WAIT] = 2*60*HZ,
*/
struct ip_vs_timeout_table vs_timeout_table = {
ATOMIC_INIT(0), /* refcnt */
@@ -345,7 +347,7 @@
[IP_VS_S_ESTABLISHED] = 15*60*HZ,
[IP_VS_S_SYN_SENT] = 2*60*HZ,
[IP_VS_S_SYN_RECV] = 1*60*HZ,
- [IP_VS_S_FIN_WAIT] = 2*60*HZ,
+ [IP_VS_S_FIN_WAIT] = 10*HZ,
[IP_VS_S_TIME_WAIT] = 2*60*HZ,
[IP_VS_S_CLOSE] = 10*HZ,
[IP_VS_S_CLOSE_WAIT] = 60*HZ,
@@ -561,7 +563,7 @@
struct ip_vs_dest *dest = cp->dest;
IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
- "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+ "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n",
ip_vs_proto_name(cp->protocol),
(state_off==VS_STATE_OUTPUT)?"output ":"input ",
th->syn? 'S' : '.',
@@ -1077,6 +1079,11 @@
}
}
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->activeconns)
+ + atomic_read(&dest->inactconns);
+}
/*
* Bind a connection entry with a virtual service destination
@@ -1096,8 +1103,9 @@
cp->flags |= atomic_read(&dest->conn_flags);
cp->dest = dest;
- IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
- "d:%u.%u.%u.%u:%d fwd:%c s:%s flg:%X cnt:%d destcnt:%d\n",
+ IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+ "d:%u.%u.%u.%u:%d fwd:%c s:%s conn->flg:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
ip_vs_proto_name(cp->protocol),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -1105,6 +1113,29 @@
ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
cp->flags, atomic_read(&cp->refcnt),
atomic_read(&dest->refcnt));
+
+ /* Update connection counters */
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so increase the inactive
+ connection counter because it is in TCP SYNRECV
+ state (inactive) or other protocol inacive state */
+ atomic_inc(&dest->inactconns);
+ } else {
+ /* It is a persistent connection/template, so increase
+ the peristent connection counter */
+ atomic_inc(&dest->persistconns);
+ }
+
+ IP_VS_DBG(3, "Bind-dest: Threshold handling: avail_dests=%d\n",
+ atomic_read(&dest->svc->avail_dests));
+ if (dest->u_threshold != 0 &&
+ ip_vs_dest_totalconns(dest) >= dest->u_threshold) {
+ dest->flags |= IP_VS_DEST_F_OVERLOAD;
+ if (atomic_dec_and_test(&dest->svc->avail_dests)) {
+ /* All RS for this service are overloaded */
+ dest->svc->flags |= IP_VS_SVC_F_OVERLOAD;
+ }
+ }
}
@@ -1120,9 +1151,9 @@
if (!dest)
return;
- IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d "
+ IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d "
"v:%u.%u.%u.%u:%d d:%u.%u.%u.%u:%d fwd:%c "
- "s:%s flg:%X cnt:%d destcnt:%d\n",
+ "s:%s flg:%X conn->refcnt:%d dest->refcnt:%d\n",
ip_vs_proto_name(cp->protocol),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -1131,16 +1162,38 @@
cp->flags, atomic_read(&cp->refcnt),
atomic_read(&dest->refcnt));
- /*
- * Decrease the inactconns or activeconns counter
- * if it is not a connection template
- */
- if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
- if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+ /* Update the connection counters */
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so decrease the inactconns
+ or activeconns counter */
+ if (cp->flags & IP_VS_CONN_F_INACTIVE) {
atomic_dec(&dest->inactconns);
} else {
atomic_dec(&dest->activeconns);
}
+ } else {
+ /* It is a persistent connection/template, so decrease
+ the peristent connection counter */
+ atomic_dec(&dest->persistconns);
+ }
+
+ IP_VS_DBG(3, "Unbind-dest: Threshold handling: avail_dests=%d\n",
+ atomic_read(&dest->svc->avail_dests));
+ if (dest->l_threshold != 0) {
+ /* This implies that the upper threshold is != 0 as well */
+ if (ip_vs_dest_totalconns(dest) <= dest->l_threshold) {
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ atomic_inc(&dest->svc->avail_dests);
+ dest->svc->flags &= ~IP_VS_SVC_F_OVERLOAD;
+ }
+ } else {
+ /* We drop in here if the upper threshold is != 0 and the
+ lower threshold is ==0. */
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD) {
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ atomic_inc(&dest->svc->avail_dests);
+ dest->svc->flags &= ~IP_VS_SVC_F_OVERLOAD;
+ }
}
/*
@@ -1187,12 +1240,7 @@
ip_vs_conn_hash(ct);
}
}
-
- /*
- * Simply decrease the refcnt of the template,
- * don't restart its timer.
- */
- atomic_dec(&ct->refcnt);
+ __ip_vs_conn_put(ct);
return 0;
}
return 1;
@@ -1270,7 +1318,7 @@
ip_vs_conn_hash(cp);
expire_later:
- IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+ IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn.n_control=%d\n",
atomic_read(&cp->refcnt)-1,
atomic_read(&cp->n_control));
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_core.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_core.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_core.c 2005-10-27 17:17:13
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_core.c 2005-11-03 09:21:51
+0100
@@ -305,13 +305,6 @@
}
/*
- * Increase the inactive connection counter
- * because it is in Syn-Received
- * state (inactive) when the connection is created.
- */
- atomic_inc(&dest->inactconns);
-
- /*
* Add its control
*/
ip_vs_control_add(cp, ct);
@@ -369,14 +362,8 @@
if (cp == NULL)
return NULL;
- /*
- * Increase the inactive connection counter because it is in
- * Syn-Received state (inactive) when the connection is created.
- */
- atomic_inc(&dest->inactconns);
-
IP_VS_DBG(6, "Schedule fwd:%c s:%s c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
- "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+ "d:%u.%u.%u.%u:%u flg:%X conn->refcnt:%d\n",
ip_vs_fwd_tag(cp), ip_vs_state_name(cp->state),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
@@ -1111,11 +1098,10 @@
if (sysctl_ip_vs_expire_nodest_conn) {
/* try to expire the connection immediately */
ip_vs_conn_expire_now(cp);
- } else {
- /* don't restart its timer, and silently
- drop the packet. */
- __ip_vs_conn_put(cp);
}
+ /* don't restart its timer, and silently
+ drop the packet. */
+ __ip_vs_conn_put(cp);
return NF_DROP;
}
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_ctl.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_ctl.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_ctl.c 2005-06-01 02:56:56 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_ctl.c 2005-11-03 15:58:33 +0100
@@ -17,6 +17,8 @@
* 2 of the License, or (at your option) any later version.
*
* Changes:
+ * Roberto Nibali, ratz: Implemented per RS threshold limitation based
+ * on 2.5.x code and former design in 2.2.x.
*
*/
@@ -428,7 +430,7 @@
out:
read_unlock(&__ip_vs_svc_lock);
- IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+ IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
fwmark, ip_vs_proto_name(protocol),
NIPQUAD(vaddr), ntohs(vport),
svc?"hit":"not hit");
@@ -590,7 +592,7 @@
for (e=l->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
- "refcnt=%d\n",
+ "dest->refcnt=%d\n",
dest->vfwmark,
NIPQUAD(dest->addr), ntohs(dest->port),
atomic_read(&dest->refcnt));
@@ -712,7 +714,17 @@
}
/* set the dest status flags */
+ if (ur->dest_flags & IP_VS_DEST_F_OVERFLOW) {
+ dest->flags |= IP_VS_DEST_F_OVERFLOW;
+ }
dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+ if (ur->u_threshold == 0 || ur->u_threshold > dest->u_threshold) {
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ svc->flags &= ~IP_VS_SVC_F_OVERLOAD;
+ }
+ dest->u_threshold = ur->u_threshold;
+ dest->l_threshold = ur->l_threshold;
}
@@ -746,9 +758,12 @@
dest->vfwmark = svc->fwmark;
dest->addr = ur->daddr;
dest->port = ur->dport;
+ /* should this be atomic set? */
+ dest->flags = ur->dest_flags;
atomic_set(&dest->activeconns, 0);
atomic_set(&dest->inactconns, 0);
+ atomic_set(&dest->persistconns, 0);
atomic_set(&dest->refcnt, 0);
INIT_LIST_HEAD(&dest->d_list);
@@ -796,7 +811,7 @@
dest = ip_vs_trash_get_dest(svc, daddr, dport);
if (dest != NULL) {
IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
- "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+ "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
NIPQUAD(daddr), ntohs(dport),
atomic_read(&dest->refcnt),
dest->vfwmark,
@@ -820,6 +835,9 @@
list_add(&dest->n_list, &svc->destinations);
svc->num_dests++;
+ if (!(dest->flags & IP_VS_DEST_F_OVERFLOW)) {
+ atomic_inc(&svc->avail_dests);
+ }
/* call the update_service function of its scheduler */
svc->scheduler->update_service(svc);
@@ -850,6 +868,9 @@
list_add(&dest->n_list, &svc->destinations);
svc->num_dests++;
+ if (!(dest->flags & IP_VS_DEST_F_OVERFLOW)) {
+ atomic_inc(&svc->avail_dests);
+ }
/* call the update_service function of its scheduler */
svc->scheduler->update_service(svc);
@@ -935,7 +956,7 @@
atomic_dec(&dest->svc->refcnt);
kfree(dest);
} else {
- IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash,
refcnt=%d\n",
+ IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash,
dest->refcnt=%d\n",
NIPQUAD(dest->addr), ntohs(dest->port),
atomic_read(&dest->refcnt));
list_add(&dest->n_list, &ip_vs_dest_trash);
@@ -958,6 +979,9 @@
*/
list_del(&dest->n_list);
svc->num_dests--;
+ if (!(dest->flags & IP_VS_DEST_F_OVERFLOW)) {
+ atomic_dec(&svc->avail_dests);
+ }
if (svcupd) {
/*
* Call the update_service function of its scheduler
@@ -1848,6 +1872,7 @@
entry.timeout = svc->timeout / HZ;
entry.netmask = svc->netmask;
entry.num_dests = svc->num_dests;
+ entry.avail_dests = atomic_read(&svc->avail_dests);
__ip_vs_copy_stats(&entry.stats, &svc->stats);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
@@ -1873,6 +1898,7 @@
entry.timeout = svc->timeout / HZ;
entry.netmask = svc->netmask;
entry.num_dests = svc->num_dests;
+ entry.avail_dests = atomic_read(&svc->avail_dests);
__ip_vs_copy_stats(&entry.stats, &svc->stats);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
@@ -1912,9 +1938,13 @@
entry.addr = dest->addr;
entry.port = dest->port;
entry.flags = atomic_read(&dest->conn_flags);
+ entry.dest_flags = dest->flags;
entry.weight = atomic_read(&dest->weight);
+ entry.u_threshold = dest->u_threshold;
+ entry.l_threshold = dest->l_threshold;
entry.activeconns = atomic_read(&dest->activeconns);
entry.inactconns = atomic_read(&dest->inactconns);
+ entry.persistconns = atomic_read(&dest->persistconns);
__ip_vs_copy_stats(&entry.stats, &dest->stats);
if (copy_to_user(&uptr->entrytable[count],
&entry, sizeof(entry))) {
@@ -2028,6 +2058,7 @@
get.timeout = svc->timeout / HZ;
get.netmask = svc->netmask;
get.num_dests = svc->num_dests;
+ get.avail_dests = atomic_read(&svc->avail_dests);
__ip_vs_copy_stats(&get.stats, &svc->stats);
if (copy_to_user(user, &get, *len) != 0)
ret = -EFAULT;
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_dh.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_dh.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_dh.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_dh.c 2005-10-27 17:22:03 +0200
@@ -185,19 +185,6 @@
/*
- * If the number of active connections is twice larger than its weight,
- * consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
-{
- if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)*2) {
- return 1;
- }
- return 0;
-}
-
-
-/*
* Destination hashing scheduling
*/
static struct ip_vs_dest *
@@ -213,7 +200,7 @@
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
- || is_overloaded(dest)) {
+ || ip_vs_is_overloaded(dest)) {
return NULL;
}
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_hprio.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_hprio.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_hprio.c 1970-01-01 01:00:00
+0100
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_hprio.c 2005-10-27 17:21:57
+0200
@@ -0,0 +1,144 @@
+/*
+ * IPVS: Weighted Round-Robin High Priority Scheduling module
+ *
+ * Version: 0.0.3, 2005/10/26
+ *
+ * Authors: Roberto Nibali <ratz@xxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Algorithm:
+ * It's based on the rr scheduler but it only takes the highest
+ * weight and then discards all other servers which have not
+ * the highest weight.
+ *
+ * This can be used to simulate an atomic session LIP for peak
+ * loads where the user space application doesn't have the chance
+ * to react atomically.
+ *
+ * Changes:
+ * 0.0.2
+ * - Removed wrr related code for list traversal: would crash the
+ * whole kernel otherwise
+ * 0.0.3 - Ported to kernel 2.4
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <net/ip_vs.h>
+
+static int ip_vs_hprio_init_svc(struct ip_vs_service *svc) {
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+static int ip_vs_hprio_done_svc(struct ip_vs_service *svc) {
+ return 0;
+}
+
+static int ip_vs_hprio_update_svc(struct ip_vs_service *svc) {
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+
+/*
+ * Get the maximum weight of the service destinations.
+ */
+static int ip_vs_hprio_max_weight(struct ip_vs_service *svc) {
+ register struct list_head *l, *e;
+ struct ip_vs_dest *dest;
+ int weight = 0;
+
+ l = &svc->destinations;
+ for (e = l->next; e != l; e = e->next) {
+ dest = list_entry(e, struct ip_vs_dest, n_list);
+ IP_VS_DBG(1, " weight: %d\n", atomic_read(&dest->weight));
+ if (atomic_read(&dest->weight) > weight &&
+ !ip_vs_is_overloaded(dest)) {
+ weight = atomic_read(&dest->weight);
+ }
+ }
+ IP_VS_DBG(1, "max weight: %d\n", weight);
+ return weight;
+}
+
+
+/*
+ * Weighted Round-Robin Highest Priority Scheduling
+ */
+static struct ip_vs_dest* ip_vs_hprio_schedule(struct ip_vs_service *svc,
+ struct iphdr *iph) {
+ register struct list_head *p, *q;
+ struct ip_vs_dest *dest;
+ int max_weight;
+
+ IP_VS_DBG(1, "ip_vs_hprio_schedule(): Scheduling...\n");
+
+ write_lock(&svc->sched_lock);
+ max_weight = ip_vs_hprio_max_weight(svc);
+ p = (struct list_head *)svc->sched_data;
+ p = p->next;
+ q = p;
+ do {
+ if (q == &svc->destinations) {
+ q = q->next;
+ continue;
+ }
+ dest = list_entry(q, struct ip_vs_dest, n_list);
+ /*
+ The check for an overloaded destination is done in the
+ ip_vs_hprio_max_weight() function. With that we don't need
+ to quiesce destination servers by setting their weights to 0
+ */
+ if (atomic_read(&dest->weight) > 0 &&
+ atomic_read(&dest->weight) == max_weight) {
+ goto out;
+ }
+ q = q->next;
+ } while (q != p);
+ write_unlock(&svc->sched_lock);
+ return NULL;
+
+ out:
+ svc->sched_data = q;
+ write_unlock(&svc->sched_lock);
+ IP_VS_DBG(3, "HPRIO: server %d.%d.%d.%d:%d "
+ "activeconns %d refcnt %d weight %d\n",
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+ return dest;
+}
+
+static struct ip_vs_scheduler ip_vs_hprio_scheduler = {
+ {0}, /* n_list */
+ "hprio", /* name */
+ ATOMIC_INIT(0), /* refcnt */
+ THIS_MODULE, /* this module */
+ ip_vs_hprio_init_svc, /* service initializer */
+ ip_vs_hprio_done_svc, /* service done */
+ ip_vs_hprio_update_svc, /* service updater */
+ ip_vs_hprio_schedule, /* select a server from the destination list
*/
+};
+
+static int __init ip_vs_hprio_init(void) {
+ IP_VS_INFO("HPRIO scheduling module loaded.\n");
+ INIT_LIST_HEAD(&ip_vs_hprio_scheduler.n_list);
+ return register_ip_vs_scheduler(&ip_vs_hprio_scheduler);
+}
+
+static void __exit ip_vs_hprio_cleanup(void) {
+ unregister_ip_vs_scheduler(&ip_vs_hprio_scheduler);
+}
+
+module_init(ip_vs_hprio_init);
+module_exit(ip_vs_hprio_cleanup);
+MODULE_LICENSE("GPL");
+
+
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lblc.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lblc.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lblc.c 2004-04-14 15:05:41
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lblc.c 2005-10-27 17:22:03
+0200
@@ -473,6 +473,8 @@
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
+ if (ip_vs_is_overloaded(least))
+ continue;
if (atomic_read(&least->weight) > 0) {
loh = atomic_read(&least->activeconns) * 50
+ atomic_read(&least->inactconns);
@@ -487,6 +489,8 @@
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
+ if (ip_vs_is_overloaded(dest))
+ continue;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
if (loh * atomic_read(&dest->weight) >
@@ -558,7 +562,7 @@
ip_vs_lblc_hash(tbl, en);
} else {
dest = en->dest;
- if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
+ if (!(ip_vs_is_overloaded(dest))
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest, svc)) {
dest = __ip_vs_wlc_schedule(svc, iph);
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lblcr.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lblcr.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lblcr.c 2004-04-14 15:05:41
+0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lblcr.c 2005-10-27 17:22:03
+0200
@@ -178,6 +178,8 @@
/* select the first destination server, whose weight > 0 */
for (e=set->list; e!=NULL; e=e->next) {
least = e->dest;
+ if (ip_vs_is_overloaded(least))
+ continue;
if ((atomic_read(&least->weight) > 0)
&& (least->flags & IP_VS_DEST_F_AVAILABLE)) {
loh = atomic_read(&least->activeconns) * 50
@@ -192,6 +194,8 @@
nextstage:
for (e=e->next; e!=NULL; e=e->next) {
dest = e->dest;
+ if (ip_vs_is_overloaded(dest))
+ continue;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
if ((loh * atomic_read(&dest->weight) >
@@ -723,6 +727,8 @@
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
+ if (ip_vs_is_overloaded(least))
+ continue;
if (atomic_read(&least->weight) > 0) {
loh = atomic_read(&least->activeconns) * 50
+ atomic_read(&least->inactconns);
@@ -737,6 +743,8 @@
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
+ if (ip_vs_is_overloaded(dest))
+ continue;
doh = atomic_read(&dest->activeconns) * 50
+ atomic_read(&dest->inactconns);
if (loh * atomic_read(&dest->weight) >
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lc.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lc.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_lc.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_lc.c 2005-10-27 17:22:03 +0200
@@ -79,6 +79,8 @@
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry (e, struct ip_vs_dest, n_list);
+ if (ip_vs_is_overloaded(least))
+ continue;
if (atomic_read(&least->weight) > 0) {
loh = ip_vs_lc_dest_overhead(least);
goto nextstage;
@@ -92,7 +94,8 @@
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
- if (atomic_read(&dest->weight) == 0)
+ if (ip_vs_is_overloaded(dest)
+ || atomic_read(&dest->weight) == 0)
continue;
doh = ip_vs_lc_dest_overhead(dest);
if (doh < loh) {
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_nq.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_nq.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_nq.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_nq.c 2005-10-27 17:22:03 +0200
@@ -99,6 +99,8 @@
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
+ if (ip_vs_is_overloaded(least))
+ continue;
if (atomic_read(&least->weight) > 0) {
loh = ip_vs_nq_dest_overhead(least);
@@ -117,6 +119,8 @@
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
+ if (ip_vs_is_overloaded(dest))
+ continue;
doh = ip_vs_nq_dest_overhead(dest);
/* return the server directly if it is idle */
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_rr.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_rr.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_rr.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_rr.c 2005-10-27 17:22:03 +0200
@@ -68,7 +68,8 @@
continue;
}
dest = list_entry(q, struct ip_vs_dest, n_list);
- if (atomic_read(&dest->weight) > 0)
+ if (!ip_vs_is_overloaded(dest)
+ && atomic_read(&dest->weight) > 0)
/* HIT */
goto out;
q = q->next;
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sed.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sed.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sed.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sed.c 2005-10-27 17:22:03 +0200
@@ -103,6 +103,8 @@
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
+ if (ip_vs_is_overloaded(least))
+ continue;
if (atomic_read(&least->weight) > 0) {
loh = ip_vs_sed_dest_overhead(least);
goto nextstage;
@@ -116,6 +118,8 @@
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
+ if (ip_vs_is_overloaded(dest))
+ continue;
doh = ip_vs_sed_dest_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sh.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sh.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_sh.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_sh.c 2005-10-27 17:22:03 +0200
@@ -182,19 +182,6 @@
/*
- * If the number of active connections is twice larger than its weight,
- * consider that the server is overloaded here.
- */
-static inline int is_overloaded(struct ip_vs_dest *dest)
-{
- if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)*2) {
- return 1;
- }
- return 0;
-}
-
-
-/*
* Source Hashing scheduling
*/
static struct ip_vs_dest *
@@ -210,7 +197,7 @@
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
- || is_overloaded(dest)) {
+ || ip_vs_is_overloaded(dest)) {
return NULL;
}
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wlc.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wlc.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wlc.c 2004-04-14 15:05:41 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wlc.c 2005-10-27 17:22:03 +0200
@@ -91,6 +91,8 @@
l = &svc->destinations;
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
+ if (ip_vs_is_overloaded(least))
+ continue;
if (atomic_read(&least->weight) > 0) {
loh = ip_vs_wlc_dest_overhead(least);
goto nextstage;
@@ -104,7 +106,8 @@
nextstage:
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
-
+ if (ip_vs_is_overloaded(dest))
+ continue;
doh = ip_vs_wlc_dest_overhead(dest);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
diff -X dontdiff -Nur linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wrr.c
linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wrr.c
--- linux-2.4.32-orig/net/ipv4/ipvs/ip_vs_wrr.c 2005-04-04 03:42:20 +0200
+++ linux-2.4.32-pab2/net/ipv4/ipvs/ip_vs_wrr.c 2005-10-27 17:22:03 +0200
@@ -154,14 +154,16 @@
{
struct ip_vs_dest *dest;
struct ip_vs_wrr_mark *mark = svc->sched_data;
+ struct list_head *p;
IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
/*
- * This loop will always terminate, because 0<mark->cw<max_weight,
+ * This loop will always terminate, because 0 < mark->cw < max_weight,
* and at least one server has its weight equal to max_weight.
*/
write_lock(&svc->sched_lock);
+ p = mark->cl;
while (1) {
if (mark->cl == &svc->destinations) {
/* it is at the head of the destination list */
@@ -187,17 +189,29 @@
return NULL;
}
}
- }
- else mark->cl = mark->cl->next;
+ } else
+ mark->cl = mark->cl->next;
if (mark->cl != &svc->destinations) {
/* not at the head of the list */
dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
- if (atomic_read(&dest->weight) >= mark->cw) {
+ if (!ip_vs_is_overloaded(dest)
+ && atomic_read(&dest->weight) >= mark->cw) {
write_unlock(&svc->sched_lock);
break;
}
}
+
+ if (mark->cl == p) {
+ /*
+ We're back to the start and no dest has been found.
+ It's only possible if all dests are OVERLOADED. This
+ while ip_vs_wrr_schedule section should be adapted
+ to match the 2.6.x kernel function, using goto.
+ */
+ write_unlock(&svc->sched_lock);
+ return NULL;
+ }
}
IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
@@ -206,8 +220,7 @@
atomic_read(&dest->activeconns),
atomic_read(&dest->refcnt),
atomic_read(&dest->weight));
-
- return dest;
+ return dest;
}
|