Hi,
below is a patch that implements slow start for the WLC scheduler.
This is designed to address the problem where a real server is added
to the pool and soon inundated with connections. This is sometimes
refered to as the thundering hurd problem and was recently
the topic of a thread on this list "Easing a Server into Rotation".
http://marc.theaimsgroup.com/?l=linux-virtual-server&m=107591805721441&w=2
The patch has two basic parts.
ip_vs_ctl.c:
When the weight of a real server is modified (or a real server is added)
set the IP_VS_DEST_F_WEIGHT_INC or IP_VS_DEST_F_WEIGHT_DEC flag
as appropriate and put the size of the change in dest.slow_start_data
This information is intended to act as hints for scheduler
modules to implement slow start. The scheduler modules may
completely ignore this information without any side effects.
ip_vs_wlc.c:
If IP_VS_DEST_F_WEIGHT_DEC is set then the flag is zeroed -
slow start does not come into effect for weight defects.
If IP_VS_DEST_F_WEIGHT_INC is set then a handicap is calculated.
The flag is then zeroed.
The handicap is stored in dest.slow_start_data, along with a scaling
factor to allow gradual decay which is stored in dest.slow_start_data2.
The handicap effectively makes the real server appear to have
more connections than it does, thus decreasing the number of connections
that the wlc scheduler will allocate to it. This handicap is decayed
over time.
Limited debuging information is available by setting
/proc/sys/net/ipv4/vs/debug_level to 1 (or greater).
This will show the size of the handicap when it is calculated
and show a message when the handicap is fully decayed.
--
Horms
--- linux-2.4/include/net/ip_vs.h 2003-11-29 03:26:21.000000000 +0900
+++ linux-2.4.new/include/net/ip_vs.h 2004-02-23 14:59:35.000000000 +0900
@@ -26,6 +26,18 @@
#define IP_VS_DEST_F_AVAILABLE 0x0001 /* Available tag */
/*
+ * Advisoary flags for slow start
+ * The absolute value size of the weight change will be stored
+ * in dest->slow_start_data.
+ * The flag and slow_start_data may be used and modified by the scheduler
+ * to effect slow start
+ */
+#define IP_VS_DEST_F_WEIGHT_INC 0x0002 /* Weight has been increaced */
+#define IP_VS_DEST_F_WEIGHT_DEC 0x0004 /* Weight has been increaced */
+#define IP_VS_DEST_F_WEIGHT_MASK \
+ (IP_VS_DEST_F_WEIGHT_INC|IP_VS_DEST_F_WEIGHT_DEC)
+
+/*
* IPVS sync daemon states
*/
#define IP_VS_STATE_NONE 0 /* daemon is stopped */
@@ -317,6 +329,7 @@ enum {
NET_IPV4_VS_EXPIRE_NODEST_CONN=23,
NET_IPV4_VS_SYNC_THRESHOLD=24,
NET_IPV4_VS_NAT_ICMP_SEND=25,
+ NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE=26,
NET_IPV4_VS_LAST
};
@@ -498,6 +511,10 @@ struct ip_vs_dest {
__u32 vaddr; /* IP address for virtual service */
__u16 vport; /* port number for the service */
__u32 vfwmark; /* firewall mark of the service */
+
+ /* for slow start */
+ atomic_t slow_start_data;
+ atomic_t slow_start_data2;
};
@@ -702,6 +719,7 @@ extern int sysctl_ip_vs_cache_bypass;
extern int sysctl_ip_vs_expire_nodest_conn;
extern int sysctl_ip_vs_sync_threshold;
extern int sysctl_ip_vs_nat_icmp_send;
+extern int sysctl_ip_vs_expire_quiescent_template;
extern struct ip_vs_stats ip_vs_stats;
extern struct ip_vs_service *ip_vs_service_get(__u32 fwmark,
--- linux-2.4/net/ipv4/ipvs/ip_vs_ctl.c 2003-11-29 03:26:21.000000000 +0900
+++ linux-2.4.new/net/ipv4/ipvs/ip_vs_ctl.c 2004-02-23 15:41:42.000000000
+0900
@@ -79,6 +79,7 @@ int sysctl_ip_vs_cache_bypass = 0;
int sysctl_ip_vs_expire_nodest_conn = 0;
int sysctl_ip_vs_sync_threshold = 3;
int sysctl_ip_vs_nat_icmp_send = 0;
+int sysctl_ip_vs_expire_quiescent_template = 0;
#ifdef CONFIG_IP_VS_DEBUG
static int sysctl_ip_vs_debug_level = 0;
@@ -670,6 +671,20 @@ static void __ip_vs_update_dest(struct i
struct ip_vs_rule_user *ur)
{
int conn_flags;
+ int old_weight;
+
+ /* Set hints for slow start */
+ dest->flags |= IP_VS_DEST_F_WEIGHT_MASK;
+ dest->flags ^= IP_VS_DEST_F_WEIGHT_MASK;
+ old_weight = atomic_read(&dest->weight);
+ if (old_weight < ur->weight) {
+ atomic_set(&dest->slow_start_data, ur->weight - old_weight);
+ dest->flags |= IP_VS_DEST_F_WEIGHT_INC;
+ }
+ else if (old_weight > ur->weight) {
+ atomic_set(&dest->slow_start_data, old_weight - ur->weight);
+ dest->flags |= IP_VS_DEST_F_WEIGHT_DEC;
+ }
/*
* Set the weight and the flags
@@ -1436,6 +1451,9 @@ static struct ip_vs_sysctl_table ipv4_vs
{NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send",
&sysctl_ip_vs_nat_icmp_send, sizeof(int), 0644, NULL,
&proc_dointvec},
+ {NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE, "expire_quiescent_template",
+ &sysctl_ip_vs_expire_quiescent_template, sizeof(int), 0644, NULL,
+ &proc_dointvec},
{0}},
{{NET_IPV4_VS, "vs", NULL, 0, 0555, ipv4_vs_table.vs_vars},
{0}},
--- linux-2.4/net/ipv4/ipvs/ip_vs_wlc.c 2003-11-29 03:26:21.000000000 +0900
+++ linux-2.4.new/net/ipv4/ipvs/ip_vs_wlc.c 2004-02-23 15:48:45.000000000
+0900
@@ -51,9 +51,116 @@ ip_vs_wlc_update_svc(struct ip_vs_servic
return 0;
}
+static void
+ip_vs_wlc_set_slow_start(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ __u32 ss_handicap;
+ __u32 ss_shift;
+ __u32 ndest;
+ __u32 w = 0;
+ __u32 dest_w = 0;
+ struct list_head *l, *e;
+ struct ip_vs_dest *d;
+
+ /* If the weight is zero just set the slow_start hint
+ * and data to zero too as they won't be used */
+
+ if((dest->flags & IP_VS_DEST_F_WEIGHT_DEC) ||
+ !(dest_w = atomic_read(&dest->weight))) {
+ IP_VS_DBG(1, "slow_start: null\n");
+ atomic_set(&dest->slow_start_data, 0);
+ atomic_set(&dest->slow_start_data2, 0);
+ return;
+ }
+
+ /* Calculate a weighted number of connections
+ * this server would have if all the currently
+ * active connections were redistributed limited
+ * to a maximum of 64k */
+ l = &svc->destinations;
+ ss_handicap = 0;
+ ndest = 0;
+ for (e=l->next; e!=l; e=e->next) {
+ d = list_entry(e, struct ip_vs_dest, n_list);
+ w = atomic_read(&d->weight);
+ if (w < 1 || d == dest) {
+ continue;
+ }
+ ndest++;
+
+ /* Try to avoid overflowint ss_handicap */
+ ss_shift = atomic_read(&d->activeconns);
+ if(ss_shift & 0xffff0000)
+ ss_shift = 0xffff;
+ ss_shift = (ss_shift << 16 ) / (w & 0xffff);
+
+ if (~0L - ss_handicap < ss_shift) {
+ ss_handicap = ~0L;
+ break;
+ }
+ ss_handicap += ss_shift;
+ }
+ if (ndest)
+ ss_handicap = (ss_handicap * dest_w / ndest) >> 16;
+
+ /* ss_shift = log_2((ss_handicap & 0xfff) >> 3) */
+ if (ss_handicap) {
+ __u32 i;
+ ss_shift = ss_handicap;;
+ for (i = 12; i > 0; i--) {
+ if(ss_shift & 0x8000)
+ break;
+ ss_shift <<= 1;
+ }
+ ss_shift = i;
+ ss_handicap <<= ss_shift;
+ }
+ else
+ ss_shift = 0;
+
+ atomic_set(&dest->slow_start_data, ss_handicap);
+ atomic_set(&dest->slow_start_data2, ss_shift);
+
+ IP_VS_DBG(1, "WLC slow_start_init: server %u.%u.%u.%u:%u "
+ "handicap=%u (%u) shift=%u ndest=%u\n",
+ NIPQUAD(dest->addr), ntohs(dest->port),
+ ss_handicap, ss_handicap >> ss_shift, ss_shift, ndest);
+}
+
+
+
+static inline unsigned int
+ip_vs_wlc_slowlstart_dest_handicap(struct ip_vs_dest *dest,
+ struct ip_vs_service *svc)
+{
+ unsigned int handicap;
+
+
+ /* Set up slow_start if weight has recently changed */
+ if (unlikely(dest->flags & IP_VS_DEST_F_WEIGHT_MASK)) {
+ ip_vs_wlc_set_slow_start(dest, svc);
+ dest->flags |= IP_VS_DEST_F_WEIGHT_MASK;
+ dest->flags ^= IP_VS_DEST_F_WEIGHT_MASK;
+ }
+
+ handicap = atomic_read(&dest->slow_start_data);
+ if (unlikely(!handicap))
+ return 0;
+
+ handicap--;
+ atomic_set(&dest->slow_start_data, handicap);
+
+#ifdef CONFIG_IP_VS_DEBUG
+ if (unlikely(!handicap))
+ IP_VS_DBG(1, "WLC slow_start_end: server %u.%u.%u.%u:%u\n",
+ NIPQUAD(dest->addr), ntohs(dest->port));
+#endif
+
+ return handicap >> atomic_read(&dest->slow_start_data2);
+}
static inline unsigned int
-ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
+ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
/*
* We think the overhead of processing active connections is 256
@@ -62,7 +169,8 @@ ip_vs_wlc_dest_overhead(struct ip_vs_des
* use the following formula to estimate the overhead now:
* dest->activeconns*256 + dest->inactconns
*/
- return (atomic_read(&dest->activeconns) << 8) +
+ return ((atomic_read(&dest->activeconns) +
+ ip_vs_wlc_slowlstart_dest_handicap(dest, svc)) << 8) +
atomic_read(&dest->inactconns);
}
@@ -96,7 +204,7 @@ ip_vs_wlc_schedule(struct ip_vs_service
for (e=l->next; e!=l; e=e->next) {
least = list_entry(e, struct ip_vs_dest, n_list);
if (atomic_read(&least->weight) > 0) {
- loh = ip_vs_wlc_dest_overhead(least);
+ loh = ip_vs_wlc_dest_overhead(least, svc);
goto nextstage;
}
}
@@ -109,7 +217,7 @@ ip_vs_wlc_schedule(struct ip_vs_service
for (e=e->next; e!=l; e=e->next) {
dest = list_entry(e, struct ip_vs_dest, n_list);
- doh = ip_vs_wlc_dest_overhead(dest);
+ doh = ip_vs_wlc_dest_overhead(dest, svc);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
|