Index: ipvs/ip_vs.h =================================================================== RCS file: /home/valinux/simon/cvs/ipvs-1.0/ipvs/ip_vs.h,v retrieving revision 1.1.1.6 diff -u -b -I$Id:.*Exp -r1.1.1.6 ip_vs.h --- ipvs/ip_vs.h 15 Nov 2002 02:25:39 -0000 1.1.1.6 +++ ipvs/ip_vs.h 22 Nov 2002 01:11:16 -0000 @@ -218,6 +218,129 @@ }; +/* + * IPVS connection entry hash table + */ + +#define VS_STATE_INPUT 0 +#define VS_STATE_OUTPUT 4 +#define VS_STATE_INPUT_ONLY 8 + +/* + * Delta sequence info structure + * Each ip_vs_conn has 2 (output AND input seq. changes). + * Only used in the VS/NAT. + */ +struct ip_vs_seq { + __u32 init_seq; /* Add delta from this seq */ + __u32 delta; /* Delta in sequence numbers */ + __u32 previous_delta; /* Delta in sequence numbers + before last resized pkt */ +}; + + +/* + * IPVS sync connection entry + */ +struct ip_vs_sync_conn { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __u16 cport; + __u16 vport; + __u16 dport; + __u32 caddr; /* client address */ + __u32 vaddr; /* virtual address */ + __u32 daddr; /* destination address */ + + /* Flags and state transition */ + __u16 flags; /* status flags */ + __u16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ) +#define IP_VS_SYNC_SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +#define IP_VS_SYNC_FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages to the backup load balancers in the + following format. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | Reserved | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + | . | + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +struct ip_vs_sync_mesg { + __u8 nr_conns; + __u8 reserved; + __u16 size; + + /* ip_vs_sync_conn entries start here */ +}; + + +/* Trailing sizeof(struct ip_vs_sync_conn_options) is to allow + * the full connection count to be used by allowing a bit of extra + * space, just in case the last connection is FULL_CONN_SIZE + * instead of SIMPLE_CONN_SIZE */ + +/* At the very least the message needs to hold one message */ +#define IP_VS_SYNC_MESG_MAX_SIZE_MIN \ + (IP_VS_SYNC_FULL_CONN_SIZE + sizeof(struct ip_vs_sync_mesg)) + +/* At most, 256 messages can be carried. This is because + * the nr_conns element in the ip_vs_sync_mesg structure + * is an unsigned 8bit integer, and thus has a valid range + * of 0 - 255. Beyond that a wraparound will occur. */ +#define IP_VS_SYNC_MESG_MAX_SIZE_MAX \ + (255 * IP_VS_SYNC_FULL_CONN_SIZE + sizeof(struct ip_vs_sync_mesg)) + +/* By default, send 50 connections per message. This fits nicely + * into a 1500 MTU packet */ +#define IP_VS_SYNC_MESG_MAX_SIZE_DEFAULT \ + (50 * IP_VS_SYNC_SIMPLE_CONN_SIZE + sizeof(struct ip_vs_sync_mesg) \ + + sizeof(struct ip_vs_sync_conn_options)) + +#define IP_VS_SYNC_MESG_MAX_SIZE \ + ((sysctl_ip_vs_sync_msg_max_size > IP_VS_SYNC_MESG_MAX_SIZE_MAX) ? \ + IP_VS_SYNC_MESG_MAX_SIZE_MAX: \ + ((sysctl_ip_vs_sync_msg_max_size < IP_VS_SYNC_MESG_MAX_SIZE_MIN) ? \ + IP_VS_SYNC_MESG_MAX_SIZE_MIN : sysctl_ip_vs_sync_msg_max_size)) + + +#define IP_VS_SYNC_FREQUENCY_DEFAULT 50 +#define IP_VS_SYNC_FREQUENCY \ + ((sysctl_ip_vs_sync_frequency < 1) ? 1 : sysctl_ip_vs_sync_frequency) + +#define IP_VS_SYNC_THRESHOLD_DEFAULT 3 +#define IP_VS_SYNC_THRESHOLD \ + ((sysctl_ip_vs_sync_threshold < 0) ? 0 : \ + (sysctl_ip_vs_sync_threshold >= IP_VS_SYNC_FREQUENCY) ? \ + IP_VS_SYNC_FREQUENCY - 1 : sysctl_ip_vs_sync_threshold) + #ifdef __KERNEL__ #include @@ -316,7 +439,9 @@ NET_IPV4_VS_CACHE_BYPASS=22, NET_IPV4_VS_EXPIRE_NODEST_CONN=23, NET_IPV4_VS_SYNC_THRESHOLD=24, - NET_IPV4_VS_NAT_ICMP_SEND=25, + NET_IPV4_VS_SYNC_FREQUENCY=25, + NET_IPV4_VS_SYNC_MSG_MAX_SIZE=26, + NET_IPV4_VS_NAT_ICMP_SEND=27, NET_IPV4_VS_LAST }; @@ -362,19 +487,6 @@ /* - * Delta sequence info structure - * Each ip_vs_conn has 2 (output AND input seq. changes). - * Only used in the VS/NAT. - */ -struct ip_vs_seq { - __u32 init_seq; /* Add delta from this seq */ - __u32 delta; /* Delta in sequence numbers */ - __u32 previous_delta; /* Delta in sequence numbers - before last resized pkt */ -}; - - -/* * IPVS statistics object */ struct ip_vs_stats @@ -580,10 +692,6 @@ #define IP_VS_CONN_TAB_SIZE (1 << IP_VS_CONN_TAB_BITS) #define IP_VS_CONN_TAB_MASK (IP_VS_CONN_TAB_SIZE - 1) -#define VS_STATE_INPUT 0 -#define VS_STATE_OUTPUT 4 -#define VS_STATE_INPUT_ONLY 8 - extern struct ip_vs_timeout_table vs_timeout_table; extern struct ip_vs_timeout_table vs_timeout_table_dos; @@ -701,6 +809,8 @@ extern int sysctl_ip_vs_cache_bypass; extern int sysctl_ip_vs_expire_nodest_conn; extern int sysctl_ip_vs_sync_threshold; +extern int sysctl_ip_vs_sync_frequency; +extern int sysctl_ip_vs_sync_msg_max_size; extern int sysctl_ip_vs_nat_icmp_send; extern atomic_t ip_vs_dropentry; extern struct ip_vs_stats ip_vs_stats; Index: ipvs/ip_vs_ctl.c =================================================================== RCS file: /home/valinux/simon/cvs/ipvs-1.0/ipvs/ip_vs_ctl.c,v retrieving revision 1.1.1.7 retrieving revision 1.1.1.2.2.3 diff -u -b -I$Id:.*Exp -r1.1.1.7 -r1.1.1.2.2.3 --- ipvs/ip_vs_ctl.c 15 Nov 2002 02:25:39 -0000 1.1.1.7 +++ ipvs/ip_vs_ctl.c 15 Nov 2002 04:17:24 -0000 1.1.1.2.2.3 @@ -78,7 +78,9 @@ static int sysctl_ip_vs_am_droprate = 10; int sysctl_ip_vs_cache_bypass = 0; int sysctl_ip_vs_expire_nodest_conn = 0; -int sysctl_ip_vs_sync_threshold = 3; +int sysctl_ip_vs_sync_threshold = IP_VS_SYNC_THRESHOLD_DEFAULT; +int sysctl_ip_vs_sync_frequency = IP_VS_SYNC_FREQUENCY_DEFAULT; +int sysctl_ip_vs_sync_msg_max_size = IP_VS_SYNC_MESG_MAX_SIZE_DEFAULT; int sysctl_ip_vs_nat_icmp_send = 0; #ifdef CONFIG_IP_VS_DEBUG @@ -1410,6 +1412,12 @@ &proc_dointvec}, {NET_IPV4_VS_SYNC_THRESHOLD, "sync_threshold", &sysctl_ip_vs_sync_threshold, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_SYNC_FREQUENCY, "sync_frequency", + &sysctl_ip_vs_sync_frequency, sizeof(int), 0644, NULL, + &proc_dointvec}, + {NET_IPV4_VS_SYNC_MSG_MAX_SIZE, "sync_msg_max_size", + &sysctl_ip_vs_sync_msg_max_size, sizeof(int), 0644, NULL, &proc_dointvec}, {NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send", &sysctl_ip_vs_nat_icmp_send, sizeof(int), 0644, NULL, Index: ipvs/ip_vs_sync.c =================================================================== RCS file: /home/valinux/simon/cvs/ipvs-1.0/ipvs/ip_vs_sync.c,v retrieving revision 1.1.1.5 diff -u -b -I$Id:.*Exp -r1.1.1.5 ip_vs_sync.c --- ipvs/ip_vs_sync.c 6 Sep 2002 01:16:26 -0000 1.1.1.5 +++ ipvs/ip_vs_sync.c 22 Nov 2002 01:11:16 -0000 @@ -39,69 +39,6 @@ #define IP_VS_SYNC_PORT 8848 /* multicast port */ -/* - * IPVS sync connection entry - */ -struct ip_vs_sync_conn { - __u8 reserved; - - /* Protocol, addresses and port numbers */ - __u8 protocol; /* Which protocol (TCP/UDP) */ - __u16 cport; - __u16 vport; - __u16 dport; - __u32 caddr; /* client address */ - __u32 vaddr; /* virtual address */ - __u32 daddr; /* destination address */ - - /* Flags and state transition */ - __u16 flags; /* status flags */ - __u16 state; /* state info */ - - /* The sequence options start here */ -}; - -struct ip_vs_sync_conn_options { - struct ip_vs_seq in_seq; /* incoming seq. struct */ - struct ip_vs_seq out_seq; /* outgoing seq. struct */ -}; - -#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ) -#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) -#define FULL_CONN_SIZE \ -(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) - - -/* - The master mulitcasts messages to the backup load balancers in the - following format. - - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Count Conns | Reserved | Size | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (1) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | . | - | . | - | . | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (n) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -*/ -#define SYNC_MESG_MAX_SIZE (24*50+4) -struct ip_vs_sync_mesg { - __u8 nr_conns; - __u8 reserved; - __u16 size; - - /* ip_vs_sync_conn entries start here */ -}; - - struct ip_vs_sync_buff { struct list_head list; unsigned long firstuse; @@ -153,14 +90,14 @@ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - if (!(sb->mesg=kmalloc(SYNC_MESG_MAX_SIZE, GFP_ATOMIC))) { + if (!(sb->mesg=kmalloc(IP_VS_SYNC_MESG_MAX_SIZE, GFP_ATOMIC))) { kfree(sb); return NULL; } sb->mesg->nr_conns = 0; sb->mesg->size = 4; sb->head = (unsigned char *)sb->mesg + 4; - sb->end = (unsigned char *)sb->mesg + SYNC_MESG_MAX_SIZE; + sb->end = (unsigned char *)sb->mesg + IP_VS_SYNC_MESG_MAX_SIZE; sb->firstuse = jiffies; return sb; } @@ -211,8 +148,8 @@ } } - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? IP_VS_SYNC_FULL_CONN_SIZE : + IP_VS_SYNC_SIMPLE_CONN_SIZE; m = curr_sb->mesg; s = (struct ip_vs_sync_conn *)curr_sb->head; @@ -237,7 +174,8 @@ curr_sb->head += len; /* check if there is a space for next one */ - if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { + if (curr_sb->head+IP_VS_SYNC_FULL_CONN_SIZE > curr_sb->end || + m->nr_conns == 255) { sb_queue_tail(curr_sb); curr_sb = NULL; } @@ -294,9 +232,9 @@ if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) { opt = (struct ip_vs_sync_conn_options *)&s[1]; memcpy(&cp->in_seq, opt, sizeof(*opt)); - p += FULL_CONN_SIZE; + p += IP_VS_SYNC_FULL_CONN_SIZE; } else - p += SIMPLE_CONN_SIZE; + p += IP_VS_SYNC_SIMPLE_CONN_SIZE; atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold); cp->timeout = IP_VS_SYNC_CONN_TIMEOUT; @@ -625,7 +563,7 @@ char *buf; int len; - if (!(buf=kmalloc(SYNC_MESG_MAX_SIZE, GFP_ATOMIC))) { + if (!(buf=kmalloc(IP_VS_SYNC_MESG_MAX_SIZE, GFP_ATOMIC))) { IP_VS_ERR("sync_backup_loop: kmalloc error\n"); return; } @@ -639,7 +577,7 @@ /* do you have data now? */ while (!skb_queue_empty(&(sock->sk->receive_queue))) { if ((len=ip_vs_receive(sock, buf, - SYNC_MESG_MAX_SIZE))<=0) { + IP_VS_SYNC_MESG_MAX_SIZE))<=0) { IP_VS_ERR("receiving message error\n"); break; } Index: ipvs/ipvsadm/ipvsadm.8 =================================================================== RCS file: /home/valinux/simon/cvs/ipvs-1.0/ipvs/ipvsadm/ipvsadm.8,v retrieving revision 1.1.1.6 diff -u -b -I$Id:.*Exp -r1.1.1.6 ipvsadm.8 --- ipvs/ipvsadm/ipvsadm.8 15 Nov 2002 02:22:55 -0000 1.1.1.6 +++ ipvs/ipvsadm/ipvsadm.8 22 Nov 2002 01:11:16 -0000 @@ -5,6 +5,7 @@ .\" .\" Authors: Mike Wangsmo .\" Wensong Zhang +.\" Horms .\" .\" Changes: .\" Horms : Updated to reflect recent change of ipvsadm @@ -14,6 +15,8 @@ .\" Horms : Tidy up some of the description and the .\" grammar in the -f and sysctl sections .\" Wensong Zhang : --set option description taken from ipchains(8) +.\" Horms : Document synchronisation daemon's proc +.\" entries. .\" .\" This program is free software; you can redistribute it and/or modify .\" it under the terms of the GNU General Public License as published by @@ -154,19 +157,40 @@ the current timeout value of the corresponding entry is preserved. .TP .B --start-daemon \fIstate\fP -Start the connection synchronization daemon. The \fIstate\fP is to +Start the connection synchronisation daemon. The \fIstate\fP is to indicate that the daemon is started as \fImaster\fP or \fIbackup\fP. The -connection synchronization daemon is implemented inside the Linux -kernel. The master daemon running at the primary load balancer +connection synchronisation daemon is implemented inside the Linux +kernel. The master daemon running on the primary load balancer multicasts changes of connections periodically, and the backup daemon -running at the backup load balancers receives multicast message and -creates corresponding connections. Then, in case the primary load -balancer fails, a backup load balancer will takeover, and it has state -of almost all connections, so that almost all established connections +running on the backup load balancers receives multicast message and +creates corresponding connections. Then, if the primary load +balancer fails and backup load balancer takes over, it has the state +of almost all connections. Thus, almost all established connections can continue to access the service. +.sp +There are 3 proc entries that effect the behaviour of the synchronisation +daemon. In the case of each of these proc entries, values outside of the +valid ranges given will be rounded up or down as necessary. +.sp +/proc/sys/net/ipv4/vs/sync_msg_max_size sets the maximum size of messages +sent by the synchronisation daemon in bytes. The default is 1228 and the +valid range is 52 through to 12244. +.sp +/proc/sys/net/ipv4/vs/sync_frequency sets synchronisation frequency \- +how often a connection is +synchronised in terms of the number of packets received. The default is 50 +and the valid range is 1 through to 2147483648. +.sp +/proc/sys/net/ipv4/vs/sync_threshold sets the synchronisation threshold \- +the minimum number of packets a connection needs to receive before it will +be synchronised. The default is 3 and the valid range is from 0 up to +sync_frequency. Once sync_threshold is passed, the connection will be +synchronised every sync_frequency packets. For example, using the default +sync_frequency of 50 and the default sync_threshold of 3, synchronisation +will occur on once the 3rd packet is received and then every 50th packet. .TP .B --stop-daemon -Stop the connection synchronization daemon. +Stop the connection synchronisation daemon. .TP \fB-h, --help\fR Display a description of the command syntax. @@ -192,8 +216,8 @@ virtual service instead of an address, port and protocol (UDP or TCP). The marking of packets with a firewall-mark is configured using the -m|--mark option to \fBiptables\fR(8). It can be used to build a -virtual service assoicated with the same real servers, covering -multiple IP addresss, port and protocol tripplets. +virtual service associated with the same real servers, covering +multiple IP addresses, port and protocol triplets. .sp Using firewall-mark virtual services provides a convenient method of grouping together different IP addresses, ports and protocols into a @@ -278,7 +302,7 @@ tunneling and direct routing methods, \fIport\fP must be equal to that of the service address. For normal services, the port specified in the service address will be used if \fIport\fP is not specified. For -fwmark services, \fIport\fP may be ommitted, in which case the +fwmark services, \fIport\fP may be omitted, in which case the destination port on the real server will be the destination port of the request sent to the virtual service. .TP @@ -415,13 +439,13 @@ modprobe ip_vs_ftp .fi .SH NOTES -The Linux Virtual Server implements three defense strategies against +The Linux Virtual Server implements three defence strategies against some types of denial of service (DoS) attacks. The Linux Director creates an entry for each connection in order to keep its state, and each entry occupies 128 bytes effective memory. LVS's vulnerability to a DoS attack lies in the potential to increase the number entries as much as possible until the linux director runs out of memory. The -three defense strategies against the attack are: Randomly drop some +three defence strategies against the attack are: Randomly drop some entries in the table. Drop 1/rate packets before forwarding them. And use secure tcp state transition table and short timeouts. The strategies are controlled by sysctl variables and corresponding @@ -432,7 +456,7 @@ /proc/sys/net/ipv4/vs/secure_tcp .PP Valid values for each variable are 0 through to 3. The default value -is 0, which disables the respective defense strategy. 1 and 2 are +is 0, which disables the respective defence strategy. 1 and 2 are automatic modes - when there is no enough available memory, the respective strategy will be enabled and the variable is automatically set to 2, otherwise the strategy is disabled and the variable is set @@ -462,6 +486,10 @@ .br .I /proc/sys/net/ipv4/vs/secure_tcp .br +.I /proc/sys/net/ipv4/vs/sync_msg_max_size +.br +.I /proc/sys/net/ipv4/vs/sync_threshold +.br .I /proc/sys/net/ipv4/vs/timeout_close .br .I /proc/sys/net/ipv4/vs/timeout_closewait @@ -493,5 +521,5 @@ Peter Kese man page - Mike Wangsmo Wensong Zhang - Horms + Horms .fi