Hello,
On Sun, 2010-09-26 at 15:33 +0200, Simon Horman wrote:
> From: Julian Anastasov <ja@xxxxxx>
>
> Tunnel mode for IPv6 doesn't work.
>
> IPv6 encapsulation uses a bad source address for the tunnel.
> i.e. VIP will be used as local-addr and encap. dst addr.
> Decapsulation will not accept this.
>
> Example
> LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100)
> (eth0 2003::1:0:1/96)
> RS (ethX 2003::1:0:5/96)
>
> tcpdump
> 2003::2:0:100 > 2003::1:0:5:
> IP6 (hlim 63, next-header TCP (6) payload length: 40)
> 2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312
> (correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val
> 1904932 ecr 0,nop,wscale 3], length 0
>
> In Linux IPv6 impl. you can't have a tunnel with an any cast address
> receiving packets (I have not tried to interpret RFC 2473)
> To have receive capabilities the tunnel must have:
> - Local address set as multicast addr or an unicast addr
> - Remote address set as an unicast addr.
> - Loop back addres or Link local address are not allowed.
>
> This causes us to setup a tunnel in the Real Server with the
> LVS as the remote address, here you can't use the VIP address since it's
> used inside the tunnel.
>
> Solution
> Use outgoing interface IPv6 address (match against the destination).
> i.e. use ip6_route_output() to look up the route cache and
> then use ipv6_dev_get_saddr(...) to set the source address of the
> encapsulated packet.
>
> Additionally, cache the results in new destination
> fields: dst_cookie and dst_saddr and properly check the
> returned dst from ip6_route_output. We now add xfrm_lookup
> call only for the tunneling method where the source address
> is a local one.
>
> Original patch by Hans Schillstrom.
> Check dst state and cache results for IPv6 by Julian Anastasov.
>
> Tested-by: Hans Schillstrom <hans.schillstrom@xxxxxxxxxxxx>
> Signed-off-by: Julian Anastasov <ja@xxxxxx>
> Signed-off-by: Simon Horman <horms@xxxxxxxxxxxx>
>
> ---
>
> * v1
>
> This is Julian's patch with a slightly edited version of the description
> from Hans's original patch.
>
> * v2
>
> Updated changelog as per commends from Julian
>
> Is everyone ok with pushing this?
>
> diff -urp net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h
> linux/include/net/ip_vs.h
> --- net-next-2.6-e548833-nfct_snat_reroute/linux/include/net/ip_vs.h
> 2010-09-16 09:03:48.000000000 +0300
> +++ linux/include/net/ip_vs.h 2010-09-22 10:50:18.548963467 +0300
> @@ -509,6 +509,10 @@ struct ip_vs_dest {
> spinlock_t dst_lock; /* lock of dst_cache */
> struct dst_entry *dst_cache; /* destination cache entry */
> u32 dst_rtos; /* RT_TOS(tos) for dst */
> + u32 dst_cookie;
> +#ifdef CONFIG_IP_VS_IPV6
> + struct in6_addr dst_saddr;
> +#endif
>
> /* for virtual service */
> struct ip_vs_service *svc; /* service it belongs to */
> diff -urp
> net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c
> linux/net/netfilter/ipvs/ip_vs_xmit.c
> ---
> net-next-2.6-e548833-nfct_snat_reroute/linux/net/netfilter/ipvs/ip_vs_xmit.c
> 2010-09-16 09:02:25.000000000 +0300
> +++ linux/net/netfilter/ipvs/ip_vs_xmit.c 2010-09-22 16:29:43.271964521
> +0300
> @@ -26,6 +26,7 @@
> #include <net/route.h> /* for ip_route_output */
> #include <net/ipv6.h>
> #include <net/ip6_route.h>
> +#include <net/addrconf.h>
> #include <linux/icmpv6.h>
> #include <linux/netfilter.h>
> #include <linux/netfilter_ipv4.h>
> @@ -37,26 +38,27 @@
> * Destination cache to speed up outgoing route lookup
> */
> static inline void
> -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
> +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
> + u32 dst_cookie)
> {
> struct dst_entry *old_dst;
>
> old_dst = dest->dst_cache;
> dest->dst_cache = dst;
> dest->dst_rtos = rtos;
> + dest->dst_cookie = dst_cookie;
> dst_release(old_dst);
> }
>
> static inline struct dst_entry *
> -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
> +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
> {
> struct dst_entry *dst = dest->dst_cache;
>
> if (!dst)
> return NULL;
> - if ((dst->obsolete
> - || (dest->af == AF_INET && rtos != dest->dst_rtos)) &&
> - dst->ops->check(dst, cookie) == NULL) {
> + if ((dst->obsolete || rtos != dest->dst_rtos) &&
> + dst->ops->check(dst, dest->dst_cookie) == NULL) {
> dest->dst_cache = NULL;
> dst_release(dst);
> return NULL;
> @@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *des
> }
>
> static struct rtable *
> -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
> +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos)
> {
> + struct net *net = dev_net(skb->dev);
> struct rtable *rt; /* Route to the other host */
> struct ip_vs_dest *dest = cp->dest;
>
> if (dest) {
> spin_lock(&dest->dst_lock);
> if (!(rt = (struct rtable *)
> - __ip_vs_dst_check(dest, rtos, 0))) {
> + __ip_vs_dst_check(dest, rtos))) {
> struct flowi fl = {
> .oif = 0,
> .nl_u = {
> @@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
> .tos = rtos, } },
> };
>
> - if (ip_route_output_key(&init_net, &rt, &fl)) {
> + if (ip_route_output_key(net, &rt, &fl)) {
> spin_unlock(&dest->dst_lock);
> IP_VS_DBG_RL("ip_route_output error, dest:
> %pI4\n",
> &dest->addr.ip);
> return NULL;
> }
> - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst));
> + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
> IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n",
> &dest->addr.ip,
> atomic_read(&rt->dst.__refcnt), rtos);
> @@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
> .tos = rtos, } },
> };
>
> - if (ip_route_output_key(&init_net, &rt, &fl)) {
> + if (ip_route_output_key(net, &rt, &fl)) {
> IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n",
> &cp->daddr.ip);
> return NULL;
> @@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp
> }
>
> #ifdef CONFIG_IP_VS_IPV6
> +
> +static struct dst_entry *
> +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
> + struct in6_addr *ret_saddr, int do_xfrm)
> +{
> + struct dst_entry *dst;
> + struct flowi fl = {
> + .oif = 0,
> + .nl_u = {
> + .ip6_u = {
> + .daddr = *daddr,
> + },
> + },
> + };
> +
> + dst = ip6_route_output(net, NULL, &fl);
> + if (dst->error)
> + goto out_err;
> + if (!ret_saddr)
> + return dst;
> + if (ipv6_addr_any(&fl.fl6_src) &&
> + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
> + &fl.fl6_dst, 0, &fl.fl6_src) < 0)
> + goto out_err;
> + if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0)
> + goto out_err;
> + ipv6_addr_copy(ret_saddr, &fl.fl6_src);
> + return dst;
> +
> +out_err:
> + dst_release(dst);
> + IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
> + return NULL;
> +}
> +
> static struct rt6_info *
> -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp)
> +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> + struct in6_addr *ret_saddr, int do_xfrm)
> {
> + struct net *net = dev_net(skb->dev);
> struct rt6_info *rt; /* Route to the other host */
> struct ip_vs_dest *dest = cp->dest;
> + struct dst_entry *dst;
>
> if (dest) {
> spin_lock(&dest->dst_lock);
> - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0);
> + rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
> if (!rt) {
> - struct flowi fl = {
> - .oif = 0,
> - .nl_u = {
> - .ip6_u = {
> - .daddr = dest->addr.in6,
> - .saddr = {
> - .s6_addr32 =
> - { 0, 0, 0, 0
> },
> - },
> - },
> - },
> - };
> + u32 cookie;
>
> - rt = (struct rt6_info *)ip6_route_output(&init_net,
> - NULL, &fl);
> - if (!rt) {
> + dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
> + &dest->dst_saddr,
> + do_xfrm);
> + if (!dst) {
> spin_unlock(&dest->dst_lock);
> - IP_VS_DBG_RL("ip6_route_output error, dest:
> %pI6\n",
> - &dest->addr.in6);
> return NULL;
> }
> - __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst));
> - IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n",
> - &dest->addr.in6,
> + rt = (struct rt6_info *) dst;
> + cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
> + __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
> + IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
> + &dest->addr.in6, &dest->dst_saddr,
> atomic_read(&rt->dst.__refcnt));
> }
> + if (ret_saddr)
> + ipv6_addr_copy(ret_saddr, &dest->dst_saddr);
> spin_unlock(&dest->dst_lock);
> } else {
> - struct flowi fl = {
> - .oif = 0,
> - .nl_u = {
> - .ip6_u = {
> - .daddr = cp->daddr.in6,
> - .saddr = {
> - .s6_addr32 = { 0, 0, 0, 0 },
> - },
> - },
> - },
> - };
> -
> - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL,
> &fl);
> - if (!rt) {
> - IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n",
> - &cp->daddr.in6);
> + dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr,
> + do_xfrm);
> + if (!dst)
> return NULL;
> - }
> + rt = (struct rt6_info *) dst;
> }
>
> return rt;
> @@ -248,6 +268,7 @@ int
> ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
> struct ip_vs_protocol *pp)
> {
> + struct net *net = dev_net(skb->dev);
> struct rtable *rt; /* Route to the other host */
> struct iphdr *iph = ip_hdr(skb);
> u8 tos = iph->tos;
> @@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, s
>
> EnterFunction(10);
>
> - if (ip_route_output_key(&init_net, &rt, &fl)) {
> + if (ip_route_output_key(net, &rt, &fl)) {
> IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n",
> __func__, &iph->daddr);
> goto tx_error_icmp;
> @@ -313,25 +334,18 @@ int
> ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
> struct ip_vs_protocol *pp)
> {
> + struct net *net = dev_net(skb->dev);
> + struct dst_entry *dst;
> struct rt6_info *rt; /* Route to the other host */
> struct ipv6hdr *iph = ipv6_hdr(skb);
> int mtu;
> - struct flowi fl = {
> - .oif = 0,
> - .nl_u = {
> - .ip6_u = {
> - .daddr = iph->daddr,
> - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } },
> - };
>
> EnterFunction(10);
>
> - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl);
> - if (!rt) {
> - IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n",
> - __func__, &iph->daddr);
> + dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0);
> + if (!dst)
> goto tx_error_icmp;
> - }
> + rt = (struct rt6_info *) dst;
>
> /* MTU checking */
> mtu = dst_mtu(&rt->dst);
> @@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
> IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
> }
>
> - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
> + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
> goto tx_error_icmp;
>
> /* MTU checking */
> @@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
> IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
> }
>
> - rt = __ip_vs_get_out_rt_v6(cp);
> + rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
> if (!rt)
> goto tx_error_icmp;
>
> @@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
> struct iphdr *old_iph = ip_hdr(skb);
> u8 tos = old_iph->tos;
> __be16 df = old_iph->frag_off;
> - sk_buff_data_t old_transport_header = skb->transport_header;
> struct iphdr *iph; /* Our new IP header */
> unsigned int max_headroom; /* The extra header space
> needed */
> int mtu;
> @@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
> goto tx_error;
> }
>
> - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
> + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos))))
> goto tx_error_icmp;
>
> tdev = rt->dst.dev;
> @@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, s
> old_iph = ip_hdr(skb);
> }
>
> - skb->transport_header = old_transport_header;
> + skb->transport_header = skb->network_header;
>
> /* fix old IP header checksum */
> ip_send_check(old_iph);
> @@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> struct ip_vs_protocol *pp)
> {
> struct rt6_info *rt; /* Route to the other host */
> + struct in6_addr saddr; /* Source for tunnel */
> struct net_device *tdev; /* Device to other host */
> struct ipv6hdr *old_iph = ipv6_hdr(skb);
> - sk_buff_data_t old_transport_header = skb->transport_header;
> struct ipv6hdr *iph; /* Our new IP header */
> unsigned int max_headroom; /* The extra header space needed */
> int mtu;
> @@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> goto tx_error;
> }
>
> - rt = __ip_vs_get_out_rt_v6(cp);
> + rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1);
> if (!rt)
> goto tx_error_icmp;
>
> tdev = rt->dst.dev;
>
> mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
> - /* TODO IPv6: do we need this check in IPv6? */
> - if (mtu < 1280) {
> + if (mtu < IPV6_MIN_MTU) {
> dst_release(&rt->dst);
> - IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__);
> + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
> + IPV6_MIN_MTU);
> goto tx_error;
> }
> if (skb_dst(skb))
> @@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> old_iph = ipv6_hdr(skb);
> }
>
> - skb->transport_header = old_transport_header;
> + skb->transport_header = skb->network_header;
>
> skb_push(skb, sizeof(struct ipv6hdr));
> skb_reset_network_header(skb);
> @@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb
> be16_add_cpu(&iph->payload_len, sizeof(*old_iph));
> iph->priority = old_iph->priority;
> memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
> - iph->daddr = rt->rt6i_dst.addr;
> - iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr;
> */
> + ipv6_addr_copy(&iph->daddr, &rt->rt6i_dst.addr);
> + ipv6_addr_copy(&iph->saddr, &saddr);
> iph->hop_limit = old_iph->hop_limit;
>
> /* Another hack: avoid icmp_send in ip_fragment */
> @@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struc
>
> EnterFunction(10);
>
> - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
> + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos))))
> goto tx_error_icmp;
>
> /* MTU checking */
> @@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, st
>
> EnterFunction(10);
>
> - rt = __ip_vs_get_out_rt_v6(cp);
> + rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
> if (!rt)
> goto tx_error_icmp;
>
> @@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, str
> * mangle and send the packet here (only for VS/NAT)
> */
>
> - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
> + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos))))
> goto tx_error_icmp;
>
> /* MTU checking */
> @@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb,
> * mangle and send the packet here (only for VS/NAT)
> */
>
> - rt = __ip_vs_get_out_rt_v6(cp);
> + rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0);
> if (!rt)
> goto tx_error_icmp;
>
Looks good, go ahead
--
Regards
Hans Schillstrom <hans.schillstrom@xxxxxxxxxxxx>
--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
|