LVS
lvs-devel
Google
 
Web LinuxVirtualServer.org

[RFC] ipvs ipv6 fragment handling

To: Patrick McHardy <kaber@xxxxxxxxx>, Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>, Julian Anastasov <ja@xxxxxx>, Simon Horman <horms@xxxxxxxxxxxx>, lvs-devel@xxxxxxxxxxxxxxx, netfilter-devel@xxxxxxxxxxxxxxx, netdev@xxxxxxxxxxxxxxx
Subject: [RFC] ipvs ipv6 fragment handling
From: Hans Schillstrom <hans@xxxxxxxxxxxxxxx>
Date: Thu, 9 Feb 2012 18:15:21 +0100
Hello,
Long time since the last ipvs patches from me...

IPVS do not have any kind of fragmentation handling for IPv6,
neither can it handle extension headers in front of upper layer headers.

The last issue is easy to fix with ipv6_find_hdr() which also is a part of this 
RFC.
This implementation relies on how nf_defrag_ipv6 works today.

Netfilter-team (Patrick) pls let me know if ipv6-nat makes any big changes to 
the 
defrag procedure.

IPVS do like conntrack i.e. use the skb->nfct_reasm;
(i.e. when all fragments is collected, nf_ct_frag6_output() starts a "re-play"
of all fragments into the interrupted PREROUTING chain att prio -399 with 
nfct_reasm
pointing to the assembled packet.)

IPVS adds a new hook into prerouting chain at prio -100 to catch fragments,
and copy fw-mark and routing info from the first packet with an upper layer 
header.

For the first fragment ip_vs_in() and ip_vs_out() uses the skb->nfct_reasm
to "store" fw-mark" and skb_dst().
Which will be copied into the following fragments.

This patch needs  "NETFILTER added flags to ipv6_find_hdr()" patch

The patch is far from ready since this is only a RFC, 
ex ip_vs_nat_xmit_v6() doesn't work since it writes right into the fragment 
header..
tunnel is not tested at all. 
Only Direct routing is tested. (not to localhost)

Plenty of formatting rules is violated and there is also debug printouts to 
remove...
(and kmail seems to corrupt tabs)

IPVS will depend on ip6tables and its defragmentation but NOT on conntrack.
Is the design idea OK ?
Will IPv6-NAT break everything ?
Is there a reason not to use the skb->nfct_reasm as "storage" for mark and 
dst_entry ?

This doesn't violate the RFC 2460 "fragmentation in IPv6 is performed only by 
source nodes"




diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ebe517f..2d3aeab 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -22,6 +22,9 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>                        /* for struct ipv6hdr */
 #include <net/ipv6.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netfilter/nf_conntrack.h>
 #endif
@@ -105,8 +108,10 @@ extern int ip_vs_conn_tab_size;
 
 
 struct ip_vs_iphdr {
-       int len;
-       __u8 protocol;
+       __s32 len;      /* offset where to find next header */
+       __s32 flags;
+       __u16 fragoffs;
+       __s16 protocol;
        union nf_inet_addr saddr;
        union nf_inet_addr daddr;
 };
@@ -132,6 +137,42 @@ ip_vs_fill_iphdr(int af, const void *nh, struct 
ip_vs_iphdr *iphdr)
        }
 }
 
+static inline void
+ip_vs_fill_iph_skb(int af, const struct sk_buff *skb, struct ip_vs_iphdr 
*iphdr)
+{
+       iphdr->len = 0;
+       iphdr->flags = 0;
+#ifdef CONFIG_IP_VS_IPV6
+       if (af == AF_INET6) {
+               const struct ipv6hdr *iph = (struct ipv6hdr 
*)skb_network_header(skb);
+
+               iphdr->protocol = ipv6_find_hdr(skb, &iphdr->len, -1,
+                                               &iphdr->fragoffs,
+                                               &iphdr->flags);
+               iphdr->saddr.in6 = iph->saddr;
+               iphdr->daddr.in6 = iph->daddr;
+       } else
+#endif
+       {
+               const struct iphdr *iph = (struct iphdr 
*)skb_network_header(skb);
+               iphdr->len = iph->ihl * 4;
+               iphdr->protocol = iph->protocol;
+               iphdr->saddr.ip = iph->saddr;
+               iphdr->daddr.ip = iph->daddr;
+       }
+}
+#if defined(CONFIG_NF_DEFRAG_IPV6) || defined(CONFIG_NF_DEFRAG_IPV6_MODULE)
+static inline struct sk_buff *skb_nfct_reasm(const struct sk_buff *skb)
+{
+       return skb->nfct_reasm;
+}
+#else
+static inline struct sk_buff *skb_reasm(const struct sk_buff *skb)
+{
+       return NULL;
+}
+#endif
+
 static inline void ip_vs_addr_copy(int af, union nf_inet_addr *dst,
                                   const union nf_inet_addr *src)
 {
@@ -412,7 +453,7 @@ struct ip_vs_protocol {
                        const struct sk_buff *skb,
                        const struct ip_vs_iphdr *iph,
                        unsigned int proto_off,
-                       int inverse);
+                       int inverse, unsigned short foffs);
 
        int (*snat_handler)(struct sk_buff *skb,
                            struct ip_vs_protocol *pp, struct ip_vs_conn *cp);
@@ -1001,7 +1042,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct 
ip_vs_conn_param *p);
 struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
                                             const struct ip_vs_iphdr *iph,
                                             unsigned int proto_off,
-                                            int inverse);
+                                            int inverse, unsigned short foffs);
 
 /* put back the conn without restarting its timer */
 static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 29fa5ba..36e65ca 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -432,13 +432,19 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct 
ip_vs_conn_param *p)
 
 struct ip_vs_conn *
 ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
-                        const struct ip_vs_iphdr *iph,
-                        unsigned int proto_off, int inverse)
+                        const struct ip_vs_iphdr *iph, unsigned int proto_off,
+                        int inverse, unsigned short foffs)
 {
        struct ip_vs_conn_param p;
 
-       if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
-               return NULL;
+       if (unlikely(foffs)) {
+               if (ip_vs_conn_fill_param_proto(af, skb_nfct_reasm(skb), iph,
+                                               proto_off, inverse, &p))
+                       return NULL;
+       } else {
+               if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, 
inverse, &p))
+                       return NULL;
+       }
 
        return ip_vs_conn_out_get(&p);
 }
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 611c335..e67144f 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -235,7 +235,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
        union nf_inet_addr snet;        /* source network of the client,
                                           after masking */
 
-       ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+       ip_vs_fill_iph_skb(svc->af, skb, &iph);
 
        /* Mask saddr with the netmask to adjust template granularity */
 #ifdef CONFIG_IP_VS_IPV6
@@ -402,7 +402,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff 
*skb,
        unsigned int flags;
 
        *ignored = 1;
-       ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+       ip_vs_fill_iph_skb(svc->af, skb, &iph);
        pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
        if (pptr == NULL)
                return NULL;
@@ -506,7 +506,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff 
*skb,
        int unicast;
 #endif
 
-       ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
+       ip_vs_fill_iph_skb(svc->af, skb, &iph);
 
        pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
        if (pptr == NULL) {
@@ -654,14 +654,6 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, 
u_int32_t user)
        return err;
 }
 
-#ifdef CONFIG_IP_VS_IPV6
-static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
-{
-       /* TODO IPv6: Find out what to do here for IPv6 */
-       return 0;
-}
-#endif
-
 static int ip_vs_route_me_harder(int af, struct sk_buff *skb)
 {
 #ifdef CONFIG_IP_VS_IPV6
@@ -819,6 +811,11 @@ static int handle_response_icmp(int af, struct sk_buff 
*skb,
        ip_vs_out_stats(cp, skb);
 
        skb->ipvs_property = 1;
+       IP_VS_DBG(1, " ICMP(%p) response set track(%x) %pI6c#%d %pI6c#%d 
%pI6c#%d fw:%d len:%d\n",
+                       skb, cp->flags & IP_VS_CONN_F_NFCT,
+                       &cp->caddr.in6, ntohs(cp->cport),
+                       &cp->vaddr.in6, ntohs(cp->vport),
+                       &cp->daddr.in6, ntohs(cp->dport), cp->fwmark, skb->len);
        if (!(cp->flags & IP_VS_CONN_F_NFCT))
                ip_vs_notrack(skb);
        else
@@ -902,7 +899,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
 
        ip_vs_fill_iphdr(AF_INET, cih, &ciph);
        /* The embedded headers contain source and dest in reverse order */
-       cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
+       cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1, 0);
        if (!cp)
                return NF_ACCEPT;
 
@@ -913,36 +910,34 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int 
*related,
 
 #ifdef CONFIG_IP_VS_IPV6
 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
-                            unsigned int hooknum)
+                            unsigned int hooknum, struct ip_vs_iphdr *ipvsh,
+                            unsigned short fragoffs)
 {
-       struct ipv6hdr *iph;
        struct icmp6hdr _icmph, *ic;
-       struct ipv6hdr  _ciph, *cih;    /* The ip header contained
+       struct ipv6hdr _ip6, *ip6;      /* The ip header contained
                                           within the ICMP */
-       struct ip_vs_iphdr ciph;
        struct ip_vs_conn *cp;
        struct ip_vs_protocol *pp;
-       unsigned int offset;
        union nf_inet_addr snet;
 
        *related = 1;
 
-       /* reassemble IP fragments */
-       if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-               if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
-                       return NF_STOLEN;
-       }
-
-       iph = ipv6_hdr(skb);
-       offset = sizeof(struct ipv6hdr);
-       ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+       /*
+        * Fragment is before ICMP header which tells us that this is not an
+        * error message since they can't be fragmented.
+        */
+       if (unlikely(fragoffs)) {
+               IP_VS_DBG(1,"Outgoing ICMPv6 frag(%d) %pI6c->%pI6c offs:%d\n",
+                         fragoffs, &ipv6_hdr(skb)->saddr, 
&ipv6_hdr(skb)->daddr,
+                         ipvsh->fragoffs);
+               ic = skb_header_pointer(skb_nfct_reasm(skb), ipvsh->len,
+                                       sizeof(_icmph), &_icmph);
+       } else
+               ic = skb_header_pointer(skb, ipvsh->len, sizeof(_icmph),
+                                       &_icmph);
        if (ic == NULL)
                return NF_DROP;
 
-       IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
-                 ic->icmp6_type, ntohs(icmpv6_id(ic)),
-                 &iph->saddr, &iph->daddr);
-
        /*
         * Work through seeing if this is for us.
         * These checks are supposed to be in an order that means easy
@@ -956,36 +951,51 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int 
*related,
                *related = 0;
                return NF_ACCEPT;
        }
-
-       /* Now find the contained IP header */
-       offset += sizeof(_icmph);
-       cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
-       if (cih == NULL)
-               return NF_ACCEPT; /* The packet looks wrong, ignore */
-
-       pp = ip_vs_proto_get(cih->nexthdr);
-       if (!pp)
+       if (unlikely(fragoffs)) {
+               IP_VS_DBG(1, "***ERROR*** Don't frag an ICMPv6 Error(%d,%d)"
+                            " %pI6->%pI6\n",
+                            ic->icmp6_type, ntohs(icmpv6_id(ic)),
+                            &ipvsh->saddr, &ipvsh->daddr);
                return NF_ACCEPT;
+       }
 
-       /* Is the embedded protocol header present? */
-       /* TODO: we don't support fragmentation at the moment anyways */
-       if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
-               return NF_ACCEPT;
+       IP_VS_DBG(1, "Outgoing ICMPv6 Error (%d,%d) %pI6->%pI6\n",
+                 ic->icmp6_type, ntohs(icmpv6_id(ic)),
+                 &ipvsh->saddr, &ipvsh->daddr);
 
-       IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
-                     "Checking outgoing ICMPv6 for");
+       /* Now find the contained IP header after icmp hdr */
+       ipvsh->len += sizeof(_icmph);
+       ip6 = skb_header_pointer(skb, ipvsh->len, sizeof(_ip6), &_ip6);
+       ipvsh->protocol = ipv6_find_hdr(skb, &ipvsh->len, -1,
+                                       &ipvsh->fragoffs, &ipvsh->flags);
 
-       offset += sizeof(struct ipv6hdr);
+       /*
+        * Is the embedded protocol header present?
+        * If not we can't do very much, and if it's an error on a fragment
+        *  we don't have the history anyway..
+        */
+       pp = ip_vs_proto_get(ipvsh->protocol);
+       if (!pp || (ipvsh->protocol < 0))
+               return NF_ACCEPT;
+       /* fill the rest of ipvsh */
+       ipvsh->saddr.in6 = ip6->saddr;
+       ipvsh->daddr.in6 = ip6->daddr;
 
-       ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
        /* The embedded headers contain source and dest in reverse order */
-       cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
+       cp = pp->conn_out_get(AF_INET6, skb, ipvsh, ipvsh->len, 1, 0);
        if (!cp)
                return NF_ACCEPT;
+       {
+               __be16 _ports[2], *pptr;
+
+               pptr = skb_header_pointer(skb, ipvsh->len, sizeof(_ports), 
_ports);
 
-       snet.in6 = iph->saddr;
-       return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
-                                   pp, offset, sizeof(struct ipv6hdr));
+               IP_VS_DBG(1, "Outg ICMPv6 Error found %pI6[%d] -> %pI6[%d]\n",
+                 &ip6->saddr, ntohs(pptr[0]), &ip6->daddr, ntohs(pptr[1]));
+       }
+       snet.in6 = ipvsh->saddr.in6;
+       return handle_response_icmp(AF_INET6, skb, &snet, ipvsh->protocol, cp,
+                                   pp, ipvsh->len, sizeof(struct ipv6hdr));
 }
 #endif
 
@@ -1064,6 +1074,11 @@ handle_response(int af, struct sk_buff *skb, struct 
ip_vs_proto_data *pd,
        ip_vs_out_stats(cp, skb);
        ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
        skb->ipvs_property = 1;
+       IP_VS_DBG(1, " IP(%p) response set track(%x) %pI6c#%d %pI6c#%d %pI6c#%d 
fw:%d len:%d\n",
+                       skb, cp->flags & IP_VS_CONN_F_NFCT,
+                       &cp->caddr.in6, ntohs(cp->cport),
+                       &cp->vaddr.in6, ntohs(cp->vport),
+                       &cp->daddr.in6, ntohs(cp->dport), cp->fwmark, skb->len);
        if (!(cp->flags & IP_VS_CONN_F_NFCT))
                ip_vs_notrack(skb);
        else
@@ -1083,14 +1098,14 @@ drop:
 /*
  *     Check if outgoing packet belongs to the established ip_vs_conn.
  */
-static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
+static unsigned int ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int 
af)
 {
        struct net *net = NULL;
        struct ip_vs_iphdr iph;
        struct ip_vs_protocol *pp;
        struct ip_vs_proto_data *pd;
        struct ip_vs_conn *cp;
+       unsigned short fragoffs = 0;
 
        EnterFunction(11);
 
@@ -1115,17 +1130,32 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, 
int af)
        if (!net_ipvs(net)->enable)
                return NF_ACCEPT;
 
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+       /* It's a fragment take first frag */
+       if (skb_nfct_reasm(skb))
+               ip_vs_fill_iph_skb(af, skb_nfct_reasm(skb), &iph);
+       else
+               ip_vs_fill_iph_skb(af, skb, &iph);
+
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6) {
+               if (skb_nfct_reasm(skb)) {
+                       struct sk_buff *reasm = skb_nfct_reasm(skb);
+                       int len = 0;
+
+                       ipv6_find_hdr(skb, &len, -1, &fragoffs, NULL);
+
+                       if (!fragoffs) {
+                               reasm->mark = skb->mark;
+                               skb_dst_copy(reasm, skb);
+                       }
+               }
                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
                        int related;
-                       int verdict = ip_vs_out_icmp_v6(skb, &related,
-                                                       hooknum);
-
+                       int verdict = ip_vs_out_icmp_v6(skb, &related, hooknum,
+                                                       &iph, fragoffs);
+                       /* Related means an icmp error on an ipvs_conn */
                        if (related)
                                return verdict;
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
                }
        } else
 #endif
@@ -1135,7 +1165,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int 
af)
 
                        if (related)
                                return verdict;
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+                       ip_vs_fill_iph_skb(af, skb, &iph);
                }
 
        pd = ip_vs_proto_data_get(net, iph.protocol);
@@ -1145,28 +1175,20 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, 
int af)
 
        /* reassemble IP fragments */
 #ifdef CONFIG_IP_VS_IPV6
-       if (af == AF_INET6) {
-               if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-                       if (ip_vs_gather_frags_v6(skb,
-                                                 ip_vs_defrag_user(hooknum)))
-                               return NF_STOLEN;
-               }
-
-               ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-       } else
+       if (af == AF_INET)
 #endif
                if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
                        if (ip_vs_gather_frags(skb,
                                               ip_vs_defrag_user(hooknum)))
                                return NF_STOLEN;
 
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+                       ip_vs_fill_iph_skb(af, skb, &iph);
                }
 
        /*
         * Check if the packet belongs to an existing entry
         */
-       cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
+       cp = pp->conn_out_get(af, skb, &iph, iph.len, 0, fragoffs);
 
        if (likely(cp))
                return handle_response(af, skb, pd, cp, iph.len);
@@ -1176,8 +1198,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int 
af)
             pp->protocol == IPPROTO_SCTP)) {
                __be16 _ports[2], *pptr;
 
-               pptr = skb_header_pointer(skb, iph.len,
-                                         sizeof(_ports), _ports);
+               /* Handle fragments */
+               if (fragoffs)
+                       pptr = skb_header_pointer(skb_nfct_reasm(skb), iph.len,
+                                                 sizeof(_ports), _ports);
+               else
+                       pptr = skb_header_pointer(skb, iph.len,
+                                                 sizeof(_ports), _ports);
                if (pptr == NULL)
                        return NF_ACCEPT;       /* Not for me */
                if (ip_vs_lookup_real_service(net, af, iph.protocol,
@@ -1389,14 +1416,11 @@ out:
 }
 
 #ifdef CONFIG_IP_VS_IPV6
-static int
-ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
+static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
+                           unsigned int hooknum, struct ip_vs_iphdr *iph)
 {
        struct net *net = NULL;
-       struct ipv6hdr *iph;
        struct icmp6hdr _icmph, *ic;
-       struct ipv6hdr  _ciph, *cih;    /* The ip header contained
-                                          within the ICMP */
        struct ip_vs_iphdr ciph;
        struct ip_vs_conn *cp;
        struct ip_vs_protocol *pp;
@@ -1405,15 +1429,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, 
unsigned int hooknum)
 
        *related = 1;
 
-       /* reassemble IP fragments */
-       if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
-               if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum)))
-                       return NF_STOLEN;
-       }
-
-       iph = ipv6_hdr(skb);
-       offset = sizeof(struct ipv6hdr);
-       ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+       ic = skb_header_pointer(skb, iph->len, sizeof(_icmph), &_icmph);
        if (ic == NULL)
                return NF_DROP;
 
@@ -1435,39 +1451,48 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, 
unsigned int hooknum)
                return NF_ACCEPT;
        }
 
-       /* Now find the contained IP header */
-       offset += sizeof(_icmph);
-       cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
-       if (cih == NULL)
-               return NF_ACCEPT; /* The packet looks wrong, ignore */
-
+       /*
+        * Now find the contained IP header
+        * an icmp error message will never be fragmented, but
+        * the contained IP header might contain an fragment...
+        */
+       ciph.len = iph->len + sizeof(_icmph);
+       ciph.flags = 0;
+       ciph.fragoffs = 0;
+       ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs,
+                                     &ciph.flags);
+       ciph.saddr = iph->saddr;        /* con_in_get() handles reverse order */
+       ciph.daddr = iph->daddr;
+
+       /* Hmm todo: check if find_hdr should be used instead */
        net = skb_net(skb);
-       pd = ip_vs_proto_data_get(net, cih->nexthdr);
+       pd = ip_vs_proto_data_get(net, ciph.protocol);
        if (!pd)
                return NF_ACCEPT;
        pp = pd->pp;
 
-       /* Is the embedded protocol header present? */
-       /* TODO: we don't support fragmentation at the moment anyways */
-       if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
+       /* Is the embedded protocol header present?
+        * If it's the second or later fragment we don't know what it is
+        * i.e. just let it through.
+        */
+       if (ciph.fragoffs)
                return NF_ACCEPT;
 
+       offset = ciph.len;
        IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset,
                      "Checking incoming ICMPv6 for");
 
-       offset += sizeof(struct ipv6hdr);
-
-       ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
        /* The embedded headers contain source and dest in reverse order */
-       cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
+       cp = pp->conn_in_get(AF_INET6, skb, &ciph, ciph.len, 1);
        if (!cp)
                return NF_ACCEPT;
 
        /* do the statistics and put it back */
        ip_vs_in_stats(cp, skb);
-       if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
-           IPPROTO_SCTP == cih->nexthdr)
-               offset += 2 * sizeof(__u16);
+       if (IPPROTO_TCP == ciph.protocol|| IPPROTO_UDP == ciph.protocol ||
+           IPPROTO_SCTP == ciph.protocol)
+               offset = ciph.len + (2 * sizeof(__u16));
+
        verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum);
 
        __ip_vs_conn_put(cp);
@@ -1489,13 +1514,21 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int 
af)
        struct ip_vs_protocol *pp;
        struct ip_vs_proto_data *pd;
        struct ip_vs_conn *cp;
+       unsigned short fragoffs = 0;
        int ret, pkts;
        struct netns_ipvs *ipvs;
 
+       struct sk_buff *reasm;
+
        /* Already marked as IPVS request or reply? */
        if (skb->ipvs_property)
                return NF_ACCEPT;
 
+       /* ipvs enabled in this netns ? */
+       net = skb_net(skb);
+       if (!net_ipvs(net)->enable)
+               return NF_ACCEPT;
+
        /*
         *      Big tappo:
         *      - remote client: only PACKET_HOST
@@ -1503,20 +1536,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int 
af)
         */
        if (unlikely((skb->pkt_type != PACKET_HOST &&
                      hooknum != NF_INET_LOCAL_OUT) ||
-                    !skb_dst(skb))) {
-               ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
-               IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
-                             " ignored in hook %u\n",
-                             skb->pkt_type, iph.protocol,
-                             IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
+                     !skb_dst(skb))) {
+               LeaveFunction(12);
                return NF_ACCEPT;
        }
-       /* ipvs enabled in this netns ? */
-       net = skb_net(skb);
-       if (!net_ipvs(net)->enable)
-               return NF_ACCEPT;
-
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
 
        /* Bad... Do not break raw sockets */
        if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
@@ -1528,15 +1551,36 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int 
af)
                        return NF_ACCEPT;
        }
 
+       reasm = skb_nfct_reasm(skb);
+       ip_vs_fill_iph_skb(af, (reasm ? reasm : skb), &iph);
+
 #ifdef CONFIG_IP_VS_IPV6
        if (af == AF_INET6) {
+               if (reasm) {
+                       int len = 0;
+
+                       ipv6_find_hdr(skb, &len, -1, &fragoffs, NULL);
+                       IP_VS_DBG(1, "IN FRAG %s(%p s:%p) %pI6c %pI6c hdr:%d 
len:%d/%d %d fw:%u/%u\n",
+                                 (fragoffs) ? "+2:nd" : "1:st", reasm, skb,
+                                 &ipv6_hdr(reasm)->saddr, 
&ipv6_hdr(reasm)->daddr,
+                                 ipv6_hdr(reasm)->nexthdr, skb->len, 
reasm->len, fragoffs,
+                                 skb->mark, reasm->mark);
+               }
+               /*
+                * Save first fragment dst & fwmark to the re-assembly skb
+                */
+               if (!fragoffs && reasm) {
+                       reasm->mark = skb->mark;
+                       skb_dst_copy(reasm, skb);
+               }
+
                if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
                        int related;
-                       int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
+                       int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum,
+                                                      &iph);
 
                        if (related)
                                return verdict;
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
                }
        } else
 #endif
@@ -1546,7 +1590,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int 
af)
 
                        if (related)
                                return verdict;
-                       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+                       /* I don't think this one is needed ... /HS */
+                       ip_vs_fill_iph_skb(af, skb, &iph);
                }
 
        /* Protocol supported? */
@@ -1556,14 +1601,19 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int 
af)
        pp = pd->pp;
        /*
         * Check if the packet belongs to an existing connection entry
+        * Only sched first IPv6 fragment.
         */
-       cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
+       if (fragoffs)
+               cp = pp->conn_in_get(af, reasm, &iph, iph.len, 0);
+       else {
+               cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
 
-       if (unlikely(!cp)) {
-               int v;
+               if (unlikely(!cp)) {
+                       int v;
 
-               if (!pp->conn_schedule(af, skb, pd, &v, &cp))
-                       return v;
+                       if (!pp->conn_schedule(af, skb, pd, &v, &cp))
+                               return v;
+               }
        }
 
        if (unlikely(!cp)) {
@@ -1678,6 +1728,38 @@ ip_vs_local_request4(unsigned int hooknum, struct 
sk_buff *skb,
 }
 
 #ifdef CONFIG_IP_VS_IPV6
+/*
+ * AF_INET6 fragment handling
+ * Copy info from first fragment, to the rest of them.
+ */
+static unsigned int
+ip_vs_preroute_frag6(unsigned int hooknum, struct sk_buff *skb,
+                     const struct net_device *in,
+                     const struct net_device *out,
+                     int (*okfn)(struct sk_buff *))
+{
+       struct ip_vs_iphdr iphdr  = { .len = 0, .flags = 0, };
+       struct sk_buff *reasm = skb_nfct_reasm(skb);
+       struct net *net;
+
+       /* This is not a "replay" from nf_ct_frag6_output */
+       if (!reasm)
+               return NF_ACCEPT;
+
+       net = skb_net(skb);
+       if (!net_ipvs(net)->enable)
+               return NF_ACCEPT;
+
+       iphdr.protocol = ipv6_find_hdr(skb, &iphdr.len, -1, &iphdr.fragoffs,
+                                      &iphdr.flags);
+       if (!iphdr.fragoffs)
+               return NF_ACCEPT;
+       /* Copy stored mark & dst from ip_vs_in / out */
+       skb->mark = reasm->mark;
+       skb_dst_copy(skb,reasm);
+
+       return NF_ACCEPT;
+}
 
 /*
  *     AF_INET6 handler in NF_INET_LOCAL_IN chain
@@ -1749,8 +1831,10 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct 
sk_buff *skb,
 {
        int r;
        struct net *net;
+       struct ip_vs_iphdr iphdr;
 
-       if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+       ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
+       if (iphdr.protocol != IPPROTO_ICMPV6)
                return NF_ACCEPT;
 
        /* ipvs enabled in this netns ? */
@@ -1758,7 +1842,7 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct 
sk_buff *skb,
        if (!net_ipvs(net)->enable)
                return NF_ACCEPT;
 
-       return ip_vs_in_icmp_v6(skb, &r, hooknum);
+       return ip_vs_in_icmp_v6(skb, &r, hooknum, &iphdr);
 }
 #endif
 
@@ -1816,6 +1900,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
                .priority       = 100,
        },
 #ifdef CONFIG_IP_VS_IPV6
+       /* After mangle & nat fetch 2:nd fragment and following  */
+       {
+               .hook           = ip_vs_preroute_frag6,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET6,
+               .hooknum        = NF_INET_PRE_ROUTING,
+               .priority       = NF_IP6_PRI_NAT_DST + 1,
+       },
        /* After packet filtering, change source only for VS/NAT */
        {
                .hook           = ip_vs_reply6,
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c 
b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 5b8eb8b..bda8560 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -87,7 +87,7 @@ static struct ip_vs_conn *
 ah_esp_conn_out_get(int af, const struct sk_buff *skb,
                    const struct ip_vs_iphdr *iph,
                    unsigned int proto_off,
-                   int inverse)
+                   int inverse, unsigned short foffs)
 {
        struct ip_vs_conn *cp;
        struct ip_vs_conn_param p;
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c 
b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index ef8641f..0199bb2 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -40,7 +40,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct 
ip_vs_proto_data *pd,
        struct tcphdr _tcph, *th;
        struct ip_vs_iphdr iph;
 
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+       ip_vs_fill_iph_skb(af, skb, &iph);
 
        th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph);
        if (th == NULL) {
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c 
b/net/netfilter/ipvs/ip_vs_proto_udp.c
index f4b7262..6064f6e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -37,7 +37,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct 
ip_vs_proto_data *pd,
        struct udphdr _udph, *uh;
        struct ip_vs_iphdr iph;
 
-       ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
+       ip_vs_fill_iph_skb(af, skb, &iph);
 
        uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph);
        if (uh == NULL) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 7fd66de..1d1b5ad 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -1080,6 +1080,11 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn 
*cp,
                IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
        }
 
+       IP_VS_DBG(1, "DR XMIT(%p): %pI6c %pI6c via %pI6c dp:%d fw:%d hdr:%d 
proto:%d len:%d\n",
+                 skb, &cp->caddr, &cp->vaddr, &cp->daddr,
+                 ntohs(cp->dport), cp->fwmark,
+                 ipv6_hdr(skb)->nexthdr, cp->protocol, skb->len);
+
        /* MTU checking */
        mtu = dst_mtu(&rt->dst);
        if (skb->len > mtu) {
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index bb10b07..d475f01 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -85,7 +85,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
        /*
         * Check if the packet belongs to an existing entry
         */
-       cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */);
+       cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */, 0);
        if (unlikely(cp == NULL)) {
                match = false;
                goto out;

---
Regards
Hans Schillstrom <hans@xxxxxxxxxxxxxxx>

Attachment: signature.asc
Description: This is a digitally signed message part.

<Prev in Thread] Current Thread [Next in Thread>