LVS
lvs-devel
Google
 
Web LinuxVirtualServer.org

[PATCH] ipvs: allow rescheduling of new connections when port reuse is d

To: lvs-devel@xxxxxxxxxxxxxxx
Subject: [PATCH] ipvs: allow rescheduling of new connections when port reuse is detected
Cc: Julian Anastasov <ja@xxxxxx>
From: Marcelo Ricardo Leitner <mleitner@xxxxxxxxxx>
Date: Fri, 20 Feb 2015 17:39:25 -0200
Currently, when TCP/SCTP port reusing happens, IPVS will find the old
entry and use it for the new one, behaving like a forced persistence.
But if you consider a cluster with a heavy load of small connections,
such reuse will happen often and may lead to a not optimal load
balancing and might prevent a new node from getting a fair load.

This patch introduces a new sysctl, conn_reuse_mode, that allows
controlling how to proceed when port reuse is detected. The default
value will allow rescheduling of new connections only if the old entry
was in TIME_WAIT state for TCP or CLOSED for SCTP.

Signed-off-by: Marcelo Ricardo Leitner <mleitner@xxxxxxxxxx>
---
 Documentation/networking/ipvs-sysctl.txt | 21 ++++++++++++++++++++
 include/net/ip_vs.h                      | 11 +++++++++++
 net/netfilter/ipvs/ip_vs_core.c          | 33 ++++++++++++++++++++++++++++----
 net/netfilter/ipvs/ip_vs_ctl.c           |  8 ++++++++
 net/netfilter/ipvs/ip_vs_sync.c          | 21 ++++++++++++++++++--
 5 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/ipvs-sysctl.txt 
b/Documentation/networking/ipvs-sysctl.txt
index 
7a3c047295914cbc8c4273506a9b6d35246a1750..3ba709531adba970595251fa73d6d471ed14c5c1
 100644
--- a/Documentation/networking/ipvs-sysctl.txt
+++ b/Documentation/networking/ipvs-sysctl.txt
@@ -22,6 +22,27 @@ backup_only - BOOLEAN
        If set, disable the director function while the server is
        in backup mode to avoid packet loops for DR/TUN methods.
 
+conn_reuse_mode - INTEGER
+       1 - default
+
+       Controls how ipvs will deal with connections that are detected
+       port reuse. It is a bitmap, with the values being:
+
+       0: disable any special handling on port reuse. The new
+       connection will be delivered to the same real server that was
+       servicing the previous connection. This will effectively
+       disable expire_nodest_conn.
+
+       bit 1: enable rescheduling of new connections when it is safe.
+       That is, whenever expire_nodest_conn and for TCP sockets, when
+       the connection is in TIME_WAIT state (which is only possible if
+       you use NAT mode).
+
+       bit 2: it is bit 1 plus, for TCP connections, when connections
+       are in FIN_WAIT state, as this is the last state seen by load
+       balancer in Direct Routing mode. This bit helps on adding new
+       real servers to a very busy cluster.
+
 conntrack - BOOLEAN
        0 - disabled (default)
        not 0 - enabled
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 
615b20b585452111a25085890d8fa875657dbe76..6c7ee0ae7ef1694671e4b6af0906b2fa077f5c7c
 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -924,6 +924,7 @@ struct netns_ipvs {
        int                     sysctl_nat_icmp_send;
        int                     sysctl_pmtu_disc;
        int                     sysctl_backup_only;
+       int                     sysctl_conn_reuse_mode;
 
        /* ip_vs_lblc */
        int                     sysctl_lblc_expiration;
@@ -1042,6 +1043,11 @@ static inline int sysctl_backup_only(struct netns_ipvs 
*ipvs)
               ipvs->sysctl_backup_only;
 }
 
+static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
+{
+       return ipvs->sysctl_conn_reuse_mode;
+}
+
 #else
 
 static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
@@ -1109,6 +1115,11 @@ static inline int sysctl_backup_only(struct netns_ipvs 
*ipvs)
        return 0;
 }
 
+static inline int sysctl_conn_reuse_mode(struct netns_ipvs *ipvs)
+{
+       return 1;
+}
+
 #endif
 
 /* IPVS core functions
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 
b87ca32efa0b4e6edc7f251c2c32c4ba3b55659c..3ec9b1a54024fa421f330cf1d0eeb67da9683127
 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1046,6 +1046,26 @@ static inline bool is_new_conn(const struct sk_buff *skb,
        }
 }
 
+static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
+                                       int conn_reuse_mode)
+{
+       /* Controlled (FTP DATA or persistence)? */
+       if (cp->control)
+               return false;
+
+       switch (cp->protocol) {
+       case IPPROTO_TCP:
+               return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
+                       ((conn_reuse_mode & 2) &&
+                        (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
+                        (cp->flags & IP_VS_CONN_F_NOOUTPUT));
+       case IPPROTO_SCTP:
+               return cp->state == IP_VS_SCTP_S_CLOSED;
+       default:
+               return false;
+       }
+}
+
 /* Handle response packets: rewrite addresses and send away...
  */
 static unsigned int
@@ -1585,6 +1605,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int 
af)
        struct ip_vs_conn *cp;
        int ret, pkts;
        struct netns_ipvs *ipvs;
+       int conn_reuse_mode;
 
        /* Already marked as IPVS request or reply? */
        if (skb->ipvs_property)
@@ -1653,10 +1674,14 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int 
af)
         */
        cp = pp->conn_in_get(af, skb, &iph, 0);
 
-       if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
-           unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
-           is_new_conn(skb, &iph)) {
-               ip_vs_conn_expire_now(cp);
+       conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
+       if (conn_reuse_mode && !iph.fragoffs &&
+           is_new_conn(skb, &iph) && cp &&
+           ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
+             unlikely(!atomic_read(&cp->dest->weight))) ||
+            unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
+               if (!atomic_read(&cp->n_control))
+                       ip_vs_conn_expire_now(cp);
                __ip_vs_conn_put(cp);
                cp = NULL;
        }
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 
e55759056361c47ed1fcfa5c656541ba39bfd260..ec7f6f1e07cee1d15a6f839defc86aec8abd821e
 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1808,6 +1808,12 @@ static struct ctl_table vs_vars[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+       {
+               .procname       = "conn_reuse_mode",
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
 #ifdef CONFIG_IP_VS_DEBUG
        {
                .procname       = "debug_level",
@@ -3732,6 +3738,8 @@ static int __net_init 
ip_vs_control_net_init_sysctl(struct net *net)
        ipvs->sysctl_pmtu_disc = 1;
        tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
        tbl[idx++].data = &ipvs->sysctl_backup_only;
+       ipvs->sysctl_conn_reuse_mode = 1;
+       tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
 
 
        ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 
c47ffd7a0a709cb73834c84652f251960f25db79..0b4ae02dabf784eebf0bb196f010e239d5c14838
 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -845,10 +845,25 @@ static void ip_vs_proc_conn(struct net *net, struct 
ip_vs_conn_param *param,
        struct ip_vs_conn *cp;
        struct netns_ipvs *ipvs = net_ipvs(net);
 
-       if (!(flags & IP_VS_CONN_F_TEMPLATE))
+       if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
                cp = ip_vs_conn_in_get(param);
-       else
+               if (cp && ((cp->dport != dport) ||
+                          !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
+                       if (!(flags & IP_VS_CONN_F_INACTIVE)) {
+                               ip_vs_conn_expire_now(cp);
+                               __ip_vs_conn_put(cp);
+                               cp = NULL;
+                       } else {
+                               /* This is the expiration message for the
+                                * connection that was already replaced, so we
+                                * just ignore it.
+                                */
+                               goto out;
+                       }
+               }
+       } else {
                cp = ip_vs_ct_in_get(param);
+       }
 
        if (cp) {
                /* Free pe_data */
@@ -925,6 +940,8 @@ static void ip_vs_proc_conn(struct net *net, struct 
ip_vs_conn_param *param,
                else
                        cp->timeout = (3*60*HZ);
        }
+
+out:
        ip_vs_conn_put(cp);
 }
 
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe lvs-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

<Prev in Thread] Current Thread [Next in Thread>