LVS
lvs-users
Google
 
Web LinuxVirtualServer.org

Release new code: Scheduler for distributed caching

To: Thomas Proell <Thomas.Proell@xxxxxxxxxx>
Subject: Release new code: Scheduler for distributed caching
Cc: lvs-users@xxxxxxxxxxxxxxxxxxxxxx
From: Thomas Proell <Thomas.Proell@xxxxxxxxxx>
Date: Mon, 23 Oct 2000 22:23:15 +0200 (MET DST)
Hi!

I hereby release a new scheduler under the GNU Public License.
This was made possible by Prof. Ernst Biersack and Institut
Eurecom, who support me with my master thesis, and who support
the distribution under GPL. However, the copyright is still at 
Institut Eurecom, France.

I apologize for the tricks I was using for coding, since they are
not always very nice, but we use it in a slightly different way
here. Moreover I didn't have much time :-(
So, I hope Wensong isn't mad with me, because I inserted some 
not very nice pieces of code into his LVS. I promise, I'll improve 
it; in the moment I just don't have time, since my work here ends 
soon, so I think people can play with it until I can do improvements
in 2001.

If there are problems with the patch, tell me, it was my first 
patch in my whole life.
This should be a patch on top of the newest IPVS-patch.

The code is running here very stable; please let me know, if you
have problems with patching, compiling, starting or whatever.
This software comes without warranty. If your PC is fuc^H^H^H not
working afterwards you'll be on your own.


What is it for?
***************

With the "Consistent Hashing Scheduler" you can set up a cache 
farm with LVS. It is NOT useable for a SERVER-farm, it is designed
for CACHES.

Incoming requests for certain IP-addresses are sent to the proxy-
caches. This could be achieved with the commom round robin schedulers,
too. So, what's new?
With rr, you'll soon find all documents on all caches. This won't
be the case with the "Consistent Hashing" Scheduler.

A simple example for 2 caches would be: send the even IP-addresses
to cache1, send the odd ones to cache2.

This is an example; the CH-Scheduler works in a much more clever way,
that results to the following properites:

- if an IP-address is sent to one cache, it won't be sent to another:
  saves disk space/increases possibility of a hit.
- if one (out of n>2) caches fails, the documents for this cache
  are well distributed among the others.
- with that, inserting a new cache (upgrade or reboot) will take
  away the same amount of traffic from all servers
- the distribution of traffic is fair, that means all caches will
  experience the same load
- it is predictable: several redirectors will use the same cache
  for the same IP-address. Or if the director breaks down and comes
  up again, it will still distribute the addresses like it did before.


It would be not very useful to use this scheduler for a server-farm,
since all servers have the same IP-address, and all the load will be
sent to one unlucky machine :-/
So, it's quite a new way of using the LVS, isn't it?


Have fun with it!

Any comments are appreciated.


Thomas


P.S. For those who wonder how LVS can distribute load among caches:
You can set up the "ipchains" which seems to be a firewall. With
that you can mark all incoming packets. Then, you can set up LVS that
way, that it handles all marked packets. It's that easy!
diff -uNr linux1/include/net/crc16.h linux/include/net/crc16.h
--- linux1/include/net/crc16.h  Thu Jan  1 01:00:00 1970
+++ linux/include/net/crc16.h   Mon Oct 23 18:50:30 2000
@@ -0,0 +1,9 @@
+#ifndef __crc16_h__
+#define __crc16_h__
+
+#define RAND_MAX_COMBO ((unsigned short)65535)
+
+unsigned short int hashcrc16(char * www);
+void seed_rand_combo(unsigned long seed);
+unsigned short rand_combo(void);
+#endif
diff -uNr linux1/include/net/newhash.h linux/include/net/newhash.h
--- linux1/include/net/newhash.h        Thu Jan  1 01:00:00 1970
+++ linux/include/net/newhash.h Mon Oct 23 18:50:30 2000
@@ -0,0 +1,27 @@
+#ifndef __newhash_h__
+#define __newhash_h__
+
+#include <linux/types.h>
+#include <asm/types.h>
+
+struct wserver
+{
+       int     port;
+       __u32   name;
+};
+
+
+
+              
+unsigned char newhash(char *item, unsigned char *intervaltable);
+
+void filltable(int nservers, struct wserver *listwserver, unsigned char 
+               *intervaltable, unsigned int maxnumber, int cachecopies);
+
+void addservcop(char *server, int index, int copies, unsigned char 
+               *intervaltable, unsigned int maxnumber);
+
+void padtable(unsigned char *intervaltable, unsigned int maxnumber);
+
+
+#endif
diff -uNr linux1/net/ipv4/Config.in linux/net/ipv4/Config.in
--- linux1/net/ipv4/Config.in   Mon Oct 23 18:01:21 2000
+++ linux/net/ipv4/Config.in    Mon Oct 23 18:34:37 2000
@@ -58,6 +58,7 @@
           tristate 'IPVS: weighted round-robin scheduling' 
CONFIG_IP_MASQUERADE_VS_WRR
           tristate 'IPVS: least-connection scheduling' 
CONFIG_IP_MASQUERADE_VS_LC
           tristate 'IPVS: weighted least-connection scheduling' 
CONFIG_IP_MASQUERADE_VS_WLC
+         tristate 'IPVS: consistent hashing scheduling' 
CONFIG_IP_MASQUERADE_VS_CH
        fi
       fi
     fi
diff -uNr linux1/net/ipv4/Makefile linux/net/ipv4/Makefile
--- linux1/net/ipv4/Makefile    Mon Oct 23 18:01:21 2000
+++ linux/net/ipv4/Makefile     Mon Oct 23 18:30:54 2000
@@ -125,6 +125,16 @@
     M_OBJS += ip_vs_wlc.o
     endif
   endif
+
+  ifeq ($(CONFIG_IP_MASQUERADE_VS_CH),y)
+  IPV4_OBJS += ip_vs_ch.o
+  IPV4_OBJS += newhash.o
+  IPV4_OBJS += crc16.o
+  else
+    ifeq ($(CONFIG_IP_MASQUERADE_VS_CH),m)
+    M_OBJS += ip_vs_con.o
+    endif
+  endif
 endif
 
 M_OBJS += ip_masq_user.o
@@ -147,6 +157,9 @@
 endif
 
 include $(TOPDIR)/Rules.make
+
+ip_vs_con.o:   ip_vs_ch.o crc16.o newhash.o
+               ld -m elf_i386 -r -o ip_vs_con.o ip_vs_ch.o newhash.o crc16.o
 
 tar:
                tar -cvf /dev/f1 .
diff -uNr linux1/net/ipv4/crc16.c linux/net/ipv4/crc16.c
--- linux1/net/ipv4/crc16.c     Thu Jan  1 01:00:00 1970
+++ linux/net/ipv4/crc16.c      Mon Oct 23 18:38:04 2000
@@ -0,0 +1,94 @@
+/*
+ * linux/net/ipv4/crc16.c
+ *
+ * Copyright (C) 2000 Institut Eurecom, France
+ *
+ * This file is needed for the hashing routines of the IPVS-CH-scheduler 
+ * It is responsable for the CRC16 and the random number calculation
+ *
+ * Authors:    Guillermo Barrenecha
+ *             Thomas Proell <proellt@xxxxxx>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ * Changes:
+ *
+ */
+
+#include <net/crc16.h>
+#include <linux/kernel.h>
+#define __NO_VERSION__
+
+#ifdef MODULE
+#include <linux/module.h>
+#include <linux/version.h>
+
+#include <linux/modversions.h>
+#endif
+
+unsigned short int hashcrc16(char *www)
+{
+       unsigned int n, crc;
+
+       static unsigned short int crc16l[]= {
+       0x0000, 0xc0c1, 0xc181, 0x0140, 
+       0xc301, 0x03c0, 0x0280, 0xc241,
+       0xc601, 0x06c0, 0x0780, 0xc741, 
+       0x0500, 0xc5c1, 0xc481, 0x0440};        
+
+
+       static unsigned short int crc16h[]= {
+       0x0000, 0xcc01, 0xd801, 0x1400, 
+       0xf001, 0x3c00, 0x2800, 0xe401,
+       0xa001, 0x6c00, 0x7800, 0xb401, 
+       0x5000, 0x9c01, 0x8801, 0x4400};        
+
+
+       crc=0;
+
+       while((n=(unsigned char)(*www++))!=0) 
+       {
+               n ^= crc;
+               crc = crc16l[n&0x0F] ^ crc16h[(n>>4)&0x0F] ^ (crc>>8);
+       }
+
+       return crc;
+}
+
+
+/*
+ * Here's a very good random number gererator, written by
+ * Scott Nelson (http://www.helsbreth.org)
+ * The algorithm itself was invented by George Marsaglia
+ * Thanks to both!
+ */
+
+
+
+static unsigned long combo_x = 3;
+static unsigned long combo_y = 1;
+static unsigned long combo_z = 1;
+static unsigned long combo_v;
+
+
+void seed_rand_combo(unsigned long seed) 
+{
+       combo_x = seed * 8 + 3;
+       combo_y = seed * 2 + 1;
+       combo_z = seed | 1;
+}
+
+unsigned short rand_combo(void) 
+{
+       combo_v = combo_x * combo_y;
+       combo_x = combo_y;
+       combo_y = combo_v;
+       combo_z = (combo_z & 65535) * 30903 + (combo_z >> 16);
+       return ((combo_y + combo_z)&65535);
+}
+
+
+
+
diff -uNr linux1/net/ipv4/ip_vs.c linux/net/ipv4/ip_vs.c
--- linux1/net/ipv4/ip_vs.c     Mon Oct 23 18:01:23 2000
+++ linux/net/ipv4/ip_vs.c      Mon Oct 23 21:25:14 2000
@@ -69,7 +69,8 @@
  *     Wensong Zhang           :    changed to two service hash tables
  *     Julian Anastasov        :    corrected trash_dest lookup for both
  *                                  normal service and fwmark service
- *
+ *     Thomas Proell           :    added consistent hashing scheduler support
+ *                                  
  */
 
 #include <linux/config.h>
@@ -1406,7 +1407,10 @@
          */
         list_add(&dest->n_list, &svc->destinations);
         atomic_inc(&dest->refcnt);
-        
+       
+       if(!strcmp(svc->scheduler->name, "ch"))
+               svc->scheduler->update_service(svc);
+ 
         write_unlock_bh(&__ip_vs_lock);
 
         LeaveFunction(3);
@@ -2140,6 +2144,15 @@
                 return NULL;
         }
         
+       /*
+        *   This one is really dirty, but
+        *   the ch-scheduler needs the
+        *   desired address.
+        */
+
+       if(!strcmp(svc->scheduler->name, "ch"))
+               svc->addr=iph->daddr;
+
         read_lock(&__ip_vs_lock);
         
         dest = svc->scheduler->schedule(svc);
@@ -2755,6 +2768,9 @@
 #endif
 #ifdef CONFIG_IP_MASQUERADE_VS_WLC
         ip_vs_wlc_init();
+#endif
+#ifdef CONFIG_IP_MASQUERADE_VS_CH
+        ip_vs_ch_init();
 #endif
         return 0;
 }
diff -uNr linux1/net/ipv4/ip_vs_ch.c linux/net/ipv4/ip_vs_ch.c
--- linux1/net/ipv4/ip_vs_ch.c  Thu Jan  1 01:00:00 1970
+++ linux/net/ipv4/ip_vs_ch.c   Mon Oct 23 18:38:25 2000
@@ -0,0 +1,286 @@
+/*
+ * linux/net/ipv4/ip_vs_ch.c
+ *
+ * Copyright (C) 2000 Institut Eurecom, France
+ *
+ * This is a scheduler for the Linux Virtual Server that works with
+ * consistent hashing. AFAIK makes only sense when you distribute
+ * traffic between caches, and NOT between server.
+ *             
+ * Consistent Hashing will distribute the load among the caches
+ * according to the requested IP-address (which would always be the
+ * same when you use servers).
+ *
+ * Authors:    Thomas Proell <proellt@xxxxxx>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ * 
+ * Changes:
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <net/ip_masq.h>
+#ifdef CONFIG_IP_MASQUERADE_MOD
+#include <net/ip_masq_mod.h>
+#endif
+#include <linux/sysctl.h>
+#include <linux/ip_fw.h>
+#include <net/ip_vs.h>
+
+#include <linux/string.h>
+#include <linux/socket.h>
+
+#include <net/crc16.h>
+#include <net/newhash.h>
+
+#include <math.h>
+#include <linux/vmalloc.h>
+
+#ifdef MODULE
+#include <linux/modversions.h>
+#endif
+
+static void build_wserver(struct list_head *server, int i);
+static struct ip_vs_dest* get_wserver(int webcache);
+
+
+static int number_of_realservers=0, maxnumber, cachecopies=1000, nbits=16;
+static unsigned char *intervaltable;
+static struct wserver *listwserver;
+
+
+static int ip_vs_ch_init_svc(struct ip_vs_service *svc)
+{
+        register struct list_head *q;
+       int i=0;
+
+       svc->sched_data = &svc->destinations;
+
+       q = (struct list_head *)&svc->sched_data;
+       q = q->next;
+
+       while((q->next)!=(svc->sched_data))
+        {
+                q = q->next;
+               i++;
+       }
+       number_of_realservers=i;
+
+       listwserver=(struct wserver*)kmalloc(number_of_realservers*
+               sizeof(struct wserver), GFP_KERNEL);
+       if(listwserver==NULL)
+       {
+               IP_VS_ERR("ip_vs_ch_init_svc: kmalloc failed.\n");
+               return 0;
+       }
+
+       for(i=0; i<number_of_realservers; i++)
+       {
+               q = q->next;
+               build_wserver(q, i);
+       }
+
+       maxnumber=pow(2, nbits);
+       intervaltable=(unsigned char*)vmalloc(maxnumber*sizeof(unsigned char));
+       if(intervaltable==NULL)
+       {
+               IP_VS_ERR("ip_vs_ch_init_svc: vmalloc failed.\n");
+               return 0;
+       }
+
+       filltable(number_of_realservers, listwserver, intervaltable, 
+               maxnumber, cachecopies);
+
+       padtable(intervaltable, maxnumber);
+
+
+        MOD_INC_USE_COUNT;
+        return 0;
+}
+
+
+static int ip_vs_ch_done_svc(struct ip_vs_service *svc)
+{
+        MOD_DEC_USE_COUNT;
+       kfree_s(listwserver, number_of_realservers*sizeof
+               (struct wserver));
+       vfree(intervaltable);
+        return 0;
+}
+
+
+static int ip_vs_ch_update_svc(struct ip_vs_service *svc)
+{
+        register struct list_head *q;
+        int i=0;
+
+       svc->sched_data = &svc->destinations;
+
+        q = (struct list_head *)&svc->sched_data;
+       q = q->next;
+
+        while((q->next)!=(svc->sched_data))
+        {
+                q = q->next;
+                i++;
+        }
+
+       if(i!=number_of_realservers)
+       {
+               number_of_realservers=i;
+               kfree_s(listwserver, number_of_realservers*sizeof
+                       (struct wserver));              
+               listwserver=(struct wserver*)kmalloc(number_of_realservers*
+                       sizeof(struct wserver), GFP_KERNEL);
+               if(listwserver==NULL)
+               {
+                       IP_VS_ERR("ip_vs_ch_update: kmalloc failed.\n");
+                       return 0;
+               }                       
+       }
+
+       q = q->next;
+       for(i=0; i<number_of_realservers; i++)
+       {
+               q = q->next;
+               build_wserver(q, i);
+       }
+
+
+       filltable(number_of_realservers, listwserver, intervaltable, 
+               maxnumber, cachecopies);
+
+       padtable(intervaltable, maxnumber);
+        return 0;
+}
+
+
+/*
+ * Consistant Hashing Scheduling
+ */
+static struct ip_vs_dest* ip_vs_ch_schedule(struct ip_vs_service *svc)
+{
+       register struct list_head *q;
+        static struct ip_vs_dest *dest;
+       static struct ip_vs_dest *compare;
+       int i;
+       char webserver[18];
+
+        IP_VS_DBG("ip_vs_ch_schedule(): Scheduling...\n");
+
+       sprintf(webserver, "%d.%d.%d.%d", NIPQUAD(svc->addr));
+       i=((unsigned char)newhash(webserver, intervaltable)-1);
+
+       dest=get_wserver(i);
+
+       q=(struct list_head *)&svc->sched_data; 
+       q=q->next;
+
+       while((q->next)!=(svc->sched_data))
+       {
+               q=q->next;
+               compare=list_entry(q, struct ip_vs_dest, n_list);
+               if(compare->addr==dest->addr)
+                       if(compare->port==dest->port)
+                       {
+                               IP_VS_DBG("CH: server %d.%d.%d.%d:%d "
+                               "activeconns %d refcnt %d weight %d\n",
+                               NIPQUAD(compare->addr), ntohs(compare->port),
+                               atomic_read(&compare->activeconns),
+                               atomic_read(&compare->refcnt), compare->weight);
+                               return compare;
+                       }
+       } 
+       return NULL;
+
+}
+
+
+static struct ip_vs_scheduler ip_vs_ch_scheduler = {
+        {0},                    /* n_list */
+        "ch",                   /* name */
+        ATOMIC_INIT(0),         /* refcnt */
+        ip_vs_ch_init_svc,      /* service initializer */
+        ip_vs_ch_done_svc,      /* service done */
+        ip_vs_ch_update_svc,    /* service updater */
+        ip_vs_ch_schedule       /* select a server from the destination list */
+};
+
+
+__initfunc(int ip_vs_ch_init(void))
+{
+        IP_VS_INFO("Initializing CH scheduling\n");
+        INIT_LIST_HEAD(&ip_vs_ch_scheduler.n_list);
+        return register_ip_vs_scheduler(&ip_vs_ch_scheduler) ;
+}
+
+
+static void build_wserver(struct list_head *server, int i)
+{
+        struct ip_vs_dest *destin;
+        destin = list_entry(server, struct ip_vs_dest, n_list);
+       listwserver[i].name=destin->addr;
+       listwserver[i].port=destin->port;
+}
+
+
+
+static struct ip_vs_dest* get_wserver(int webcache)
+{
+       static struct ip_vs_dest dest;
+       dest.port=listwserver[webcache].port;
+       dest.addr=listwserver[webcache].name;
+       return &dest;
+}
+
+
+
+
+
+
+#ifdef MODULE
+EXPORT_NO_SYMBOLS;
+
+int init_module(void)
+{
+        INIT_LIST_HEAD(&ip_vs_ch_scheduler.n_list);
+
+        /* module initialization by 'request_module' */
+        if(register_ip_vs_scheduler(&ip_vs_ch_scheduler) != 0)
+                return -EIO;
+
+        IP_VS_INFO("CH scheduling module loaded.\n");
+
+        return 0;
+}
+
+void cleanup_module(void)
+{
+        /* module cleanup by 'release_module' */
+        if(unregister_ip_vs_scheduler(&ip_vs_ch_scheduler) != 0)
+                IP_VS_INFO("cannot remove CH scheduling module\n");
+        else
+                IP_VS_INFO("CH scheduling module unloaded.\n");
+}
+#endif /* MODULE */
+
+
+
+
+
+
+
+
+
+
+
diff -uNr linux1/net/ipv4/newhash.c linux/net/ipv4/newhash.c
--- linux1/net/ipv4/newhash.c   Thu Jan  1 01:00:00 1970
+++ linux/net/ipv4/newhash.c    Mon Oct 23 21:24:09 2000
@@ -0,0 +1,99 @@
+/*
+ * linux/net/ipv4/newhash.c    
+ *
+ * Copyright (C) 2000 Institut Eurecom, France
+ *
+ * This file is included by the consistent hashing scheduler of IPVS.
+ * It is responsable for all hashing-operations.
+ *
+ * Authors:    Guillermo Barrenecha
+ *             Thomas Proell <proellt@xxxxxx>
+ *
+ *             This program is free software; you can redistribute it and/or
+ *             modify it under the terms of the GNU General Public License
+ *             as published by the Free Software Foundation; either version
+ *             2 of the License, or (at your option) any later version.
+ * Changes:
+ * 
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <math.h>
+#include <net/newhash.h>
+#include <net/crc16.h>
+
+#include <linux/kernel.h>
+#include <stdlib.h>
+
+#ifdef MODULE
+#define __NO_VERSION__
+#include <linux/module.h>
+#include <linux/version.h>
+
+#include <linux/modversions.h>
+#endif
+
+void filltable(        int nservers, struct wserver *listwserver, 
+               unsigned char *intervaltable,
+               unsigned int maxnumber, int cachecopies)
+{
+       int i;
+       char name[18];
+
+       for (i=0; (unsigned int)i<maxnumber; i++)
+               (unsigned char)intervaltable[i]=(unsigned char)0;
+
+       for (i=0; i<nservers; i++)
+       {
+               sprintf(name, "%d.%d.%d.%d", NIPQUAD(listwserver[i].name));
+               addservcop(name, i+1, cachecopies, intervaltable,maxnumber);
+       }
+}
+
+
+
+
+void addservcop(char *server, int index, int copies, 
+               unsigned char *intervaltable, unsigned int maxnumber)
+{
+       int i;
+       unsigned int seed;
+       seed=(unsigned short int)hashcrc16(server);
+       seed_rand_combo((unsigned int)seed);
+
+       for(i=0;i<copies;i++) 
+               intervaltable[(unsigned long)((rand_combo()*
+               maxnumber)/RAND_MAX_COMBO)]=index;
+}
+
+
+
+
+void padtable(unsigned char *intervaltable, unsigned int maxnumber)
+{
+       int i,j,k;
+       k=0;
+       for(i=0; i<maxnumber; i++)
+       {
+               if(intervaltable[i] != 0) 
+               {
+                       for(j=k; j<i; j++) 
+                               intervaltable[j]=intervaltable[i];
+                       k=i+1;
+               }
+       }
+       for(i=k; i<maxnumber; i++) 
+               intervaltable[i]=intervaltable[0];
+}
+
+
+unsigned char newhash(char *item, unsigned char *intervaltable)
+{
+       unsigned char i;
+       i=intervaltable[(unsigned short int)hashcrc16(item)];
+       return i;
+}




<Prev in Thread] Current Thread [Next in Thread>