load control system (6 of 8)

Keith Muller muller at sdcc3.UUCP
Wed Feb 13 05:03:14 AEST 1985


This is part 6 of the the load control system. Part 1 must be unpacked before
any other part.
	Keith Muller
	ucbvax!sdcsvax!muller


# This is a shell archive.  Remove anything before this line,
# then unpack it by saving it in a file and typing "sh file".
#
# Wrapped by sdcc3!muller on Sat Feb  9 13:56:47 PST 1985
# Contents:  server/Makefile server/data.c server/globals.c server/main.c
 
echo x - server/Makefile
sed 's/^@//' > "server/Makefile" <<'@//E*O*F server/Makefile//'
#
# Makefile for batch server
#

CFLAGS= -O

BGID=	lddgrp

DEST=	/etc

HDR=	../h/common.h ../h/server.h

SRC=	main.c data.c globals.c setup.c commands.c

OBJ=	main.o data.o globals.o setup.o commands.o

all:	ldd

ldd:    $(OBJ)
	cc -o ldd $(OBJ)

$(OBJ):	 $(HDR)

install: $(DEST)/ldd

$(DEST)/ldd: ldd
	install -c -m 700 -o root -g $(BGID) ldd $(DEST)

clean:
	rm -f $(OBJ) core ldd

lint:
	lint -abchx $(SRC)
@//E*O*F server/Makefile//
chmod u=r,g=r,o=r server/Makefile
 
echo x - server/data.c
sed 's/^@//' > "server/data.c" <<'@//E*O*F server/data.c//'

/*-------------------------------------------------------------------------
 * data.c - server
 *
 * routines that deal with the data structures maintained by the server.
 * the server uses a double linked list with qhead pointing at the head
 * and qtail pointing at the tail. if the queue is not empty then
 * qhead->back is always QNIL and qtail->fow is always QNIL. Insertions
 * also require that the time field increase (older to younger) from qhead
 * to qtail. 
 *
 * NOTE: that when nodes are added to the free list only the fow
 * link is altered so procedures that search through the list with the
 * intention of calling rmqueue must search from qtail to qhead because
 * rmqueue will destroy the nodes fow link.
 *-------------------------------------------------------------------------
 */

/* $Log$ */

#include "../h/common.h"
#include "../h/server.h"

extern struct qnode *qhead;
extern struct qnode *qtail;
extern struct qnode *freequeue;
extern int qcount;
extern int newlist;
extern int newstatus;

/*------------------------------------------------------------------------
 * rmqueue
 *
 * remove the node pointed at by work from the double linked list.
 *------------------------------------------------------------------------
 */
rmqueue(work)
struct qnode *work;
{
	/*
	 * set flags to indicate the list and status files are out of date
	 */
	newlist = 1;
	newstatus = 1;
	qcount--;

	/*
	 * splice the job out of the queue
	 */
	if (work->back == QNIL)
		qhead = work->fow;
	if (work->fow == QNIL)
		qtail = work->back;
	if (work->fow != QNIL)
		(work->fow)->back = work->back;
	if (work->back != QNIL)
		(work->back)->fow = work->fow;
	work->fow = freequeue;
	freequeue = work;
}

/*-------------------------------------------------------------------------
 * addqueue
 *
 * add a node to the queue if it is not already in it.
 * note that when clients poll the server to see if it is still alive they
 * send another "queue" command. This is why addqueue must 
 * check if the job is still queued.
 *-------------------------------------------------------------------------
 */
addqueue(work)
struct request *work;
{
	register struct qnode *spot;
	register struct qnode *spot2;
	register struct qnode *ptr;
	extern int full;
	extern char *malloc();
	extern char *strcpy();

	/*
	 * find the place in the queue for this request. The
	 * time field is used for this oldest requests belong closer
	 * to the head of the queue.
	 */
	for (spot = qtail; spot != QNIL; spot = spot->back){
		/*
		 * it might be already in the queue as a client
		 * is just polling the server to see if the server is
		 * still alive
		 */
		if (spot->pid == work->pid)
			return(1);

		/*
		 * check to see if this job is older
		 */
		if (work->time > spot->time)
			break;
	}

	/*
	 * At this point, job is not in the queue at the correct point.
	 * either is a new job or a client checking to see if server is
	 * alive. If this is a check, look for job higher up in the queue.
	 */
	if (work->type != POLLCMD){
		/*
	 	 * at this point the node is a new one, reject if the
	 	 * queue is full.
	 	 */
		if (qcount >= full)
			return(-2);
	}else if (spot != QNIL){
		/*
		 * this job is just checking up to see if it is still
		 * queued.
		 */
		for (spot2 = spot->back; spot2 != QNIL; spot2 = spot2->back){
			/*
			 * job must have been moved
			 */
			if (spot2->pid == work->pid)
				return(1);
		}

		/*
		 * at this point the job is missing. it should have
		 * been in the queue. so put it back.
		 */
	}

	/*
	 * allocate space for qnode, check freelist first
	 */
	if (freequeue == QNIL)
		ptr = (struct qnode *)malloc(sizeof(struct qnode));
	else{
		ptr = freequeue;
		freequeue = ptr->fow;
	}
	if (ptr == QNIL){
		errlog("no space for a qnode");
		return(-1);
	}

	/*
	 * copy in the data from the datagram
	 */
	ptr->pid = work->pid;
	ptr->uid = work->uid;
	ptr->time = work->time;
	(void)strcpy(ptr->com, work->com);

	/*
	 * special case if queue was empty
	 */
	if (qcount == 0){
		if ((qhead != QNIL) || (qtail != QNIL)){
			errlog("Addqueue: qcount should not be 0");
			cleanup();
		}
		qhead = qtail = ptr;
		ptr->fow = ptr->back = QNIL;
		newlist = 1;
		newstatus = 1;
		qcount = 1;
		return(0);
	}
	/*
	 * do two integrity checks, yes we are paranoid
	 */
	if (qhead == QNIL){
		errlog("Addqueue: qhead should not be QNIL");
		cleanup();
	}
	if (qtail == QNIL){
		errlog("Addqueue: qtail should not be QNIL");
		cleanup();
	}

	/*
	 * if spot == qhead, belongs at very beginning of queue
	 */
	if (spot == QNIL){
		qhead->back = ptr;
		ptr->fow = qhead;
		ptr->back = QNIL;
		qhead = ptr;
	}else{
		/*
		 * insert into the queue
		 */
		ptr->fow = spot->fow;
		ptr->back = spot;
		if (spot->fow != QNIL)
			(spot->fow)->back = ptr;
		else
			qtail = ptr;
		spot->fow = ptr;
	}
	/*
	 * change newlist to show queue has changed
	 */
	newlist = 1;
	newstatus = 1;
	qcount++;
	return(1);
}


/*-------------------------------------------------------------------------
 * movequeue
 *
 * move the job pid to posistion pos in the queue. Note to maintain
 * insertion date requirements, the time field in the moved job is
 * altered.
 *-------------------------------------------------------------------------
 */
movequeue(pos,pid)
u_long pos;
u_long pid;
{
	register struct qnode *ptr;
	register struct qnode *work;
	extern int qcount;

	work = QNIL;
	for (ptr = qhead; ptr != QNIL; ptr = ptr->fow){
		/*
		 * look for the requested node, set work to point
		 */
		if (ptr->pid == pid){
			work = ptr;
			break;
		}
	}

	/*
	 * if not found return -1 as no such pid, or return 0
	 * if only one job queued
	 */
	if (work == QNIL)
		return(-1);
	if (qcount == 1)
		return(0);

	/*
	 * set ptr to point a position to move work to
	 * note: first position in queue is 1 (not 0).
	 */
	for (ptr = qhead; ((ptr != QNIL) && (pos > 1)); ptr = ptr->fow){
		if (ptr != work)
			/*
			 * must be moving the job to a lower position
			 * in the queue. So cannot count self.
			 */
			pos--;
	}

	/*
	 * if it is already at the requested position, or the pos is
	 * after the last node and the pid IS the last node, return
	 */
	if ((ptr == work) || ((ptr == QNIL) && (qtail == work)))
		return(0);
	
	newlist = 1;
	/*
	 * splice the node out of the queue
	 */
	if (work->fow != QNIL)
		(work->fow)->back = work->back;
	if (work->back != QNIL)
		(work->back)->fow = work->fow;
	if (qtail == work)
		qtail = work->back;
	if (qhead == work)
		qhead = work->fow;
	/*
	 * splice the node into the new position.
	 */
	if (ptr == QNIL){
		/*
		 * put at the end of the queue
		 */
		work->back = qtail;
		work->fow = QNIL;
		work->time = qtail->time + 1;
		qtail->fow = work;
		qtail = work;
	}else{
		/*
		 * belongs in the queue as ptr points at a node
		 */
		work->fow = ptr;
		work->back = ptr->back;
		/*
		 * see if the pid is being put at the head of the list
		 */
		if (ptr->back != QNIL){
			(ptr->back)->fow = work;
			work->time = ptr->time-((ptr->time-(ptr->back)->time)/2);
		}else{
			qhead = work;
			work->time = ptr->time - 1;
		}
		ptr->back = work;
	}
	return(0);
}
@//E*O*F server/data.c//
chmod u=r,g=r,o=r server/data.c
 
echo x - server/globals.c
sed 's/^@//' > "server/globals.c" <<'@//E*O*F server/globals.c//'

/*-------------------------------------------------------------------------
 * globals.c - server
 *
 * allocation of the variables that are global to the server.
 *-------------------------------------------------------------------------
 */

/* $Log$ */

#include "../h/common.h"
#include "../h/server.h"
#include <sys/uio.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/time.h>
#include <stdio.h>

int kmem = -1;				/* file desc for kmem to get load */
int cntrlsock = -1;			/* socket desc for control messages*/
int msgsock = -1;			/* socket for queue requests */
int qcount = 0;				/* count job in the queue */
int newlist = 1;			/* 1 when queue is new than last list*/
int newstatus = 1;			/* 1 when status variable are changed*/
int errorcount = 0;			/* count of number of recovered error*/
int timerstop = 1;			/* when when timer stopped, 0 runs */
u_long mqtime = MAXQTIME;		/* max time a job can be in queue */
int descsize = 0;			/* desc table size for select */
long loadaddr = 0;			/* address of load aver in kmem */
int alrmmask = 0;			/* mask for blocking SIGALRM */
int full = MAXINQUEUE;			/* max number of jobs waiting to run */
FILE *errfile;				/* file where errors are logged */
struct qnode *qhead = QNIL;		/* points at queue head */
struct qnode *qtail = QNIL;		/* points at queue tail */
struct qnode *freequeue = QNIL;		/* pointer to local freelist of qnode*/
struct itimerval startalrm = {{ALRMTIME,0},{ALRMTIME,0}}; /* alrm time */
struct itimerval stopalrm = {{0,0},{0,0}}; /* value used to stop timer */
struct timeval polltime = {WAITTIME,0};    /* wait time during poll */

#ifdef sun
long loadlevel = (long)(MAXLOAD*256);	/* load at which queueing starts */
#else
double loadlevel = MAXLOAD;		/* load at which queueing starts */
#endif
@//E*O*F server/globals.c//
chmod u=r,g=r,o=r server/globals.c
 
echo x - server/main.c
sed 's/^@//' > "server/main.c" <<'@//E*O*F server/main.c//'

/*-------------------------------------------------------------------------
 * main.c - server
 *
 * The server takes requests from client processes and the control
 * program, and performs various operations. The servers major task is
 * to attempt to maintain the systems load average close to a set limit
 * loadlevel. Client processes are kept in a queue and are waiting for a
 * command from the server (to run or abort). The server reads /dev/kmem
 * every ALRMTIME seconds checking to see if the load level has dropped
 * below the required loadlevel. If the queue is empty the timer is turned
 * off. While the timer is off, the server will only read /dev/kmem at the
 * receipt of a request to run from a client program.
 *
 * The server was designed to be as fault tolerant as possible and maintains
 * an errorfile of detectable errors. The server can safely be aborted and
 * restarted without deadlocking the clients. The server when restarted
 * will rebuild the queue of waiting processes to the state that exsisted
 * before the prvious server exited. The entire system was designed to allow
 * execution of user programs (even those under load control) even if the
 * server is not functioning properly! (user jobs will ALWAYS run, the system
 * will never hang).
 *
 * The effectiveness of the system depends on what fraction of the programs
 * that are causing the system overload are maintained under this system.
 * Processes can only remain in queue a maximium of "mqtime" seconds 
 * REGARDLESS of the loadlevel setting. This was done in case the programs 
 * that are keeping the systems loadlevel above the threshold are not
 * controlled by the server! So eventually all jobs will run.
 *
 * The control program allows users to remove their jobs from the queue and
 * allows root to adjust the operating parameters of the server while the
 * server is running.
 * 
 * All the programs and routines are commented and warnings about certain
 * sections of code are given when the code might be vague.
 * 
 * This system has ONLY BEEN RUN ON 4.2 UNIX (sun, vax and pyramid) and uses
 * datagrams in the AF_UNIX domain. (which seems to be extremely reliable).
 *
 * Author: Keith Muller
 *         University of California, San Diego
 *         Academic Computer Center C - 010
 *	   La Jolla, Ca 92093
 *	   (ucbvax!sdcsvax!sdcc3!muller)
 *	   (619) 452-6090
 *-------------------------------------------------------------------------
 */

/* $Log$ */

#include "../h/common.h"
#include "../h/server.h"
#include <sys/time.h>
#include <sys/file.h>
#include <stdio.h>
#include <errno.h>

/*--------------------------------------------------------------------------
 * main
 *
 *--------------------------------------------------------------------------
 */
main(argc, argv)
int argc;
char **argv;
{
	register int msgmask;
	register int cntrlmask;
	int numfds;
	int readfds;
	int readmask;
	extern int msgsock;
	extern int cntrlsock;
	extern int descsize;
	extern int errno;

	/*
	 * check the command line args
	 */
	doargs(argc, argv);

	/*
	 * setup the server
	 */
	setup();

	/*
	 * create all the sockets
	 */
	crsock();

	/*
	 * scan the spool for waiting clients and send them a POLLCMD
	 */
	scanspool();

	/*
	 * create the bit mask used by select to determine which descriptors
	 * are checked for available input ( datagrams).
	 */
	msgmask = 1 << msgsock;
	cntrlmask = 1 << cntrlsock;
	readmask = msgmask | cntrlmask;

	/*
	 * do this forever
	 */
	for(;;){
		readfds = readmask;

		/*
		 * wait for a datagram to arrive
		 */
		numfds = select(descsize,&readfds,(int *)0,(int *)0,(struct timeval *)0);
		if ((numfds < 0) && (errno != EINTR)){
			errlog("select error");
			cleanup();
		}

		/*
		 * if the interval timer interrupted us, go back to the select
		 */
		if (numfds <= 0)
			continue;
		/*
		 * WARNING! note that BOTH SOCKETS are always checked 
		 * when the select indicates at least one datagram is waiting.
		 * This was done to prevent a situation where one socket
		 * "locks" out the other if it is subject to high traffic!
		 */

		/*
		 * first check to see if there is a control message
		 */
		if (readfds & cntrlmask)
			cntrldis();

		/*
		 * now see if there is a queue message
		 */
		if (readfds & msgmask)
			msgdis();
	}

}


/*--------------------------------------------------------------------------
 * onalrm
 *
 * handler for the SIGALRM sent by the interval timer. This routine checks
 * the queue to see if there is any jobs that can be run. The two conditions
 * for running a job is that the load on the machine is below loadlimit or
 * the oldest job in the queue has exceed the maximium queue time and should
 * be run regardless of the load.
 *--------------------------------------------------------------------------
 */
onalrm()
{
	register int count;
	struct timezone zone;
	struct timeval now;
	struct itimerval oldalrm;
	extern struct itimerval stopalrm;
	extern struct qnode *qhead;
	extern u_long mqtime;
	extern int qcount;
	extern int timerstop;
	extern int newstatus;

	/*
	 * if the load average is below the limit run as many jobs as
	 * possable to bring the load up to the loadlimit.
	 * this could cause an overshoot of the loadlimit, but in most
	 * cases this overshoot will be small. This prevents excessive
	 * waiting of jobs due to momentary load peaks.
	 */
	if ((count = getrun()) != 0){
		while ((count > 0) && (qcount > 0)){
			/*
			 * only decrement count if there was really
			 * a waiting client (the client could be dead)
			 */
			if (outmsg(qhead->pid, RUNCMD) == 0)
				count--;
			rmqueue(qhead);
		}
	}else if (qcount > 0){
		/*
		 * load is too high to run a job, check if oldest can be run
		 */
		if (gettimeofday(&now, &zone) < 0){
			errlog("onalrm cannot get time");
			return;
		}
		while ((qcount>0)&&(((u_long)now.tv_sec - qhead->time)>mqtime)){
			/*
			 * determined oldest job can run. if job is
			 * dead try next one
			 */
			if (outmsg(qhead->pid, RUNCMD) == 0){
				rmqueue(qhead);
				break;
			}else
				rmqueue(qhead);
		}
	}

	/*
	 * if the queue is not empty or the interval timer is stopped
	 * then return
	 */
	if ((qcount != 0) || (timerstop == 1))
		return;

	/*
	 * otherwise stop the timer
	 */
	if (setitimer(ITIMER_REAL,&stopalrm, &oldalrm) < 0)
		errlog("stop timer error");
	else{
		timerstop = 1;
		newstatus = 1;
	}
}


/*-------------------------------------------------------------------------
 * getrun
 *
 * determines how many jobs can be run after obtaining current 1 minute
 * load average. since the load obtained from kmeme is an average, this
 * should provide some hysteresis so the server doesn't thrash around
 *-------------------------------------------------------------------------
 */
getrun()
{
	extern int qcount;
	extern int kmem;
	extern long loadaddr;
#ifdef sun
	long load;
	long run;
	extern long loadlevel;
#else
	double load;
	double run;
	extern double loadlevel;
#endif sun
	extern long lseek();

	/*
	 * seek out into kmem (yuck!!!)
	 */
	if (lseek(kmem, loadaddr, L_SET) == -1){
		errlog("lseek error");
		cleanup();
	}

	/*
	 * read the load
	 */
	if (read(kmem, (char *)&load, sizeof(load)) < 0){
		errlog("kmem read error");
		cleanup();
	}

	/*
	 * calculate the number of jobs that can run
	 * (will always overshoot by the fraction)
	 */
	if ((run = loadlevel - load) > 0){
#ifdef sun
		/*
	 	 * sun encodes the load average in a long. It is the
	 	 * load average * 256
	 	 */
		return(1 + (int)(run >> 8));
#else
		return(1 + (int)run);
#endif
	}else
		return(0);
}


/*------------------------------------------------------------------------
 * errlog
 *
 * log the erros into a log. should be small number (hopefully zero!!)
 *------------------------------------------------------------------------
 */
errlog (mess)
char *mess;

{
	struct timeval now;
	struct timezone zone;
	extern char *ctime();
	extern int errorcount;
	extern int errno;
	extern int sys_nerr;
	extern char *sys_errlist[];
	extern FILE *errfile;

	/*
	 * increase the errorcount
	 */
	errorcount = errorcount + 1;

	/*
	 * if called with an arg, print it first
	 */
	if (mess != (char *)0)
		fprintf(errfile,"%s: ", mess);
	/*
	 * if a valid error print the human message
	 */
	if ((errno > 0) && (errno < sys_nerr))
		fprintf(errfile," %s ", sys_errlist[errno]);
	/*
	 * stamp the time of occurance
	 */
	if (gettimeofday(&now, &zone) < 0)
		fprintf(errfile,"errlog cannot get time of day\n");
	else
		fprintf(errfile,"%s", ctime(&(now.tv_sec)));
	(void)fflush(errfile);
}


/*-------------------------------------------------------------------------
 * cleanup
 *
 * the whole system fell apart. close down the sockets log the server
 * termination and exit.
 *-------------------------------------------------------------------------
 */
cleanup()
{
	extern int msgsock;
	extern int cntrlsock;
	extern int errno;
	extern FILE *errfile;

	(void)close(msgsock);
	(void)close(cntrlsock);
	(void)unlink(MSGPATH);
	(void)unlink(CNTRLPATH);
	errno = 0;
	errlog("Server aborting at");
	(void)fclose(errfile);
	exit(1);
}
@//E*O*F server/main.c//
chmod u=r,g=r,o=r server/main.c
 
echo Inspecting for damage in transit...
temp=/tmp/shar$$; dtemp=/tmp/.shar$$
trap "rm -f $temp $dtemp; exit" 0 1 2 3 15
cat > $temp <<\!!!
      33      62     411 Makefile
     311    1144    7097 data.c
      44     288    1782 globals.c
     355    1341    9080 main.c
     743    2835   18370 total
!!!
wc  server/Makefile server/data.c server/globals.c server/main.c | sed 's=[^ ]*/==' | diff -b $temp - >$dtemp
if [ -s $dtemp ]
then echo "Ouch [diff of wc output]:" ; cat $dtemp
else echo "No problems found."
fi
exit 0



More information about the Comp.sources.unix mailing list