irix-657m-src/irix/kern/os/clock.c

/**************************************************************************
 *									  *
 * 		 Copyright (C) 1989-1994 Silicon Graphics, Inc.		  *
 *									  *
 *  These coded instructions, statements, and computer programs  contain  *
 *  unpublished  proprietary  information of Silicon Graphics, Inc., and  *
 *  are protected by Federal copyright law.  They  may  not be disclosed  *
 *  to  third  parties  or copied or duplicated in any form, in whole or  *
 *  in part, without the prior written consent of Silicon Graphics, Inc.  *
 *									  *
 **************************************************************************/
/*	Copyright (c) 1984 AT&T	*/
/*	  All Rights Reserved  	*/

/*	THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF AT&T	*/
/*	The copyright notice above does not evidence any   	*/
/*	actual or intended publication of such source code.	*/

#ident	"$Revision: 3.401 $"

#include <stdarg.h>
#include <sys/types.h>
#include <ksys/as.h>
#include <os/as/as_private.h> /* XXX */
#include <sys/callo.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/errno.h>
#include <sys/getpages.h>
#include <sys/immu.h>
#include <sys/pfdat.h>
#include <sys/page.h>
#include <sys/kabi.h>
#include <sys/kmem.h>
#include <sys/ksignal.h>
#include <sys/par.h>
#include <sys/param.h>
#include <sys/pda.h>
#include <sys/prctl.h>
#include <sys/proc.h>
#include <sys/psw.h>
#include <sys/reg.h>
#include <sys/resource.h>
#include <sys/runq.h>
#include <sys/schedctl.h>
#include <sys/signal.h>
#include <sys/strmp.h>
#include <sys/swap.h>
#include <sys/sysinfo.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/ktime.h>
#include <sys/time.h>
#include <sys/tuneable.h>
#include <ksys/exception.h>
#include <sys/xlate.h>
#include <sys/capability.h>
#include <sys/sat.h>
#include <sys/space.h>
#include <ksys/sthread.h>
#include <ksys/xthread.h>
#include <sys/hwperfmacros.h>
#include <sys/atomic_ops.h>
#include <sys/calloinfo.h>
#include <sys/ddi.h>
#include <sys/klog.h>
#include <sys/rt.h>
#include "os/proc/pproc_private.h"	/* XXX bogus */
#include <ksys/hwperf.h>
#ifdef	CELL
#include <ksys/cell/cell_hb.h>
#endif
#include <sys/lpage.h>
#include <ksys/vhost.h>
#ifdef NUMA_BASE
#include <sys/nodepda.h>
#endif

/*
 * Clock
 *
 * Functions:
 *	implement callouts
 *	maintain user/system times
 *	maintain date
 *	profile
 *	alarm clock signals
 *	jab the scheduler
 */
extern sema_t	vfswakeup;
extern void	onesec_maint(void);
extern void	tick_maint(void);
extern void	tick_actions(void);
static int	updateload(proc_t *, void *, int);
static void	calcrss(uthread_t *);
static void callout_itentry_init(callout_info_t *, int, char *);
static pgcnt_t getrss(uthread_t *);

sema_t		second_sema;		/* v'd once a second */

time_t		time;			/* time in seconds since 1970 */
time_t		lbolt;			/* time in HZ since last boot */

#if (MAXCPUS > 128)
int		sched_tick_mask=0;	/* (see comment where used below) */
#endif

int	one_sec = 1;
extern int vhandkicklim;	/* # pages before kick vhand	*/
int	vhandcnt;		/* counter for vhand kick	*/
int	vfssynccnt;		/* counter for vfs_syncr	*/
uint	sxbrkcnt;		/* count of uthreads which are SXBRK */
ulong	freeswap;		/* amount of free swap */


time_t lastadjtime;		/* HZ since last adjtime(2) */
time_t lastadjtod = DIDADJTIME;	/* for 1 hour after last adjtime(2) */

#if !defined(CLOCK_CTIME_IS_ABSOLUTE)
callout_info_t *calltodo;
#endif /* !CLOCK_CTIME_IS_ABSOLUTE */
callout_info_t *fastcatodo;
extern struct callout *timeout_get_queuehead(long list, struct callout *pnew);

static toid_t find_migrated_timeout(toid_t id);
static lock_t migrated_timeout_lock;

static int rqlen;
#ifdef notdef
static int sqlen;
#endif

extern int		pdcount;	/* count of pdinserted pages */
extern int		pdflag;		/* flag is nonzero if pdcount pages */

/*
 * Compute a Tenex-style load average of a quantity on 1, 5 and 15 minute
 * intervals, using 'fixed-point' arithmetic with 3 decimal digits to right.
 *
 *  avg[T+t] = avg[T] * exp(-t/c) + nrun * (1 - exp(-t/c))
 *  where c = 1, 5, and 15 minutes, t = calculation interval
 *
 * Exponential constants for the specified interval:
 */
#define AVENRUN_INTVL	4	/* interval between calculations */
#define	CEXP_0	958		/* (int) (exp(-4/60) * 1024) */
#define	CEXP_1	1010		/* (int) (exp(-4/300) * 1024) */
#define	CEXP_2	1019		/* (int) (exp(-4/900) * 1024) */

__int32_t avenrun[3];		/* smoothed load averages */
static int nrun;		/* # of runnable processes (see updateload) */

static void
calcavenrun(void)
{
	register __int32_t *avg = avenrun;

#define AVGCALC(n, exp) \
	avg[n] = (exp * (__int64_t)avg[n] + \
		 (((1024 - exp) * (__int64_t)nrun) << 10)) >> 10

	AVGCALC(0, CEXP_0);
	AVGCALC(1, CEXP_1);
	AVGCALC(2, CEXP_2);

#undef AVGCALC
}

#ifdef NUMA_BASE
#define SYSWAIT_NODE(_node, _field) NODEPDA(_node)->syswait._field
#define SYSWAIT_BACKOFF_THRESHOLD	(HZ * 5)	/* 5 secs */
#define SYSWAIT_BACKOFF_RATE		100		/* one out of 100 */
#define SYSWAIT_BIGSYS			10		/* one out of 10 */

/* Only the clock master re-calculates the global syswait count */
#define RECALC_SYSWAIT()					\
{								\
	if (private.p_flags & PDAF_CLOCK) {			\
		recalc_syswait();				\
		if (syswait.iowait || syswait.swap || syswait.physio) { \
			SYSINFO.wioocc++;			\
			SYSINFO.wioque += (syswait.iowait + 	\
				syswait.swap + syswait.physio);	\
		}						\
	}							\
}

/* Reset the node's syswait field by taking the current value and removing it.
 * This should prevent overflows from occurring.  We just can't set it to zero
 * because it could change while we are resetting it.  After doing this for
 * all nodes the accum value should represent the global state of syswait.
 * We can then apply that to our node to save it.
 */
#define RESET_SYSWAIT_VAL(_accum, _tmp, _node, _field)		\
{								\
	_tmp = SYSWAIT_NODE(_node, _field);			\
	_accum._field += _tmp;					\
	if (_tmp) {						\
		atomicAddInt(&NODEPDA(_node)->syswait._field, ~(_tmp - 1)); \
	}							\
}

static void
recalc_syswait(void)
{
	int	i;
	struct syswait	tmp_syswait;
	static time_t	last_wait = 0;

	/* To eliminate contention on the global syswait counter
	 * each node has its own. We add up the syswait counts of all
	 * nodes and store the result in the global count that will be
	 * seen by the other cpus.  Maybe the clock cpu is not the best
	 * choice for this since it gets stuck doing everything, but then
	 * again maybe that is a good reason to pick it ???
	 */

#if defined(SN0XXL) || defined(SN1)
	/*
	 * Large systems cant afford to make this calculation every tick.
	 */
	if (lbolt % SYSWAIT_BIGSYS != 0)
		return;
#endif
	if (lbolt - last_wait >= SYSWAIT_BACKOFF_THRESHOLD) {

		/* We are backing off to a slower rate since nothing
		 * showed up within the threshold limit.  If something
		 * shows up again we'll go back to every tick.
		 */

		if (lbolt % SYSWAIT_BACKOFF_RATE != 0) {
			return;
		}
	}

	bzero(&tmp_syswait, sizeof(tmp_syswait));
	for (i = 0; i < numnodes; i++) {

		/* If the total count ends up being negative we must
		 * have passed the node that incremented it and we only
		 * caught the decrement.  Will set it to zero later.
		 */

		tmp_syswait.iowait += SYSWAIT_NODE(i, iowait);
		tmp_syswait.swap += SYSWAIT_NODE(i, swap);
		tmp_syswait.physio += SYSWAIT_NODE(i, physio);
	}

	/* Fix negative counts where the increment ocurred on a cpu
	 * after we went past it in the loop above.
	 */

	if (tmp_syswait.iowait < 0) {
		tmp_syswait.iowait = 0;
	}
	if (tmp_syswait.swap < 0) {
		tmp_syswait.swap = 0;
	}
	if (tmp_syswait.physio < 0) {
		tmp_syswait.physio = 0;
	}

	/* Update the global syswait counts with the accumulated result */

	if (tmp_syswait.iowait != syswait.iowait) {
		syswait.iowait = tmp_syswait.iowait;
	}
	if (tmp_syswait.swap != syswait.swap) {
		syswait.swap = tmp_syswait.swap;
	}
	if (tmp_syswait.physio != syswait.physio) {
		syswait.physio = tmp_syswait.physio;
	}

	if (tmp_syswait.iowait || tmp_syswait.swap || tmp_syswait.physio) {

		/* Somebody is waiting for I/O, reset backoff */

		last_wait = lbolt;
	}
}

#define RESET_SYSWAIT_PERIOD	30	/* seconds */
#define RESET_SYSWAIT()		reset_syswait()

static void
reset_syswait(void)
{
	int	i, j;
	struct syswait	tmp_syswait;
	static	int reset_syswait_timeout = RESET_SYSWAIT_PERIOD;

	/* Periodically reset everyone's counter to prevent overflows and save
	 * the current syswait state in our's.
	 */

	if (--reset_syswait_timeout > 0) {
		return;
	}
	reset_syswait_timeout = RESET_SYSWAIT_PERIOD;

	bzero(&tmp_syswait, sizeof(tmp_syswait));
	for (i = 0; i < numnodes; i++) {
		RESET_SYSWAIT_VAL(tmp_syswait, j, i, iowait);
		RESET_SYSWAIT_VAL(tmp_syswait, j, i, physio);
		RESET_SYSWAIT_VAL(tmp_syswait, j, i, swap);
	}
	atomicAddInt(&nodepda->syswait.iowait, tmp_syswait.iowait);
	atomicAddInt(&nodepda->syswait.physio, tmp_syswait.physio);
	atomicAddInt(&nodepda->syswait.swap, tmp_syswait.swap);
}
#else
#define RECALC_SYSWAIT()					\
{								\
	if (private.p_flags & PDAF_CLOCK) {			\
		if (syswait.iowait || syswait.swap || syswait.physio) { \
			SYSINFO.wioocc++;			\
			SYSINFO.wioque += (syswait.iowait + 	\
				syswait.swap + syswait.physio);	\
		}						\
	}							\
}
#define RESET_SYSWAIT()
#endif	/* NUMA_BASE */

#ifdef NUMA_BASE
#define PDCOUNT_NODE(_node)		NODEPDA(_node)->pdcount
/* Only the clock master re-calculates the global pdcount */
#define RECALC_PDCOUNT()					\
{								\
	if (private.p_flags & PDAF_CLOCK) {			\
		recalc_pdcount();				\
	}							\
}

/* Reset the node's pdcount field by taking the current value and removing it.
 * see above comments for RESET_SYSWAIT_VAL
 */
#define RESET_PDCOUNT_VAL(_accum, _tmp, _node)			\
{								\
	_tmp = swap_int(&NODEPDA(_node)->pdcount, 0);		\
	_accum += _tmp;						\
}


#define PDCOUNT_RECALC_RATE		HZ		/* once a second */

static void
recalc_pdcount(void)
{
	int	i;
	int	tmp_pdcount;
	static int pdcount_recalc_rate=0;

	/*
	 * Note that only routines such as sar or osview actually
	 * use pdcount. Therefore we only update it once a second.
	 */
	if (--pdcount_recalc_rate > 0)
		return;
	pdcount_recalc_rate = PDCOUNT_RECALC_RATE;

	tmp_pdcount=0;
	for (i = 0; i < numnodes; i++) {
		tmp_pdcount += PDCOUNT_NODE(i);
	}

	if (tmp_pdcount < 0) {
		tmp_pdcount = 0;
	}

	if(tmp_pdcount != pdcount) {
		pdcount = tmp_pdcount;
	}

	if (tmp_pdcount && !pdflag) {
		pdflag = 1;
	}
}

#define RESET_PDCOUNT_PERIOD	30	/* seconds */
#define RESET_PDCOUNT()		reset_pdcount()

static void
reset_pdcount(void)
{
	int	i, j;
	int	tmp_pdcount;
	static	int reset_pdcount_timeout = RESET_PDCOUNT_PERIOD;

	/* Periodically reset everyone's counter to prevent overflows and save
	 * the current pdcount state in our's.
	 */
	if (--reset_pdcount_timeout > 0) {
		return;
	}
	reset_pdcount_timeout = RESET_PDCOUNT_PERIOD;

	tmp_pdcount = 0;
	for (i = 0; i < numnodes; i++) {
		RESET_PDCOUNT_VAL(tmp_pdcount, j, i);
	}
	atomicAddInt(&nodepda->pdcount, tmp_pdcount);
}

#else
#define	RECALC_PDCOUNT()
#define RESET_PDCOUNT()
#endif	/* NUMA_BASE */


/* ARGSUSED */
void
second_thread(void *arg)
{
	extern sv_t runout;
	int	coalesced_kick_timeout = COALESCED_KICK_PERIOD;

	for (;;) {
		psema(&second_sema, PZERO);

		/*
		 * we really should run on the clock_processor
		 * XXX race with sysmp()
		 */
		if (cpuid() != clock_processor)
			(void)setmustrun(clock_processor);

		(void) nfreeswap(&freeswap);

		rqlen = 0;
		nrun  = 0;
		/* update load average */
		procscan(updateload, 0);
		if (rqlen) {
			SYSINFO.runque += rqlen;
			SYSINFO.runocc++;
		}

		if ((time % AVENRUN_INTVL) == 0)
			calcavenrun();

		/*
		 * Periodically we're updating the amount of global system
		 * free memory.  Do this before waking coalesced -- it wants
		 * a reasonably accurate picture.
		 */
		GLOBAL_FREEMEM_UPDATE();

		if (--vfssynccnt <= 0) {
			extern int vfs_syncr;
			vfssynccnt = vfs_syncr;
			cvsema(&vfswakeup);
		}

		if (--coalesced_kick_timeout <= 0) {
			coalesced_kick_timeout = COALESCED_KICK_PERIOD;
			COALESCED_KICK();
		}

		RESET_SYSWAIT();
		RESET_PDCOUNT();

		/* wake up sched every second */
		sv_signal(&runout);

		onesec_maint();
	}
}

void
init_second()
{
	extern int onesec_pri;

	initnsema(&second_sema, 1, "second_sema");
	sthread_create("onesec", NULL, 4096, 0, onesec_pri, KT_PS,
			second_thread, 0, 0, 0, 0);
}

int
clock(eframe_t *ep)
{
	kthread_t *kt = curthreadp;
	k_machreg_t	ps = ep->ef_sr;
	uthread_t	*ut = kt && KT_ISUTHREAD(kt) ? KT_TO_UT(kt) : NULL;
	struct proc	*pp = ut ? UT_TO_PROC(ut) : NULL;
	register int	a;
	pgcnt_t		rss;

	ackrtclock();	/* acknowledge the clock interrupt	*/

	ASSERT(issplhi(getsr()));
	ASSERT(private.p_switching == 0);

	/*
	 * Blip the LED's if necessary.
	 */
	bump_leds();

	tick_actions();	/* machine-dependent per-tick activities */

#ifdef	CELL
	hb_update_local_heart_beat(); /* Update local heart beat */
#endif

	RECALC_SYSWAIT();
	RECALC_PDCOUNT();

#ifdef ULI
	/* we may have interrupted out of a ULI proc. This is only
	 * really a clock tick during a user process if curuli is
	 * also clear.
	 */
	if (USERMODE(ps) && ut && pp && !private.p_curuli)
#else
	if (USERMODE(ps) && ut && pp)
#endif
	{
		ASSERT(pp);		/* won't always be true! */
		a = CPU_USER;
		if (pp->p_profn) {
			/*
			 * Set a flag so that user will later accumulate
			 * a pc tick for this clock tick.  We can't just
			 * call addupc here because it may take a page
			 * fault and need to sleep.  It's a sin to sleep
			 * in the clock interrupt handler.
			 */
			if (pp->p_flag & SPROF) {
				/*
				 * Have to check the flag now since it could
				 * be profiling with the R10000 counters,
				 * in which case SPROF will not be set.
				 */
				ut_flagset(ut, UT_OWEUPC);
				PCB(pcb_resched) = 1;
			}
		}
#if !NO_WIRED_SEGMENTS
#if FAST_LOCORE_TFAULT
		/* In "fast locore tfault" mode, the utas_segflags are almost
		 * always set and does not indicate segement table mode.
		 * Instead, locore uses u_nexttlb to point into the wired
		 * range, and if out-of-bounds it indicates that we're in
		 * segment table mode and should do random 2nd level dropins.
		 * To start filling the wired entries with new entries, we
		 * simply need to reset u_nexttlb and we will start re-using
		 * the wired entries for new 2nd level dropins ... without
		 * needing to clear all of the maps. We MUST change the tlbpid
		 * to eliminate the non-wired 2nd level entries (unfortunately
		 * this will also effectively flush all 1st level entries).
		 */
		if ((ut->ut_exception->u_nexttlb >= NWIREDENTRIES-TLBWIREDBASE)
		    && (++ut->ut_as.utas_tlbcnt >= tune.t_tlbdrop)) {
			ut->ut_as.utas_tlbcnt = 0;
			ut->ut_exception->u_nexttlb = 0;
			new_tlbpid(&ut->ut_as, VM_TLBINVAL);
		}
#else /* !FAST_LOCORE_TFAULT */
		/*
		 * If process is running in segment-table mode,
		 * see if it will be well-behaved for awhile
		 * and use only NWIRED tlb entries.
		 * NOTE: No need for this code if we don't have wired
		 * second level tlbs (like TFP).
		 */
		if (++ut->ut_as.utas_tlbcnt >= tune.t_tlbdrop) {
			ut->ut_as.utas_tlbcnt = 0;
			if (ut->ut_as.utas_segflags) {
				setup_wired_tlb(1);
#ifdef R4000
				new_tlbpid(&ut->ut_as, VM_TLBINVAL);
#endif
			}
		}
#endif /* !FAST_LOCORE_TFAULT */
#endif /* !NO_WIRED_SEGMENTS */

		/*
		 * if we should, update the process
		 * virtual itimer and if it expired post the correct signal
		 * NOTE: this occurs for the normal clock tick (10ms)
		 */
		if (timerisset(&ut->ut_timer[UT_ITIMER_VIRTUAL].it_value) &&
		    itimerdecr(&ut->ut_timer[UT_ITIMER_VIRTUAL],
				USEC_PER_TICK) == 0)
			sigtouthread(ut, SIGVTALRM, (k_siginfo_t *)NULL);

	} else if (kt == NULL) {

		/* idling - see how */
		a = CPU_IDLE;

		if (sxbrkcnt)
			a = CPU_SXBRK;
		/*
		 * any time we're idle - account for wait I/O
		 * this overrides interest in SXBRK
		 */

		if (syswait.iowait) {
			a = CPU_WAIT;
			SYSINFO.wait[W_IO]++;
		}
		if (syswait.swap) {
			a = CPU_WAIT;
			SYSINFO.wait[W_SWAP]++;
		}
		if (syswait.physio) {
			a = CPU_WAIT;
			SYSINFO.wait[W_PIO]++;
		}

	} else {
		if (private.p_gfx_waitc) {
			a = CPU_WAIT;
			SYSINFO.wait[W_GFXC]++;
		} else if (private.p_gfx_waitf) {
			a = CPU_WAIT;
			if (ut)
				ut->ut_acct.ua_graphfifo++;
			SYSINFO.wait[W_GFXF]++;
		} else {
			if (KT_ISXTHREAD(kt))
				a = CPU_INTR;
			else
				a = CPU_KERNEL;
			if (ut)
				ut->ut_prftime++;
		}
	}

	SYSINFO.cpu[a]++;

	/*
	 * This could be a user thread that is exiting, so it might
	 * not have a process attached.
	 */
	if (pp && pp->p_stat == SRUN) {
		struct rlimit *rlp;
		timespec_t utime, stime;

		ASSERT(KT_ISUTHREAD(kt) && ut);

		rlp = &pp->p_rlimit[RLIMIT_CPU];

		/* XXX if this is a multi-threaded app, this
		 * is incorrect - we are looking only at this
		 * thread's u+s time, not the whole procs.
		 * But... to look at the whole proc would be
		 * expensive, and since resource limits are
		 * not part of any posix spec, and this
		 * will still perform the useful attribute
		 * of RLIMIT_CPU - to catch runaway processes.
		 */
		ktimer_read(UT_TO_KT(ut), AS_USR_RUN, &utime);
		ktimer_read(UT_TO_KT(ut), AS_SYS_RUN, &stime);
		if (rlp->rlim_cur != RLIM_INFINITY &&
		    utime.tv_sec + stime.tv_sec +1 > rlp->rlim_cur)
		{
			extern int cpulimit_gracetime;
			extern void qprintf(char *f, ...);

			if (cpulimit_gracetime == 0) {
				/*
				 * old behaviour
				 */

				sigtouthread(ut, SIGXCPU, (k_siginfo_t *)NULL);

				/*
			 	 * Don't give the signal every clock tick.
			 	 */
				if (rlp->rlim_cur < rlp->rlim_max)
					rlp->rlim_cur += 5;
			} else {

				/*
				 * new behaviour, we will send once SIGXCPU, so the
				 * process can checkpoint or whatever necessary and
				 * after a grace time defined by systuneable
				 * cpulimit_gracetime we send SIGKILL.
				 * The reason for this new new behaviour is that it
				 * was possible for a process to completely ignore
				 * SIGXCPU.
				 */

				if (pp->p_flag & SGRACE) {
					sigtouthread(ut,SIGKILL,(k_siginfo_t *)NULL);
				} else {

					sigtouthread(ut, SIGXCPU, (k_siginfo_t *)NULL);
					rlp->rlim_cur += cpulimit_gracetime;
					pp->p_flag |= SGRACE;

				}
			}

		}

		/*
		 * Update the process profile itimer if it is
		 * set and if it expired post the SIGPROF signal.
		 * NOTE: this occurs for the normal clock tick (10ms).
		 */
		if (timerisset(&ut->ut_timer[UT_ITIMER_PROF].it_value) &&
		    itimerdecr(&ut->ut_timer[UT_ITIMER_PROF], USEC_PER_TICK)
									== 0)
			sigtouthread(ut, SIGPROF, (k_siginfo_t *)NULL);

		/*
		 * since # utlbmisses only updated on context switch
		 * update here also
		 */
		ut->ut_acct.ua_ufaults += private.p_utlbmisses;
		private.p_utlbmisses = 0;

		/*
		 * If this is a pthreads process, and it has a prda,
		 * and the prda's resched counter is non-null,
		 * decrement the counter --  and if it goes to zero,
		 * send pthread reschedule signal.
		 * XXX	Is this still needed?
		 */
		if (ut->ut_flags & UT_PTHREAD && ut->ut_prda) {
			/*
			 * Don't need atomic operator -- the uthread
			 * might be in the middle of updating this,
			 * but it only does processor-atomic stores,
			 * never increments/decrements.
			 * The worst that can happen is that we miss
			 * an update, and catch it next clock tick.
			 */
			if ((a = ut->ut_prda->t_sys.t_resched) > 0) {
				ut->ut_prda->t_sys.t_resched = --a;
				if (a == 0) {
					int s = ut_lock(ut);
					sigaddset(&ut->ut_sig, SIGPTRESCHED);
					ut_unlock(ut, s);
				}
			}
		}

		/*
		 * Once a second update in calcrss() is not good enough?
		 * Count 1K blocks, not pages.
		 * XXX If rss is negative, vhand is busy; ignore it.
		 */
		rss = getrss(ut);
		if (rss > 0) {
			if ((rss * (NBPC / 1024)) >
					ut->ut_pproxy->prxy_ru.ru_maxrss)
				ut->ut_pproxy->prxy_ru.ru_maxrss =
							rss * (NBPC / 1024);
			ut->ut_acct.ua_mem += rss;
		}
#ifdef DEBUG
		else if (rss < 0) {
			cmn_err(CE_DEBUG,
			"Strange ut_mem update: pid=%d  ut_mem=%lx  rss=%lx\n",
				pp->p_pid, ut->ut_acct.ua_mem, rss);
		}
#endif
	}
	if (ut) {
		/*
		 * Time slice and preemption checks.
		 */
		tschkRunq(ut);
	}
#if (MAXCPUS > 128)
	/*
	 * On large systems, calling the wtree routines every clock tick
	 * on every cpu causes EXTREMELY hot cache lines in the wtree & job_t structures.
	 * In fact, the system hangs under some circumstances when access to
	 * the structures starts to take > 10ms. In this case, the next clock tick has
	 * already occurred when we exit clock() & we stop making forward progress.
	 *
	 * The fix (aka hack) is to call the cpu_sched_tick on every nth clock
	 * tick. We pick lbolt as the randomizer. This variable is monitonically
	 * increasing & incremented by 1 cpu once every clock tick (10ms).
	 *
	 * The following code does the following:
	 *	- at clock tick 0, cpu 0-63 will call cpu_sched_tick
	 *	- at clock tick 1, cpu 64-127 will call cpu_sched_tick
	 *
	 * The sched_tick_mask is set during boot and is a function of the number
	 * of cpus in the system. The mask is 0 for 0-63 cpus, 1 for 64-127 cpus, ...
	 */
	if ((lbolt & sched_tick_mask) == ((cpuid()>>6)&sched_tick_mask))
#endif
		cpu_sched_tick(kt);

        /*
         * Numa memory management periodic ops
         */
        MEM_TICK();

	if (private.p_flags & PDAF_CLOCK) {
		unsigned long ofreemem;

		++lbolt;
		tick_maint();	/* muck with one_sec */

		/* "double" long arithmetic for minfo.freemem */
		ofreemem = MINFO.freemem[0];
		MINFO.freemem[0] += GLOBAL_FREEMEM();
		if (MINFO.freemem[0] < ofreemem)
			MINFO.freemem[1]++;

		if (--vhandcnt <= 0) {
			if (GLOBAL_FREEMEM() < vhandkicklim) {
				cvsema(&vhandsema);
			}
			vhandcnt = 2*HZ;
		} else if (rsswaitcnt) {
			cvsema(&vhandsema);
			vhandcnt = 2*HZ;
		} else if (GLOBAL_FREEMEM() < tune.t_gpgslo) {
			/*
			 * Push up vhand if memory is really low.
			 * We don't just wake up vhand here because
			 * we want to give runnable processes (who
			 * possibly are about to release their regions)
			 * a chance to run.
			 */
			if (vhandcnt > 5)
				vhandcnt = 5;
		}

		ASSERT(loclkok(ep));
		if (one_sec) {
			one_sec = 0;
			vsema(&second_sema);
#if DEBUG
			if (valusema(&second_sema))
				cmn_err(CE_WARN,"one second clock processing still pending after %d seconds\n", valusema(&second_sema)-1);
#endif

			/*
			 * Update memory usage for the current running process
			 */
			if (ut)
				calcrss(ut);

		}

		/*
		 * klog_need_action is set in icmn_err, indicating klogwakeup needs
		 * to be called. Note that this is done only on the PDAF_CLOCK cpu.
		 */
		if (klog_need_action) {
			klog_need_action = 0;
			klog_unlocked_wakeup();
		}
	}

	/*
	 * If we are using the event counters and there are
	 * more events being tracked than there are counters,
	 * multiplex the events every tick. (R10000)
	 */
	MULTIPLEX_HWPERF_COUNTERS();


	return(0);
}

/*
 * Update system load average.
 */
/*ARGSUSED*/
static int
updateload(proc_t *pp, void *arg, int mode)
{
	if (mode == 1 && pp->p_stat == SRUN && uscan_tryaccess(&pp->p_proxy)) {
		register uthread_t *ut;

		for (ut = prxy_to_thread(&pp->p_proxy); ut; ut = ut->ut_next) {
			kthread_t	*kt;
			/*
			 * If the thread is asleep but not sxbrk, don't
			 * mark it as on-the-runq, but do mark it as
			 * runnable if it isn't doing a long-term wait
			 * and it isn't breakable (this implies just waiting
			 * for some kinda mutex?).
			 * We don't bother locking ut_lock, 'cause we're
			 * just generating statistics.
			 */
			kt = UT_TO_KT(ut);
			if ((kt->k_flags & KT_SLEEP) &&
			    !(ut->ut_flags & UT_SXBRK)) {
				if ((kt->k_flags & (KT_LTWAIT|KT_NWAKE))
						== KT_NWAKE)
					nrun++;
				continue;
			}
			if (ut->ut_flags & UT_STOP)
				continue;

			if (! is_weightless(UT_TO_KT(ut)))
				nrun++;

			rqlen++;
		}
		uscan_unlock(&pp->p_proxy);
	}
	return(0);
}

static pgcnt_t
getrss(uthread_t *ut)
{
	vasid_t vasid;
	ppas_t *ppas;
	pas_t *pas;
	int as_lookup_pinned(uthread_t *, vasid_t *);

	/*
	 * Hack - this is the only code that tries all this grot from interrupt
	 * level, and we hope to remove this RSN.
	 * So, we need to practice careful reference here to avoid problems
	 * when a process happens to being execing
	 * We have a chance since we are in fact the running process
	 * so we can take snapshots
	 */
	if (AS_ISNULL(&ut->ut_asid))
		return 0;
	if (as_lookup_pinned(ut, &vasid))
		return 0;
	ppas = (ppas_t *)vasid.vas_pasid;
	pas = VASID_TO_PAS(vasid);
	return pas->pas_rss + ppas->ppas_rss;
}

static void
calcrss(uthread_t *ut)
{
	register preg_t	*prp;
	register reg_t	*rp;
	int doingshd = 0;
	vasid_t vasid;
	pas_t *pas;
	ppas_t *ppas;
	pgcnt_t rss = 0;
	int as_lookup_pinned(uthread_t *, vasid_t *);

	/*
	 * Hack - this is the only code that tries all this grot from interrupt
	 * level, and we hope to remove this RSN.
	 * So, we need to practice careful reference here to avoid problems
	 * when a process happens to being execing
	 * We have a chance since we are in fact the running process
	 * so we can take snapshots
	 */
	if (AS_ISNULL(&ut->ut_asid))
		return;
	if (as_lookup_pinned(ut, &vasid))
		return;
	if (VAS_TRYLOCK(vasid, AS_SHARED)) {
		pas = VASID_TO_PAS(vasid);
		ppas = (ppas_t *)vasid.vas_pasid;
		prp = PREG_FIRST(ppas->ppas_pregions);
doshd:
		while (prp) {
			rp = prp->p_reg;
			if (rp->r_flags & RG_PHYS) {
				prp = PREG_NEXT(prp);
				continue;
			}
			rss += prp->p_nvalid;
			prp = PREG_NEXT(prp);
		}
		if (!doingshd) {
			ppas->ppas_rss = rss;
			rss = 0;
			doingshd++;
			prp = PREG_FIRST(pas->pas_pregions);
			goto doshd;
		}
		pas->pas_rss = rss;
		VAS_UNLOCK(vasid);
	} else {
		/* no charges?? */
		;
	}
}


/*
 * Call the "vanilla" shake routine for each cpu's free zone.
 */
/* ARGSUSED */
static int
callout_shake(int level)
{
	cpuid_t cpu;
	int page_count = 0;

	ASSERT(level == SHAKEMGR_MEMORY);

	for (cpu=0; cpu < maxcpus; cpu++) {
		zone_t *callout_free_zone;

		if ((pdaindr[cpu].CpuId == -1) || !(pdaindr[cpu].pda->p_flags & PDAF_ENABLED))
			continue;

		callout_free_zone = CI_FREE_ZONE(&CALLTODO(cpu));

		page_count += zone_shake(callout_free_zone);
	}

	return(page_count);
}


/*
 * Per-CPU callout initialization.
 */
void
calloutinit_cpu(cpuid_t cpu)
{
	extern int ncallout;
	int nbytes, num_per_page, numpages;
	zone_t *callout_free_zone;
	cnodeid_t cnode;

	callout_free_zone = kmem_zone_private(sizeof(struct callout), "callout");

	CI_FREE_ZONE(&CALLTODO(cpu)) = callout_free_zone;
	(void)kmem_zone_private_mode_noalloc(callout_free_zone);
	(void)kmem_zone_enable_shake(callout_free_zone);

	nbytes = kmem_zone_unitsize(callout_free_zone);
	ASSERT(nbytes > 0);
	num_per_page = ctob(1) / nbytes;
	ASSERT(num_per_page > 0);

	/*
	 * Determine how many pages per CPU we'd need to allocate so that
	 * the total number of pages allocated stores ncallout structures.
	 */
	numpages = (((ncallout + num_per_page -1) / num_per_page) + maxcpus-1) / maxcpus;
	ASSERT(numpages > 0);

	cnode = cputocnode(cpu);
	kmem_zone_reserve_node(cnode, callout_free_zone, ctob(1)*numpages);

	/* Prevent shake routine from reclaiming entire callout list. */
	kmem_zone_minsize(callout_free_zone, numpages*num_per_page);

	shake_register(SHAKEMGR_MEMORY, callout_shake);
}


/*
 * Called once to perform global initialization required for timeout processing.
 */
void
calloutinit()
{

#if !CLOCK_CTIME_IS_ABSOLUTE
	calltodo = (callout_info_t *)kmem_zalloc(sizeof (*calltodo) * maxcpus,
					VM_DIRECT|KM_SLEEP);

	fastcatodo = (callout_info_t *)kmem_zalloc(sizeof (*fastcatodo),
					VM_DIRECT|KM_SLEEP);

	spinlock_init(&CI_LISTLOCK(fastcatodo), "fastcatodo");
#endif

	calloutinit_cpu(master_procid);

	spinlock_init(&migrated_timeout_lock, "migr_to");
}


/*
 * Allocate a callout structure.
 *
 * Try to get a structure from the target cpu's zone.
 * If this fails, try to allocate more memory from the
 * target CPU's node and add it to the target's zone.
 * If this fails, try all the other cpus' zones.
 * If this fails, try to allocate any page to the target's zone.
 * If this fails, we're probably in trouble; return NULL
 * and let the caller deal with it.
 *
 * We raise to splprof before calling into any routine that needs
 * to grab a spinlock, because callout_alloc may be called from
 * an interrupt routine that interrupts at splprof, and we need
 * to avoid double-tripping on whatever locks are used.  We have
 * elected to repeatedly raise and lower IPL level as we try the
 * various altenatives rather than just holding it at splprof
 * for the duration of this function.  That way we won't end up
 * holding off interrupts for too long (we especially want to avoid
 * holdoffs that increase with the number of CPUs).  It's fairly
 * unusual to get all the way through the various cases, anyway;
 * most of the time, the initial attempt will succeed and we will
 * have done a single splprof/splx pair.
 */
static struct callout *
callout_alloc(cpuid_t targ)
{
        struct callout *co;
	zone_t *callout_free_zone;
	cpuid_t cpu, my_cpuid;
	int s;

	/* Try to get a free structure from the target cpu's zone. */
	callout_free_zone = CI_FREE_ZONE(&CALLTODO(targ));
again:
	s = splprof();
	co = kmem_zone_alloc(callout_free_zone, VM_NOSLEEP);
	splx(s);
	if (!co) {
		/* Try to add another page to the empty zone */
		void *ptr;

		s = splprof();
		ptr = kvpalloc_node(cputocnode(targ), 1, VM_NOSLEEP, 0);
		splx(s);
		if (ptr) {
			kmem_zone_fill(callout_free_zone, ptr, ctob(1));
			goto again;
		}
	} else {
		co->c_ownercpu = targ;
		return(co);
	}

	/*
	 * If we get here, it means that the target CPU's callout zone
	 * is empty and we were unable to allocate more memory to fill
	 * the zone.
	 *
	 * Try other cpus' lists.
	 */
	my_cpuid = cpuid();

	for (cpu=my_cpuid+1; cpu < maxcpus; cpu++) {
		if ((pdaindr[cpu].CpuId == -1) || !(pdaindr[cpu].pda->p_flags & PDAF_ENABLED))
			continue;
		callout_free_zone = CI_FREE_ZONE(&CALLTODO(cpu));
		s = splprof();
		co = kmem_zone_alloc(callout_free_zone, VM_NOSLEEP);
		splx(s);
		if (co) {
			co->c_ownercpu = cpu;
			return(co);
		}
	}

	for (cpu=0; cpu < my_cpuid; cpu++) {
		if ((pdaindr[cpu].CpuId == -1) || !(pdaindr[cpu].pda->p_flags & PDAF_ENABLED))
			continue;
		callout_free_zone = CI_FREE_ZONE(&CALLTODO(cpu));
		s = splprof();
		co = kmem_zone_alloc(callout_free_zone, VM_NOSLEEP);
		splx(s);
		if (co) {
			co->c_ownercpu = cpu;
			return(co);
		}
	}

	/*
	 * Try a kvpalloc without specifying node.
	 * We'd really prefer not to do this, since the callout structures
	 * from another node will be stuck on this node indefinitely.  Still,
	 * it's preferable to panicing.
	 */
	{
		void *ptr;

		s = splprof();
		ptr = kvpalloc(1, VM_NOSLEEP, 0);
		splx(s);
		if (ptr) {
			kmem_zone_fill(callout_free_zone, ptr, ctob(1));
			goto again;
		}
	}

	cmn_err_tag(317,CE_WARN, "Out of callouts cpu%d-->cpu%d\n", cpuid(), targ);

	return(NULL);
}


/*
 * Free the specified callout structure.
 */

void
callout_free(struct callout *co)
{
	cpuid_t owner_cpuid = co->c_ownercpu;
	zone_t *callout_free_zone = CI_FREE_ZONE(&CALLTODO(owner_cpuid));
	int s;

	ASSERT(owner_cpuid != CPU_NONE);

	/* Put it back in the zone for the CPU that owns it. */

	s = splprof();
	kmem_zone_free(callout_free_zone, co);
	splx(s);
}


/*
 * timeout is called to arrange that fun(arg) is called in tim/HZ seconds.
 * An entry is sorted into the callout structure.
 * The time in each structure entry is the number of HZ's more
 * than the previous entry. In this way, decrementing the
 * first entry has the effect of updating all entries.
 *
 * The panic is there because there is nothing
 * intelligent to be done if an entry won't fit.
 *
 * timeout now queues function to be invoked on the same cpu that
 * timeout was called
 */
toid_t
timeout(void (*fun)(), void *arg, long tim, ...)
{
	toid_t retval;

	va_list ap;
	va_start(ap, tim);
	retval = dotimeout(cpuid(), fun, arg, tim, callout_get_pri(), C_NORM, ap);
	va_end(ap);

	if (retval == NULL)
		/*
		 * The kernel was unable to allocate space for
                 * a timeout request.  Since timeout is a comptability
		 * interface, the kernel cannot return
                 * an error and allow processing to be handler
                 * by the caller.
		 */
		cmn_err(CE_PANIC,
			"Timeout table overflow.\n Tune ncallout to a higher value.");

	return(retval);
}

toid_t
timeout_pri(void (*fun)(), void *arg, long tim, int pri, ...)
{
	toid_t retval;
	va_list ap;

	/* timeouts should never equal 0 */
	ASSERT((pri > 0) && (pri <= 255));

	va_start(ap, pri);
	retval = dotimeout(cpuid(), fun, arg, tim, pri, C_NORM, ap);
	va_end(ap);

	return(retval);
}

/* This routine is identical to timeout() but causes timein processing
 * to occur on the timein interrupt stack.
 *
 * NOTE: Should ONLY be used on short duration routines which are simply
 * awakening another thread or incrementing a counter.
 */

toid_t
timeout_nothrd(void (*fun)(), void *arg, long tim, ...)
{
	toid_t retval;

	va_list ap;
	va_start(ap, tim);
	retval = dotimeout(cpuid(), fun, arg, tim, 0, C_NORM_ISTK, ap);
	va_end(ap);

	if (retval == NULL)
		/*
		 * The kernel was unable to allocate space for
                 * a timeout request.  Since timeout is a comptability
		 * interface, the kernel cannot return
                 * an error and allow processing to be handled
                 * by the caller.
		 */
		cmn_err(CE_PANIC,
			"Timeout table overflow.\n Tune ncallout to a higher value.");

	return(retval);
}

/*
 * prtimeout - queue a timeout on specified processor
 * If a processor with timeouts is isolated or restricted then the
 * timeouts will migrate to the clock processor.
 */
toid_t
prtimeout(processorid_t prid, void (*fun)(), void *arg, long tim, ...)
{
	toid_t retval;

	va_list ap;
	va_start(ap, tim);
	retval = dotimeout(prid, fun, arg, tim, callout_get_pri(), C_NORM, ap);
	va_end(ap);

	if (retval == NULL)
		/*
		 * The kernel was unable to allocate space for
                 * a timeout request.  Since timeout is a comptability
		 * interface, the kernel cannot return
                 * an error and allow processing to be handled
                 * by the caller.
		 */
		cmn_err(CE_PANIC,
			"Timeout table overflow.\n Tune ncallout to a higher value.");

	return(retval);
}

#if RTINT_WAR
/* This routine is identical to prtimeout() but causes timein processing
 * to occur on the timein interrupt stack.
 *
 * NOTE: Should ONLY be used on short duration routines which are simply
 * awakening another thread or incrementing a counter.
 */

toid_t
prtimeout_nothrd(processorid_t prid, void (*fun)(), void *arg, long tim, ...)
{
	toid_t retval;

	va_list ap;
	va_start(ap, tim);
	retval = dotimeout(prid, fun, arg, tim, 0, C_NORM_ISTK, ap);
	va_end(ap);

	if (retval == NULL)
		/*
		 * The kernel was unable to allocate space for
                 * a timeout request.  Since timeout is a comptability
		 * interface, the kernel cannot return
                 * an error and allow processing to be handled
                 * by the caller.
		 */
		cmn_err(CE_PANIC,
			"Timeout table overflow.\n Tune ncallout to a higher value.");

	return(retval);
}
#endif

/*
 * fast_prtimeout - like prtimeout, but use the fast timeouts
 */

extern int fastclock;
toid_t
fast_prtimeout(processorid_t targcpu, void (*fun)(), void *arg, long tim, ...)
{
	toid_t retval;
	va_list ap;

	if (!fastclock)
		enable_fastclock();
	va_start(ap, tim);

	retval = dotimeout(targcpu, fun, arg, tim, callout_get_pri(), C_FAST, ap);
	va_end(ap);

	if (retval == NULL)
		cmn_err(CE_PANIC,"Timeout table overflow.\n Tune ncallout to a higher value.");
	return(retval);
}

/*
 * timeout routines for DDI/DKI compliant drivers.
 */

extern pl_t plbase;
extern pl_t pltimeout;
extern pl_t pldisk;
extern pl_t plstr;
extern pl_t plhi;

/* ARGSUSED */
toid_t
dtimeout(void (*fun)(), void *arg, long tim, pl_t pl, processorid_t prid)
{
	toid_t  retval;
	va_list ap;

	va_start(ap, prid);

	retval = dotimeout(prid, fun, arg, tim, callout_get_pri(), C_NORM, ap);
	va_end(ap);
	return(retval);
}

toid_t
itimeout(void (*fun)(), void *arg, long tim, pl_t pl, ...)
{
	toid_t  retval;
	va_list ap;

	va_start(ap, pl);

	retval = dotimeout(cpuid(), fun, arg, tim, callout_get_pri(), C_NORM, ap);
	va_end(ap);
	return(retval);
}

toid_t
itimeout_nothrd(void (*fun)(), void *arg, long tim, pl_t pl, ...)
{
	toid_t  retval;
	va_list ap;

	va_start(ap, pl);
	retval = dotimeout(cpuid(), fun, arg, tim, 0, C_NORM_ISTK,
									ap);
	va_end(ap);
	return(retval);
}

toid_t
fast_itimeout(void (*fun)(), void *arg, long tim, pl_t pl, ...)
{
	toid_t  retval;
	va_list ap;

	if (!fastclock)
		enable_fastclock();

	va_start(ap, pl);
	retval = dotimeout(cpuid(), fun, arg, tim, callout_get_pri(), C_FAST, ap);
	va_end(ap);
	return(retval);
}

toid_t
fast_itimeout_nothrd(void (*fun)(), void *arg, long tim, pl_t pl, ...)
{
	toid_t  retval;
	va_list ap;

	if (!fastclock)
		enable_fastclock();
	va_start(ap, pl);
	retval = dotimeout(cpuid(), fun, arg, tim, 0, C_FAST_ISTK,
									ap);
	va_end(ap);
	return(retval);
}

#ifdef CLOCK_CTIME_IS_ABSOLUTE
/*
 * clock_prtimeout - like prtimeout, but use the absolute timeouts
 * This function is only useful or used on systems that call
 * set_timer_intr with an absolute time rather then a number
 * of ticks.
 */
toid_t
clock_prtimeout(processorid_t targcpu, void (*fun)(), void *arg, __int64_t tim, int pri, ...)
{
	toid_t retval;
	va_list ap;

	va_start(ap, pri);
	retval = dotimeout(targcpu, fun, arg, tim, pri, C_CLOCK, ap);
	va_end(ap);
	if (retval == NULL)
		cmn_err(CE_PANIC,"Timeout table overflow.\n Tune ncallout to a higher value.");
	return(retval);
}

toid_t
clock_prtimeout_nothrd(processorid_t targcpu, void (*fun)(), void *arg,
		__int64_t tim, ...)
{
	toid_t retval;
	va_list ap;

	va_start(ap, tim);
	retval = dotimeout(targcpu, fun, arg, tim, 0, C_CLOCK_ISTK, ap);
	va_end(ap);
	if (retval == NULL)
		cmn_err_tag(138,CE_PANIC,"Timeout table overflow.\n Tune ncallout to a higher value.");
	return(retval);
}
#endif /* CLOCK_CTIME_IS_ABSOLUTE */
/*
** list is used to select which callout list to queue on
** list == C_FAST then queue em on the fast callout list
** list == C_NORM then queue em on processor 'targcpu' callout list
** list == C_CLOCK then queue em on the fast callout list using an
**	   absolute clock cycle count to cmp against. [EVEREST]
** Per-CPU lists each have an associated "list lock".
**
** if tim == TIMEPOKE_NOW then timepoke() is called immediately
*/

toid_t
dotimeout(
	register processorid_t targcpu,
	void (*fun)(),
	void *arg,
	__int64_t tim,
	int pri,
	long list,
	va_list ap)
{
	register struct callout *p1, *p2, *pnew, *phead;
	register toid_t id;
	void *arg1, *arg2, *arg3;
	int s;
	__int64_t tmp_tim;

	ASSERT(targcpu >= 0  &&  targcpu < maxcpus);
	arg1 = va_arg(ap, void *);
	arg2 = va_arg(ap, void *);
	arg3 = va_arg(ap, void *);

	/*
	 * Frame Scheduler
	 */
	if (pdaindr[targcpu].pda->p_frs_flags) {
		targcpu = clock_processor;
	}

#ifdef ISOLATE_DEBUG
	{
	extern int isolate_drop;

	if (pdaindr[targcpu].pda->p_flags & PDAF_ISOLATED) {
		cmn_err(CE_WARN,
			"Isolated processor %d executes dotimeout\n",
			pdaindr[targcpu].pda->p_cpuid);
		if (isolate_drop)
			debug((char *)0);
	}
	}
#endif

	pnew = callout_alloc(targcpu);
	if (!pnew)
		return(0);

	pnew->c_id = 0;
	pnew->c_flags = 0;
	pnew->c_func = fun;
	pnew->c_arg = arg;
	pnew->c_arg1 = arg1;
	pnew->c_arg2 = arg2;
	pnew->c_arg3 = arg3;
	pnew->c_pl = pri;
	pnew->c_cpuid = targcpu;
	pnew->c_time = tim;

	/* Grab the per-list lock in order to enqueue the new callout request */
	s = mutex_spinlock_spl(&CI_LISTLOCK(&CALLTODO(targcpu)), splprof);

	/*
	 * Find the head of the queue to insert the
	 * callout, if needed adjust the c_time field
	 * to have the correct units for the hardware
	 */
	phead = timeout_get_queuehead(list, pnew);

	/*
	 * Set any needed flags info.
	 */
	if (list == C_NORM_ISTK || list == C_FAST_ISTK || list == C_CLOCK_ISTK)
		pnew->c_flags |= C_FLAG_ISTK;
	else
		pnew->c_flags |= C_FLAG_ITHRD;

	ASSERT(phead);

	tmp_tim = pnew->c_time;
	id = pnew->c_id;	/* return all bits */

	/*
	 * Insert pnew into correct position in callout list.
	 */
	for (p1 = phead ; p2 = p1->c_next ; p1 = p2) {
	    if (p2->c_time > tmp_tim)
		break;

#if !defined(CLOCK_CTIME_IS_ABSOLUTE)
		/* Make times relative to prev callback */
		tmp_tim -= p2->c_time;
#endif /* !CLOCK_CTIME_IS_ABSOLUTE */
	}
	p1->c_next = pnew;
	pnew->c_next = p2;

#if !defined(CLOCK_CTIME_IS_ABSOLUTE)
	pnew->c_time = tmp_tim;
	/* Nothing to do for CLOCK_CTIME_IS_ABSOLUTE, times are absolute*/
	if (p2)
		p2->c_time -= tmp_tim;
#endif /* !CLOCK_CTIME_IS_ABSOLUTE */
	/*
	 * If we've just put something at the head of the queue,
	 * insure that a timer will go off at the right moment.
	 */
	if (phead == p1)
		set_timer_intr(targcpu, tmp_tim, list);

	mutex_spinunlock(&CI_LISTLOCK(&CALLTODO(targcpu)), s);
	return(id);
}

/*
 * Return a thread priority for a callout thread.  Used whenever
 * we are not given an explicit priority.
 */

int
callout_get_pri(void)
{
	extern int default_timeout_pri;

	if (private.p_kstackflag <= PDA_CURKERSTK) {
		kthread_t *kt = curthreadp;
		int pri = kt->k_basepri;

		if (kt->k_copri) {
			pri = kt->k_copri;
			ASSERT(pri != 255);
		} else if (pri < 0)
			pri = default_timeout_pri;
		else if (KT_ISUTHREAD(kt) && (pri < 255))
			pri++;

		ASSERT((pri >= 0) && (pri <= 255));
		return pri;
	}

	return default_timeout_pri;
}

/*
 * untimeout_body
 *
 * This routine attempts to find the specified timeout id and
 * disable the entry if possible by checking the todo list, the
 * pending list, timein thread list, and the migrated list.
 */
static int untimeout_migrated(toid_t, int);

static int
untimeout_body(toid_t id, int wait)
{
	register struct callout *p1, *p2, *p3;
	ci_itinfo_t *citp;
	callout_info_t *cip;
	register int s;
	union	c_tid c_tid;	/* pick this name to use macro in callo.h */
	__int64_t totaltime = 0;
	int rc = 0, found = 0;

	extern struct strintr *strintrrsrv;

	/* Races here are benign */
	if (strintrrsrv != NULL)
		streams_untimeout(id);

	/* determine which callout to search */
	c_id = id;
	cip = c_fast == 1 ? fastcatodo : &CALLTODO(c_cpuid);

	/* Search todo list */
	p1 = CI_TODO(cip);

	s = mutex_spinlock_spl(&CI_LISTLOCK(cip), splprof);
	for ( ; (p2 = p1->c_next) != 0; p1 = p2) {
#ifndef CLOCK_CTIME_IS_ABSOLUTE
	    totaltime += p2->c_time;
#endif /* CLOCK_CTIME_IS_ABSOLUTE */
		if (p2->c_id == c_id) {
			found = 1;
			p3 = p2->c_next;
#ifdef CLOCK_CTIME_IS_ABSOLUTE
			totaltime = p2->c_time;
#else
			if (p3)	/* carry overflow or delta */
				p3->c_time += p2->c_time;
#endif /* CLOCK_CTIME_IS_ABSOLUTE */
			p1->c_next = p3;
			rc = callout_time_to_hz(totaltime, c_cpuid, c_fast);
			break;
		}
	}

	if (!found) {
		/* Search pending callout list */
		p1 = CI_PENDING(cip);
		for ( ; (p2 = p1->c_next) != NULL; p1 = p2) {
			if (p2->c_id == c_id) {
				p1->c_next = p2->c_next;
				rc = found = 1;
				break;
			}
		}
	}

	if (found) {
		mutex_spinunlock(&CI_LISTLOCK(cip), s);
		callout_free(p2);
		return(rc);
	}

	/*
	 * If we're in wait mode, then we need to search the
	 * list of xthread infos to see if any timein threads are
	 * executing the one we're looking for.  If we find it,
	 * and we're not the thread executing the timeout (i.e.
	 * the timeout canceling itself -- see ec_recover()),
	 * then we go to sleep on the sync var.
	 */
	ASSERT(!found);
	if (wait) {
		int x;
		citp = cip->ci_ithrdinfo;
		for (x=0; x < cip->ci_ithrd_cnt; x++, citp++) {
			if ((citp->cit_toid == c_id) && (citp->cit_ithread !=
			    (struct xthread *)private.p_curkthread)) {
				citp->cit_flags |= CIT_WAITING;
				sv_wait(&citp->cit_sync, PZERO, &CI_LISTLOCK(cip), s);
				found = 1;
				break;
			}
		}
	}

	if (!found) {
		mutex_spinunlock(&CI_LISTLOCK(cip), s);
		rc = untimeout_migrated(id, wait);
	}

	return(rc);
}

int
untimeout_wait(toid_t id)
{
	ASSERT(id);
	return (untimeout_body(id, 1));
}

int
untimeout(toid_t id)
{
	/*
	 * If the id is 0 then there is no work to do.
	 */
	if (id == 0)
		return(0);
	return (untimeout_body(id, 0));
}

/*
 * given an ID, return true(1) if still in timeout queue
 * For the result to mean much, these functions should be called at spl > 1
 */
int
chktimeout(int id)
{
	register struct callout *p1, *p2;
	register int s, rv = 0;
	union c_tid c_tid; /* pick this name to use macros in callo.h */
	callout_info_t *callout_info;

	c_id = id;
	callout_info = &CALLTODO(c_cpuid);
	s = mutex_spinlock_spl(&CI_LISTLOCK(callout_info), splprof);
	for (p1 = CI_TODO(callout_info);
	     (p2 = p1->c_next) != 0; p1 = p2) {
		if (p2->c_cid == c_cid) {
			rv = 1;
			break;
		}
	}
	mutex_spinunlock(&CI_LISTLOCK(callout_info), s);
	return(rv);
}

__int64_t
do_chktimeout_tick(callout_info_t *callout_info, toid_t id, void (*fun)(), void *arg)
{
	register struct callout *phead, *p1, *p2;
	__int64_t rv = 0;
	union	c_tid c_tid;	/* pick this name to use macro in callo.h */
	int s;

	/*
	 * When passed nothing return nothing
	 */
	if ((id == 0) && (fun == NULL))
		return (0);
	c_id = id;
	phead = CI_TODO(callout_info);

	s = mutex_spinlock_spl(&CI_LISTLOCK(callout_info), splprof);
	for ( p1 = phead; (p2 = p1->c_next) != 0; p1 = p2) {
#ifndef CLOCK_CTIME_IS_ABSOLUTE
		/* accumulating ticks before it */
		rv += p2->c_time;
#endif /* !CLOCK_CTIME_IS_ABSOLUTE */
		if ((p2->c_id == c_id) ||
		    (p2->c_func == fun && p2->c_arg == arg)){
		    break;
		}
	}
   	mutex_spinunlock(&CI_LISTLOCK(callout_info), s);

	if (p2 == 0) {		/* did not find it */
		if (id) {
			c_id = find_migrated_timeout(id);
			if(c_id)
				return(do_chktimeout_tick(callout_info,c_id, NULL, NULL));

		}
		return(0);
	}

#ifdef CLOCK_CTIME_IS_ABSOLUTE
	rv = p2->c_time;
#endif /* CLOCK_CTIME_IS_ABSOLUTE */

	if (rv <= 0)		/* found it, but was negative or zero */
		return(1);	/* return a small positive value */
	else
		return(rv);	/* found it, return accurate value */
}

/*
 * Process a timeout entry and release it back to the free list.
 */
void
timein_entry(struct callout *timeout_entry)
{
	register void *arg, *arg1, *arg2, *arg3;
	register void (*func)(void *, void *, void *, void *);
#ifdef ISOLATE_DEBUG
	pda_t   *npda;
	extern int isolate_debug, isolate_drop;
	extern int wsyncv();

	npda = pdaindr[cpuid()].pda;
	if (npda->p_flags & PDAF_ISOLATED &&
			isolate_debug &&
			timeout_entry->c_func != (void (*)())wsyncv) {
		cmn_err(CE_WARN,
			"Isolated proc %d about to run timeout 0x%x\n",
			npda->p_cpuid,timeout_entry->c_func );
		if (isolate_drop) debug((char *) 0);
	}
#endif /* ISOLATE_DEBUG */
	arg = timeout_entry->c_arg;
	arg1 = timeout_entry->c_arg1;
	arg2 = timeout_entry->c_arg2;
	arg3 = timeout_entry->c_arg3;
	func = (void (*)(void *, void *, void *, void *))
			timeout_entry->c_func;

	callout_free(timeout_entry);

	(*func)(arg, arg1, arg2, arg3); /* call the function */
}

/* ARGSUSED */
void
timein_entry_icvsema(ci_itinfo_t *citp, struct callout *to)
{
	void timein_body_ithrd(ci_itinfo_t *);
	void thread_timein(void *);
	kthread_t *kt = curthreadp;

	timein_body_ithrd(citp);

	/* Reset func, args and pri, in case thread is next started
	 * by vsema (rather than icvsema).  If it is again started
	 * by icvsema, these will be overridden.
	 */
	xthread_set_func(KT_TO_XT(kt), (xt_func_t *)thread_timein, citp);

	ipsema(CIT_GET_SEMA(citp));
}

/*
 * timein_entry_ithrd
 *
 * This routine links the specified callout entry to the tail
 * of the xthread callout list and then bumps the counting
 * semaphore.
 */
void
timein_entry_ithrd(callout_info_t *cip, struct callout *to, int s)
{
	struct callout *p1;
	ci_itinfo_t *citp;
	kthread_t *ktscan;
	kthread_t *kt;
	xthread_t *xt;
	int hipri;
	int x;

	xt = icvsema(CI_SEMA(cip), to->c_pl,
		     (xt_func_t *) timein_entry_icvsema,
		     NULL, to);

	if (xt) {
		/*
		 * Register untimeout information.  Note that the newly
		 * runable interrupt thread can't run yet because it's
		 * mustrun on the current CPU and we hold a spinlock.
		 * This prevents a race between the initialization here
		 * and the thread trying to read the information ...
		 */
		ASSERT_MP(XT_TO_KT(xt)->k_mustrun == cpuid());
		citp = (ci_itinfo_t*) xt->xt_arg;
		citp->cit_to = to;
		citp->cit_toid = to->c_id;
		mutex_spinunlock(&CI_LISTLOCK(cip), s);
		return;
	}

	/*
	 * Normally, the icvsema above will succeed, and the timein thread
	 * will be started directly.  However, when we get callouts faster
	 * than the timein threads can handle them, we fall back to having
	 * to queue the callout and wait for the next available timein thread.
	 */

	kt = NULL;
	hipri = 0;

	/*
	 * Before we queue this callout, we need to check if it is
	 * more important than the callouts currently being handled.
	 * If it is, then we need to boost the priority of one of the
	 * running timein threads to prevent priority inversion.
	 */
	citp = cip->ci_ithrdinfo;
	for (x=0; x < cip->ci_ithrd_cnt; x++, citp++) {
		ktscan = XT_TO_KT(citp->cit_ithread);
		if (ktscan->k_basepri >= to->c_pl) {
			kt = NULL;
			break;
		}
		if (ktscan->k_basepri > hipri) {
			hipri = ktscan->k_basepri;
			kt = ktscan;
		}
	}

	if (kt) {
		/*
		 * Boost priority
		 */
		int dequeued = 0;
retry:
		kt_nested_lock(kt);
		if (kt->k_onrq != CPU_NONE) {
			if (removerunq(kt))
				dequeued++;
			else {
				kt_nested_unlock(kt);
				goto retry;
			}
		}

		if (kt->k_copri == 0)
			kt->k_copri = kt->k_basepri;

		kt->k_basepri = to->c_pl;
		if (kt->k_pri < to->c_pl)
			kt->k_pri = to->c_pl;

		if (dequeued)
			putrunq(kt, CPU_NONE);

		kt_nested_unlock(kt);
	}

	/*
	 * Keep pending callout queue sorted by priority
	 */
	if (p1 = CI_PENDING_NEXT(cip)) {
		struct callout *prev = CI_PENDING(cip);
		while (p1->c_pl >= to->c_pl) {
			if (p1->c_next == NULL) {
				p1->c_next = to;
				to->c_next = NULL;
				goto done;
			}
			prev = p1;
			p1 = p1->c_next;
		}
		to->c_next = prev->c_next;
		prev->c_next = to;
	} else {
		CI_PENDING_NEXT(cip) = to;
		to->c_next = NULL;
	}

done:
	mutex_spinunlock(&CI_LISTLOCK(cip), s);

	vsema(CI_SEMA(cip));
}

/*
 * timein_body_ithrd()
 *
 * This routine scans the list of xthread callouts and executes them
 * at the proper spl/priority value (note that any entries on this
 * list are by definition expired.  We also handle untimeout()
 * synchronization here.
 */
void
timein_body_ithrd(ci_itinfo_t *citp)
{
	callout_info_t *cip = citp->cit_calloinfo;
	register struct callout *list, *p1, *p2;
	register int s;
	kthread_t *kt = curthreadp;

	ASSERT(cip);
	ASSERT(cip->ci_flags & CA_ENABLED);

	/*
	 * If cit_to is set, which will be the usual case,
	 * our timein thread has been started by icvsema directly,
	 * and we're running at the correct priority.  Otherwise,
	 * we need to find the next timeout on the PENDING list.
	 */
	if ((p1 = citp->cit_to) == NULL) {
		list = CI_PENDING(cip);
		s = mutex_spinlock_spl(&CI_LISTLOCK(cip), splprof);
		kt->k_copri = 0;

		p1 = list->c_next;
		if (p1 == 0) {		/* we're done */
			mutex_spinunlock(&CI_LISTLOCK(cip), s);
			return;
		}

		p2 = p1->c_next;	/* advance to next item */
		list->c_next = p2;

		if (p1->c_next == NULL)
			CI_PENDING_NEXT(cip) = NULL;

		/* register untimeout information */
		citp->cit_toid = p1->c_id;
		citp->cit_to = p1;

		/*
		 * This is rare, but in the event our priority was boosted
		 * by a high priority callout, we must make sure we lower it
		 * after the urgent callout was handled.
		 */
		if (kt->k_basepri > p1->c_pl) {
			kt_nested_lock(kt);
			kt_initialize_pri(kt, p1->c_pl);
			reset_pri(kt);
			kt_nested_unlock(kt);
		}
		ASSERT(kt->k_basepri <= p1->c_pl);
		ASSERT(kt->k_pri <= p1->c_pl);

		mutex_spinunlock(&CI_LISTLOCK(cip), s);

		if (private.p_runrun == 1)
			qswtch(RESCHED_Y);

		/* Create another thread since we're overloaded */
		if (cip->ci_ithrd_cnt < CA_ITHRDS_PER_LIST)
			callout_itentry_init(cip, cpuid(), "timein");
	}

	ASSERT(citp->cit_toid == p1->c_id);
	ASSERT(citp->cit_to == p1);

	timein_entry(p1);

	/*
	 * Check to see if someone is waiting on the
	 * timeout we just fired.  If so, clear the wait
	 * state and wake em up.
	 */
	s = mutex_spinlock_spl(&CI_LISTLOCK(cip), splprof);
	if (citp->cit_flags & CIT_WAITING) {
		citp->cit_flags &= ~CIT_WAITING;
		sv_broadcast(&citp->cit_sync);
	}
	citp->cit_toid = 0;
	citp->cit_to = NULL;
	kt->k_copri = 0;
	mutex_spinunlock(&CI_LISTLOCK(cip), s);
}

/*
 * This function doesn't look like a loop, but it is.
 * The ipsema call always comes out calling the function set up in the xthread.
 */
void
thread_timein(void *arg)
{
	timein_body_ithrd(arg);
	ipsema(CIT_GET_SEMA((ci_itinfo_t *)arg));
}

/*
 * Do one-time thread setup.  Won't get called more than once per thread
 * since the ipsema() will come out in the function setup in the
 * xthread_set_func() call.
 */
void
thread_timein_start(void *arg)
{
#if MP
	int mustruncpu = CIT_TO_CPU((ci_itinfo_t *)arg);
#endif

	/* NO-OP on UP */
	(void) setmustrun(mustruncpu);

	xthread_set_func(KT_TO_XT(curthreadp), (xt_func_t *)thread_timein, arg);
	ipsema(CIT_GET_SEMA((ci_itinfo_t *)arg));
	/* NOTREACHED */
}

/*
 * callout_info_init
 *
 * Initialize callout info structure.
 */
static void
callout_itentry_init(callout_info_t *cip, int targcpu, char *name)
{
	char threadname[20];
	ci_itinfo_t *citp;
	extern int default_timeout_pri;

	/* fill in ithrd info */
	if (atomicSetInt(&cip->ci_flags,CA_ITHRD_CREATING) & CA_ITHRD_CREATING)
		return;

	if (cip->ci_ithrd_cnt >= CA_ITHRDS_PER_LIST)
		return;

	citp = &cip->ci_ithrdinfo[cip->ci_ithrd_cnt];

	init_sv(&citp->cit_sync, SV_DEFAULT, name, targcpu);
	citp->cit_calloinfo = cip;

	sprintf(threadname, "%s%d", name, targcpu);
	citp->cit_ithread = xthread_create(threadname, 0,
				KTHREAD_DEF_STACKSZ, 0,
				default_timeout_pri, KT_PS,
				(xt_func_t *)thread_timein_start,
				(void *)(__psint_t)citp);

	/*
	 * Don't increment new thread cursor till we're done initializing
	 * all associated state -- like cit_ithread.  This avoids a race
	 * with timein_entry_ithrd() which scans the table up to the cursor.
	 */
	__synchronize();
	cip->ci_ithrd_cnt++;
	atomicClearInt(&cip->ci_flags, CA_ITHRD_CREATING);
}

static void
callout_info_init(callout_info_t *cip, int targcpu, char *name)
{
	ci_itinfo_t *citp;

	init_sema(&cip->ci_sema, 0, name, targcpu);
	cip->ci_flags |= ((targcpu << CA_CPU_SHIFT) & CA_CPU_MASK);

	/* allocate xthread info blocks */
	citp = (ci_itinfo_t *)kmem_zalloc(sizeof (ci_itinfo_t) *
					       CA_ITHRDS_PER_LIST, KM_SLEEP);

	/* now fill in ithrd info */
	cip->ci_ithrdinfo = citp;
	callout_itentry_init(cip, targcpu, name);
	cip->ci_flags |= CA_ENABLED;
	spinlock_init(&cip->ci_listlock, "cilock");
}

/*
 * thread_timein_init
 * This routine sets up each allocated callout list for
 * xthread handling.
 */
void
thread_timein_init(void)
{
	int i;

	for (i = 0; i < maxcpus; i++) {
		/* don't bother for CPUs that don't exist */
		if (!cpu_enabled(i))
			continue;
		callout_info_init(&CALLTODO(i), i, "timein");
	}

#if !defined(CLOCK_CTIME_IS_ABSOLUTE)
	/* setup fastclock handling */
	callout_info_init(fastcatodo, fastclock_processor, "ftimein");
#endif /* !CLOCK_CTIME_IS_ABSOLUTE */

#if (MAXCPUS > 128)
	if (numcpus <= 64)
		sched_tick_mask = 0;
	else if (numcpus <= 128)
		sched_tick_mask = 1;
	else if (numcpus <= 256)
		sched_tick_mask = 3;
	else
		sched_tick_mask = 7;
#endif
}

void
delay(long ticks)
{
	timespec_t ts;

	if (ticks == 0)
		return;
	tick_to_timespec(ticks, &ts, NSEC_PER_TICK);
	nano_delay(&ts);
}

/*
 * delay current thread. non-breakable
 */
void
nano_delay(timespec_t *ts)
{
	if (ts->tv_sec != 0 || ts->tv_nsec != 0) {
		kthread_t *kt = curthreadp;
		int s = kt_lock(kt);
		kt_timedwait(kt, 0, s, 1, ts, NULL);
	}
}

/*
 * Adjtime system call.
 * If the delta is reasonable, do it.
 */
struct adjtimea {
	struct timeval *delta;
	struct timeval *olddelta;
};


int
adjtime(struct adjtimea *uap)
{
	struct timeval atv;		/* new adjustment */
	struct timeval oatv;		/* old adjustment */
	long odelta;
	/*REFERENCED(!MP)*/
#if _MIPS_SIM == _ABI64
	int abi = get_current_abi();
#endif

	if (!_CAP_ABLE(CAP_TIME_MGT))
		return EPERM;
	if (COPYIN_XLATE(uap->delta, &atv, sizeof atv,
			 irix5_to_timeval_xlate, abi, 1)) {
		_SAT_CLOCK(0,EFAULT);
		return EFAULT;
	}

	/* prevent overflow */
	if (atv.tv_sec <= -0x7fffffff/USEC_PER_SEC
	    || atv.tv_sec >= 0x7fffffff/USEC_PER_SEC) {
		_SAT_CLOCK(atv.tv_sec,EINVAL);
		return EINVAL;
	}

	VHOST_ADJ_TIME(atv.tv_sec*USEC_PER_SEC + atv.tv_usec, &odelta);
	_SAT_CLOCK(atv.tv_sec,0);	/* Log successful change */
	/*
	 * Mark last adjtime so that onesec_maint will
	 * know to reset tod chip as needed
	 */
	lastadjtime = lbolt+DIDADJTIME;

	/*
	 * return remaining old correction if asked
	 */
	if (uap->olddelta) {
		oatv.tv_sec = odelta / USEC_PER_SEC;
		oatv.tv_usec = odelta % USEC_PER_SEC;
		if (XLATE_COPYOUT(&oatv, uap->olddelta, sizeof oatv,
				   timeval_to_irix5_xlate, abi, 1))
			return EFAULT;
	}
	return 0;
}


/*
 * Get current time in the BSD style.
 */
struct gettimeofdaya {
	void	*tvp;
};

/* ARGSUSED1 */
int
gettimeofday(struct gettimeofdaya *uap)
{
	struct timeval tv;

	/*
	 * in 64 bit mode the tv struct has an initial 32 bit pad - since
	 * in 6.1 we had tv_Sec be a long, we need to guarantee that the
	 * top 32 bits is 0 - so to be safe, we bzero it here
	 * Note that 64 bit apps running on 64 bit kernels simply end up
	 * calling copyout - so this 'tv' is the actualy copy used.
	 */
	bzero(&tv, sizeof(tv));
	microtime(&tv);
	if (XLATE_COPYOUT(&tv, uap->tvp, sizeof tv,
			   timeval_to_irix5_xlate, get_current_abi(), 1))
		return EFAULT;
	return 0;
}

/*
 * Set the current time, BSD style.
 * Called from syssgi(2), not directly from sysent.
 */
int
settimeofday(void *uap)
{
	struct timeval atv;
	/*REFERENCED(!MP)*/
	cpu_cookie_t was_running;

	if (!_CAP_ABLE(CAP_TIME_MGT))
		return EPERM;

	if (COPYIN_XLATE(uap, &atv, sizeof atv,
			 irix5_to_timeval_xlate, get_current_abi(), 1)) {
		_SAT_CLOCK(0,EFAULT);
		return EFAULT;
	}

	/* assume the libc wrapper will have rounded the value.  There
	 * is no reason to put code in the kernel unless necessary,
	 * and the super user can do far greater damage with the wrong
	 * time, than simply badly formatted time.
	 */
	was_running = setmustrun(clock_processor);
	settime(atv.tv_sec, atv.tv_usec);
	wtodc();
	restoremustrun(was_running);
	_SAT_CLOCK(atv.tv_sec,0);
	return 0;
}

#if _MIPS_SIM == _ABI64
/*ARGSUSED*/
int
irix5_to_timeval_xlate(
	enum xlate_mode mode,
	void *to,
	int count,
	register xlate_info_t *info)
{
	ASSERT(count == 1);
	ASSERT(info->smallbuf != NULL);

	ASSERT(mode == SETUP_BUFFER || mode == DO_XLATE);

	if (mode == SETUP_BUFFER) {
		ASSERT(info->copybuf == NULL);
		ASSERT(info->copysize == 0);
		if (sizeof(struct irix5_timeval) <= info->inbufsize)
			info->copybuf = info->smallbuf;
		else
			info->copybuf = kern_malloc(
						sizeof(struct irix5_timeval));
		info->copysize = sizeof(struct irix5_timeval);
		return 0;
	}

	ASSERT(info->copysize == sizeof(struct irix5_timeval));
	ASSERT(info->copybuf != NULL);

	irix5_to_timeval((struct timeval *)to,
			 (struct irix5_timeval *)info->copybuf);
	return 0;
}

/*ARGSUSED*/
int
timeval_to_irix5_xlate(
	void *from,
	int count,
	register xlate_info_t *info)
{
	ASSERT(count == 1);
	ASSERT(info->smallbuf != NULL);

	if (sizeof(struct irix5_timeval) <= info->inbufsize)
		info->copybuf = info->smallbuf;
	else
		info->copybuf = kern_malloc(sizeof(struct irix5_timeval));
	info->copysize = sizeof(struct irix5_timeval);

	timeval_to_irix5((struct timeval *)from,
			 (struct irix5_timeval *)info->copybuf);

	return 0;
}
#endif	/* _ABI64 */

/*
 *The following code is support for migrating timeouts
*/

struct migrated_timeout {
	struct migrated_timeout *next; /* link to next entry */
	toid_t oldid;		      /* The id the timeout was assigned */
	toid_t newid;		      /* The id of timeout after migration */
	time_t time;		      /* Time we put on list */
};

static struct migrated_timeout *migrated_timeouts;

/*
 * given an old timeout id plus the time left before the
 * timeout add an item to the migrated timeouts list.
 * this list is used to forward untimeout requests after a timeout has
 * been migrated. It returns a pointer to the field for the new timeout
 * id that is filled in after we get one.
 */
void *
allocate_migrate_timeout(void)
{
	struct migrated_timeout *to;
	to = (struct migrated_timeout *)kmem_alloc(sizeof(*to), KM_SLEEP);
	return (to);

}
void
free_migrate_timeout(void *ptr)
{
	kmem_free(ptr,sizeof(struct migrated_timeout));
}

volatile toid_t *
add_migrated_timeout(toid_t oldid, long sec, void *ptr)
{
	struct migrated_timeout *to;
	int s;

	to = (struct migrated_timeout *)ptr;
	to->oldid = oldid;
	to->newid = 0;
	to->time = lbolt / HZ + sec + 5; /* What time should
					  * this entry expire
					  */
	s = mutex_spinlock_spl(&migrated_timeout_lock, splprof);
	to->next =  migrated_timeouts;
	migrated_timeouts = to;
	mutex_spinunlock(&migrated_timeout_lock, s);

	return (&to->newid);
}
/*
 * given an ID, kill the corresponding migrated time-out
 */

static int
untimeout_migrated(toid_t id, int wait)
{
	struct migrated_timeout *p1, *last;
	struct migrated_timeout *free;
	int return_val = 0;
	register int s;

	free = NULL;
	/* If we do not have any migrated timeouts then return */
	if (!migrated_timeouts)
		return 0;
  startover:
	s = mutex_spinlock_spl(&migrated_timeout_lock, splprof);
	p1 = migrated_timeouts;
	/* Check if the head of the list is what we are looking for */
	if (p1 && p1->oldid == id) {
		free = p1;
		migrated_timeouts = p1->next;
	} else if (p1) {
		for ( last = p1 ; (p1 = p1->next) != 0 ; last = p1) {
			if((p1->oldid == id)) {
				last->next = p1->next;
				free = p1;
				break;
			}
			if (lbolt / HZ > p1->time) {
				/*
				 * if it is past the time that this timeout
				 * was due it can be removed.
				 */
				last->next = p1->next;
				/* Need to drop lock before free */
				mutex_spinunlock(&migrated_timeout_lock, s);
				free_migrate_timeout((void *)p1);
				/* As we dropped the lock
				 * we do not know the state of
				 * the list so we start all over
				 */
				goto startover;
			}
		}
	}
	mutex_spinunlock(&migrated_timeout_lock, s);
	if (free) {
		/*
		 * Avoid the short race where we might just have put the
		 * timeout onto the migrated queue
		 */
		while(free->newid == 0);
		return_val = untimeout_body(free->newid, wait);
		kmem_free (free, sizeof(*free));
	}
	return return_val;
}

static toid_t
find_migrated_timeout(toid_t id)
{
	struct migrated_timeout *p1;
	int s;

	/* Peek to see if it's worth grabbing the migrated_timeout_lock. */
	if ((p1 = migrated_timeouts) == NULL)
		return 0;

	s = mutex_spinlock_spl(&migrated_timeout_lock, splprof);
	p1 = migrated_timeouts;
	while (p1) {
		if (p1->oldid == id)
			break;
		p1 = p1->next;
	}
	mutex_spinunlock(&migrated_timeout_lock, s);

	return (p1 ? p1->newid : 0);
}