1
0
Files
irix-657m-src/irix/kern/os/clock.c
2022-09-29 17:59:04 +03:00

2575 lines
61 KiB
C

/**************************************************************************
* *
* Copyright (C) 1989-1994 Silicon Graphics, Inc. *
* *
* These coded instructions, statements, and computer programs contain *
* unpublished proprietary information of Silicon Graphics, Inc., and *
* are protected by Federal copyright law. They may not be disclosed *
* to third parties or copied or duplicated in any form, in whole or *
* in part, without the prior written consent of Silicon Graphics, Inc. *
* *
**************************************************************************/
/* Copyright (c) 1984 AT&T */
/* All Rights Reserved */
/* THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF AT&T */
/* The copyright notice above does not evidence any */
/* actual or intended publication of such source code. */
#ident "$Revision: 3.401 $"
#include <stdarg.h>
#include <sys/types.h>
#include <ksys/as.h>
#include <os/as/as_private.h> /* XXX */
#include <sys/callo.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/errno.h>
#include <sys/getpages.h>
#include <sys/immu.h>
#include <sys/pfdat.h>
#include <sys/page.h>
#include <sys/kabi.h>
#include <sys/kmem.h>
#include <sys/ksignal.h>
#include <sys/par.h>
#include <sys/param.h>
#include <sys/pda.h>
#include <sys/prctl.h>
#include <sys/proc.h>
#include <sys/psw.h>
#include <sys/reg.h>
#include <sys/resource.h>
#include <sys/runq.h>
#include <sys/schedctl.h>
#include <sys/signal.h>
#include <sys/strmp.h>
#include <sys/swap.h>
#include <sys/sysinfo.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/ktime.h>
#include <sys/time.h>
#include <sys/tuneable.h>
#include <ksys/exception.h>
#include <sys/xlate.h>
#include <sys/capability.h>
#include <sys/sat.h>
#include <sys/space.h>
#include <ksys/sthread.h>
#include <ksys/xthread.h>
#include <sys/hwperfmacros.h>
#include <sys/atomic_ops.h>
#include <sys/calloinfo.h>
#include <sys/ddi.h>
#include <sys/klog.h>
#include <sys/rt.h>
#include "os/proc/pproc_private.h" /* XXX bogus */
#include <ksys/hwperf.h>
#ifdef CELL
#include <ksys/cell/cell_hb.h>
#endif
#include <sys/lpage.h>
#include <ksys/vhost.h>
#ifdef NUMA_BASE
#include <sys/nodepda.h>
#endif
/*
* Clock
*
* Functions:
* implement callouts
* maintain user/system times
* maintain date
* profile
* alarm clock signals
* jab the scheduler
*/
extern sema_t vfswakeup;
extern void onesec_maint(void);
extern void tick_maint(void);
extern void tick_actions(void);
static int updateload(proc_t *, void *, int);
static void calcrss(uthread_t *);
static void callout_itentry_init(callout_info_t *, int, char *);
static pgcnt_t getrss(uthread_t *);
sema_t second_sema; /* v'd once a second */
time_t time; /* time in seconds since 1970 */
time_t lbolt; /* time in HZ since last boot */
#if (MAXCPUS > 128)
int sched_tick_mask=0; /* (see comment where used below) */
#endif
int one_sec = 1;
extern int vhandkicklim; /* # pages before kick vhand */
int vhandcnt; /* counter for vhand kick */
int vfssynccnt; /* counter for vfs_syncr */
uint sxbrkcnt; /* count of uthreads which are SXBRK */
ulong freeswap; /* amount of free swap */
time_t lastadjtime; /* HZ since last adjtime(2) */
time_t lastadjtod = DIDADJTIME; /* for 1 hour after last adjtime(2) */
#if !defined(CLOCK_CTIME_IS_ABSOLUTE)
callout_info_t *calltodo;
#endif /* !CLOCK_CTIME_IS_ABSOLUTE */
callout_info_t *fastcatodo;
extern struct callout *timeout_get_queuehead(long list, struct callout *pnew);
static toid_t find_migrated_timeout(toid_t id);
static lock_t migrated_timeout_lock;
static int rqlen;
#ifdef notdef
static int sqlen;
#endif
extern int pdcount; /* count of pdinserted pages */
extern int pdflag; /* flag is nonzero if pdcount pages */
/*
* Compute a Tenex-style load average of a quantity on 1, 5 and 15 minute
* intervals, using 'fixed-point' arithmetic with 3 decimal digits to right.
*
* avg[T+t] = avg[T] * exp(-t/c) + nrun * (1 - exp(-t/c))
* where c = 1, 5, and 15 minutes, t = calculation interval
*
* Exponential constants for the specified interval:
*/
#define AVENRUN_INTVL 4 /* interval between calculations */
#define CEXP_0 958 /* (int) (exp(-4/60) * 1024) */
#define CEXP_1 1010 /* (int) (exp(-4/300) * 1024) */
#define CEXP_2 1019 /* (int) (exp(-4/900) * 1024) */
__int32_t avenrun[3]; /* smoothed load averages */
static int nrun; /* # of runnable processes (see updateload) */
static void
calcavenrun(void)
{
register __int32_t *avg = avenrun;
#define AVGCALC(n, exp) \
avg[n] = (exp * (__int64_t)avg[n] + \
(((1024 - exp) * (__int64_t)nrun) << 10)) >> 10
AVGCALC(0, CEXP_0);
AVGCALC(1, CEXP_1);
AVGCALC(2, CEXP_2);
#undef AVGCALC
}
#ifdef NUMA_BASE
#define SYSWAIT_NODE(_node, _field) NODEPDA(_node)->syswait._field
#define SYSWAIT_BACKOFF_THRESHOLD (HZ * 5) /* 5 secs */
#define SYSWAIT_BACKOFF_RATE 100 /* one out of 100 */
#define SYSWAIT_BIGSYS 10 /* one out of 10 */
/* Only the clock master re-calculates the global syswait count */
#define RECALC_SYSWAIT() \
{ \
if (private.p_flags & PDAF_CLOCK) { \
recalc_syswait(); \
if (syswait.iowait || syswait.swap || syswait.physio) { \
SYSINFO.wioocc++; \
SYSINFO.wioque += (syswait.iowait + \
syswait.swap + syswait.physio); \
} \
} \
}
/* Reset the node's syswait field by taking the current value and removing it.
* This should prevent overflows from occurring. We just can't set it to zero
* because it could change while we are resetting it. After doing this for
* all nodes the accum value should represent the global state of syswait.
* We can then apply that to our node to save it.
*/
#define RESET_SYSWAIT_VAL(_accum, _tmp, _node, _field) \
{ \
_tmp = SYSWAIT_NODE(_node, _field); \
_accum._field += _tmp; \
if (_tmp) { \
atomicAddInt(&NODEPDA(_node)->syswait._field, ~(_tmp - 1)); \
} \
}
static void
recalc_syswait(void)
{
int i;
struct syswait tmp_syswait;
static time_t last_wait = 0;
/* To eliminate contention on the global syswait counter
* each node has its own. We add up the syswait counts of all
* nodes and store the result in the global count that will be
* seen by the other cpus. Maybe the clock cpu is not the best
* choice for this since it gets stuck doing everything, but then
* again maybe that is a good reason to pick it ???
*/
#if defined(SN0XXL) || defined(SN1)
/*
* Large systems cant afford to make this calculation every tick.
*/
if (lbolt % SYSWAIT_BIGSYS != 0)
return;
#endif
if (lbolt - last_wait >= SYSWAIT_BACKOFF_THRESHOLD) {
/* We are backing off to a slower rate since nothing
* showed up within the threshold limit. If something
* shows up again we'll go back to every tick.
*/
if (lbolt % SYSWAIT_BACKOFF_RATE != 0) {
return;
}
}
bzero(&tmp_syswait, sizeof(tmp_syswait));
for (i = 0; i < numnodes; i++) {
/* If the total count ends up being negative we must
* have passed the node that incremented it and we only
* caught the decrement. Will set it to zero later.
*/
tmp_syswait.iowait += SYSWAIT_NODE(i, iowait);
tmp_syswait.swap += SYSWAIT_NODE(i, swap);
tmp_syswait.physio += SYSWAIT_NODE(i, physio);
}
/* Fix negative counts where the increment ocurred on a cpu
* after we went past it in the loop above.
*/
if (tmp_syswait.iowait < 0) {
tmp_syswait.iowait = 0;
}
if (tmp_syswait.swap < 0) {
tmp_syswait.swap = 0;
}
if (tmp_syswait.physio < 0) {
tmp_syswait.physio = 0;
}
/* Update the global syswait counts with the accumulated result */
if (tmp_syswait.iowait != syswait.iowait) {
syswait.iowait = tmp_syswait.iowait;
}
if (tmp_syswait.swap != syswait.swap) {
syswait.swap = tmp_syswait.swap;
}
if (tmp_syswait.physio != syswait.physio) {
syswait.physio = tmp_syswait.physio;
}
if (tmp_syswait.iowait || tmp_syswait.swap || tmp_syswait.physio) {
/* Somebody is waiting for I/O, reset backoff */
last_wait = lbolt;
}
}
#define RESET_SYSWAIT_PERIOD 30 /* seconds */
#define RESET_SYSWAIT() reset_syswait()
static void
reset_syswait(void)
{
int i, j;
struct syswait tmp_syswait;
static int reset_syswait_timeout = RESET_SYSWAIT_PERIOD;
/* Periodically reset everyone's counter to prevent overflows and save
* the current syswait state in our's.
*/
if (--reset_syswait_timeout > 0) {
return;
}
reset_syswait_timeout = RESET_SYSWAIT_PERIOD;
bzero(&tmp_syswait, sizeof(tmp_syswait));
for (i = 0; i < numnodes; i++) {
RESET_SYSWAIT_VAL(tmp_syswait, j, i, iowait);
RESET_SYSWAIT_VAL(tmp_syswait, j, i, physio);
RESET_SYSWAIT_VAL(tmp_syswait, j, i, swap);
}
atomicAddInt(&nodepda->syswait.iowait, tmp_syswait.iowait);
atomicAddInt(&nodepda->syswait.physio, tmp_syswait.physio);
atomicAddInt(&nodepda->syswait.swap, tmp_syswait.swap);
}
#else
#define RECALC_SYSWAIT() \
{ \
if (private.p_flags & PDAF_CLOCK) { \
if (syswait.iowait || syswait.swap || syswait.physio) { \
SYSINFO.wioocc++; \
SYSINFO.wioque += (syswait.iowait + \
syswait.swap + syswait.physio); \
} \
} \
}
#define RESET_SYSWAIT()
#endif /* NUMA_BASE */
#ifdef NUMA_BASE
#define PDCOUNT_NODE(_node) NODEPDA(_node)->pdcount
/* Only the clock master re-calculates the global pdcount */
#define RECALC_PDCOUNT() \
{ \
if (private.p_flags & PDAF_CLOCK) { \
recalc_pdcount(); \
} \
}
/* Reset the node's pdcount field by taking the current value and removing it.
* see above comments for RESET_SYSWAIT_VAL
*/
#define RESET_PDCOUNT_VAL(_accum, _tmp, _node) \
{ \
_tmp = swap_int(&NODEPDA(_node)->pdcount, 0); \
_accum += _tmp; \
}
#define PDCOUNT_RECALC_RATE HZ /* once a second */
static void
recalc_pdcount(void)
{
int i;
int tmp_pdcount;
static int pdcount_recalc_rate=0;
/*
* Note that only routines such as sar or osview actually
* use pdcount. Therefore we only update it once a second.
*/
if (--pdcount_recalc_rate > 0)
return;
pdcount_recalc_rate = PDCOUNT_RECALC_RATE;
tmp_pdcount=0;
for (i = 0; i < numnodes; i++) {
tmp_pdcount += PDCOUNT_NODE(i);
}
if (tmp_pdcount < 0) {
tmp_pdcount = 0;
}
if(tmp_pdcount != pdcount) {
pdcount = tmp_pdcount;
}
if (tmp_pdcount && !pdflag) {
pdflag = 1;
}
}
#define RESET_PDCOUNT_PERIOD 30 /* seconds */
#define RESET_PDCOUNT() reset_pdcount()
static void
reset_pdcount(void)
{
int i, j;
int tmp_pdcount;
static int reset_pdcount_timeout = RESET_PDCOUNT_PERIOD;
/* Periodically reset everyone's counter to prevent overflows and save
* the current pdcount state in our's.
*/
if (--reset_pdcount_timeout > 0) {
return;
}
reset_pdcount_timeout = RESET_PDCOUNT_PERIOD;
tmp_pdcount = 0;
for (i = 0; i < numnodes; i++) {
RESET_PDCOUNT_VAL(tmp_pdcount, j, i);
}
atomicAddInt(&nodepda->pdcount, tmp_pdcount);
}
#else
#define RECALC_PDCOUNT()
#define RESET_PDCOUNT()
#endif /* NUMA_BASE */
/* ARGSUSED */
void
second_thread(void *arg)
{
extern sv_t runout;
int coalesced_kick_timeout = COALESCED_KICK_PERIOD;
for (;;) {
psema(&second_sema, PZERO);
/*
* we really should run on the clock_processor
* XXX race with sysmp()
*/
if (cpuid() != clock_processor)
(void)setmustrun(clock_processor);
(void) nfreeswap(&freeswap);
rqlen = 0;
nrun = 0;
/* update load average */
procscan(updateload, 0);
if (rqlen) {
SYSINFO.runque += rqlen;
SYSINFO.runocc++;
}
if ((time % AVENRUN_INTVL) == 0)
calcavenrun();
/*
* Periodically we're updating the amount of global system
* free memory. Do this before waking coalesced -- it wants
* a reasonably accurate picture.
*/
GLOBAL_FREEMEM_UPDATE();
if (--vfssynccnt <= 0) {
extern int vfs_syncr;
vfssynccnt = vfs_syncr;
cvsema(&vfswakeup);
}
if (--coalesced_kick_timeout <= 0) {
coalesced_kick_timeout = COALESCED_KICK_PERIOD;
COALESCED_KICK();
}
RESET_SYSWAIT();
RESET_PDCOUNT();
/* wake up sched every second */
sv_signal(&runout);
onesec_maint();
}
}
void
init_second()
{
extern int onesec_pri;
initnsema(&second_sema, 1, "second_sema");
sthread_create("onesec", NULL, 4096, 0, onesec_pri, KT_PS,
second_thread, 0, 0, 0, 0);
}
int
clock(eframe_t *ep)
{
kthread_t *kt = curthreadp;
k_machreg_t ps = ep->ef_sr;
uthread_t *ut = kt && KT_ISUTHREAD(kt) ? KT_TO_UT(kt) : NULL;
struct proc *pp = ut ? UT_TO_PROC(ut) : NULL;
register int a;
pgcnt_t rss;
ackrtclock(); /* acknowledge the clock interrupt */
ASSERT(issplhi(getsr()));
ASSERT(private.p_switching == 0);
/*
* Blip the LED's if necessary.
*/
bump_leds();
tick_actions(); /* machine-dependent per-tick activities */
#ifdef CELL
hb_update_local_heart_beat(); /* Update local heart beat */
#endif
RECALC_SYSWAIT();
RECALC_PDCOUNT();
#ifdef ULI
/* we may have interrupted out of a ULI proc. This is only
* really a clock tick during a user process if curuli is
* also clear.
*/
if (USERMODE(ps) && ut && pp && !private.p_curuli)
#else
if (USERMODE(ps) && ut && pp)
#endif
{
ASSERT(pp); /* won't always be true! */
a = CPU_USER;
if (pp->p_profn) {
/*
* Set a flag so that user will later accumulate
* a pc tick for this clock tick. We can't just
* call addupc here because it may take a page
* fault and need to sleep. It's a sin to sleep
* in the clock interrupt handler.
*/
if (pp->p_flag & SPROF) {
/*
* Have to check the flag now since it could
* be profiling with the R10000 counters,
* in which case SPROF will not be set.
*/
ut_flagset(ut, UT_OWEUPC);
PCB(pcb_resched) = 1;
}
}
#if !NO_WIRED_SEGMENTS
#if FAST_LOCORE_TFAULT
/* In "fast locore tfault" mode, the utas_segflags are almost
* always set and does not indicate segement table mode.
* Instead, locore uses u_nexttlb to point into the wired
* range, and if out-of-bounds it indicates that we're in
* segment table mode and should do random 2nd level dropins.
* To start filling the wired entries with new entries, we
* simply need to reset u_nexttlb and we will start re-using
* the wired entries for new 2nd level dropins ... without
* needing to clear all of the maps. We MUST change the tlbpid
* to eliminate the non-wired 2nd level entries (unfortunately
* this will also effectively flush all 1st level entries).
*/
if ((ut->ut_exception->u_nexttlb >= NWIREDENTRIES-TLBWIREDBASE)
&& (++ut->ut_as.utas_tlbcnt >= tune.t_tlbdrop)) {
ut->ut_as.utas_tlbcnt = 0;
ut->ut_exception->u_nexttlb = 0;
new_tlbpid(&ut->ut_as, VM_TLBINVAL);
}
#else /* !FAST_LOCORE_TFAULT */
/*
* If process is running in segment-table mode,
* see if it will be well-behaved for awhile
* and use only NWIRED tlb entries.
* NOTE: No need for this code if we don't have wired
* second level tlbs (like TFP).
*/
if (++ut->ut_as.utas_tlbcnt >= tune.t_tlbdrop) {
ut->ut_as.utas_tlbcnt = 0;
if (ut->ut_as.utas_segflags) {
setup_wired_tlb(1);
#ifdef R4000
new_tlbpid(&ut->ut_as, VM_TLBINVAL);
#endif
}
}
#endif /* !FAST_LOCORE_TFAULT */
#endif /* !NO_WIRED_SEGMENTS */
/*
* if we should, update the process
* virtual itimer and if it expired post the correct signal
* NOTE: this occurs for the normal clock tick (10ms)
*/
if (timerisset(&ut->ut_timer[UT_ITIMER_VIRTUAL].it_value) &&
itimerdecr(&ut->ut_timer[UT_ITIMER_VIRTUAL],
USEC_PER_TICK) == 0)
sigtouthread(ut, SIGVTALRM, (k_siginfo_t *)NULL);
} else if (kt == NULL) {
/* idling - see how */
a = CPU_IDLE;
if (sxbrkcnt)
a = CPU_SXBRK;
/*
* any time we're idle - account for wait I/O
* this overrides interest in SXBRK
*/
if (syswait.iowait) {
a = CPU_WAIT;
SYSINFO.wait[W_IO]++;
}
if (syswait.swap) {
a = CPU_WAIT;
SYSINFO.wait[W_SWAP]++;
}
if (syswait.physio) {
a = CPU_WAIT;
SYSINFO.wait[W_PIO]++;
}
} else {
if (private.p_gfx_waitc) {
a = CPU_WAIT;
SYSINFO.wait[W_GFXC]++;
} else if (private.p_gfx_waitf) {
a = CPU_WAIT;
if (ut)
ut->ut_acct.ua_graphfifo++;
SYSINFO.wait[W_GFXF]++;
} else {
if (KT_ISXTHREAD(kt))
a = CPU_INTR;
else
a = CPU_KERNEL;
if (ut)
ut->ut_prftime++;
}
}
SYSINFO.cpu[a]++;
/*
* This could be a user thread that is exiting, so it might
* not have a process attached.
*/
if (pp && pp->p_stat == SRUN) {
struct rlimit *rlp;
timespec_t utime, stime;
ASSERT(KT_ISUTHREAD(kt) && ut);
rlp = &pp->p_rlimit[RLIMIT_CPU];
/* XXX if this is a multi-threaded app, this
* is incorrect - we are looking only at this
* thread's u+s time, not the whole procs.
* But... to look at the whole proc would be
* expensive, and since resource limits are
* not part of any posix spec, and this
* will still perform the useful attribute
* of RLIMIT_CPU - to catch runaway processes.
*/
ktimer_read(UT_TO_KT(ut), AS_USR_RUN, &utime);
ktimer_read(UT_TO_KT(ut), AS_SYS_RUN, &stime);
if (rlp->rlim_cur != RLIM_INFINITY &&
utime.tv_sec + stime.tv_sec +1 > rlp->rlim_cur)
{
extern int cpulimit_gracetime;
extern void qprintf(char *f, ...);
if (cpulimit_gracetime == 0) {
/*
* old behaviour
*/
sigtouthread(ut, SIGXCPU, (k_siginfo_t *)NULL);
/*
* Don't give the signal every clock tick.
*/
if (rlp->rlim_cur < rlp->rlim_max)
rlp->rlim_cur += 5;
} else {
/*
* new behaviour, we will send once SIGXCPU, so the
* process can checkpoint or whatever necessary and
* after a grace time defined by systuneable
* cpulimit_gracetime we send SIGKILL.
* The reason for this new new behaviour is that it
* was possible for a process to completely ignore
* SIGXCPU.
*/
if (pp->p_flag & SGRACE) {
sigtouthread(ut,SIGKILL,(k_siginfo_t *)NULL);
} else {
sigtouthread(ut, SIGXCPU, (k_siginfo_t *)NULL);
rlp->rlim_cur += cpulimit_gracetime;
pp->p_flag |= SGRACE;
}
}
}
/*
* Update the process profile itimer if it is
* set and if it expired post the SIGPROF signal.
* NOTE: this occurs for the normal clock tick (10ms).
*/
if (timerisset(&ut->ut_timer[UT_ITIMER_PROF].it_value) &&
itimerdecr(&ut->ut_timer[UT_ITIMER_PROF], USEC_PER_TICK)
== 0)
sigtouthread(ut, SIGPROF, (k_siginfo_t *)NULL);
/*
* since # utlbmisses only updated on context switch
* update here also
*/
ut->ut_acct.ua_ufaults += private.p_utlbmisses;
private.p_utlbmisses = 0;
/*
* If this is a pthreads process, and it has a prda,
* and the prda's resched counter is non-null,
* decrement the counter -- and if it goes to zero,
* send pthread reschedule signal.
* XXX Is this still needed?
*/
if (ut->ut_flags & UT_PTHREAD && ut->ut_prda) {
/*
* Don't need atomic operator -- the uthread
* might be in the middle of updating this,
* but it only does processor-atomic stores,
* never increments/decrements.
* The worst that can happen is that we miss
* an update, and catch it next clock tick.
*/
if ((a = ut->ut_prda->t_sys.t_resched) > 0) {
ut->ut_prda->t_sys.t_resched = --a;
if (a == 0) {
int s = ut_lock(ut);
sigaddset(&ut->ut_sig, SIGPTRESCHED);
ut_unlock(ut, s);
}
}
}
/*
* Once a second update in calcrss() is not good enough?
* Count 1K blocks, not pages.
* XXX If rss is negative, vhand is busy; ignore it.
*/
rss = getrss(ut);
if (rss > 0) {
if ((rss * (NBPC / 1024)) >
ut->ut_pproxy->prxy_ru.ru_maxrss)
ut->ut_pproxy->prxy_ru.ru_maxrss =
rss * (NBPC / 1024);
ut->ut_acct.ua_mem += rss;
}
#ifdef DEBUG
else if (rss < 0) {
cmn_err(CE_DEBUG,
"Strange ut_mem update: pid=%d ut_mem=%lx rss=%lx\n",
pp->p_pid, ut->ut_acct.ua_mem, rss);
}
#endif
}
if (ut) {
/*
* Time slice and preemption checks.
*/
tschkRunq(ut);
}
#if (MAXCPUS > 128)
/*
* On large systems, calling the wtree routines every clock tick
* on every cpu causes EXTREMELY hot cache lines in the wtree & job_t structures.
* In fact, the system hangs under some circumstances when access to
* the structures starts to take > 10ms. In this case, the next clock tick has
* already occurred when we exit clock() & we stop making forward progress.
*
* The fix (aka hack) is to call the cpu_sched_tick on every nth clock
* tick. We pick lbolt as the randomizer. This variable is monitonically
* increasing & incremented by 1 cpu once every clock tick (10ms).
*
* The following code does the following:
* - at clock tick 0, cpu 0-63 will call cpu_sched_tick
* - at clock tick 1, cpu 64-127 will call cpu_sched_tick
*
* The sched_tick_mask is set during boot and is a function of the number
* of cpus in the system. The mask is 0 for 0-63 cpus, 1 for 64-127 cpus, ...
*/
if ((lbolt & sched_tick_mask) == ((cpuid()>>6)&sched_tick_mask))
#endif
cpu_sched_tick(kt);
/*
* Numa memory management periodic ops
*/
MEM_TICK();
if (private.p_flags & PDAF_CLOCK) {
unsigned long ofreemem;
++lbolt;
tick_maint(); /* muck with one_sec */
/* "double" long arithmetic for minfo.freemem */
ofreemem = MINFO.freemem[0];
MINFO.freemem[0] += GLOBAL_FREEMEM();
if (MINFO.freemem[0] < ofreemem)
MINFO.freemem[1]++;
if (--vhandcnt <= 0) {
if (GLOBAL_FREEMEM() < vhandkicklim) {
cvsema(&vhandsema);
}
vhandcnt = 2*HZ;
} else if (rsswaitcnt) {
cvsema(&vhandsema);
vhandcnt = 2*HZ;
} else if (GLOBAL_FREEMEM() < tune.t_gpgslo) {
/*
* Push up vhand if memory is really low.
* We don't just wake up vhand here because
* we want to give runnable processes (who
* possibly are about to release their regions)
* a chance to run.
*/
if (vhandcnt > 5)
vhandcnt = 5;
}
ASSERT(loclkok(ep));
if (one_sec) {
one_sec = 0;
vsema(&second_sema);
#if DEBUG
if (valusema(&second_sema))
cmn_err(CE_WARN,"one second clock processing still pending after %d seconds\n", valusema(&second_sema)-1);
#endif
/*
* Update memory usage for the current running process
*/
if (ut)
calcrss(ut);
}
/*
* klog_need_action is set in icmn_err, indicating klogwakeup needs
* to be called. Note that this is done only on the PDAF_CLOCK cpu.
*/
if (klog_need_action) {
klog_need_action = 0;
klog_unlocked_wakeup();
}
}
/*
* If we are using the event counters and there are
* more events being tracked than there are counters,
* multiplex the events every tick. (R10000)
*/
MULTIPLEX_HWPERF_COUNTERS();
return(0);
}
/*
* Update system load average.
*/
/*ARGSUSED*/
static int
updateload(proc_t *pp, void *arg, int mode)
{
if (mode == 1 && pp->p_stat == SRUN && uscan_tryaccess(&pp->p_proxy)) {
register uthread_t *ut;
for (ut = prxy_to_thread(&pp->p_proxy); ut; ut = ut->ut_next) {
kthread_t *kt;
/*
* If the thread is asleep but not sxbrk, don't
* mark it as on-the-runq, but do mark it as
* runnable if it isn't doing a long-term wait
* and it isn't breakable (this implies just waiting
* for some kinda mutex?).
* We don't bother locking ut_lock, 'cause we're
* just generating statistics.
*/
kt = UT_TO_KT(ut);
if ((kt->k_flags & KT_SLEEP) &&
!(ut->ut_flags & UT_SXBRK)) {
if ((kt->k_flags & (KT_LTWAIT|KT_NWAKE))
== KT_NWAKE)
nrun++;
continue;
}
if (ut->ut_flags & UT_STOP)
continue;
if (! is_weightless(UT_TO_KT(ut)))
nrun++;
rqlen++;
}
uscan_unlock(&pp->p_proxy);
}
return(0);
}
static pgcnt_t
getrss(uthread_t *ut)
{
vasid_t vasid;
ppas_t *ppas;
pas_t *pas;
int as_lookup_pinned(uthread_t *, vasid_t *);
/*
* Hack - this is the only code that tries all this grot from interrupt
* level, and we hope to remove this RSN.
* So, we need to practice careful reference here to avoid problems
* when a process happens to being execing
* We have a chance since we are in fact the running process
* so we can take snapshots
*/
if (AS_ISNULL(&ut->ut_asid))
return 0;
if (as_lookup_pinned(ut, &vasid))
return 0;
ppas = (ppas_t *)vasid.vas_pasid;
pas = VASID_TO_PAS(vasid);
return pas->pas_rss + ppas->ppas_rss;
}
static void
calcrss(uthread_t *ut)
{
register preg_t *prp;
register reg_t *rp;
int doingshd = 0;
vasid_t vasid;
pas_t *pas;
ppas_t *ppas;
pgcnt_t rss = 0;
int as_lookup_pinned(uthread_t *, vasid_t *);
/*
* Hack - this is the only code that tries all this grot from interrupt
* level, and we hope to remove this RSN.
* So, we need to practice careful reference here to avoid problems
* when a process happens to being execing
* We have a chance since we are in fact the running process
* so we can take snapshots
*/
if (AS_ISNULL(&ut->ut_asid))
return;
if (as_lookup_pinned(ut, &vasid))
return;
if (VAS_TRYLOCK(vasid, AS_SHARED)) {
pas = VASID_TO_PAS(vasid);
ppas = (ppas_t *)vasid.vas_pasid;
prp = PREG_FIRST(ppas->ppas_pregions);
doshd:
while (prp) {
rp = prp->p_reg;
if (rp->r_flags & RG_PHYS) {
prp = PREG_NEXT(prp);
continue;
}
rss += prp->p_nvalid;
prp = PREG_NEXT(prp);
}
if (!doingshd) {
ppas->ppas_rss = rss;
rss = 0;
doingshd++;
prp = PREG_FIRST(pas->pas_pregions);
goto doshd;
}
pas->pas_rss = rss;
VAS_UNLOCK(vasid);
} else {
/* no charges?? */
;
}
}
/*
* Call the "vanilla" shake routine for each cpu's free zone.
*/
/* ARGSUSED */
static int
callout_shake(int level)
{
cpuid_t cpu;
int page_count = 0;
ASSERT(level == SHAKEMGR_MEMORY);
for (cpu=0; cpu < maxcpus; cpu++) {
zone_t *callout_free_zone;
if ((pdaindr[cpu].CpuId == -1) || !(pdaindr[cpu].pda->p_flags & PDAF_ENABLED))
continue;
callout_free_zone = CI_FREE_ZONE(&CALLTODO(cpu));
page_count += zone_shake(callout_free_zone);
}
return(page_count);
}
/*
* Per-CPU callout initialization.
*/
void
calloutinit_cpu(cpuid_t cpu)
{
extern int ncallout;
int nbytes, num_per_page, numpages;
zone_t *callout_free_zone;
cnodeid_t cnode;
callout_free_zone = kmem_zone_private(sizeof(struct callout), "callout");
CI_FREE_ZONE(&CALLTODO(cpu)) = callout_free_zone;
(void)kmem_zone_private_mode_noalloc(callout_free_zone);
(void)kmem_zone_enable_shake(callout_free_zone);
nbytes = kmem_zone_unitsize(callout_free_zone);
ASSERT(nbytes > 0);
num_per_page = ctob(1) / nbytes;
ASSERT(num_per_page > 0);
/*
* Determine how many pages per CPU we'd need to allocate so that
* the total number of pages allocated stores ncallout structures.
*/
numpages = (((ncallout + num_per_page -1) / num_per_page) + maxcpus-1) / maxcpus;
ASSERT(numpages > 0);
cnode = cputocnode(cpu);
kmem_zone_reserve_node(cnode, callout_free_zone, ctob(1)*numpages);
/* Prevent shake routine from reclaiming entire callout list. */
kmem_zone_minsize(callout_free_zone, numpages*num_per_page);
shake_register(SHAKEMGR_MEMORY, callout_shake);
}
/*
* Called once to perform global initialization required for timeout processing.
*/
void
calloutinit()
{
#if !CLOCK_CTIME_IS_ABSOLUTE
calltodo = (callout_info_t *)kmem_zalloc(sizeof (*calltodo) * maxcpus,
VM_DIRECT|KM_SLEEP);
fastcatodo = (callout_info_t *)kmem_zalloc(sizeof (*fastcatodo),
VM_DIRECT|KM_SLEEP);
spinlock_init(&CI_LISTLOCK(fastcatodo), "fastcatodo");
#endif
calloutinit_cpu(master_procid);
spinlock_init(&migrated_timeout_lock, "migr_to");
}
/*
* Allocate a callout structure.
*
* Try to get a structure from the target cpu's zone.
* If this fails, try to allocate more memory from the
* target CPU's node and add it to the target's zone.
* If this fails, try all the other cpus' zones.
* If this fails, try to allocate any page to the target's zone.
* If this fails, we're probably in trouble; return NULL
* and let the caller deal with it.
*
* We raise to splprof before calling into any routine that needs
* to grab a spinlock, because callout_alloc may be called from
* an interrupt routine that interrupts at splprof, and we need
* to avoid double-tripping on whatever locks are used. We have
* elected to repeatedly raise and lower IPL level as we try the
* various altenatives rather than just holding it at splprof
* for the duration of this function. That way we won't end up
* holding off interrupts for too long (we especially want to avoid
* holdoffs that increase with the number of CPUs). It's fairly
* unusual to get all the way through the various cases, anyway;
* most of the time, the initial attempt will succeed and we will
* have done a single splprof/splx pair.
*/
static struct callout *
callout_alloc(cpuid_t targ)
{
struct callout *co;
zone_t *callout_free_zone;
cpuid_t cpu, my_cpuid;
int s;
/* Try to get a free structure from the target cpu's zone. */
callout_free_zone = CI_FREE_ZONE(&CALLTODO(targ));
again:
s = splprof();
co = kmem_zone_alloc(callout_free_zone, VM_NOSLEEP);
splx(s);
if (!co) {
/* Try to add another page to the empty zone */
void *ptr;
s = splprof();
ptr = kvpalloc_node(cputocnode(targ), 1, VM_NOSLEEP, 0);
splx(s);
if (ptr) {
kmem_zone_fill(callout_free_zone, ptr, ctob(1));
goto again;
}
} else {
co->c_ownercpu = targ;
return(co);
}
/*
* If we get here, it means that the target CPU's callout zone
* is empty and we were unable to allocate more memory to fill
* the zone.
*
* Try other cpus' lists.
*/
my_cpuid = cpuid();
for (cpu=my_cpuid+1; cpu < maxcpus; cpu++) {
if ((pdaindr[cpu].CpuId == -1) || !(pdaindr[cpu].pda->p_flags & PDAF_ENABLED))
continue;
callout_free_zone = CI_FREE_ZONE(&CALLTODO(cpu));
s = splprof();
co = kmem_zone_alloc(callout_free_zone, VM_NOSLEEP);
splx(s);
if (co) {
co->c_ownercpu = cpu;
return(co);
}
}
for (cpu=0; cpu < my_cpuid; cpu++) {
if ((pdaindr[cpu].CpuId == -1) || !(pdaindr[cpu].pda->p_flags & PDAF_ENABLED))
continue;
callout_free_zone = CI_FREE_ZONE(&CALLTODO(cpu));
s = splprof();
co = kmem_zone_alloc(callout_free_zone, VM_NOSLEEP);
splx(s);
if (co) {
co->c_ownercpu = cpu;
return(co);
}
}
/*
* Try a kvpalloc without specifying node.
* We'd really prefer not to do this, since the callout structures
* from another node will be stuck on this node indefinitely. Still,
* it's preferable to panicing.
*/
{
void *ptr;
s = splprof();
ptr = kvpalloc(1, VM_NOSLEEP, 0);
splx(s);
if (ptr) {
kmem_zone_fill(callout_free_zone, ptr, ctob(1));
goto again;
}
}
cmn_err_tag(317,CE_WARN, "Out of callouts cpu%d-->cpu%d\n", cpuid(), targ);
return(NULL);
}
/*
* Free the specified callout structure.
*/
void
callout_free(struct callout *co)
{
cpuid_t owner_cpuid = co->c_ownercpu;
zone_t *callout_free_zone = CI_FREE_ZONE(&CALLTODO(owner_cpuid));
int s;
ASSERT(owner_cpuid != CPU_NONE);
/* Put it back in the zone for the CPU that owns it. */
s = splprof();
kmem_zone_free(callout_free_zone, co);
splx(s);
}
/*
* timeout is called to arrange that fun(arg) is called in tim/HZ seconds.
* An entry is sorted into the callout structure.
* The time in each structure entry is the number of HZ's more
* than the previous entry. In this way, decrementing the
* first entry has the effect of updating all entries.
*
* The panic is there because there is nothing
* intelligent to be done if an entry won't fit.
*
* timeout now queues function to be invoked on the same cpu that
* timeout was called
*/
toid_t
timeout(void (*fun)(), void *arg, long tim, ...)
{
toid_t retval;
va_list ap;
va_start(ap, tim);
retval = dotimeout(cpuid(), fun, arg, tim, callout_get_pri(), C_NORM, ap);
va_end(ap);
if (retval == NULL)
/*
* The kernel was unable to allocate space for
* a timeout request. Since timeout is a comptability
* interface, the kernel cannot return
* an error and allow processing to be handler
* by the caller.
*/
cmn_err(CE_PANIC,
"Timeout table overflow.\n Tune ncallout to a higher value.");
return(retval);
}
toid_t
timeout_pri(void (*fun)(), void *arg, long tim, int pri, ...)
{
toid_t retval;
va_list ap;
/* timeouts should never equal 0 */
ASSERT((pri > 0) && (pri <= 255));
va_start(ap, pri);
retval = dotimeout(cpuid(), fun, arg, tim, pri, C_NORM, ap);
va_end(ap);
return(retval);
}
/* This routine is identical to timeout() but causes timein processing
* to occur on the timein interrupt stack.
*
* NOTE: Should ONLY be used on short duration routines which are simply
* awakening another thread or incrementing a counter.
*/
toid_t
timeout_nothrd(void (*fun)(), void *arg, long tim, ...)
{
toid_t retval;
va_list ap;
va_start(ap, tim);
retval = dotimeout(cpuid(), fun, arg, tim, 0, C_NORM_ISTK, ap);
va_end(ap);
if (retval == NULL)
/*
* The kernel was unable to allocate space for
* a timeout request. Since timeout is a comptability
* interface, the kernel cannot return
* an error and allow processing to be handled
* by the caller.
*/
cmn_err(CE_PANIC,
"Timeout table overflow.\n Tune ncallout to a higher value.");
return(retval);
}
/*
* prtimeout - queue a timeout on specified processor
* If a processor with timeouts is isolated or restricted then the
* timeouts will migrate to the clock processor.
*/
toid_t
prtimeout(processorid_t prid, void (*fun)(), void *arg, long tim, ...)
{
toid_t retval;
va_list ap;
va_start(ap, tim);
retval = dotimeout(prid, fun, arg, tim, callout_get_pri(), C_NORM, ap);
va_end(ap);
if (retval == NULL)
/*
* The kernel was unable to allocate space for
* a timeout request. Since timeout is a comptability
* interface, the kernel cannot return
* an error and allow processing to be handled
* by the caller.
*/
cmn_err(CE_PANIC,
"Timeout table overflow.\n Tune ncallout to a higher value.");
return(retval);
}
#if RTINT_WAR
/* This routine is identical to prtimeout() but causes timein processing
* to occur on the timein interrupt stack.
*
* NOTE: Should ONLY be used on short duration routines which are simply
* awakening another thread or incrementing a counter.
*/
toid_t
prtimeout_nothrd(processorid_t prid, void (*fun)(), void *arg, long tim, ...)
{
toid_t retval;
va_list ap;
va_start(ap, tim);
retval = dotimeout(prid, fun, arg, tim, 0, C_NORM_ISTK, ap);
va_end(ap);
if (retval == NULL)
/*
* The kernel was unable to allocate space for
* a timeout request. Since timeout is a comptability
* interface, the kernel cannot return
* an error and allow processing to be handled
* by the caller.
*/
cmn_err(CE_PANIC,
"Timeout table overflow.\n Tune ncallout to a higher value.");
return(retval);
}
#endif
/*
* fast_prtimeout - like prtimeout, but use the fast timeouts
*/
extern int fastclock;
toid_t
fast_prtimeout(processorid_t targcpu, void (*fun)(), void *arg, long tim, ...)
{
toid_t retval;
va_list ap;
if (!fastclock)
enable_fastclock();
va_start(ap, tim);
retval = dotimeout(targcpu, fun, arg, tim, callout_get_pri(), C_FAST, ap);
va_end(ap);
if (retval == NULL)
cmn_err(CE_PANIC,"Timeout table overflow.\n Tune ncallout to a higher value.");
return(retval);
}
/*
* timeout routines for DDI/DKI compliant drivers.
*/
extern pl_t plbase;
extern pl_t pltimeout;
extern pl_t pldisk;
extern pl_t plstr;
extern pl_t plhi;
/* ARGSUSED */
toid_t
dtimeout(void (*fun)(), void *arg, long tim, pl_t pl, processorid_t prid)
{
toid_t retval;
va_list ap;
va_start(ap, prid);
retval = dotimeout(prid, fun, arg, tim, callout_get_pri(), C_NORM, ap);
va_end(ap);
return(retval);
}
toid_t
itimeout(void (*fun)(), void *arg, long tim, pl_t pl, ...)
{
toid_t retval;
va_list ap;
va_start(ap, pl);
retval = dotimeout(cpuid(), fun, arg, tim, callout_get_pri(), C_NORM, ap);
va_end(ap);
return(retval);
}
toid_t
itimeout_nothrd(void (*fun)(), void *arg, long tim, pl_t pl, ...)
{
toid_t retval;
va_list ap;
va_start(ap, pl);
retval = dotimeout(cpuid(), fun, arg, tim, 0, C_NORM_ISTK,
ap);
va_end(ap);
return(retval);
}
toid_t
fast_itimeout(void (*fun)(), void *arg, long tim, pl_t pl, ...)
{
toid_t retval;
va_list ap;
if (!fastclock)
enable_fastclock();
va_start(ap, pl);
retval = dotimeout(cpuid(), fun, arg, tim, callout_get_pri(), C_FAST, ap);
va_end(ap);
return(retval);
}
toid_t
fast_itimeout_nothrd(void (*fun)(), void *arg, long tim, pl_t pl, ...)
{
toid_t retval;
va_list ap;
if (!fastclock)
enable_fastclock();
va_start(ap, pl);
retval = dotimeout(cpuid(), fun, arg, tim, 0, C_FAST_ISTK,
ap);
va_end(ap);
return(retval);
}
#ifdef CLOCK_CTIME_IS_ABSOLUTE
/*
* clock_prtimeout - like prtimeout, but use the absolute timeouts
* This function is only useful or used on systems that call
* set_timer_intr with an absolute time rather then a number
* of ticks.
*/
toid_t
clock_prtimeout(processorid_t targcpu, void (*fun)(), void *arg, __int64_t tim, int pri, ...)
{
toid_t retval;
va_list ap;
va_start(ap, pri);
retval = dotimeout(targcpu, fun, arg, tim, pri, C_CLOCK, ap);
va_end(ap);
if (retval == NULL)
cmn_err(CE_PANIC,"Timeout table overflow.\n Tune ncallout to a higher value.");
return(retval);
}
toid_t
clock_prtimeout_nothrd(processorid_t targcpu, void (*fun)(), void *arg,
__int64_t tim, ...)
{
toid_t retval;
va_list ap;
va_start(ap, tim);
retval = dotimeout(targcpu, fun, arg, tim, 0, C_CLOCK_ISTK, ap);
va_end(ap);
if (retval == NULL)
cmn_err_tag(138,CE_PANIC,"Timeout table overflow.\n Tune ncallout to a higher value.");
return(retval);
}
#endif /* CLOCK_CTIME_IS_ABSOLUTE */
/*
** list is used to select which callout list to queue on
** list == C_FAST then queue em on the fast callout list
** list == C_NORM then queue em on processor 'targcpu' callout list
** list == C_CLOCK then queue em on the fast callout list using an
** absolute clock cycle count to cmp against. [EVEREST]
** Per-CPU lists each have an associated "list lock".
**
** if tim == TIMEPOKE_NOW then timepoke() is called immediately
*/
toid_t
dotimeout(
register processorid_t targcpu,
void (*fun)(),
void *arg,
__int64_t tim,
int pri,
long list,
va_list ap)
{
register struct callout *p1, *p2, *pnew, *phead;
register toid_t id;
void *arg1, *arg2, *arg3;
int s;
__int64_t tmp_tim;
ASSERT(targcpu >= 0 && targcpu < maxcpus);
arg1 = va_arg(ap, void *);
arg2 = va_arg(ap, void *);
arg3 = va_arg(ap, void *);
/*
* Frame Scheduler
*/
if (pdaindr[targcpu].pda->p_frs_flags) {
targcpu = clock_processor;
}
#ifdef ISOLATE_DEBUG
{
extern int isolate_drop;
if (pdaindr[targcpu].pda->p_flags & PDAF_ISOLATED) {
cmn_err(CE_WARN,
"Isolated processor %d executes dotimeout\n",
pdaindr[targcpu].pda->p_cpuid);
if (isolate_drop)
debug((char *)0);
}
}
#endif
pnew = callout_alloc(targcpu);
if (!pnew)
return(0);
pnew->c_id = 0;
pnew->c_flags = 0;
pnew->c_func = fun;
pnew->c_arg = arg;
pnew->c_arg1 = arg1;
pnew->c_arg2 = arg2;
pnew->c_arg3 = arg3;
pnew->c_pl = pri;
pnew->c_cpuid = targcpu;
pnew->c_time = tim;
/* Grab the per-list lock in order to enqueue the new callout request */
s = mutex_spinlock_spl(&CI_LISTLOCK(&CALLTODO(targcpu)), splprof);
/*
* Find the head of the queue to insert the
* callout, if needed adjust the c_time field
* to have the correct units for the hardware
*/
phead = timeout_get_queuehead(list, pnew);
/*
* Set any needed flags info.
*/
if (list == C_NORM_ISTK || list == C_FAST_ISTK || list == C_CLOCK_ISTK)
pnew->c_flags |= C_FLAG_ISTK;
else
pnew->c_flags |= C_FLAG_ITHRD;
ASSERT(phead);
tmp_tim = pnew->c_time;
id = pnew->c_id; /* return all bits */
/*
* Insert pnew into correct position in callout list.
*/
for (p1 = phead ; p2 = p1->c_next ; p1 = p2) {
if (p2->c_time > tmp_tim)
break;
#if !defined(CLOCK_CTIME_IS_ABSOLUTE)
/* Make times relative to prev callback */
tmp_tim -= p2->c_time;
#endif /* !CLOCK_CTIME_IS_ABSOLUTE */
}
p1->c_next = pnew;
pnew->c_next = p2;
#if !defined(CLOCK_CTIME_IS_ABSOLUTE)
pnew->c_time = tmp_tim;
/* Nothing to do for CLOCK_CTIME_IS_ABSOLUTE, times are absolute*/
if (p2)
p2->c_time -= tmp_tim;
#endif /* !CLOCK_CTIME_IS_ABSOLUTE */
/*
* If we've just put something at the head of the queue,
* insure that a timer will go off at the right moment.
*/
if (phead == p1)
set_timer_intr(targcpu, tmp_tim, list);
mutex_spinunlock(&CI_LISTLOCK(&CALLTODO(targcpu)), s);
return(id);
}
/*
* Return a thread priority for a callout thread. Used whenever
* we are not given an explicit priority.
*/
int
callout_get_pri(void)
{
extern int default_timeout_pri;
if (private.p_kstackflag <= PDA_CURKERSTK) {
kthread_t *kt = curthreadp;
int pri = kt->k_basepri;
if (kt->k_copri) {
pri = kt->k_copri;
ASSERT(pri != 255);
} else if (pri < 0)
pri = default_timeout_pri;
else if (KT_ISUTHREAD(kt) && (pri < 255))
pri++;
ASSERT((pri >= 0) && (pri <= 255));
return pri;
}
return default_timeout_pri;
}
/*
* untimeout_body
*
* This routine attempts to find the specified timeout id and
* disable the entry if possible by checking the todo list, the
* pending list, timein thread list, and the migrated list.
*/
static int untimeout_migrated(toid_t, int);
static int
untimeout_body(toid_t id, int wait)
{
register struct callout *p1, *p2, *p3;
ci_itinfo_t *citp;
callout_info_t *cip;
register int s;
union c_tid c_tid; /* pick this name to use macro in callo.h */
__int64_t totaltime = 0;
int rc = 0, found = 0;
extern struct strintr *strintrrsrv;
/* Races here are benign */
if (strintrrsrv != NULL)
streams_untimeout(id);
/* determine which callout to search */
c_id = id;
cip = c_fast == 1 ? fastcatodo : &CALLTODO(c_cpuid);
/* Search todo list */
p1 = CI_TODO(cip);
s = mutex_spinlock_spl(&CI_LISTLOCK(cip), splprof);
for ( ; (p2 = p1->c_next) != 0; p1 = p2) {
#ifndef CLOCK_CTIME_IS_ABSOLUTE
totaltime += p2->c_time;
#endif /* CLOCK_CTIME_IS_ABSOLUTE */
if (p2->c_id == c_id) {
found = 1;
p3 = p2->c_next;
#ifdef CLOCK_CTIME_IS_ABSOLUTE
totaltime = p2->c_time;
#else
if (p3) /* carry overflow or delta */
p3->c_time += p2->c_time;
#endif /* CLOCK_CTIME_IS_ABSOLUTE */
p1->c_next = p3;
rc = callout_time_to_hz(totaltime, c_cpuid, c_fast);
break;
}
}
if (!found) {
/* Search pending callout list */
p1 = CI_PENDING(cip);
for ( ; (p2 = p1->c_next) != NULL; p1 = p2) {
if (p2->c_id == c_id) {
p1->c_next = p2->c_next;
rc = found = 1;
break;
}
}
}
if (found) {
mutex_spinunlock(&CI_LISTLOCK(cip), s);
callout_free(p2);
return(rc);
}
/*
* If we're in wait mode, then we need to search the
* list of xthread infos to see if any timein threads are
* executing the one we're looking for. If we find it,
* and we're not the thread executing the timeout (i.e.
* the timeout canceling itself -- see ec_recover()),
* then we go to sleep on the sync var.
*/
ASSERT(!found);
if (wait) {
int x;
citp = cip->ci_ithrdinfo;
for (x=0; x < cip->ci_ithrd_cnt; x++, citp++) {
if ((citp->cit_toid == c_id) && (citp->cit_ithread !=
(struct xthread *)private.p_curkthread)) {
citp->cit_flags |= CIT_WAITING;
sv_wait(&citp->cit_sync, PZERO, &CI_LISTLOCK(cip), s);
found = 1;
break;
}
}
}
if (!found) {
mutex_spinunlock(&CI_LISTLOCK(cip), s);
rc = untimeout_migrated(id, wait);
}
return(rc);
}
int
untimeout_wait(toid_t id)
{
ASSERT(id);
return (untimeout_body(id, 1));
}
int
untimeout(toid_t id)
{
/*
* If the id is 0 then there is no work to do.
*/
if (id == 0)
return(0);
return (untimeout_body(id, 0));
}
/*
* given an ID, return true(1) if still in timeout queue
* For the result to mean much, these functions should be called at spl > 1
*/
int
chktimeout(int id)
{
register struct callout *p1, *p2;
register int s, rv = 0;
union c_tid c_tid; /* pick this name to use macros in callo.h */
callout_info_t *callout_info;
c_id = id;
callout_info = &CALLTODO(c_cpuid);
s = mutex_spinlock_spl(&CI_LISTLOCK(callout_info), splprof);
for (p1 = CI_TODO(callout_info);
(p2 = p1->c_next) != 0; p1 = p2) {
if (p2->c_cid == c_cid) {
rv = 1;
break;
}
}
mutex_spinunlock(&CI_LISTLOCK(callout_info), s);
return(rv);
}
__int64_t
do_chktimeout_tick(callout_info_t *callout_info, toid_t id, void (*fun)(), void *arg)
{
register struct callout *phead, *p1, *p2;
__int64_t rv = 0;
union c_tid c_tid; /* pick this name to use macro in callo.h */
int s;
/*
* When passed nothing return nothing
*/
if ((id == 0) && (fun == NULL))
return (0);
c_id = id;
phead = CI_TODO(callout_info);
s = mutex_spinlock_spl(&CI_LISTLOCK(callout_info), splprof);
for ( p1 = phead; (p2 = p1->c_next) != 0; p1 = p2) {
#ifndef CLOCK_CTIME_IS_ABSOLUTE
/* accumulating ticks before it */
rv += p2->c_time;
#endif /* !CLOCK_CTIME_IS_ABSOLUTE */
if ((p2->c_id == c_id) ||
(p2->c_func == fun && p2->c_arg == arg)){
break;
}
}
mutex_spinunlock(&CI_LISTLOCK(callout_info), s);
if (p2 == 0) { /* did not find it */
if (id) {
c_id = find_migrated_timeout(id);
if(c_id)
return(do_chktimeout_tick(callout_info,c_id, NULL, NULL));
}
return(0);
}
#ifdef CLOCK_CTIME_IS_ABSOLUTE
rv = p2->c_time;
#endif /* CLOCK_CTIME_IS_ABSOLUTE */
if (rv <= 0) /* found it, but was negative or zero */
return(1); /* return a small positive value */
else
return(rv); /* found it, return accurate value */
}
/*
* Process a timeout entry and release it back to the free list.
*/
void
timein_entry(struct callout *timeout_entry)
{
register void *arg, *arg1, *arg2, *arg3;
register void (*func)(void *, void *, void *, void *);
#ifdef ISOLATE_DEBUG
pda_t *npda;
extern int isolate_debug, isolate_drop;
extern int wsyncv();
npda = pdaindr[cpuid()].pda;
if (npda->p_flags & PDAF_ISOLATED &&
isolate_debug &&
timeout_entry->c_func != (void (*)())wsyncv) {
cmn_err(CE_WARN,
"Isolated proc %d about to run timeout 0x%x\n",
npda->p_cpuid,timeout_entry->c_func );
if (isolate_drop) debug((char *) 0);
}
#endif /* ISOLATE_DEBUG */
arg = timeout_entry->c_arg;
arg1 = timeout_entry->c_arg1;
arg2 = timeout_entry->c_arg2;
arg3 = timeout_entry->c_arg3;
func = (void (*)(void *, void *, void *, void *))
timeout_entry->c_func;
callout_free(timeout_entry);
(*func)(arg, arg1, arg2, arg3); /* call the function */
}
/* ARGSUSED */
void
timein_entry_icvsema(ci_itinfo_t *citp, struct callout *to)
{
void timein_body_ithrd(ci_itinfo_t *);
void thread_timein(void *);
kthread_t *kt = curthreadp;
timein_body_ithrd(citp);
/* Reset func, args and pri, in case thread is next started
* by vsema (rather than icvsema). If it is again started
* by icvsema, these will be overridden.
*/
xthread_set_func(KT_TO_XT(kt), (xt_func_t *)thread_timein, citp);
ipsema(CIT_GET_SEMA(citp));
}
/*
* timein_entry_ithrd
*
* This routine links the specified callout entry to the tail
* of the xthread callout list and then bumps the counting
* semaphore.
*/
void
timein_entry_ithrd(callout_info_t *cip, struct callout *to, int s)
{
struct callout *p1;
ci_itinfo_t *citp;
kthread_t *ktscan;
kthread_t *kt;
xthread_t *xt;
int hipri;
int x;
xt = icvsema(CI_SEMA(cip), to->c_pl,
(xt_func_t *) timein_entry_icvsema,
NULL, to);
if (xt) {
/*
* Register untimeout information. Note that the newly
* runable interrupt thread can't run yet because it's
* mustrun on the current CPU and we hold a spinlock.
* This prevents a race between the initialization here
* and the thread trying to read the information ...
*/
ASSERT_MP(XT_TO_KT(xt)->k_mustrun == cpuid());
citp = (ci_itinfo_t*) xt->xt_arg;
citp->cit_to = to;
citp->cit_toid = to->c_id;
mutex_spinunlock(&CI_LISTLOCK(cip), s);
return;
}
/*
* Normally, the icvsema above will succeed, and the timein thread
* will be started directly. However, when we get callouts faster
* than the timein threads can handle them, we fall back to having
* to queue the callout and wait for the next available timein thread.
*/
kt = NULL;
hipri = 0;
/*
* Before we queue this callout, we need to check if it is
* more important than the callouts currently being handled.
* If it is, then we need to boost the priority of one of the
* running timein threads to prevent priority inversion.
*/
citp = cip->ci_ithrdinfo;
for (x=0; x < cip->ci_ithrd_cnt; x++, citp++) {
ktscan = XT_TO_KT(citp->cit_ithread);
if (ktscan->k_basepri >= to->c_pl) {
kt = NULL;
break;
}
if (ktscan->k_basepri > hipri) {
hipri = ktscan->k_basepri;
kt = ktscan;
}
}
if (kt) {
/*
* Boost priority
*/
int dequeued = 0;
retry:
kt_nested_lock(kt);
if (kt->k_onrq != CPU_NONE) {
if (removerunq(kt))
dequeued++;
else {
kt_nested_unlock(kt);
goto retry;
}
}
if (kt->k_copri == 0)
kt->k_copri = kt->k_basepri;
kt->k_basepri = to->c_pl;
if (kt->k_pri < to->c_pl)
kt->k_pri = to->c_pl;
if (dequeued)
putrunq(kt, CPU_NONE);
kt_nested_unlock(kt);
}
/*
* Keep pending callout queue sorted by priority
*/
if (p1 = CI_PENDING_NEXT(cip)) {
struct callout *prev = CI_PENDING(cip);
while (p1->c_pl >= to->c_pl) {
if (p1->c_next == NULL) {
p1->c_next = to;
to->c_next = NULL;
goto done;
}
prev = p1;
p1 = p1->c_next;
}
to->c_next = prev->c_next;
prev->c_next = to;
} else {
CI_PENDING_NEXT(cip) = to;
to->c_next = NULL;
}
done:
mutex_spinunlock(&CI_LISTLOCK(cip), s);
vsema(CI_SEMA(cip));
}
/*
* timein_body_ithrd()
*
* This routine scans the list of xthread callouts and executes them
* at the proper spl/priority value (note that any entries on this
* list are by definition expired. We also handle untimeout()
* synchronization here.
*/
void
timein_body_ithrd(ci_itinfo_t *citp)
{
callout_info_t *cip = citp->cit_calloinfo;
register struct callout *list, *p1, *p2;
register int s;
kthread_t *kt = curthreadp;
ASSERT(cip);
ASSERT(cip->ci_flags & CA_ENABLED);
/*
* If cit_to is set, which will be the usual case,
* our timein thread has been started by icvsema directly,
* and we're running at the correct priority. Otherwise,
* we need to find the next timeout on the PENDING list.
*/
if ((p1 = citp->cit_to) == NULL) {
list = CI_PENDING(cip);
s = mutex_spinlock_spl(&CI_LISTLOCK(cip), splprof);
kt->k_copri = 0;
p1 = list->c_next;
if (p1 == 0) { /* we're done */
mutex_spinunlock(&CI_LISTLOCK(cip), s);
return;
}
p2 = p1->c_next; /* advance to next item */
list->c_next = p2;
if (p1->c_next == NULL)
CI_PENDING_NEXT(cip) = NULL;
/* register untimeout information */
citp->cit_toid = p1->c_id;
citp->cit_to = p1;
/*
* This is rare, but in the event our priority was boosted
* by a high priority callout, we must make sure we lower it
* after the urgent callout was handled.
*/
if (kt->k_basepri > p1->c_pl) {
kt_nested_lock(kt);
kt_initialize_pri(kt, p1->c_pl);
reset_pri(kt);
kt_nested_unlock(kt);
}
ASSERT(kt->k_basepri <= p1->c_pl);
ASSERT(kt->k_pri <= p1->c_pl);
mutex_spinunlock(&CI_LISTLOCK(cip), s);
if (private.p_runrun == 1)
qswtch(RESCHED_Y);
/* Create another thread since we're overloaded */
if (cip->ci_ithrd_cnt < CA_ITHRDS_PER_LIST)
callout_itentry_init(cip, cpuid(), "timein");
}
ASSERT(citp->cit_toid == p1->c_id);
ASSERT(citp->cit_to == p1);
timein_entry(p1);
/*
* Check to see if someone is waiting on the
* timeout we just fired. If so, clear the wait
* state and wake em up.
*/
s = mutex_spinlock_spl(&CI_LISTLOCK(cip), splprof);
if (citp->cit_flags & CIT_WAITING) {
citp->cit_flags &= ~CIT_WAITING;
sv_broadcast(&citp->cit_sync);
}
citp->cit_toid = 0;
citp->cit_to = NULL;
kt->k_copri = 0;
mutex_spinunlock(&CI_LISTLOCK(cip), s);
}
/*
* This function doesn't look like a loop, but it is.
* The ipsema call always comes out calling the function set up in the xthread.
*/
void
thread_timein(void *arg)
{
timein_body_ithrd(arg);
ipsema(CIT_GET_SEMA((ci_itinfo_t *)arg));
}
/*
* Do one-time thread setup. Won't get called more than once per thread
* since the ipsema() will come out in the function setup in the
* xthread_set_func() call.
*/
void
thread_timein_start(void *arg)
{
#if MP
int mustruncpu = CIT_TO_CPU((ci_itinfo_t *)arg);
#endif
/* NO-OP on UP */
(void) setmustrun(mustruncpu);
xthread_set_func(KT_TO_XT(curthreadp), (xt_func_t *)thread_timein, arg);
ipsema(CIT_GET_SEMA((ci_itinfo_t *)arg));
/* NOTREACHED */
}
/*
* callout_info_init
*
* Initialize callout info structure.
*/
static void
callout_itentry_init(callout_info_t *cip, int targcpu, char *name)
{
char threadname[20];
ci_itinfo_t *citp;
extern int default_timeout_pri;
/* fill in ithrd info */
if (atomicSetInt(&cip->ci_flags,CA_ITHRD_CREATING) & CA_ITHRD_CREATING)
return;
if (cip->ci_ithrd_cnt >= CA_ITHRDS_PER_LIST)
return;
citp = &cip->ci_ithrdinfo[cip->ci_ithrd_cnt];
init_sv(&citp->cit_sync, SV_DEFAULT, name, targcpu);
citp->cit_calloinfo = cip;
sprintf(threadname, "%s%d", name, targcpu);
citp->cit_ithread = xthread_create(threadname, 0,
KTHREAD_DEF_STACKSZ, 0,
default_timeout_pri, KT_PS,
(xt_func_t *)thread_timein_start,
(void *)(__psint_t)citp);
/*
* Don't increment new thread cursor till we're done initializing
* all associated state -- like cit_ithread. This avoids a race
* with timein_entry_ithrd() which scans the table up to the cursor.
*/
__synchronize();
cip->ci_ithrd_cnt++;
atomicClearInt(&cip->ci_flags, CA_ITHRD_CREATING);
}
static void
callout_info_init(callout_info_t *cip, int targcpu, char *name)
{
ci_itinfo_t *citp;
init_sema(&cip->ci_sema, 0, name, targcpu);
cip->ci_flags |= ((targcpu << CA_CPU_SHIFT) & CA_CPU_MASK);
/* allocate xthread info blocks */
citp = (ci_itinfo_t *)kmem_zalloc(sizeof (ci_itinfo_t) *
CA_ITHRDS_PER_LIST, KM_SLEEP);
/* now fill in ithrd info */
cip->ci_ithrdinfo = citp;
callout_itentry_init(cip, targcpu, name);
cip->ci_flags |= CA_ENABLED;
spinlock_init(&cip->ci_listlock, "cilock");
}
/*
* thread_timein_init
* This routine sets up each allocated callout list for
* xthread handling.
*/
void
thread_timein_init(void)
{
int i;
for (i = 0; i < maxcpus; i++) {
/* don't bother for CPUs that don't exist */
if (!cpu_enabled(i))
continue;
callout_info_init(&CALLTODO(i), i, "timein");
}
#if !defined(CLOCK_CTIME_IS_ABSOLUTE)
/* setup fastclock handling */
callout_info_init(fastcatodo, fastclock_processor, "ftimein");
#endif /* !CLOCK_CTIME_IS_ABSOLUTE */
#if (MAXCPUS > 128)
if (numcpus <= 64)
sched_tick_mask = 0;
else if (numcpus <= 128)
sched_tick_mask = 1;
else if (numcpus <= 256)
sched_tick_mask = 3;
else
sched_tick_mask = 7;
#endif
}
void
delay(long ticks)
{
timespec_t ts;
if (ticks == 0)
return;
tick_to_timespec(ticks, &ts, NSEC_PER_TICK);
nano_delay(&ts);
}
/*
* delay current thread. non-breakable
*/
void
nano_delay(timespec_t *ts)
{
if (ts->tv_sec != 0 || ts->tv_nsec != 0) {
kthread_t *kt = curthreadp;
int s = kt_lock(kt);
kt_timedwait(kt, 0, s, 1, ts, NULL);
}
}
/*
* Adjtime system call.
* If the delta is reasonable, do it.
*/
struct adjtimea {
struct timeval *delta;
struct timeval *olddelta;
};
int
adjtime(struct adjtimea *uap)
{
struct timeval atv; /* new adjustment */
struct timeval oatv; /* old adjustment */
long odelta;
/*REFERENCED(!MP)*/
#if _MIPS_SIM == _ABI64
int abi = get_current_abi();
#endif
if (!_CAP_ABLE(CAP_TIME_MGT))
return EPERM;
if (COPYIN_XLATE(uap->delta, &atv, sizeof atv,
irix5_to_timeval_xlate, abi, 1)) {
_SAT_CLOCK(0,EFAULT);
return EFAULT;
}
/* prevent overflow */
if (atv.tv_sec <= -0x7fffffff/USEC_PER_SEC
|| atv.tv_sec >= 0x7fffffff/USEC_PER_SEC) {
_SAT_CLOCK(atv.tv_sec,EINVAL);
return EINVAL;
}
VHOST_ADJ_TIME(atv.tv_sec*USEC_PER_SEC + atv.tv_usec, &odelta);
_SAT_CLOCK(atv.tv_sec,0); /* Log successful change */
/*
* Mark last adjtime so that onesec_maint will
* know to reset tod chip as needed
*/
lastadjtime = lbolt+DIDADJTIME;
/*
* return remaining old correction if asked
*/
if (uap->olddelta) {
oatv.tv_sec = odelta / USEC_PER_SEC;
oatv.tv_usec = odelta % USEC_PER_SEC;
if (XLATE_COPYOUT(&oatv, uap->olddelta, sizeof oatv,
timeval_to_irix5_xlate, abi, 1))
return EFAULT;
}
return 0;
}
/*
* Get current time in the BSD style.
*/
struct gettimeofdaya {
void *tvp;
};
/* ARGSUSED1 */
int
gettimeofday(struct gettimeofdaya *uap)
{
struct timeval tv;
/*
* in 64 bit mode the tv struct has an initial 32 bit pad - since
* in 6.1 we had tv_Sec be a long, we need to guarantee that the
* top 32 bits is 0 - so to be safe, we bzero it here
* Note that 64 bit apps running on 64 bit kernels simply end up
* calling copyout - so this 'tv' is the actualy copy used.
*/
bzero(&tv, sizeof(tv));
microtime(&tv);
if (XLATE_COPYOUT(&tv, uap->tvp, sizeof tv,
timeval_to_irix5_xlate, get_current_abi(), 1))
return EFAULT;
return 0;
}
/*
* Set the current time, BSD style.
* Called from syssgi(2), not directly from sysent.
*/
int
settimeofday(void *uap)
{
struct timeval atv;
/*REFERENCED(!MP)*/
cpu_cookie_t was_running;
if (!_CAP_ABLE(CAP_TIME_MGT))
return EPERM;
if (COPYIN_XLATE(uap, &atv, sizeof atv,
irix5_to_timeval_xlate, get_current_abi(), 1)) {
_SAT_CLOCK(0,EFAULT);
return EFAULT;
}
/* assume the libc wrapper will have rounded the value. There
* is no reason to put code in the kernel unless necessary,
* and the super user can do far greater damage with the wrong
* time, than simply badly formatted time.
*/
was_running = setmustrun(clock_processor);
settime(atv.tv_sec, atv.tv_usec);
wtodc();
restoremustrun(was_running);
_SAT_CLOCK(atv.tv_sec,0);
return 0;
}
#if _MIPS_SIM == _ABI64
/*ARGSUSED*/
int
irix5_to_timeval_xlate(
enum xlate_mode mode,
void *to,
int count,
register xlate_info_t *info)
{
ASSERT(count == 1);
ASSERT(info->smallbuf != NULL);
ASSERT(mode == SETUP_BUFFER || mode == DO_XLATE);
if (mode == SETUP_BUFFER) {
ASSERT(info->copybuf == NULL);
ASSERT(info->copysize == 0);
if (sizeof(struct irix5_timeval) <= info->inbufsize)
info->copybuf = info->smallbuf;
else
info->copybuf = kern_malloc(
sizeof(struct irix5_timeval));
info->copysize = sizeof(struct irix5_timeval);
return 0;
}
ASSERT(info->copysize == sizeof(struct irix5_timeval));
ASSERT(info->copybuf != NULL);
irix5_to_timeval((struct timeval *)to,
(struct irix5_timeval *)info->copybuf);
return 0;
}
/*ARGSUSED*/
int
timeval_to_irix5_xlate(
void *from,
int count,
register xlate_info_t *info)
{
ASSERT(count == 1);
ASSERT(info->smallbuf != NULL);
if (sizeof(struct irix5_timeval) <= info->inbufsize)
info->copybuf = info->smallbuf;
else
info->copybuf = kern_malloc(sizeof(struct irix5_timeval));
info->copysize = sizeof(struct irix5_timeval);
timeval_to_irix5((struct timeval *)from,
(struct irix5_timeval *)info->copybuf);
return 0;
}
#endif /* _ABI64 */
/*
*The following code is support for migrating timeouts
*/
struct migrated_timeout {
struct migrated_timeout *next; /* link to next entry */
toid_t oldid; /* The id the timeout was assigned */
toid_t newid; /* The id of timeout after migration */
time_t time; /* Time we put on list */
};
static struct migrated_timeout *migrated_timeouts;
/*
* given an old timeout id plus the time left before the
* timeout add an item to the migrated timeouts list.
* this list is used to forward untimeout requests after a timeout has
* been migrated. It returns a pointer to the field for the new timeout
* id that is filled in after we get one.
*/
void *
allocate_migrate_timeout(void)
{
struct migrated_timeout *to;
to = (struct migrated_timeout *)kmem_alloc(sizeof(*to), KM_SLEEP);
return (to);
}
void
free_migrate_timeout(void *ptr)
{
kmem_free(ptr,sizeof(struct migrated_timeout));
}
volatile toid_t *
add_migrated_timeout(toid_t oldid, long sec, void *ptr)
{
struct migrated_timeout *to;
int s;
to = (struct migrated_timeout *)ptr;
to->oldid = oldid;
to->newid = 0;
to->time = lbolt / HZ + sec + 5; /* What time should
* this entry expire
*/
s = mutex_spinlock_spl(&migrated_timeout_lock, splprof);
to->next = migrated_timeouts;
migrated_timeouts = to;
mutex_spinunlock(&migrated_timeout_lock, s);
return (&to->newid);
}
/*
* given an ID, kill the corresponding migrated time-out
*/
static int
untimeout_migrated(toid_t id, int wait)
{
struct migrated_timeout *p1, *last;
struct migrated_timeout *free;
int return_val = 0;
register int s;
free = NULL;
/* If we do not have any migrated timeouts then return */
if (!migrated_timeouts)
return 0;
startover:
s = mutex_spinlock_spl(&migrated_timeout_lock, splprof);
p1 = migrated_timeouts;
/* Check if the head of the list is what we are looking for */
if (p1 && p1->oldid == id) {
free = p1;
migrated_timeouts = p1->next;
} else if (p1) {
for ( last = p1 ; (p1 = p1->next) != 0 ; last = p1) {
if((p1->oldid == id)) {
last->next = p1->next;
free = p1;
break;
}
if (lbolt / HZ > p1->time) {
/*
* if it is past the time that this timeout
* was due it can be removed.
*/
last->next = p1->next;
/* Need to drop lock before free */
mutex_spinunlock(&migrated_timeout_lock, s);
free_migrate_timeout((void *)p1);
/* As we dropped the lock
* we do not know the state of
* the list so we start all over
*/
goto startover;
}
}
}
mutex_spinunlock(&migrated_timeout_lock, s);
if (free) {
/*
* Avoid the short race where we might just have put the
* timeout onto the migrated queue
*/
while(free->newid == 0);
return_val = untimeout_body(free->newid, wait);
kmem_free (free, sizeof(*free));
}
return return_val;
}
static toid_t
find_migrated_timeout(toid_t id)
{
struct migrated_timeout *p1;
int s;
/* Peek to see if it's worth grabbing the migrated_timeout_lock. */
if ((p1 = migrated_timeouts) == NULL)
return 0;
s = mutex_spinlock_spl(&migrated_timeout_lock, splprof);
p1 = migrated_timeouts;
while (p1) {
if (p1->oldid == id)
break;
p1 = p1->next;
}
mutex_spinunlock(&migrated_timeout_lock, s);
return (p1 ? p1->newid : 0);
}