irix-657m-src/eoe/cmd/par/par.c

/**************************************************************************
 *									  *
 * 		 Copyright (C) 1988-1996 Silicon Graphics, Inc.		  *
 *									  *
 *  These coded instructions, statements, and computer programs  contain  *
 *  unpublished  proprietary  information of Silicon Graphics, Inc., and  *
 *  are protected by Federal copyright law.  They  may  not be disclosed  *
 *  to  third  parties  or copied or duplicated in any form, in whole or  *
 *  in part, without the prior written consent of Silicon Graphics, Inc.  *
 *									  *
 **************************************************************************/

#ident "$Revision: 1.152 $"

/*
 *	par -- process activity reporter
 *
 *	It generates a report either by taking padc's output
 *	or by invoking padc to get process activity data.
 *
 *	usage: par <collect_functions> <display_options> <command + argument list>
 *	       par <collect_functions> <display_options> -p pid
 *	       par <collect_functions> <display_options> -t 'time'
 *	       par <collect_functions> <display_options>
 *	       par <display_options>
 *
 *	In the first instance, par execs the supplied command with the
 *	specified arguments and monitors the activity of that process.
 *
 *	If -p is specified, then par reports on the activity of the
 *	specified process.
 *
 *	If -t is specified, then par records information for 'time' seconds
 *	If no -t is given cotinuous information is collected
 *
 *	In the last instance, par reads a padc output file from
 *	standard input and reports on its contents.
 *
 *	The types of activity which can be reported are controlled
 *	by the following functions and options:
 *
 *	collection functions:
 *      	-s - trace system calls with parameters
 *      	-r - trace scheduler
 *      	-x - trace network queueing
 *      	-X - trace network throughput
 *		-k - trace disk activity
 *		-p - trace a specific pid
 *		-i - inherit tracing on fork
 *		-O file - copy trace data to specified file
 *
 *	display options:
 *		-l 		list system call in long format
 *		-n syscall-num	select system call by system call number/name
 *		-e syscall-num	exclude system call by system call number/name
 *		-N syscall-name	select system call by system call name/number
 *		-P pid		trace named process only
 *		-S		print summary of system call and signal counts
 *		-SS		print system calls and signals trace
 *		-Q		print scheduling summary
 *		-QQ		print scheduling trace
 *		-QQQ		print full runq context scheduling trace
 *		-c		do not print cpuid
 *		-d		show any syscall time delta
 *		-u		show times in uS
 *		-o file		output to file
 *		-a len		max ascii length to print
 *		-b len		max binary length to print
 *		-B		force binary output rather than ascii
 *		-A		force ascii output rather than binary
 *		-z		sort syscall summary by syscall name
 *
 * NB: event processing and display is handled by common code found
 *     in -lrtmon; go there to add new support for decoding system
 *     calls and other events.
 */
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <signal.h>
#include <assert.h>
#include <ctype.h>
#include <stdarg.h>
#include <stdlib.h>

#include <rtmon.h>
#include <sys/par.h>
#include <sys/param.h>
#include <sys/schedctl.h>

#define PADC	"/usr/sbin/padc"	/* name of event collector app */

typedef __uint64_t ptime_t;		/* time base in par */

struct kidlist {		/* linked list of processes */
    struct kidlist* next;
    struct kidlist* prev;
};

struct kidq {			/* queue of processes */
    struct kidlist h;		/* list of processes */
    int		qlen;		/* queue length */
    int		qmaxlen;	/* max queue length over time */
};
struct kidq runq;			/* global run queue */
struct kidq localqs[RTMOND_MAXCPU];	/* per-cpu affinity queue */

struct cpurec {			/* per-CPU statistics */
    ptime_t	idletimes;	/* total number of times went idle */
    ptime_t	idleticks;	/* total mS spent idle */
    ptime_t	wentidle;	/* tick number last went idle */
    int		disps;		/* total times dispatched */
    int		hotdisps;	/* times dispatched from local/affinity q */
    ptime_t	runticks;	/* total mS spent running */
    int		swtchs;		/* number of calls to swtch */
    int64_t	curkid;		/* running kid */
    int64_t	lastkid;	/* last running kid */
    int		runsame;	/* times called swtch but ran same guy */
    int		nothighest;	/* ran guy other than top of run q */
    int		curpri;		/* priority of proc running at dispatch */
};
struct cpurec cpus[RTMOND_MAXCPU];	/* cpu statistics table */

struct kidrec {		/* per-thread state+statistics */
    struct kidlist h;		/* queue links */
    struct kidq* curq;		/* current q process is on */
    struct kidrec* hnext;	/* hash chain */
    int		traced;		/* 1 if being traced */
    char*	name;		/* process name string */
    int64_t	kid;		/* thread id */
	/*    pid_t	pid;		 process id */
    int		pri;		/* scheduling priority */
    int		rtpri;		/* priority if realtime */
    int		lastrun;	/* last CPU proc ran on */
    int		slices;		/* # times switched in */
    ptime_t	starttick;	/* time last switched in */
    ptime_t	runtime;	/* total time spent running */
    int		queued;		/* # times placed on global run queue */
    int		lqueued;	/* # times placed on local run queue */
    ptime_t	queuedtick;	/* time last placed on global queue */
    ptime_t	queuedtime;	/* total time spent on global queue */
    ptime_t	lqueuedtime;	/* total time spent on local queue */
    ptime_t	sleeptick;	/* time last put to sleep */
    ptime_t	sleeptime;	/* total time spent asleep */
    int		cpuswitch;	/* # times moved between CPUs */
    int		preempt_short;	/* # times preempted for <10 ticks */
    int		preempt_long;	/* # times preempted for >=10 ticks */
    int		swtch;		/* # times switched out */
    int		sleep;		/* # times put to sleep */
    int		yield;		/* # times voluntarily yielding CPU */
    int		mustrun;	/* # times switched out 'cuz of mustrun proc */
};

/*
 * Per-process state+statistics are maintained separately from
 * the state maintained by the library support code.  We should
 * be able to hook our data on to that of the library but there's
 * currently no way to do that.
 *
 * We use an egregiously large hash table size in order to handle
 * cases where we may be keeping track of hundreds or even thousands
 * of processes.  Since pids tend to clump together we probably will
 * only use a few cache lines out of each table when we're working
 * with a small number of processes.
 */
#define KIDHASHSIZE  4096			/* must be power of 2 */
#define KIDHASH(kid) ((kid) & (KIDHASHSIZE-1))	/* hash pid to hash index */
struct kidrec *kidhash[KIDHASHSIZE];	/* process statistics table */

char	funs[512];
FILE	*nfd = stdin;
int	debug = 0;		/* print debug information */
int	collect;		/* we're collecting data */
int	spawn;			/* should we spawn process to monitor */
int	Qflag;			/* scheduling display flag */
int	Sflag;			/* syscall display flag */
int	display;		/* disk/net/misc. collect/display flag */
int	cflag = 0;		/* 1 if -c option specified */
ptime_t	msecs;			/* current time */
int	syscallp;		/* report restricted to pids?? */
int	longl = 0;
int	inherit = 0;		/* have tracing inherited on fork */
FILE*	outf = stdout;		/* output file */
FILE*	errf = stderr;		/* error file */
const	char* padcfile = NULL;	/* name of optional event data file */
FILE*	dataf = NULL;		/* event data file */
rtmonPrintState* rs;		/* decoder state */
int	syscalls = 0;		/* # syscalls specified */
int	zflag = 0;		/* sort syscall summary by name */
u_int	contextswtch = 0;	/* number of context switches */
u_int	lost = 0;		/* number of events lost */

void	usage(char *progname);
void	readpadc(FILE*);
void	copypadc(FILE*);
void	setsyscalltrace(const char* arg, int onoff);
void	cpuidle(const tstamp_event_entry_t*);
void	cpusched(const tstamp_event_entry_t*);
void	kidexit(const tstamp_event_entry_t*);
void	cpusummary(void);
void	runqsummary(void);
void	printkidsum(register struct kidrec *kp);
void	kidsummary(void);
void	schedsummary(void);
void	dumpcpushort(void);
void	dumprunq(void);
void	showstat(void);
void	dumperrors(char *msg);
void	dumpsums(void);
void	sys_summary(void);
void	Qinit(struct kidq *);
void	Qinsert(struct kidq *qp, struct kidrec *entp);
void	Qremove(struct kidrec *entp);
void	kidinsert(struct kidrec *kp);
struct	kidrec *newkid(int64_t pid);
struct	kidrec *getkid(int64_t pid);
struct	kidrec *kidfind(register int64_t pid);
int	getpri(struct kidrec *kp);
void	kidwalk(void (*)(struct kidrec *));
void	handleevent(const tstamp_event_entry_t* ev);
void	merge(uint64_t, const tstamp_event_entry_t* pending[], int npending[]);

void sigcatcher(int sig) { (void) sig; }

static void
error(const char* fmt, ...)
{
    va_list ap;
    va_start(ap, fmt);
    fprintf(errf, "par: ");
    vfprintf(errf, fmt, ap);
    va_end(ap);
    fputc('\n', errf);
}

int
main(int argc, char **argv)
{
    int monitoring = 0;		/* number of pids we're monitoring */
    pid_t childid;		/* pid of child executing command */
    int c;			/* option switch */
    const char* optstr = "Aa:Bb:cdDe:iKklN:n:O:o:P:p:rst:uSQXxz";

    rs = rtmon_printBegin();
    rs->eventmask = 0;		/* by default, display nothing */
    strcpy(funs, PADC);
    while ((c = getopt(argc, argv, optstr)) != -1) {
	switch(c) {
	case 'i':		/* inherit tracing */
	    inherit++;
	    strcat(funs, " -i");
	    break;
#ifdef NET_DEBUG
	case 'X':		/* network */
	    rs->eventmask |= RTMON_NETFLOW;
	    display=1;
	    goto collect_arg;
	case 'x':		/* network */
	    rs->eventmask |= RTMON_NETSCHED;
	    display=1;
	    goto collect_arg;
#endif
	case 'K':
	    rs->flags |= (RTPRINT_SHOWKID | RTPRINT_SHOWPID);
	    display=1;
	    break;
	case 'k':		/* disk */
	    rs->eventmask |= RTMON_DISK;
	    display=1;
	    goto collect_arg;
	case 'r':		/* scheduler */
	case 's':		/* system calls+signals */
	collect_arg:
	    collect++;
	    sprintf(strchr(funs,'\0'), " -%c", c);
	    break;
	case 't':		/* collect for a period of time */
	    collect++;
	    sprintf(strchr(funs,'\0'), " -t %s", optarg);
	    break;
	case 'p':		/* collect events from a specific pid */
	    collect++;
	    monitoring++;
	    sprintf(strchr(funs,'\0'), " -p %s", optarg);
	    /* simulate previous behaviour--limit reporting */
	    rtmon_traceProcess(rs, atoi(optarg));
	    break;
	case 'l':		/* force long listing */
	    ++longl;
	    break;
	case 'a':		/* constrain max ascii string length */
	    rs->max_asciilen = atoi(optarg);
	    break;
	case 'A':		/* print data as ascii */
	    rs->flags |= RTPRINT_ASCII;
	    rs->flags &= ~RTPRINT_BINARY;
	    break;
	case 'b':		/* constrain max binary data length */
	    rs->max_binlen = atoi(optarg);
	    break;
	case 'B':		/* print data as binary */
	    rs->flags &= ~RTPRINT_ASCII;
	    rs->flags |= RTPRINT_BINARY;
	    break;
	case 'P':		/* filter reporting by pid */
	    rtmon_traceProcess(rs, atoi(optarg));
	    syscallp++;
	    break;
	case 'Q':		/* report scheduling activity */
	    if (++Qflag > 1)
		rs->eventmask |= RTMON_TASK;
	    break;
	case 'S':		/* report system call activity */
	    if (++Sflag > 1)
		rs->eventmask |= RTMON_SYSCALL|RTMON_SIGNAL;
	    break;
	case 'n':		/* filter reporting by sys call name/number */
	case 'e':		/* exclude sys call name/number */
	case 'N':		/* for compatibility */
	    setsyscalltrace(optarg, c != 'e');
	    break;
	case 'c':		/* disable CPU printing */
	    rs->flags &= ~RTPRINT_SHOWCPU;
	    cflag = 1;
	    break;
	case 'd':		/* show system calls as begin+end */
	    rs->syscalldelta = 0;
	    break;
	case 'u':		/* show times as ms+us delta */
	    rs->flags |= RTPRINT_USEUS;
	    break;
	case 'D':
	    rs->flags |= RTPRINT_INTERNAL;
	    debug++;
	    break;
	case 'o':		/* send report output to specified file */
	    outf = fopen(optarg, "w+");
	    if (outf == NULL) {
		error("%s: Cannot open: %s", optarg, strerror(errno));
		exit(1);
	    }
	    errf = outf;
	    break;
	case 'O':		/* send event data to specified file */
	    padcfile = optarg;
	    break;
	case 'z':
	    zflag = 1;
	    break;
	default:
	    usage(argv[0]);
	    exit(1);
	}
    }
    /* if any args left then we should spawn a process */
    if (optind < argc) {
	if (!Qflag && !Sflag && !display && !collect) {
	    /*
	     * If no analysis or collection options were specified
	     * and a command line appears to be present, then do a
	     * system call trace since that's what most people seem
	     * to want (e.g. par ls -> par -sSS ls).
	     */
	    Sflag = 2;
	    rs->eventmask |= RTMON_SYSCALL|RTMON_SIGNAL;
	    strcat(funs, " -s");
	}
	collect++, spawn++;
    }

    if (!(Qflag || Sflag || display)) {	/* no analysis options */
	if (!collect) {
	    error("No statistics collection or analysis options specified.");
	    usage(argv[0]);
	    exit(1);
	} else if (monitoring) {
	    /*
	     * If no analysis or collection options were specified
	     * then do a system call trace since that's what most
	     * people seem to want (e.g. par -p 178 -> par -sSS -p 178).
	     */
	    Sflag = 2;
	    rs->eventmask |= RTMON_SYSCALL|RTMON_SIGNAL;
	    strcat(funs, " -s");
	} else if (padcfile == NULL) {
	    error("No event data file specified "
		"for data collection; use the -O option.");
	    usage(argv[0]);
	    exit(1);
	}
    }
    if (padcfile != NULL) {
	dataf = fopen(padcfile, "w");
	if (dataf == NULL) {
	    error("Cannot open event data file %s: %s.",
		padcfile, strerror(errno));
	    exit(1);
	}
    }

    if (isatty(fileno(outf)))		/* line buffer output to tty */
	setlinebuf(outf);

    /*
     * Force a long listing if more than one
     * process might appear in the traces.
     */
    if (!longl)
	longl = (Qflag > 1 || syscallp > 1 || inherit ||
		    (!spawn && monitoring != 1)) ||
		    (rs->flags & RTPRINT_SHOWKID) ;
    if (longl)
	rs->flags |= RTPRINT_SHOWPID;
    else
	rs->flags &= ~RTPRINT_SHOWPID;

    if (collect) {		   	/* collect event data */
	if (spawn) {			/* fork a child to exec command */
	    switch (childid = fork()) {
	    case 0:			/* child */
		/*
		 * Synchronize w/ padc by pausing until we
		 * receive SIGUSR1.  This lets padc start up
		 * event collection before the command is exec'd.
		 */
		signal(SIGUSR1, sigcatcher);
		pause();
		execvp(argv[optind], &argv[optind]);
		dumperrors("exec failed");
		/*NOTREACHED*/
	    case -1:
		dumperrors("command fork failed");
		/*NOTREACHED*/
	    default:			/* parent */
		/*
		 * Pass child's pid to padc so it can send
		 * it SIGUSR1 when it's setup to collect
		 * event data for the command.
		 */
		sprintf(strchr(funs,'\0'), " -P %d", childid);
		/* restrict tracing to child proc */
		rtmon_traceProcess(rs, childid);
		syscallp++;
		break;
	    }
	}
	/*
	 * Spawn padc to collect data.  We add args
	 * to force padc to ``line buffer'' its output
	 * so long-running processes w/ small amounts
	 * of event data don't have their results buffered
	 * indefinitely; and pass the max # bytes to
	 * collect for indirect params.  Note that the
	 * latter is actually a system-wide parameter
	 * so one padc can affect another (sigh).
	 */
	sprintf(strchr(funs,'\0'), " -b 0 -I %d",
	    rs->max_asciilen > rs->max_binlen ?
		rs->max_asciilen : rs->max_binlen);
	if (debug > 1)
	    strcat(funs, debug > 2 ? " -DD" : " -D");
	if ((nfd = popen(funs, "r")) == 0)
	    dumperrors("padc exec failed");
    }

    if (Qflag || Sflag || display) {
	if (Qflag) {
	    u_int i;
	    Qinit(&runq);
	    for (i = 0; i < RTMOND_MAXCPU; i++)
		Qinit(&localqs[i]);
	}

	readpadc(nfd);
	if (Sflag)
	    sys_summary();

	if (Qflag) {
	    dumpsums();
	    schedsummary();
#ifdef NET_DEBUG
	    netsummary();
#endif
	}
    } else
	copypadc(nfd);

    if (collect) {
	if (spawn)	/* in case padc neglected to do this */
	    kill(childid, SIGUSR1);
	pclose(nfd);
    }
    if (outf != stdout)
	fclose(outf);
    if (dataf != NULL)
	fclose(dataf);
    return (0);
}

void
setsyscalltrace(const char* arg, int onoff)
{
    if (isdigit(arg[0])) {
	if (rtmon_settracebynum(rs, atoi(arg), onoff) ||
	    rtmon_settracebyname(rs, arg, onoff))
	    goto done;
    } else if (rtmon_settracebyname(rs, arg, onoff))
	goto done;
    error("Unknown system call name/number %s.", arg);
    exit(1);
    /*NOTREACHED*/
done:
    syscalls++;
    if (Sflag < 2)				/* enable syscall reporting */
	Sflag = 2;
    rs->eventmask |= RTMON_SYSCALL|RTMON_SIGNAL;
    strcat(funs, " -s");
}

/*
 * Read the remainder of a partial event.
 */
static int
readrem(FILE* fd, char* buf, size_t cc, size_t max)
{
    size_t rem = max - (cc % sizeof (tstamp_event_entry_t));
    do {
	size_t n = fread(&buf[cc], 1, rem, fd);
	if (n == 0)
	    break;
	rem -= n;
    } while (rem > 0);
    return (rem == 0);
}

void
readpadc(FILE* fd)
{
    const tstamp_event_entry_t* pending[RTMOND_MAXCPU];
    int npending[RTMOND_MAXCPU];
    char* data[RTMOND_MAXCPU];
    size_t datalen[RTMOND_MAXCPU];
    uint64_t tlast[RTMOND_MAXCPU];

    memset(pending, 0, sizeof (pending));
    memset(npending, 0, sizeof (npending));
    memset(data, 0, sizeof (data));
    memset(datalen, 0, sizeof (datalen));
    memset(tlast, 0, sizeof (tlast));
    for (;;) {
	union {
		tstamp_event_entry_t ev[NUMB_KERNEL_TSTAMPS];
		char buf[1];
	} u;
	size_t n;
	tstamp_event_entry_t* ev;

	n = fread(u.buf, 1, sizeof (u.ev[0]), fd);
	if (n == 0)
	    break;
	if (n != sizeof (u.ev[0]) && !readrem(fd, u.buf, n, sizeof (u.ev[0]))) {
	    error("Partial event read. n is %d and size is %d evt is %d", n,sizeof(u.ev[0]),u.ev[0].evt);
	    break;
	}
	ev = u.ev;
	if (ev->jumbocnt > 0) {		/* jumbo event, read the remainder */
	    n = ev->jumbocnt * sizeof (*ev);
	    if (!readrem(fd, (char*) u.buf, sizeof (u.ev[0]), n)) {
		error("Partial event read, terminating.");
		break;
	    }
	}
	if (dataf != NULL &&
	  fwrite(u.buf, (1+ev->jumbocnt)*sizeof (*ev), 1, dataf) != 1) {
	    error("%s: Output write error: %s.", padcfile, strerror(errno));
	    break;		/* XXX? */
	}
	if (ev->evt == TSTAMP_EV_SORECORD) {
	    int cpu;
	    size_t chunksize;
	    size_t nevts;
	    size_t off;
	    uint64_t threshold;
	    /*
	     * A new "record" of data to process.  Read the
	     * new chunk of event data into the per-CPU buffer.
	     * We move data or expand the buffer as needed to
	     * accomodate the new events.
	     */
	    cpu = ev->cpu;
	    assert(cpu < RTMOND_MAXCPU);
	    chunksize = (size_t) ev->qual[0];
	    nevts = chunksize / sizeof (*ev);
	    off = ((const char*) pending[cpu]) - data[cpu];
	    if (off + (npending[cpu]+nevts)*sizeof (*ev) > datalen[cpu]) {
		if (chunksize <= off) {
		    /*
		     * Space is available at the front of the buffer;
		     * just copy the pending events down.
		     */
		    if (npending[cpu])
			memmove(data[cpu], pending[cpu],
			    npending[cpu]*sizeof (*ev));
		    pending[cpu] = (const tstamp_event_entry_t*) data[cpu];
		} else {
		    char* dp;
		    /*
		     * Grow the data buffer to hold this chunk of
		     * events and any that are pending; and reset
		     * the pending reference to the buffer.  We
		     * allocate a new buffer and copy pending data
		     * to the front instead of realloc'ing the
		     * existing buffer because we're likely to
		     * get more data soon and that would just
		     * cause us to copy lots of data to the front
		     * anyway.  This way we copy (hopefully) less
		     * data and read the new chunk directly into
		     * the right spot--eliminating a copy.
		     */
		    datalen[cpu] = off+(npending[cpu]+nevts)*sizeof (*ev);
		    dp = (char*) malloc(datalen[cpu]);
		    if (dp == NULL) {
			error("Out of memory (event data).");
			break;
		    }
		    if (data[cpu]) {
			memcpy(dp, pending[cpu], npending[cpu]*sizeof (*ev));
			free(data[cpu]);
		    }
		    data[cpu] = dp;
		    pending[cpu] = (const tstamp_event_entry_t*) dp;
		}
	    }
	    /*
	     * Read the chunk of event data.
	     */
	    if (chunksize && fread((char*)(pending[cpu]+npending[cpu]), chunksize, 1, fd) != 1) {
		if (debug)
		    fprintf(outf, "short read of %d bytes of event data\n",
			chunksize);
		break;
	    }
	    npending[cpu] += nevts;
	    /*
	     * Calculate the latest time that we can merge
	     * events up to.  We know each stream of events
	     * is ordered so by comparing the time of the
	     * last event for each CPU we can select a time
	     * that is safe to use in selecting events to merge.
	     */
	    tlast[cpu] = ev->qual[2];
	    threshold = tlast[0];
	    if (debug)
		    fprintf(stdout,"threashold %lld\n",threshold);
	    for (cpu = rtmon_ncpus(rs)-1; cpu > 0; cpu--)
		if (tlast[cpu] < threshold)
		    threshold = tlast[cpu];
	    merge(threshold, pending, npending);
	} else
	    handleevent(ev);
	if (debug)
	    fflush(outf);
    }
    if (debug)
	    fprintf(stdout,"threashold %lld\n",-1LL);
    merge(-1LL, pending, npending);		/* flush any remainder */
}

void
handleevent(const tstamp_event_entry_t* ev)
{
    static int lost = 0;
    static ptime_t starttime = 0;

    /*
     * Use the first non-config event to set the
     * base time for calculating relative time
     * and to decide whether or not to display
     * CPU numbers in the listing.
     */
    if (!starttime && ev->evt != TSTAMP_EV_CONFIG) {
	starttime = ev->tstamp;
	if (!cflag) {
	    if (rtmon_ncpus(rs) > 1)
		rs->flags |= RTPRINT_SHOWCPU;
	    else
		rs->flags &= ~RTPRINT_SHOWCPU;
	}
    }
    msecs = ev->tstamp - starttime;

    switch (ev->evt) {
    case TSTAMP_EV_EXIT:		/* process exit'd */
	kidexit(ev);
	rtmon_printEvent(rs, outf, ev);
	break;
    case TSTAMP_EV_FORK: {		/* process fork'd */
	const tstamp_event_fork_data_t* fdp =
	    (const tstamp_event_fork_data_t*) &ev->qual[0];

	if (inherit && _rtmon_kid_istraced(&rs->ds, (int64_t) fdp->pkid))
	    rtmon_traceKid(rs, (int64_t)fdp->ckid);

	rtmon_printEvent(rs, outf, ev);
	break;
    }
    case EVENT_TASK_STATECHANGE:	/* proc state change */
	contextswtch++;
	cpusched(ev);
	break;
    case TSTAMP_EV_EOSWITCH:		/* new proc to run */
    case TSTAMP_EV_EODISP:		/* put proc on runq */
	cpusched(ev);
	break;
    case EVENT_WIND_EXIT_IDLE:		/* idle processor */
	contextswtch++;
	cpuidle(ev);
	break;
    case TSTAMP_EV_LOST_TSTAMP:		/* server lost events */
	lost += (int) ev->qual[0];
	fprintf(outf, "OVERFLOW: CPU %d lost %d so far\n", ev->cpu, lost);
	break;
    default:
	rtmon_printEvent(rs, outf, ev);
	break;
    }
}

/*
 * Merge event data up to tmax time.  Merged events
 * are written to stdout (w/ buffering).  This code
 * assumes that each CPU's set of events are already
 * ordered in time; we just merge the event streams
 * from different CPUs.
 */
void
merge(uint64_t tmax, const tstamp_event_entry_t* pending[], int npending[])
{
    const tstamp_event_entry_t* candidate[RTMOND_MAXCPU];
    const tstamp_event_entry_t* ev;
    int nc, i, j, k;

    /*
     * Find CPUs with events to merge and do a
     * first-level sort of the top events for
     * each CPU.  We assume below that each CPU's
     * events are already sorted.
     */
    nc = 0;
    if (debug > 1)
	fprintf(outf, "par: merge to %llu: ", tmax);
    for (i = rtmon_ncpus(rs)-1; i >= 0; i--) {
	if (npending[i] && (ev = pending[i])->tstamp < tmax) {
	    for (j = 0; j < nc && ev->tstamp > candidate[j]->tstamp; j++)
		;
	    if (j < nc) {			/* insert in middle */
		for (k = nc-1; k >= j; k--)
		    candidate[k+1] = candidate[k];
	    }
	    candidate[j] = ev, nc++;
	    if (debug > 1)
		fprintf(outf, " CPU[%d] %d", i, npending[i]);
	} else if (debug > 1)
	    fprintf(outf, " !CPU[%d](%d,%lld)", i, npending[i],
		npending[i] ? pending[i]->tstamp : 0);
    }
    if (debug > 1)
	fprintf(outf, "\n");
    while (nc > 0) {
	ev = candidate[0];			/* sorted event */
	if (ev->tstamp > tmax) {
	    for (i = 0; i < nc; i++)
		pending[candidate[i]->cpu] = candidate[i];
	    break;
	}
	handleevent(ev);			/* process event */
	j = 1+ev->jumbocnt;			/* slots used by event */
	assert(j <= npending[ev->cpu]);
	npending[ev->cpu] -= j;
	if (npending[ev->cpu]) {		/* merge next event for CPU */
	    /*
	     * Advance to next event for this CPU and
	     * re-sort the top events based on time.
	     * Since we know that all other candidate
	     * events are already sorted by time this
	     * just entails inserting the new event in
	     * the correct place.
	     */
	    ev += j;
	    for (i = 0; i < nc-1 && ev->tstamp > candidate[i+1]->tstamp; i++)
		candidate[i] = candidate[i+1];
	    candidate[i] = ev;
	} else {				/* no more events for CPU */
	    pending[ev->cpu] = ev+j;
	    nc--;
	    for (i = 1; i <= nc; i++)		/* shift down, already sorted */
		candidate[i-1] = candidate[i];
	}
    }
}

/*
 * Check a server-calculated checksum.  By
 * default data does *not* have checksums;
 * these are added only for debugging.
 */
void
checksum(int cpu, const tstamp_event_entry_t* ev, size_t cc0, uint64_t esum)
{
    const uint64_t* lp = (const uint64_t*) ev;
    uint64_t sum = 0;
    size_t cc;
    for (cc = cc0; (ssize_t) cc > 0; cc -= sizeof (*lp))
	sum += *lp++;
    if (sum != esum) {
	fprintf(stderr,
"padc: checksum mismatch, cpu %d, count %d, got %#llx, expected %#llx\n",
	    cpu, cc0, sum, esum);
	lp = (const uint64_t*) ev;
	for (cc = 0; cc != cc0; cc += sizeof (*lp))
	    fprintf(stderr, "%4d: %#llx\n", cc, *lp++);
    }
}

void
copypadc(FILE* fd)
{
    for (;;) {
	char buf[128*1024];
	size_t n = fread(buf, 1, sizeof (buf), fd);
	if (n == 0)
	    break;
	if (fwrite(buf, n, 1, dataf) != 1) {
	    error("%s: Output write error: %s.", padcfile, strerror(errno));
	    break;
	}
    }
}

void
dumpsums(void)
{
    if (contextswtch && msecs) {
	u_int secs = (u_int)(rtmon_toms(rs,msecs) / 1000);
	fprintf(outf, " %d context switches / sec\n",
	    contextswtch / (secs ? secs : 1));
    }
}

typedef struct {
    const char*	name;
    int		count;
    ptime_t	time;
} scallstat_t;
static	scallstat_t* stats = NULL;
static	int nstats;

/* dump system calls' summary */
static void
recordstat(rtmonPrintState* rs, const char* name, int count, __int64_t time)
{
    (void) rs;
    if (nstats == 0)
	stats = (scallstat_t*) malloc(sizeof (*stats));
    else
	stats = (scallstat_t*) realloc(stats, (nstats+1) * sizeof (*stats));
    stats[nstats].name = name;
    stats[nstats].count = count;
    stats[nstats].time = time;
    nstats++;
}
static int
statcompar(const void* a, const void* b)
{
    const scallstat_t* sa = (const scallstat_t*) a;
    const scallstat_t* sb = (const scallstat_t*) b;

    if (sa->time != sb->time)
	return ((int)(sb->time - sa->time));
    if (sa->count != sb->count)
	return (sb->count - sa->count);
    return (strcmp(sa->name, sb->name));
}
static int
namecompar(const void* a, const void* b)
{
    const scallstat_t* sa = (const scallstat_t*) a;
    const scallstat_t* sb = (const scallstat_t*) b;

    return (strcmp(sa->name, sb->name));
}
static void
banner(const char* c1, const char* c2, const char* c3, const char* c4)
{
    fprintf(outf, "%-14s %6s %9s %9s\n", c1, c2, c3, c4);
}
void
sys_summary(void)
{
    int i;

    rtmon_walksyscalls(rs, recordstat);
    qsort(stats, nstats, sizeof (scallstat_t), zflag ? namecompar : statcompar);
    if (nstats > 0) {
	if (Sflag > 1)
	    fputc('\n', outf);
	fprintf(outf, "System call summary:\n");
	banner(    "",       "", "Average",     "Total");
	banner("Name", "#Calls", "Time(ms)", "Time(ms)");
	fprintf(outf, "-----------------------------------------\n");
	for (i = 0; i < nstats; i++) {
	    const scallstat_t* st = &stats[i];
	    fprintf(outf, "%-14s %6d %9.2f %9.2f\n"
		, st->name
		, st->count
		, rtmon_tomsf(rs, st->time/st->count)
		, rtmon_tomsf(rs, st->time)
	    );
	}
	free(stats);
    }
}

/*
 * Queue manipulation primitives
 */
void
Qinit(struct kidq* qp)
{
    qp->h.next = qp->h.prev = &qp->h;
    qp->qlen = 0;
    qp->qmaxlen = 0;
}

#define	Qempty(qp)	(qp->next == qp)

int
getpri(struct kidrec* kp)
{
    return (kp->pri <= PZERO || kp->rtpri == 0 ? kp->pri : kp->rtpri);
}

void
Qinsert(struct kidq* qp, struct kidrec* entp)
{
    int newpri = getpri(entp);
    struct kidlist* wp;

    for (wp = qp->h.next; wp != &qp->h; wp = wp->next) {
	int workpri = getpri((struct kidrec*) wp);
	if (newpri < workpri)
	    break;
    }

    entp->curq = qp;
    entp->h.next = wp;
    entp->h.prev = wp->prev;
    entp->h.prev->next = wp->prev = &entp->h;
    if (++(qp->qlen) > qp->qmaxlen)
	    qp->qmaxlen = qp->qlen;
}

void
Qremove(struct kidrec* entp)
{
    struct kidq* qp = entp->curq;

    assert(qp != NULL);
    entp->h.next->prev = entp->h.prev;
    entp->h.prev->next = entp->h.next;
    entp->h.next = entp->h.prev = NULL;
    entp->curq = NULL;
    qp->qlen--;
    assert(qp->qlen >= 0);
}

void
kidinsert(struct kidrec* kp)
{
    struct kidrec **phead = &kidhash[KIDHASH(kp->kid)];
    while (*phead != NULL) {
	assert((*phead)->kid != kp->kid);
	phead = &(*phead)->hnext;
    }
    assert(kp->hnext == NULL);
    *phead = kp;
}

struct kidrec*
kidfind(register int64_t kid)
{
    struct kidrec *kp = kidhash[KIDHASH(kid)];
    for (; kp != NULL && kp->kid != kid; kp = kp->hnext)
	;
    return (kp);
}

void
kidwalk(void (*func)(struct kidrec*))
{
    int i;
    for (i = 0; i < KIDHASHSIZE; i++) {
	struct kidrec* kp = kidhash[i];
	for (; kp != NULL; kp = kp->hnext)
	    (*func)(kp);
    }
}

struct kidrec*
newkid(int64_t kid)
{
    register struct kidrec* kp;

    if ((kp = (struct kidrec*) malloc(sizeof(*kp))) == NULL) {
	error("Out of memory: can not allocate %d bytes for KID %d.",
	    sizeof (*kp), kid);
	exit(1);
    }

    memset(kp, 0, sizeof(*kp));
    kp->traced =
       (rs->flags & RTPRINT_ALLPROCS) != 0 || _rtmon_kid_istraced(&rs->ds, kid);
    kp->kid = kid;
    kp->pri = PIDLE;
    kp->rtpri = PIDLE;
    kp->lastrun = -1;
    kp->starttick = -1;
    kp->queuedtick = -1;
    kp->sleeptick = -1;

    /*
     * Insert the new record
     */
    kidinsert(kp);

    return (kp);
}

struct kidrec*
getkid(int64_t kid)
{
    struct kidrec* kp = kidfind(kid);
    return (kp == NULL ? newkid(kid) : kp);
}

void
cpuidle(const tstamp_event_entry_t* ev)
{
    struct cpurec* crp = &cpus[ev->cpu];

    if (!Qflag)
	return;
    if (crp->wentidle == 0) {
	crp->idletimes++;
	crp->curkid = -1;
	crp->curpri = PIDLE;
	crp->wentidle = ev->tstamp;
	rtmon_printEvent(rs, outf, ev);
    }
}

#define RTMON_UNPACKPRI(v, f, p, bp) { \
    f = (int)(v>>32); p = (short)(v&0xffff); bp = (short)((v>>16)&0xffff); \
}

/*
 * handle scheduling event
 * Always maintain all statistics even if only tracing a single pid
 * The display options only handle what is printed.
 * This lets us print reasonable summary stats
 */
void
cpusched(const tstamp_event_entry_t* ev)
{
    register struct kidrec *kp;
    register struct cpurec *crp;
    ptime_t now;
    cpuid_t onrq;
    int64_t kid;
    int flags, pri, basepri;

    if (!Qflag)
	    return;

    kid = (int64_t) ev->qual[0];
    kp = getkid(kid);
    crp = &cpus[ev->cpu];
    now = ev->tstamp;
    RTMON_UNPACKPRI(ev->qual[1], flags, pri, basepri);

    switch (ev->evt) {
    case TSTAMP_EV_EODISP:		/* put on a run q */
	    kp->pri = pri;
	    kp->rtpri = basepri;
	    if (kp->sleeptick != -1) {
		    if (! (now >= kp->sleeptick))
			    now = kp->sleeptick;
		    kp->sleeptime += now - kp->sleeptick;
		    kp->sleeptick = -1;
	    }
	    kp->queuedtick = now;
	    /*
	     * Processor affinity is implemented by placing
	     * processes/threads on a per-cpu local q first,
	     * then moving them to the global run q if not
	     * re-scheduled quickly.  This can result in
	     * multiple event records, one for each move.
	     *
	     * NB: cpu identifies the processor where the event
	     *     record was generated.  The target processor
	     *     (or ncpus if none) is identified separately
	     *     since they need not be the same.
	     */
	    if (kp->curq != NULL)		/* remove from previous q */
		    Qremove(kp);
	    onrq = (cpuid_t) flags;		/* NB: packed in w/ pri's */
	    assert(onrq != (cpuid_t) -1);
	    if (onrq < rtmon_ncpus(rs)) {
		    kp->lqueued++;
		    Qinsert(&localqs[onrq], kp);
	    } else if (onrq == rtmon_ncpus(rs)) {
		    kp->queued++;
		    Qinsert(&runq, kp);
	    } else {
		    error(
			  "Scheduling event for invalid CPU %u (ncpus %u); max CPUs %u.",
			  onrq, rtmon_ncpus(rs), RTMOND_MAXCPU);
		    exit(1);
	    }
	    break;
    case EVENT_TASK_STATECHANGE:	/* switched out */
	    crp->swtchs++;
	    crp->lastkid = kp->kid;
	    /*
	     * set up for idle here - only really important
	     * when not tracing everyone - so runq stuff still looks
	     * right
	     */
	    crp->curkid = -1;
	    crp->curpri = PIDLE;
	    kp->pri = pri;
	    kp->rtpri = basepri;
	    if (kp->starttick != -1) {
		    ptime_t runtime;
		    if (! (now >= kp->starttick))
			    now = kp->starttick;
		    runtime = now - kp->starttick;
		    kp->runtime += runtime;
		    crp->runticks += runtime;
	    }
	    kp->swtch++;
	    switch (flags) {
	    case SEMAWAIT:
	    case MUTEXWAIT:
	    case SVWAIT:
	    case MRWAIT:
	    case GANGWAIT:
		    kp->sleep++;
		    kp->sleeptick = now;
		    break;
	    case MUSTRUNCPU:
		    kp->mustrun++;
		    break;
	    case RESCHED_P:
	    case RESCHED_KP:
		    if (kp->starttick != -1 && rtmon_toms(rs,now - kp->starttick) < 10)
			    kp->preempt_short++;
		    else
			    kp->preempt_long++;
		    break;
	    case RESCHED_S:
	    case RESCHED_Y:
	    case RESCHED_D:
		    kp->yield++;
		    break;
	    default:
		    fprintf(outf, "Unknown sched reason %d\n", flags);
		    break;
	    }
	    kp->starttick = -1;

	    break;
	case TSTAMP_EV_EOSWITCH:	/* switched in */
		kp->pri = pri;
		kp->rtpri = basepri;
		if (kp->queuedtick != -1 && now > kp->queuedtick) {
				/*
				 * NB: We cannot properly attribute queue time
				 *     if we lose the set on run q event that
				 *     tells us which queue the process is on.
				 */
			if (kp->curq == &runq || kp->curq == NULL)
				kp->queuedtime += now - kp->queuedtick;
			else
				kp->lqueuedtime += now - kp->queuedtick;
		}
		kp->slices++;
		kp->starttick = now;
		kp->queuedtick = -1;
		if (kp->lastrun != -1 && kp->lastrun != ev->cpu)
			kp->cpuswitch++;
		kp->lastrun = ev->cpu;
		if (crp->wentidle > 0) {
			if (! (now >= crp->wentidle))
				now = crp->wentidle;
			crp->idleticks += now - crp->wentidle;
			crp->wentidle = 0;
		}
		crp->curpri = getpri(kp);
		if (crp->lastkid == kp->kid)
			crp->runsame++;
		crp->curkid = kp->kid;
		crp->disps++;
		if (kp->curq != NULL) {
			if (kp != (struct kidrec*) kp->curq->h.next)
				crp->nothighest++;
			if (kp->curq != &runq)	/* processor affinity win */
				crp->hotdisps++;
			Qremove(kp);
		}
		break;
    }
    rtmon_printEvent(rs, outf, ev);
    if (Qflag > 2)
	    showstat();
}

/*
 * Handle process termination event; if process
 * was running complete run time calculation.
 */
void
kidexit(const tstamp_event_entry_t* ev)
{
    struct kidrec* kp = getkid(ev->qual[0]);
    assert(kp != NULL);
    if (kp->starttick != -1) {
	ptime_t runtime;
	ptime_t now = ev->tstamp;
	if (! (now >= kp->starttick))
	    now = kp->starttick;
	runtime = now - kp->starttick;
	kp->runtime += runtime;
	kp->starttick = -1;
    }
    if (Qflag && kp->traced) {
	/*
	 * Sigh, if we're going to generate a summary
	 * we must copy the process name string from
	 * the library because it'll free it when the
	 * process terminates.
	 */
	kp->name = strdup(rtmon_kidLookup(&rs->ds, kp->kid));
    }
}

#define	NZ(x)	((x) ? (x) : 1)
#define	PCT(a,b)	((100.0 * (double) (a)) / (double)NZ((a) + (b)))

void
cpusummary(void)
{
    register int i;
    register struct cpurec *cp;
    register struct kidrec *kp;

    for (i = 0, cp = cpus; i < RTMOND_MAXCPU; i++, cp++) {
	if (cp->idletimes == 0 && cp->disps == 0 && cp->swtchs == 0)
	    continue;
	/*
	 * Tally the final execution segment.
	 */
	if (cp->wentidle > 0 && cp->wentidle <= msecs) {
	    assert(cp->curpri == PIDLE);
	    cp->idleticks += msecs - cp->wentidle;
	} else if (cp->curkid > 0) {
	    kp = getkid(cp->curkid);
	    assert(kp != NULL);
	    if (kp->starttick != -1 && kp->starttick <= msecs)
		cp->runticks += msecs - kp->starttick;
	}
	fprintf(outf, "\nCPU %d summary:\n", i);
	fprintf(outf, "  Total mS idle        = %lld\n",
	    rtmon_toms(rs,cp->idleticks));
	fprintf(outf, "  Total mS running     = %lld (saturation = %0.1f%%)\n",
	    rtmon_toms(rs,cp->runticks),
	    PCT(cp->runticks, cp->idleticks));
	fprintf(outf, "  Processes dispatched = %d (local %d global %d)\n",
	    cp->disps, cp->hotdisps, cp->disps - cp->hotdisps);
	fprintf(outf, "  Ran same             = %d (%0.1f%%)\n",
	    cp->runsame, (100.0 * cp->runsame)/NZ(cp->disps));
	fprintf(outf, "  Ran not highest      = %d (%0.1f%%)\n",
	    cp->nothighest, (100.0 * cp->nothighest)/NZ(cp->disps));
    }

}

void
runqsummary(void)
{
    fprintf(outf, "\nRun queue summary:\n");
    fprintf(outf, "  Max queue len\t= %d\n", runq.qmaxlen);
}

void
printkidsum(register struct kidrec *kp)
{
	pid_t pid = 0;
#define	AVG(a,b)	((a) > 0 ? rtmon_tomsf(rs,(b)/(a)) : 0.0)
    if (!kp->traced)
	return;
    if (kp->starttick != -1 && kp->starttick <= msecs)
	kp->runtime += msecs - kp->starttick;
    else if (kp->queuedtick != -1 && kp->queuedtick <= msecs) {
	if (kp->curq == &runq || kp->curq == NULL)
	    kp->queuedtime += msecs - kp->queuedtick;
	else
	    kp->lqueuedtime += msecs - kp->queuedtick;
    } else if (kp->sleeptick != -1 && kp->sleeptick <= msecs)
	kp->sleeptime += msecs - kp->sleeptick;
    pid = rtmon_pidLookup(&rs->ds, kp->kid);
    if (rs->flags & RTPRINT_SHOWKID)
	    fprintf(outf, "\nProcess summary for %s (pid:kid %d:%lld):\n",
		    kp->name != NULL ? kp->name : rtmon_kidLookup(&rs->ds, kp->kid),
		    pid, kp->kid);
    else
	    fprintf(outf, "\nProcess summary for %s (%s %lld):\n",
		    kp->name != NULL ? kp->name : rtmon_kidLookup(&rs->ds, kp->kid),
		    (pid != 0) ? "pid" : "kid",
		    (pid != 0) ? pid : kp->kid);

    fprintf(outf, "  Time slices %d\n", kp->slices);
    fprintf(outf, "  Total run mS %lld (avg mS/slice %0.2f)\n",
	rtmon_toms(rs,kp->runtime), AVG(kp->slices, kp->runtime));
    if (rtmon_ncpus(rs) > 1) {
	fprintf(outf, "  Times queued %d (local %d global %d)\n",
	    kp->queued+kp->lqueued, kp->lqueued, kp->queued);
	fprintf(outf, "  Total mS queued locally %llu (avg residence %0.2f)\n",
	    rtmon_toms(rs,kp->lqueuedtime), AVG(kp->lqueued, kp->lqueuedtime));
	fprintf(outf, "  Total mS queued globally %llu (avg residence %0.2f)\n",
	    rtmon_toms(rs,kp->queuedtime), AVG(kp->queued, kp->queuedtime));
    } else {
	fprintf(outf, "  Times queued %d\n", kp->queued+kp->lqueued);
	fprintf(outf, "  Total mS queued %llu (avg queue residence %0.2f)\n",
	    rtmon_toms(rs,kp->queuedtime+kp->lqueuedtime),
	    AVG(kp->lqueued+kp->queued, kp->lqueuedtime+kp->queuedtime));
    }
    fprintf(outf, "  Total mS sleeping %llu (avg wait time %0.2f)\n",
	rtmon_toms(rs,kp->sleeptime), AVG(kp->sleep, kp->sleeptime));
    fprintf(outf, "  Total context switches %d (counts by reason for swtch):\n",
	kp->swtch);
    fprintf(outf, "    preempt short\t%d (%0.1f%%)\n", kp->preempt_short,
	(100.0*kp->preempt_short)/NZ(kp->swtch));
    fprintf(outf, "    preempt long\t%d (%0.1f%%)\n", kp->preempt_long,
	(100.0*kp->preempt_long)/NZ(kp->swtch));
    fprintf(outf, "    sleep\t\t%d (%0.1f%%)\n", kp->sleep,
	(100.0*kp->sleep)/NZ(kp->swtch));
    fprintf(outf, "    yield\t\t%d (%0.1f%%)\n", kp->yield,
	(100.0*kp->yield)/NZ(kp->swtch));
    fprintf(outf, "    mustrun\t\t%d (%0.1f%%)\n", kp->mustrun,
	(100.0*kp->mustrun)/NZ(kp->swtch));
#undef AVG
}

void
kidsummary(void)
{
    kidwalk(printkidsum);
}

void
schedsummary(void)
{
    cpusummary();
	runqsummary();
	kidsummary();
}

void
dumpcpushort(void)
{
    register int i;
    register struct cpurec *cp;

    fprintf(outf, "  CPU Status:");
    for (i = 0, cp = cpus; i < RTMOND_MAXCPU; i++, cp++) {
	pid_t pid;
	if (cp->idletimes == 0 && cp->disps == 0 && cp->swtchs == 0)
	    continue;
	if (cp->curpri == PIDLE) {
	    fprintf(outf, "  [%d] IDLE", i);
	    continue;
	}
	pid =  rtmon_pidLookup(&rs->ds, cp->curkid);
	if (rs->flags & RTPRINT_SHOWKID)
		fprintf(outf, "  [%d] %d:%lld(pri %d)", i, pid, cp->curkid, cp->curpri);
	else
		fprintf(outf, "  [%d] %lld(pri %d)", i, pid ? (int64_t)pid :  cp->curkid, cp->curpri);
    }
    fprintf(outf, "\n");
}

void
dumprunq(void)
{
    register int i;

    fprintf(outf, "  Run Queue:");

    for (i = 0; i <= RTMOND_MAXCPU; i++) {
	register struct kidlist *lp;
	register struct kidlist *kp;

	lp = (i == RTMOND_MAXCPU) ? &runq.h : &localqs[i].h;
	if (!lp || lp->next == lp)
	    continue;

	if (i == RTMOND_MAXCPU)
	    fprintf(outf, "  [G]");
	else
	    fprintf(outf, "  [%d]", i);

	for (kp = lp->next; kp != lp; kp = kp->next) {
	    pid_t pid;
	    struct kidrec* pkp = (struct kidrec*) kp;

	    pid = rtmon_pidLookup(&rs->ds, pkp->kid);

	    if (rs->flags & RTPRINT_SHOWKID)
		    fprintf(outf, " %d:%lld(pri %d)", pid, pkp->kid, getpri(pkp));
	    else
		    fprintf(outf, " %lld(pri %d)", pid ? pid : pkp->kid, getpri(pkp));
	}
    }

    fprintf(outf, "\n");
}

void
showstat(void)
{
    dumpcpushort();
    dumprunq();
}

void
usage(char *progname)
{
    fprintf(errf,
	"Usage: %s <functions> <options> <command + args>\n"
	"       %s <functions> <options> -p pid\n"
	"       %s <functions> <options> -t time\n"
	"       %s <Display Options> <logfile\n",
	progname, progname, progname, progname);
    fprintf(errf,
	"Collection Functions:\n"
	"       -s              collect system call events\n"
	"       -r              collect process scheduling events\n"
	"       -k              collect disk activity events\n"
	"       -i              inherit event collection on fork\n"
	"       -O file         write event data to file\n"
#ifdef NET_DEBUG
	"       -x              collect network queueing events\n"
	"       -X              collect network throughput events\n"
#endif
	);
    fprintf(errf,
	"Display Options:\n"
	"       -l              list system calls in long format\n"
	"       -n syscall      select system call by name or number\n"
	"       -e syscall      exclude system call by name or number\n"
	"       -P pid          list system calls for <pid> only\n"
	"       -d              show time delta when printing calls\n"
	"       -c              do not show cpu id when printing calls\n"
	"       -u              show delta time in uSec\n"
	"	-k		print disk I/O trace (same as collection flag)\n"
	"       -S              print system call summary\n"
	"       -SS             print system call trace\n"
	"       -Q              print scheduling summary\n"
	"       -QQ             print scheduling trace\n"
	"       -QQQ            print runq trace also\n"
	"       -o file         write analysis listing to file\n"
	"       -a len          max bytes to print for ascii data\n"
	"       -b len          max bytes to print for binary data\n"
	"       -A              force character data printing\n"
	"       -B              force hex data printing\n"
	"       -z              sort system call summary by syscall name\n"
	);
}

void
dumperrors(char *msg)
{
    const char* emsg = strerror(errno);
    fflush(outf);
    fflush(errf);
    error("%s: %s", msg, emsg);
    if (collect)
	pclose(nfd);
    exit(1);
}