/*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
/*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
/*	  All Rights Reserved  	*/

/*	THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF     	*/
/*	UNIX System Laboratories, Inc.                     	*/
/*	The copyright notice above does not evidence any   	*/
/*	actual or intended publication of such source code.	*/

/*
 * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 * 		PROPRIETARY NOTICE (Combined)
 * 
 * This source code is unpublished proprietary information
 * constituting, or derived under license from AT&T's UNIX(r) System V.
 * In addition, portions of such source code were derived from Berkeley
 * 4.3 BSD under license from the Regents of the University of
 * California.
 * 
 * 
 * 
 * 		Copyright Notice 
 * 
 * Notice of copyright on this source code product does not indicate 
 * publication.
 * 
 * 	(c) 1986,1987,1988,1989  Sun Microsystems, Inc
 * 	(c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
 * 	          All rights reserved.
 *  
 */

#ident	"$Revision: 1.215 $"

#include <limits.h>
#include <sys/types.h>
#include <sys/cred.h>
#include <sys/debug.h>
#include <sys/errno.h>
#include <ksys/vfile.h>
#include <sys/flock.h>
#include <sys/fs_subr.h>
#include <sys/kmem.h>
#include <sys/mount.h>
#include <sys/param.h>
#include <sys/pathname.h>
#include <sys/sema.h>
#include <sys/stat.h>
#include <sys/systm.h>
#include <sys/uio.h>
#include <sys/kthread.h>
#include <sys/uthread.h>
#include <ksys/vproc.h>
#include <sys/vfs.h>
#include <sys/vnode_private.h>
#include <sys/sysinfo.h>
#include <sys/ksa.h>
#include <sys/dnlc.h>
#include <sys/sysmacros.h>
#include <sys/pda.h>
#include <sys/pfdat.h>
#include <sys/sat.h>
#include <ksys/vpag.h>
#include <sys/imon.h>
#include <sys/cmn_err.h>
#include <sys/atomic_ops.h>
#include <sys/psema_cntl.h>
#ifdef _SHAREII
#include	<sys/shareIIstubs.h>
#endif /* _SHAREII */
#ifdef VNODE_TRACING
#include <sys/ktrace.h>
#endif

/*
 * Managing the pool of allocated and free vnodes:
 *
 * Whenever a vnode is needed and the number of free vnodes is above
 * (vn_vnumber - vn_epoch), and ncsize, an attempt is made to
 * reclaim a vnode from a vnode freelist.  Otherwise, or if a short search
 * of a freelist doesn't produce a reclaimable vnode, a vnode is
 * constructed from the heap.
 *
 * It is up to vn_shake to deconstruct free vnodes.
 */

/*
 * Internal data structures.
 */
/*
 * Vnode hash list bucket.
 */
typedef struct vhash_s {
	struct vnode   *vh_vnode;
	lock_t		vh_lock;
} vhash_t;

/* 
 * Macros and defines.
 */
#define	VFREELIST(count)	&vfreelist[count].vf_freelist
#define LOCK_VFREELIST(list)	mutex_spinlock(&vfreelist[list].vf_lock)
#define UNLOCK_VFREELIST(l,s)	mutex_spinunlock(&vfreelist[l].vf_lock, s)

#define LOCK_VFP(listp)		mutex_spinlock(&(listp)->vf_lock)
#define UNLOCK_VFP(listp,s)	mutex_spinunlock(&(listp)->vf_lock, s)
#define NESTED_LOCK_VFP(listp)	nested_spinlock(&(listp)->vf_lock)
#define NESTED_UNLOCK_VFP(listp) nested_spinunlock(&(listp)->vf_lock)

#define VHASHMASK 		127
#define VHASH(vnumber)		(&vhash[(vnumber) & VHASHMASK])

#define	NVSYNC			37		/* prime */
#define	vptosync(v)		(&vsync[((unsigned long)v) % NVSYNC])

/*
 * Vnode global data.
 */
static hotUint64Counter_t vn_generation; /* vnode generation number */
hotUlongCounter_t vn_vnumber;	/* # of vnodes ever allocated */
hotIntCounter_t	vn_nfree;	/* # of free vnodes */

#if MP
#pragma fill_symbol (vn_generation, 128)
#pragma fill_symbol (vn_vnumber, 128)
#pragma fill_symbol (vn_nfree, 128)
#endif

static zone_t	*vn_zone;	/* vnode heap zone */
int		vn_epoch;	/* # of vnodes freed */
				/* vn_vnumber - vn_epoch == # current vnodes */
static int	vn_minvn;	/* minimum # vnodes before reclaiming */
static int	vn_shaken;	/* damper for vn_alloc */
static uint_t 	vn_coin;	/* coin for vn_alloc */
static vhash_t	*vhash;		/* hash buckets for active vnodes */
vnode_t		*rootdir;	/* pointer to root vnode */
vfreelist_t	*vfreelist;	/* pointer to array of freelist structs */
static int	vfreelistmask;	/* number of free-lists - 1 */

/*
 * Following is global data that can't be cellularized until any given
 * vnode is accessed from only one cell.
 */
static sv_t 	vsync[NVSYNC];	/* vnode inactive/reclaim sync semaphores */
lock_t		mreg_lock;	/* spinlock protecting all vp->v_mreg */
#if MP
#pragma align_symbol (mreg_lock, L2cacheline) 
#pragma fill_symbol (mreg_lock, L2cacheline)
#endif /* MP */


/*
 * Imon data - must be here instead of imon.c so that we don't get
 * gp-relative link errors.
 */
void		(*imon_event)(struct vnode *, struct cred *cr, int);
void		(*imon_hook)(struct vnode *, dev_t, ino_t);
void 		(*imon_broadcast)(dev_t, int);
int             imon_enabled;

/*
 * Externs and static functions.
 */
static void 	vn_relink(vnlist_t *, vnode_t *, vnode_t *);
static int	vn_shake(int);

/*
 * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 * numerical order of S_IFMT and vnode types.)
 */
enum vtype iftovt_tab[] = {
	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
};

u_short vttoif_tab[] = {
	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK
};

/*
 * Vnode operations for a free or killed vnode.
 */
vnodeops_t dead_vnodeops = {
	BHV_IDENTITY_INIT_POSITION(VNODE_POSITION_INVALID),
	(vop_open_t)fs_nosys,
	(vop_close_t)fs_noerr,
	(vop_read_t)fs_nosys,
	(vop_write_t)fs_nosys,
	(vop_ioctl_t)fs_nosys,
	(vop_setfl_t)fs_nosys,
	(vop_getattr_t)fs_nosys,
	(vop_setattr_t)fs_nosys,
	(vop_access_t)fs_nosys,
	(vop_lookup_t)fs_nosys,
	(vop_create_t)fs_nosys,
	(vop_remove_t)fs_nosys,
	(vop_link_t)fs_nosys,
	(vop_rename_t)fs_nosys,
	(vop_mkdir_t)fs_nosys,
	(vop_rmdir_t)fs_nosys,
	(vop_readdir_t)fs_nosys,
	(vop_symlink_t)fs_nosys,
	(vop_readlink_t)fs_nosys,
	(vop_fsync_t)fs_nosys,
	(vop_inactive_t)fs_noerr,
	(vop_fid_t)fs_nosys,
	(vop_fid2_t)fs_nosys,
	(vop_rwlock_t)fs_noval,
	(vop_rwunlock_t)fs_noval,
	(vop_seek_t)fs_nosys,
	(vop_cmp_t)fs_nosys,
	(vop_frlock_t)fs_nosys,
	(vop_realvp_t)fs_nosys,
	(vop_bmap_t)fs_nosys,
	(vop_strategy_t)fs_noval,
	(vop_map_t)fs_nodev,
	(vop_addmap_t)fs_nosys,
	(vop_delmap_t)fs_nosys,
	(vop_poll_t)fs_nosys,
	(vop_dump_t)fs_nosys,
	(vop_pathconf_t)fs_nosys,
	(vop_allocstore_t)fs_nosys,
	(vop_fcntl_t)fs_nosys,
	(vop_reclaim_t)fs_noerr,
	(vop_attr_get_t)fs_nosys,
	(vop_attr_set_t)fs_nosys,
	(vop_attr_remove_t)fs_nosys,
	(vop_attr_list_t)fs_nosys,
	(vop_cover_t)fs_nosys,
	(vop_link_removed_t)fs_noval,
	(vop_vnode_change_t)fs_nosys,
	(vop_ptossvp_t)fs_noval,
	(vop_pflushinvalvp_t)fs_noval,
	(vop_pflushvp_t)fs_noval,
	(vop_pinvalfree_t)fs_noval,
	(vop_sethole_t)fs_noval,
	(vop_commit_t)fs_nosys,
	(vop_readbuf_t)fs_nosys,
	(vop_strgetmsg_t)fs_nosys,
	(vop_strputmsg_t)fs_nosys,
};

void
vn_init(void)
{
	register vfreelist_t *vfp;
	register sv_t *svp;
	register int i;
	extern int nproc, ncsize;

	/*
	 * There are ``vfreelistmask + 1'' freelists --
	 * so multiple clients can allocate vnodes simultaneously, and
	 * to keep the individual lists reasonably short.
	 */
	i = MIN(numcpus, 16);
	while (i & (i-1))
		i--;
	vfreelistmask = i - 1;

	vfp = vfreelist = (vfreelist_t *)
			kmem_zalloc(i * sizeof(vfreelist_t), KM_SLEEP);

	for (i = 0; i <= vfreelistmask; i++) {
		vn_initlist(&vfp->vf_freelist);
		vfp->vf_next = vfp + 1;
		vfp->vf_listid = i;
		init_spinlock(&vfp->vf_lock, "vf_lock", i);
		vfp++;
	}
	vfreelist[vfreelistmask].vf_next = vfreelist;

	vn_zone = kmem_zone_init(sizeof(vnode_t), "Vnodes");

	spinlock_init(&mreg_lock, "mreg_lock");

	for (svp = vsync, i = 0; i < NVSYNC; i++, svp++)
		init_sv(svp, SV_DEFAULT, "vsy", i);

	repl_init();		/* vnode replication */

	shake_register(SHAKEMGR_MEMORY, vn_shake);

	vhash = (vhash_t *)kmem_zalloc((VHASHMASK+1) * sizeof(vhash_t),
					KM_SLEEP);
	for (i = 0; i <= VHASHMASK; i++) {
		init_spinlock(&vhash[i].vh_lock, "vhash", i);
	}

	vn_minvn = MAX(nproc, ncsize);

	/* Just refer to vn_passthrup to force linker to bring int
	 * vn_passthru ops.
	 */
	if (!vn_passthrup){
		cmn_err_tag(139,CE_PANIC,"vnode pass through mode not initialized ?");
	}
}

/*
 * Find vnode in hash table with the given number.  If the vnode is 
 * found then it must be the same as the one passed in.  
 *
 * NOTE:  must not dereference 'vp' because it's possible the memory that 
 * vp refers to has been reallocated to some other use (because it was
 * freed by the vn_shake mechanism).
 */
vnode_t *
vn_find(vnode_t *vp, vnumber_t number)
{
	register vhash_t 	*vhp = VHASH(number);
	register int 		s = mutex_spinlock(&vhp->vh_lock);
	register vnode_t 	*lvp;

	for (lvp = vhp->vh_vnode; lvp; lvp = lvp->v_hashn) {
	        ASSERT(lvp->v_number != 0);
		if (lvp->v_number == number) {
		        if (lvp != vp) {
				printf("vn_find: vp=0x%x lvp=0x%x\n", vp, lvp);
				panic("vn_find error");
			}
			break;
		}
	}
	mutex_spinunlock(&vhp->vh_lock, s);

	return(lvp);
}

/*
 * Put vnode in hash table.  Must have a non-zero v_number.
 */
void
vn_hash(register vnode_t *vp)
{
	register vhash_t *vhp = VHASH(vp->v_number);
	register int s = mutex_spinlock(&vhp->vh_lock);
	register vnode_t **vpp = &vhp->vh_vnode;

	ASSERT(vp->v_number);
	vp->v_hashp = (vnode_t *)NULL;
	vp->v_hashn = *vpp;
	if (vp->v_hashn)
		vp->v_hashn->v_hashp = vp;
	*vpp = vp;
	mutex_spinunlock(&vhp->vh_lock, s);
}

/*
 * Remove vnode from hash table.  v_number is set to 0 so that vn_find
 * won't find it.
 */
void
vn_unhash(register vnode_t *vp)
{
	register vhash_t *vhp = VHASH(vp->v_number);
	register int s = mutex_spinlock(&vhp->vh_lock);
	register vnode_t *vnext = vp->v_hashn;
	register vnode_t *vprev = vp->v_hashp;

	if (vprev)
		vprev->v_hashn = vnext;
	else
		vhp->vh_vnode = vnext;

	if (vnext)
		vnext->v_hashp = vprev;

	mutex_spinunlock(&vhp->vh_lock, s);
	vp->v_hashp = vp->v_hashn = (vnode_t *)NULL;
	vp->v_number = 0;
}

/*
 * Clean a vnode of filesystem-specific data and prepare it for reuse.
 */
static int
vn_reclaim(struct vnode *vp, int flag)
{
	int error, s;
#ifdef CKPT
	extern int ckpt_enabled;
#endif

	VOPINFO.vn_reclaim++;

	/*
	 * Only make the VOP_RECLAIM call if there are behaviors
	 * to call.
	 */
	if (vp->v_fbhv != NULL) {
		VOP_RECLAIM(vp, flag, error);
		if (error)
			return error;
	}
	ASSERT(vp->v_fbhv == NULL);

	/*
	 * File system erred somewhere along the line, and there
	 * are still pages/buffers associated with the object.
	 * Remove the debris and print a warning.
	 * XXX LONG_MAX won't work for 64-bit offsets!
	 */
	if (vp->v_pgcnt || vp->v_dpages || vp->v_buf) {
#ifdef DEBUG
		int i;

		if (vp->v_vfsp)
			i = vp->v_vfsp->vfs_fstype;
		else
			i = 0;

		cmn_err(CE_WARN,
			"vn_reclaim: vnode 0x%x fstype %d (%s) has unreclaimed data (pgcnt %d dbuf %d dpages 0x%x), flag:%x",
			vp, i, vfssw[i].vsw_name ? vfssw[i].vsw_name : "?",
			vp->v_pgcnt, vp->v_dbuf, vp->v_dpages, vp->v_flag);
#endif
		VOP_FLUSHINVAL_PAGES(vp, 0, LONG_MAX, FI_NONE);
	}

	ASSERT(vp->v_dpages == NULL && vp->v_dbuf == 0 && vp->v_pgcnt == 0);
	/*
	 * The v_pgcnt assertion will catch debug systems that screw up.
	 * Patch up v_pgcnt for non-debug systems -- v_pgcnt probably
	 * means accounting problem here, not hashed data.
	 */
	vp->v_pgcnt = 0;

	s = VN_LOCK(vp);
	vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address);

	if (vp->v_number) {
		vn_unhash(vp);
	}
	ASSERT(vp->v_hashp == (vnode_t *)NULL);
	ASSERT(vp->v_hashn == (vnode_t *)NULL);

	/*
	 * Clear all flags except the ones relevant to the fact
	 * that it's being reclaimed.
	 */
	vp->v_flag &= (VRECLM|VWAIT|VSHARE|VLOCK);

	VN_UNLOCK(vp, s);
	vp->v_stream = NULL;
	vp->v_type = VNON;
	vp->v_fbhv = NULL;

	/*
	 * All locks should have been released by now, but
	 * the lock sema data structure needs to be taken care of.
	 */
	ASSERT(vp->v_filocks == NULL);

	/*
	 * Realaim all page cache related data.
	 * This could sleep waiting to synchronize with the
	 * threads trying to recycle hashed pages.. 
	 */
	vnode_pcache_reclaim(vp);
#ifdef CKPT
	/*
	 * Free lookup info...
	 */
	if (ckpt_enabled)
		ckpt_vnode_free(vp);
#endif
	ASSERT(vp->v_mreg == (struct pregion *)vp);
	ASSERT(vp->v_intpcount == 0);
	return 0;
}

static void
vn_wakeup(struct vnode *vp)
{
	int s;

	s = VN_LOCK(vp);
	vn_trace_entry(vp, "vn_wakeup", (inst_t *)__return_address);
	if (vp->v_flag & VWAIT) {
		sv_broadcast_bounded(vptosync(vp));
	}
	vp->v_flag &= ~(VRECLM|VWAIT);
	VN_UNLOCK(vp, s);
}

/*
 * Allocate a vnode struct for filesystem usage.
 * Reclaim the oldest on the global freelist if there are any,
 * otherwise allocate another.
 */
struct vnode *
vn_alloc(struct vfs *vfsp, enum vtype type, dev_t dev)
{
	register struct vnode *vp;
	register int list;
	register vnlist_t *vlist;
	register int cnt, s, i;
	register u_long vnumber;
	long target;
	int alloced = 0;
	int error;
#define VN_ALLOC_TRIES 4

	VOPINFO.vn_alloc++;

	if (vfreelistmask) {
		list = ++private.p_hand & vfreelistmask;
		s = LOCK_VFREELIST(list);
	} else {
		s = LOCK_VFP(vfreelist);
		list = 0;
	}

	vlist = VFREELIST(list);
	vp = vlist->vl_next;

	/*
	 * Easy cases: if list is empty, allocate a new vnode from the
	 * heap; if first vnode on the list is empty, use it.
	 */
	if (vp == (struct vnode *)vlist)
		goto alloc;

	if (vp->v_fbhv == NULL) {
		ASSERT(!vp->v_dbuf && !vp->v_dpages && !vp->v_pgcnt);
		cnt = VN_ALLOC_TRIES;
		goto get;
	}

#if VNODE_THRASH_TEST
	/*
	 * Turn this ifdef on to force races with 
	 * vn_alloc/vn_reclaim/vn_get/dnlc_lookup_fast, etc.
	 */
	if (fetchIntHot(&vn_nfree) > 0) {
		cnt = VN_ALLOC_TRIES;
		goto get;
	}
#endif

	/*
	 * Allocate a minumum of vn_minvn vnodes.
	 * XXX  Do this from vn_init?
	 */
	vnumber = fetchUlongHot(&vn_vnumber) - vn_epoch;	/* # of extant vnodes */
	if (vnumber < vn_minvn)
		goto alloc;

	cnt = fetchIntHot(&vn_nfree);
	vnumber -= cnt;				/* # of vnodes in use */

	/*
	 * If number of free vnode < number in-use, just alloc a new vnode.
	 */
	if (cnt < vnumber)
		goto alloc;

	/*
	 * Calculate target # of total vnodes to have allocated.
	 */
	target = vnumber;

	/*
	 * If number of free vnode < half of target, alloc a new vnode.
	 */
	if (cnt < target/2)
		goto alloc;

	/*
	 * If below target # of free vnodes, devise the chance that
	 * we'll manufacture a new vnode from the heap.  The closer
	 * we are to target, the more likely we'll just allocate from
	 * the freelist -- don't want to manufacture vnodes willy-nilly
	 * just to have vhand/vn_shake decommission them.
	 */
	if (cnt < target) {
		vnumber = target / 16;
		i = 0xf;
		if (vn_shaken > 0) {
			vn_shaken--;
			vnumber <<= 1;
		}

		while (cnt < target - vnumber) {
			i >>= 1;
			vnumber <<= 1;
		}

		if (!(++vn_coin & i))
			goto alloc;
	}

	/*
	 * If a reclaimable vnode isn't found after searching a very
	 * few vnodes, put those vnodes on the tail of the free list
	 * and allocate a vnode from the heap.  This shouldn't happen
	 * often, and vn_shake will trim down the number of vnodes if
	 * the count rises too high.
	 */
	cnt = VN_ALLOC_TRIES;
again:
	for ( ; vp != (struct vnode *)vlist ; vp = vp->v_next) {
		ASSERT(vp->v_listid == list);

		if (vp->v_dbuf || vp->v_dpages || vp->v_pgcnt > 8) {
			VOPINFO.vn_afreeloops++;
			if (--cnt < 0) {
				if (vlist->vl_next == vp) {
					vn_unlink(vp);
					vn_append(vp, vlist);
				} else if (vp->v_next !=
					   (struct vnode *)vlist) {
					vn_relink(vlist, vp, vp->v_prev);
				}
				VOPINFO.vn_afreemiss++;
				break;
			}
			continue;
		}
	get:
		VOPINFO.vn_afreeloops++;

		NESTED_VN_LOCK(vp);
		ASSERT(vp->v_count == 0);
		ASSERT((vp->v_flag & VINACT) == 0);
		if ((vp->v_flag & VRECLM) == 0) {

			vp->v_flag |= VRECLM;
			NESTED_VN_UNLOCK(vp);

			if (vlist->vl_next != vp && vlist->vl_prev != vp) {
				vn_relink(vlist, vp->v_next, vp->v_prev);
				vp->v_next = vp->v_prev = vp;
			} else
				vn_unlink(vp);
			ASSERT(vlist->vl_next->v_prev == (struct vnode *)vlist);
			ASSERT(vlist->vl_prev->v_next == (struct vnode *)vlist);
			ASSERT(vp->v_listid == list);
			ASSERT(vfreelist[list].vf_lsize > 0);
			vfreelist[list].vf_lsize--;
			UNLOCK_VFREELIST(list, s);

			error = vn_reclaim(vp, 0);

			if (error) {
				/*
				 * Freelist lock must be held before cvsema'ing
				 * vnode.  A vn_get could happen on this vnode:
				 * just after this process releases vp, it gets
				 * an interrupt; the vn_get process acquires
				 * freelist lock and dequeues it from nowhere;
				 * then this process puts it back on free list.
				 */
				vn_trace_entry(vp, "REC FAIL1",
					(inst_t *)__return_address);
				s = LOCK_VFREELIST(list);
				vn_wakeup(vp);

				ASSERT(vp->v_listid == list);
				ASSERT(vfreelist[list].vf_lsize >= 0);
				vfreelist[list].vf_lsize++;
				vn_append(vp, vlist);
				vp = vlist->vl_next;
				vn_trace_entry(vp, "REC FAIL2",
					(inst_t *)__return_address);
				if (--cnt < 0) {
					break;
				}
				goto again;
			} else {
				vn_wakeup(vp);
				VOPINFO.vn_afree++;
				atomicAddIntHot(&vn_nfree, -1);

				ASSERT(!(vp->v_number));
#if defined(DEBUG) && defined(VNODE_INIT_BITLOCK)
				destroy_bitlock(&vp->v_flag);
#endif
				vp->v_number =
					atomicAddUint64Hot(&vn_generation, 1);
				goto gotit;
			}
		}
		NESTED_VN_UNLOCK(vp);
	}
alloc:
	UNLOCK_VFREELIST(list, s);

	VOPINFO.vn_aheap++;

	vp = kmem_zone_zalloc(vn_zone, KM_SLEEP);
	vp->v_number = atomicAddUint64Hot(&vn_generation, 1);
	alloced = 1;

#ifdef VNODE_TRACING
	vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, 0);
#endif
	vp->v_flag = VSHARE;
	(void) atomicAddUlongHot(&vn_vnumber, 1);

	init_bitlock(&vp->v_pcacheflag, VNODE_PCACHE_LOCKBIT, "v_pcache",
		     (long)vp->v_number);
# if defined(DEBUG) && defined(VNODE_INIT_BITLOCK)
	init_bitlock(&vp->v_flag, VLOCK, "vnode", (long)vp->v_number);
# endif
	init_mutex(&vp->v_filocksem, MUTEX_DEFAULT, "vfl", (long)vp->v_number);
	init_mutex(&vp->v_buf_lock, MUTEX_DEFAULT, "vnbuf", (long)vp->v_number);

	vp->v_mreg = vp->v_mregb = (struct pregion *)vp;
	vnode_pcache_init(vp);
gotit:
	vn_hash(vp);

	ASSERT(vp->v_count == 0);
	ASSERT(vp->v_dpages == NULL && vp->v_dbuf == 0 && vp->v_pgcnt == 0);
	ASSERT(vp->v_filocks == NULL);
	ASSERT(vp->v_intpcount == 0);
	ASSERT(vp->v_flag & VSHARE);

	/* 
	 * VLOCK may or may not be set, because other threads may be
	 * trying to lock this vnode (in vn_get) in order to see if
	 * it's the one they're looking for.  They won't be able to
	 * use the vnode though because the v_number won't match.
	*/
	ASSERT(!(vp->v_flag & (VNOSWAP | VISSWAP |
			       VREPLICABLE |
			   /*  VNONREPLICABLE | XXX uncomment this */
			       VFRLOCKS | VENF_LOCKING |
			       VREMAPPING | VDOCMP | VDUP |
			       VSEMAPHORE | VUSYNC |
			       VINACT | VRECLM | VEVICT | VWAIT |
			       VFLUSH | VLOCKHOLD | VINACTIVE_TEARDOWN |
			       VROOT | VMOUNTING)));

	vnode_pcache_reinit(vp);

	/* Initialize the first behavior and the behavior chain head. */
	if (!alloced) {
		ASSERT(VN_BHV_NOT_READ_LOCKED(VN_BHV_HEAD(vp)) && 
		       VN_BHV_NOT_WRITE_LOCKED(VN_BHV_HEAD(vp)));
		vn_bhv_head_reinit(VN_BHV_HEAD(vp));
	} else
		vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode");

	vp->v_count = 1;
	vp->v_vfsp = vfsp;
	vp->v_type = type;
	vp->v_rdev = dev;
	vp->v_next = vp->v_prev = vp;

	vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address);
#ifdef CKPT
	ASSERT(vp->v_ckpt == NULL);
#endif
	return vp;
}

/*
 * Free an isolated vnode, putting it at the front of a vfreelist.
 * The vnode must not have any other references.
 */
void
vn_free(struct vnode *vp)
{
	register vfreelist_t *vfp;
	register int mask;
	register int s;
#ifdef CKPT
	extern int ckpt_enabled;
#endif

#ifdef DEBUG
	ASSERT(vp->v_count == 1);
	if (vp->v_intpcount)
	    printf("vn_free: v_intpcount = %d\n", vp->v_intpcount);
	ASSERT(vp->v_intpcount == 0);
#endif

	if (mask = vfreelistmask) {
		register vfreelist_t *tp;

		vfp = &vfreelist[private.p_hand++ & mask];
		tp = vfp->vf_next;
		if (vfp->vf_lsize > tp->vf_lsize)
			vfp = tp;

		vp->v_listid = vfp->vf_listid;
	} else {
		vp->v_listid = 0;
		vfp = vfreelist;
	}

	vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address);
	vp->v_count = 0;
	vp->v_fbhv = NULL;

	vnode_pcache_free(vp);
#ifdef CKPT
	if (ckpt_enabled)
		ckpt_vnode_free(vp);
#endif
	s = LOCK_VFP(vfp);
	ASSERT(vp->v_listid == vfp->vf_listid);
	vfp->vf_lsize++;
	vn_insert(vp, &vfp->vf_freelist);
	UNLOCK_VFP(vfp, s);
	atomicAddIntHot(&vn_nfree, 1);


	VOPINFO.vn_rele++;
}

static int
vn_shake_freelist(register int nfree)
{
	register struct vnode *vp;
	register vfreelist_t *vfp = vfreelist;
	register vnlist_t *vlist;
	register int list;
	int s, error;
	int shaken = 0;
#ifdef DEBUG
	static int vn_shake_lock = 0;
#endif

again:
	if (vn_epoch == INT_MAX || nfree <= 0)
		return shaken;

	if (list = vfreelistmask) {
		register int count = vfp->vf_lsize;
		register vfreelist_t *tp = vfp;

		do {
			vfp = vfp->vf_next;
			if (vfp->vf_lsize > count) {
				count = vfp->vf_lsize;
				tp = vfp;
			}
		} while (--list > 0);

		vfp = tp;
		list = tp->vf_listid;
	}

	vlist = &vfp->vf_freelist;
	s = LOCK_VFP(vfp);

	for (vp = vlist->vl_next; vp != (struct vnode *)vlist; vp = vp->v_next)
	{
		ASSERT(vp->v_listid == list);

		if (vp->v_pgcnt)
			continue;

		NESTED_VN_LOCK(vp);
		ASSERT((vp->v_flag & VINACT) == 0);
		if ((vp->v_flag & VRECLM) == 0) {
			vp->v_flag |= VRECLM;
			ASSERT(vp->v_count == 0);
			NESTED_VN_UNLOCK(vp);

			vn_unlink(vp);
			ASSERT(vfp->vf_lsize > 0);
			vfp->vf_lsize--;
			UNLOCK_VFP(vfp, s);

			error = vn_reclaim(vp, 0);

			/*
			 * Purge soft references to the vnode
			 * from the name cache.
			 */
			if (!error) {
				dnlc_remove_vp(vp);
				ASSERT(fetchIntHot(&vn_nfree) > 0);
				atomicAddIntHot(&vn_nfree, -1);
			} else {
				/*
				 * See comments in vn_alloc for explanation
				 * why we lock vfreelist before cvsema in
				 * error case.
				 */
				s = LOCK_VFP(vfp);
				vn_wakeup(vp);

				vp->v_listid = list;
				vn_append(vp, vlist);

				ASSERT(vfp->vf_lsize >= 0);
				vfp->vf_lsize++;
				UNLOCK_VFP(vfp, s);
				/*
				 * Decrement nfree if we fail so we don't
				 * get stuck here all day.
				 */
				if (nfree-- <= 0)
					return shaken;
				goto again;
			}

			s = LOCK_VFP(vfreelist);
			for (list = vfreelistmask, vfp = vfreelist->vf_next;
			     list-- > 0;
			     vfp = vfp->vf_next) {
				NESTED_LOCK_VFP(vfp);
			}
#ifdef DEBUG
			vn_shake_lock = 1;
#endif
			vn_wakeup(vp);

			/*
			 * Define a new epoch in the history of vnodes...
			 * Protected by having *every* freelist lock held.
			 */
			vn_epoch++;
			vn_shaken += 4;	/* dampen vn_alloc's desire to */
					/* allocate vnodes from heap */
			VOPINFO.vn_destroy++;

			for (list = vfreelistmask, vfp = vfreelist->vf_next;
			     list-- > 0;
			     vfp = vfp->vf_next) {
				NESTED_UNLOCK_VFP(vfp);
			}
			UNLOCK_VFP(vfreelist, s);
#ifdef DEBUG
			ASSERT(vn_shake_lock == 1);
			vn_shake_lock = 0;
#endif /* DEBUG */
#ifdef VNODE_TRACING
			ktrace_free(vp->v_trace);
#endif /* VNODE_TRACING */

			/* Teardown behavior chain state. */
			vn_bhv_head_destroy(VN_BHV_HEAD(vp));

			vp->v_flag = 0;	/* debug */
			destroy_bitlock(&vp->v_pcacheflag);
#if defined(DEBUG) && defined(VNODE_INIT_BITLOCK)
			destroy_bitlock(&vp->v_flag);
#endif
			mutex_destroy(&vp->v_filocksem);
			mutex_destroy(&vp->v_buf_lock);
			kmem_zone_free(vn_zone, vp);
			shaken++;

			if (nfree-- <= 0)
				return shaken;

			goto again;
		}
		NESTED_VN_UNLOCK(vp);
	}
	UNLOCK_VFP(vfp, s);

	return shaken;
}

/* ARGSUSED */
int
vn_shake(int level)
{
	int	total_vnodes = fetchUlongHot(&vn_vnumber) - vn_epoch;
	int	free = fetchIntHot(&vn_nfree);
	int	num_to_free;
	int	v;

	/*
	 * If we're below our configured minimum number of
	 * vnodes, then just get out.
	 */
	if (total_vnodes < vn_minvn) {
		return 0;
	}

	v = total_vnodes;
	v -= free;			/* # of in-use vnodes */
	ASSERT(v >= 0);

	if (free <= v)	{		/* don't steal any vnodes */
		return 0;		/* if free count <= inuse count */
	}

	v = free - v;			/* # over target */
	if (v < 0)
		v = 0;
	else
		v = v / 16;		/* take either 1/Nth of # over target */

	free = free / 128;		/* or 1/Mth of free vnodes... */

	num_to_free = MAX(free, v);
	num_to_free = MAX(num_to_free, 512);	/* but no more than 512 */

	/*
	 * Don't pull the number of extant vnodes below the desired
	 * minimum.
	 */
	if ((total_vnodes - num_to_free) < vn_minvn) {
		num_to_free = total_vnodes - vn_minvn;
	}

	return vn_shake_freelist(num_to_free);
}

/*
 * Routine which 1) makes a vnode invisible to vn_get, and 2) wakes up
 * anyone that has already found the vnode via vn_get but is blocked
 * there on the VINACT flag.  
 * 
 * This routine may only be called during inactivation processing.
 * It's used by nfs_inactive to resolve a deadlock condition.
 */
void
vn_gone(struct vnode *vp)
{
	register int s;

	ASSERT(vp->v_count == 0);
	ASSERT(vp->v_flag & VINACT);

	s = VN_LOCK(vp);
	vn_trace_entry(vp, "vn_gone", (inst_t *)__return_address);
	if (vp->v_flag & VWAIT) {
		sv_broadcast_bounded(vptosync(vp));
		vp->v_flag &= ~VWAIT;
	}
	vp->v_flag |= VGONE;
	VN_UNLOCK(vp, s);
}

/*
 * Based on the value of the vnode's reference count, attempt to set
 * a vnode's VEVICT flag.
 *
 * If v_count > 1, then do nothing and return 1.
 * If v_count == 1, then set the vnode's VEVICT flag and return 0.
 *
 * The VEVICT flag causes callers of vn_get to wait until the vnode is
 * inactivated, or the evict condition is otherwise cleared (currently, 
 * there's no interface to clear an evict condition).
 * 
 * vn_evict is used by a caller who wishes to prevent additional references
 * to a vnode iff it holds the only reference.
 */
int
vn_evict(struct vnode *vp)
{
	register int s;

	s = VN_LOCK(vp);
	vn_trace_entry(vp, "vn_evict", (inst_t *)__return_address);
	ASSERT(vp->v_count >= 1);

	if (vp->v_count == 1) {
		vp->v_flag |= VEVICT;
		VN_UNLOCK(vp, s);
		return 0;
	}
	VN_UNLOCK(vp, s);
	return 1;
}

/*
 * Get and reference a vnode, possibly removing it from the freelist.
 * If v_count is zero and VINACT is set, then vn_rele is inactivating
 * and we must wait for vp to go on the freelist, or to be reclaimed.
 * If v_count is zero and VRECLM is set, vn_alloc is reclaiming vp;
 * we must sleep till vp is reclaimed, then return false to our caller,
 * who will try again to hash vp's identifier in its filesystem cache.
 * If during the sleep on vfreelock we miss a reclaim, we will notice
 * that v_number has changed.
 */
vnode_t *
vn_get(register struct vnode *vp, register vmap_t *vmap, uint flags)
{
	register int list;
	register vfreelist_t *vfp;
	register int s;

	list = vmap->v_id;
	VOPINFO.vn_get++;

	/*
	 * A note about v_number:  it gets set to a non-zero value at
	 * vn_alloc() time and then reset to zero in vn_reclaim after
	 * the call to VOP_RECLAIM.  A file system calling vn_get will 
	 * have snapshotted the v_number and put it in the vmap structure
	 * while the vnode was in its hash table:  meaning it must have
	 * been done between the time vn_alloc() completed and the time 
	 * VOP_RECLAIM completed.  Hence, vmap->v_number should always be 
	 * non-zero
	 */ 
	if (vmap->v_number == 0) {
#pragma mips_frequency_hint NEVER
	        printf("vn_get error: vp=0x%x vmap->v_number=%d\n",
		       vp, vmap->v_number);
	        panic("vn_get: vmap->v_number == 0");
	}
again:
	/*
	 * NOTE:  must not dereference 'vp' until after verifying that
	 * it still refers to a vnode.
	 */
	if (list < 0 || list > vfreelistmask)
		goto fail;

	vfp = &vfreelist[list];
	s = LOCK_VFP(vfp);

	/*
	 * Check that the epoch of vnodes hasn't changed.  Epoch only
	 * changes when a vnode is deallocated, which means that sampled
	 * vnode pointers in filesystem caches may now be stale.  If the
	 * epoch has changed, search for the vnode in the vnode hash.
	 */
	if (vmap->v_epoch != vn_epoch) {
		/* if vn_find succeeds, it's guaranteed to find vp */
		if (vn_find(vp, vmap->v_number) == NULL) {
#pragma mips_frequency_hint NEVER
			UNLOCK_VFP(vfp, s);
			VOPINFO.vn_gchg++;
			vmap->v_id = 0;
			goto fail;
		}
	}

	/*
	 * Now it's ok to dereference 'vp'.
	 */
	vn_trace_entry(vp, "GET AGAIN", (inst_t *)__return_address);
#ifdef CKPT
	ASSERT(vp->v_ckpt != (ckpt_handle_t)-1L);
#endif
	NESTED_VN_LOCK(vp);
	vn_trace_entry(vp, "GET LOCKED", (inst_t *)__return_address);
	if (vp->v_number != vmap->v_number) {
#pragma mips_frequency_hint NEVER
		NESTED_UNLOCK_VFP(vfp);
		vmap->v_id = 0;
		vn_trace_entry(vp, "GET VERS", (inst_t *)__return_address);
		VN_UNLOCK(vp, s);
		VOPINFO.vn_gchg++;
		goto fail;
	}

	/*
	 * If the vnode is being inactivated, reclaimed, or evicted,
	 * then wait until the condition clears (unless VN_GET_NOWAIT
	 * is specified).  If the vnode has VGONE set, then return
	 * immediately.
	 */
	if (vp->v_flag & (VINACT|VRECLM|VGONE|VEVICT)) {
#pragma mips_frequency_hint NEVER

		ASSERT((vp->v_flag & VEVICT) ? vp->v_count <= 1 : 1);
		ASSERT((vp->v_flag & (VINACT|VRECLM|VGONE)) ? 
		       vp->v_count == 0 : 1);

		if (vp->v_flag & VGONE) {
			vmap->v_id = -1;
			vn_trace_entry(vp, "GET GONE",
				       (inst_t *)__return_address);
			NESTED_VN_UNLOCK(vp);
			UNLOCK_VFP(vfp, s);
			goto fail;
		}
		/*
		 * If the caller cannot get stuck waiting
		 * for the vnode to complete its inactive
		 * or reclaim routine, then return NULL.
		 * Set v_id to -2 to indicate that this is
		 * why NULL was returned.
		 */
		if (flags & VN_GET_NOWAIT) {
			vmap->v_id = -2;
			vn_trace_entry(vp, "GET NOWAIT",
				       (inst_t *)__return_address);
			NESTED_VN_UNLOCK(vp);
			UNLOCK_VFP(vfp, s);
			goto fail;
		}
		NESTED_UNLOCK_VFP(vfp);
		vp->v_flag |= VWAIT;
		vn_trace_entry(vp, "GET RECL",
			(inst_t *)__return_address);
		sv_bitlock_wait(vptosync(vp), PINOD,
				&vp->v_flag, VLOCK, s);
		VOPINFO.vn_gchg++;
		goto again;
	}

	if (vp->v_count == 0) {
		/*
		 * vnode could have travelled from one freelist to
		 * another since it was sampled by caller.
		 */
		if (list != vp->v_listid) {
#pragma mips_frequency_hint NEVER
			list = vp->v_listid;
			vn_trace_entry(vp, "GET SWTCH",
				(inst_t *)__return_address);
			NESTED_VN_UNLOCK(vp);
			UNLOCK_VFP(vfp, s);
			VOPINFO.vn_gchg++;
			goto again;
		}

		/*
		 * If there are no behaviors attached to this vnode,
		 * there is no point in giving it back to the caller.
		 * This can happen if the behavior was detached in
		 * the filesystem's inactive routine.
		 */
		if (vp->v_fbhv == NULL) {
#pragma mips_frequency_hint NEVER
			vn_trace_entry(vp, "GET NO BHV",
				(inst_t *)__return_address);
			NESTED_VN_UNLOCK(vp);
			UNLOCK_VFP(vfp, s);
			vmap->v_id = 0;
			goto fail;
		}

		/*
		 * Give vp one reference for our caller and unlink it from
		 * the vnode freelist.
		 */
		vp->v_count = 1;
		vn_trace_hold(vp, __FILE__, __LINE__,
			(inst_t *)__return_address);
		NESTED_VN_UNLOCK(vp);

		ASSERT(vp->v_next != vp && vp->v_prev != vp);
		ASSERT(vp->v_flag & VSHARE);
		ASSERT(vp->v_filocks == NULL);

		vn_unlink(vp);
		ASSERT(vfp->vf_lsize > 0);
		vfp->vf_lsize--;
		UNLOCK_VFP(vfp, s);

		ASSERT(fetchIntHot(&vn_nfree) > 0);
		atomicAddIntHot(&vn_nfree, -1);
		VOPINFO.vn_gfree++;

	} else {
		vp->v_count++;
		vn_trace_hold(vp, __FILE__, __LINE__, 
			      (inst_t *)__return_address);
		NESTED_VN_UNLOCK(vp);
		UNLOCK_VFP(vfp, s);
	}

#ifdef CKPT
	ASSERT(vp->v_ckpt != (ckpt_handle_t)-1L);
#endif
	return vp;
 fail:
	return NULL;
}

/*
 * purge a vnode from the cache
 * At this point the vnode is guaranteed to have no references (v_count == 0)
 * The caller has to make sure that there are no ways someone could
 * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock).
 */
void
vn_purge(struct vnode *vp, vmap_t *vmap)
{
	register vfreelist_t *vfp;
	register int list;
	register int s;

	list = vmap->v_id;

	/*
	 * See the note about v_number in vn_get.
	 */
	if (vmap->v_number == 0) {
	        printf("vn_purge error: vp=0x%x vmap->v_number=%d\n",
		       vp, vmap->v_number);
	        panic("vn_purge: vmap->v_number == 0");
	}
again:
	if (list < 0 || list > vfreelistmask)
		return;

	vfp = &vfreelist[list];
	s = LOCK_VFP(vfp);

	/*
	 * Check that the epoch of vnodes hasn't changed.  Epoch only
	 * changes when a vnode is deallocated, which means that sampled
	 * vnode pointers in filesystem caches may now be stale.  If the
	 * epoch has changed, search for the vnode in the vnode hash.
	 */
	if (vmap->v_epoch != vn_epoch) {
		/* if vn_find succeeds, it's guaranteed to find vp */
		if (vn_find(vp, vmap->v_number) == NULL) {
			UNLOCK_VFP(vfp, s);
			VOPINFO.vn_gchg++;
			return;
		}
	}

	/* if you don't SHARE you don't get to play */
	ASSERT(vp->v_flag & VSHARE);	

	/*
	 * Check whether vp has already been reclaimed since our caller
	 * sampled its version while holding a filesystem cache lock that
	 * its VOP_RECLAIM function acquires.
	 */
	NESTED_VN_LOCK(vp);
	vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address);
	if (vp->v_number != vmap->v_number) {
		NESTED_VN_UNLOCK(vp);
		UNLOCK_VFP(vfp, s);
		return;
	}

	/*
	 * If vp is being reclaimed or inactivated, wait until it is inert,
	 * then proceed.  Can't assume that vnode is actually reclaimed
	 * just because the reclaimed flag is asserted -- a vn_alloc
	 * reclaim can fail.
	 */
	if (vp->v_flag & (VINACT | VRECLM)) {
		ASSERT(vp->v_count == 0);
		NESTED_UNLOCK_VFP(vfp);
		vp->v_flag |= VWAIT;
		sv_bitlock_wait(vptosync(vp), PINOD, &vp->v_flag, VLOCK, s);
		goto again;
	}

	/*
	 * Another process could have raced in and gotten this vnode...
	 */
	if (vp->v_count > 0) {
		NESTED_UNLOCK_VFP(vfp);
		VN_UNLOCK(vp, s);
		return;
	}

	/*
	 * vnode could have travelled from one freelist to
	 * another since it was sampled by caller.
	 */
	if (list != vp->v_listid) {
		list = vp->v_listid;
		NESTED_UNLOCK_VFP(vfp);
		VN_UNLOCK(vp, s);
		VOPINFO.vn_gchg++;
		goto again;
	}

	vp->v_flag |= VRECLM;
	NESTED_VN_UNLOCK(vp);

	vn_unlink(vp);
	/*
	 * XXX	There is no routine that relies on a freelist's vf_lsize
	 * XXX	exactly matching the number of free list entries.  Since
	 * XXX	this vnode is going right back on the same freelist, we
	 * XXX	won't bother to decrement, and later, increment vf_lsize.
	ASSERT(vfp->vf_lsize > 0);
	vfp->vf_lsize--;
	 */
	UNLOCK_VFP(vfp, s);

	/*
	 * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells
	 * vp's filesystem to flush and invalidate all cached resources.
	 * When vn_reclaim returns, vp should have no private data,
	 * either in a system cache or attached to the behavior chain.
	 */
	if (vn_reclaim(vp, FSYNC_INVAL) != 0)
		panic("vn_purge: cannot reclaim");

	/*
	 * Setting v_listid is protected by VRECLM flag above...
	vp->v_listid = list;
	 */
	s = LOCK_VFP(vfp);
	ASSERT(vp->v_listid == list);
	vn_insert(vp, &vfp->vf_freelist);

	/*
	 * XXX	See comments above about vf_lsize.
	ASSERT(vfp->vf_lsize >= 0);
	vfp->vf_lsize++;
	 */
	UNLOCK_VFP(vfp, s);

	/*
	 * Wakeup anyone waiting for vp to be reclaimed.
	 */
	vn_wakeup(vp);
}

/*
 * Cause a vnode to be inacessible to everyone except those that
 * already have a reference.
 */
void
vn_kill(struct vnode *vp)
{
	register int s;
	/*REFERENCED*/
	int  closerr;

	/* if you don't SHARE you don't get to play */
	ASSERT(vp->v_flag & VSHARE);
	ASSERT(vp->v_type == VCHR);
	ASSERT(vp->v_count > 0);

	s = VN_LOCK(vp);
	/* 
	 * Add ref so don't race with vn_rele/vn_reclaim.
	 *
	 * XXX How does this prevent a race?  Since the VOP_CLOSE below 
	 * doesn't cause a vn_rele, then whatever vn_rele we're worried 
	 * about could equally as well happen prior to us bumping the
	 * count here.  Could this code be an artifact of the bug
	 * in vhangup where it was calling vn_kill on a session vnode
	 * that was already deallocated?
	 */
	vp->v_count++;
	vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address);
	VN_UNLOCK(vp, s);

	/*
	 * Clean out any file locks in the unlikely event there are any.
	 */
	if (vp->v_flag & VFRLOCKS)
		fs_cleanlock(vp, sys_flid, sys_cred);

	/*
	 * Pass special note to close routine - this won't really
	 * close anything but will prevent any further "gets" from
	 * succeeding on the vnode.
	 */
	VOP_CLOSE(vp, 0, FROM_VN_KILL, sys_cred, closerr);

	/* release our added ref */
	VN_RELE(vp);
}

/*
 * Add a reference to a referenced vnode.
 */
struct vnode *
vn_hold(struct vnode *vp)
{
	register int s = VN_LOCK(vp);

	/*
	 * Check validity of this request.  Because, the effects of vn_hold-ing
         * a vnode on the free list are so dire, we always want to panic rather
         * than doing this.  Note that non-VSHARE vnodes, which do not get
         * into the vnode free lists, are allowed to do a vn_hold when the
         * v_count is zero and that autofs takes advantage of this dubious 
         * privilege.
         *
         * Also note that autofs in the CELL_IRIX case has been modified not 
         * to do this (vn_hold a vnode with v_count equal to zero) so we 
         * could enforce stricter requirements in that case but do not
         * bother since CELL_IRIX is a dead issue in the kudzu base.
         * This will be an issue when this is merged into teak but by
         * that time teak should be enforcing the stricter requirments
         * in all configruations.  This will require hand-merging in any
	 * case.
	 */
        if (vp->v_count <= 0) {
#pragma mips_frequency_hint NEVER
		/*
		 * We are holding a vnode with no hold or a corrupted
		 * hold count.  This probably means the vnode is on the
		 * free list. If VSHARE is set, panic.
		 * If VSHARE is not set, someone (autofs) can hold the vnode
		 * with a count of 0. All others panic.
		 */
		if ((vp->v_flag & VSHARE) || (vp->v_count != 0))
			cmn_err_tag(140,CE_PANIC, 
				"holding vnode on free list %x(%x,%d)", 
				vp, vp->v_flag, vp->v_count);
	}

	vp->v_count++;
	VN_UNLOCK(vp, s);
	return vp;
}

/*
 * Release a vnode.  Decrements reference count and calls
 * VOP_INACTIVE on last reference.
 */
void
vn_rele(struct vnode *vp)
{
	register vfreelist_t *vfp;
	register int s;
	register int mask;
	int private_vnode;
	/*REFERENCED*/
	int cache;
	extern void usync_cleanup(caddr_t);

	VOPINFO.vn_rele++;

	s = VN_LOCK(vp);
	if (--vp->v_count <= 0) {

		/*
		 * Make sure that the count has not gone through 
		 * zero.  If we were to let this go through unchecked,
		 * all kinds of terrible things could happen to the
		 * vnode lists and we wouldn't have a clue as we
		 * surveyed the wreckage.
		 */
        	if (vp->v_count < 0) {
#pragma mips_frequency_hint NEVER
	        	cmn_err_tag(141,CE_PANIC, 
				    "vnode ref count negative %x(%x,%d)", 
				    vp, vp->v_flag, vp->v_count);
		}

		/*
		 * It is absolutely, positively the case that
		 * the lock manager will not be releasing vnodes
		 * without first having released all of its locks.
		 */
		ASSERT(!(vp->v_flag & VLOCKHOLD));
		private_vnode = !(vp->v_flag & VSHARE);

		/*
		 * As soon as we turn this on, noone can find us
		 * in vn_get until we turn off VINACT or VRECLM
		 */
		vp->v_flag |= VINACT;
		VN_UNLOCK(vp, s);

		/*
		 * Cleanup left over file locks.  These can get here 
		 * due to the NLM interfaces that could keep the last
		 * close logic from finding them.  We need to do it
		 * now instead of reclaim time because XFS can reuse
		 * vnodes/inodes without going through reclaim.
		 *
		 * Note that the intent here is merely to clean up
		 * state associated with this particular vnode, which
		 * is why cleanlocks() is used instead of fs_cleanlock()
		 * (the latter being used when distributed vnodes need
		 * to be taken into account).  
		 */
		if (vp->v_filocks != NULL)
			cleanlocks(vp, IGN_PID, 0L);
		ASSERT(vp->v_filocks == NULL);

		/*
		 * If the address space backed by this vnode represents
		 * any usync objects, clean them up.
		 */
		if (vp->v_flag & VUSYNC) {
			usync_cleanup((caddr_t) vp);
			VN_FLAGCLR(vp, VUSYNC);
		}

		/*
		 * Do not make the VOP_INACTIVE call if there
		 * are no behaviors attached to the vnode to call.
		 */
		if (vp->v_fbhv != NULL) {
			VOP_INACTIVE(vp, get_current_cred(), cache);
			ASSERT(private_vnode ? 1 : cache == VN_INACTIVE_CACHE ?
			       !(vp->v_flag & VINACTIVE_TEARDOWN) : 
			       (vp->v_flag & VINACTIVE_TEARDOWN));
		}

		/*
		 * For filesystems that do not want to be part of
		 * the global vnode cache, we must not touch the
		 * vp after we have called inactive
		 */
		if (private_vnode)
			return;

		ASSERT(vp->v_next == vp && vp->v_prev == vp);

		if (mask = vfreelistmask) {
			register vfreelist_t *tp;

			vfp = &vfreelist[private.p_hand++ & mask];
			tp = vfp->vf_next;
			if (vfp->vf_lsize > tp->vf_lsize)
				vfp = tp;

			vp->v_listid = vfp->vf_listid;
		} else {
			vp->v_listid = 0;
			vfp = vfreelist;
		}

		ASSERT(vp->v_listid <= vfreelistmask);
		s = LOCK_VFP(vfp);
		if (vp->v_fbhv == NULL)
			vn_insert(vp, &vfp->vf_freelist);
		else
			vn_append(vp, &vfp->vf_freelist);

		ASSERT(vfp->vf_lsize >= 0);
		vfp->vf_lsize++;

		/*
		 * Must hold freelist lock here to prevent
		 * vnode from being deallocated first.
		 *
		 * VRECLM is turned off too because it may
		 * have veen vn_reclaim'd above (this stmt.
		 * not true anymore).
		 */
		NESTED_VN_LOCK(vp);
		if (vp->v_flag & VWAIT) {
			vn_trace_entry(vp, "RELE WAKEUP",
				(inst_t *)__return_address);
			sv_broadcast_bounded(vptosync(vp));
		}
		vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VGONE|VEVICT);

		/* 
		 * If not interposed for replication, and if the vnode 
		 * was opened for write, NONREPLICABLE bit is turned on.
		 * Turn it off now.
		 */
		VN_CLRNONREPLICABLE(vp);

		NESTED_UNLOCK_VFP(vfp);
		VN_UNLOCK(vp, s);

		ASSERT(fetchIntHot(&vn_nfree) >= 0);
		atomicAddIntHot(&vn_nfree, 1);
	} else
		VN_UNLOCK(vp, s);
}

/*
 * Vnode list primitives.  The callers must exclude one another.
 */
void
vn_initlist(struct vnlist *vl)
{
	vl->vl_next = vl->vl_prev = (struct vnode *)vl;
}

void
vn_insert(struct vnode *vp, struct vnlist *vl)
{
	vp->v_next = vl->vl_next;
	vp->v_prev = (struct vnode *)vl;
	vl->vl_next = vp;

	/*
	 * imon depends on vp this...
	 */
	vp->v_next->v_prev = vp;
}

void
vn_unlink(register struct vnode *vp)
{
	register struct vnode *next = vp->v_next;
	register struct vnode *prev = vp->v_prev;

	next->v_prev = prev;
	prev->v_next = next;
	vp->v_next = vp->v_prev = vp;
}

static void
vn_relink(
	register vnlist_t *vlist,
	register vnode_t *next,
	register vnode_t *prev)
{
	register struct vnode *N = vlist->vl_next;
	register struct vnode *P = vlist->vl_prev;

	next->v_prev = (struct vnode *)vlist;
	P->v_next = N;
	N->v_prev = P;
	vlist->vl_next = next;
	vlist->vl_prev = prev;
	prev->v_next = (struct vnode *)vlist;
}

/*
 * Read or write a vnode.  Called from kernel code.
 */
int
vn_rdwr(enum uio_rw rw,
	struct vnode *vp,
	caddr_t base,
	size_t len,
	off_t offset,
	enum uio_seg seg,
	int ioflag,
	off_t ulimit,		/* meaningful only if rw is UIO_WRITE */
	cred_t *cr,
	ssize_t *residp,
	struct flid *fl)
{
	struct uio uio;
	struct iovec iov;
	int error;

	if (rw == UIO_WRITE && (vp->v_vfsp->vfs_flag & VFS_RDONLY))
		return EROFS;

	if ((ssize_t)len < 0)
		return EINVAL;
	iov.iov_base = base;
	iov.iov_len = len;
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_offset = offset;
	uio.uio_segflg = seg;
	uio.uio_resid = len;
	uio.uio_limit = ulimit;
	uio.uio_sigpipe = 0;
	uio.uio_pmp = NULL;
        uio.uio_pio = 0;
	uio.uio_readiolog = 0;
	uio.uio_writeiolog = 0;
        uio.uio_pbuf = 0;

	if (rw == UIO_WRITE) {
		uio.uio_fmode = FWRITE;
		VOP_WRITE(vp, &uio, ioflag, cr, fl, error);
	} else {
		uio.uio_fmode = FREAD;
		VOP_READ(vp, &uio, ioflag, cr, fl, error);
	}
	ASSERT(uio.uio_sigpipe == 0);
	if (residp)
		*residp = uio.uio_resid;
	else if (uio.uio_resid)
		error = EIO;
	return error;
}


/*
 * Open/create a vnode.
 * This may be callable by the kernel, the only known use
 * of user context being that the current user credentials
 * are used for permissions.  crwhy is defined iff filemode & FCREAT.
 */
int
vn_open(char *pnamep,
	enum uio_seg seg,
	register int filemode,
	mode_t createmode,
	struct vnode **vpp,
	enum create crwhy,
	int cflags,
	int *ckpt)
{
	struct vnode *vp, *tvp, *openvp;
	register int mode;
	register int error;
	struct vattr vattr;
#ifdef CKPT
	ckpt_handle_t lookup = NULL;
#endif
	tvp = (struct vnode *)NULL;
	mode = 0;
	if (filemode & FREAD)
		mode |= VREAD;
	if (filemode & (FWRITE|FTRUNC))
		mode |= VWRITE;
 
	if (filemode & FCREAT) {
		/*
		 * Wish to create a file.
		 */
		vattr.va_type = VREG;
		vattr.va_mode = createmode;
		vattr.va_mask = AT_TYPE|AT_MODE;
		if (filemode & FTRUNC) {
			vattr.va_size = 0;
			vattr.va_mask |= AT_SIZE;
		}
		if (filemode & FEXCL)
			cflags |= VEXCL;
		filemode &= ~(FTRUNC|FEXCL);
#ifdef _SHAREII
		/*
		 * Adjust mode for share umask.
		 */
		SHR_SETATTR(vattr.va_mask, &vattr.va_mode);
#endif /* _SHAREII */
		
		/* 
		 * vn_create can take a while, so preempt.
		 */
		if (error = vn_create(pnamep, seg, &vattr, cflags, mode, &vp,
						crwhy, ckpt))
			return error;
		tvp = vp;
		VN_HOLD(tvp);

		VOP_SETFL(vp, 0, filemode, get_current_cred(), error);
		if (error)
			goto out;
	} else {
		/*
		 * Wish to open a file.  Just look it up.
		 */
		if (error = lookupname(pnamep, seg, FOLLOW, NULLVPP, &vp,
#ifdef CKPT
				(ckpt)? &lookup : NULL))
#else
				NULL))
#endif
			return error;

		tvp = vp;
		VN_HOLD(tvp);
#ifdef CKPT
		if (ckpt) {
			if (lookup)
				*ckpt = ckpt_lookup_add(vp, lookup);
			else
				*ckpt = -1;
		}
#endif
		VOP_SETFL(vp, 0, filemode, get_current_cred(), error);
		if (error)
			goto out;

		/*
		 * Can't write directories, active texts, swap files, or
		 * read-only filesystems.  Can't truncate files
		 * on which mandatory locking is in effect.
		 */
		if (filemode & (FWRITE|FTRUNC)) {
			if (vp->v_type == VDIR) {
				error = EISDIR;
				goto out;
			}
			if (vp->v_vfsp->vfs_flag & VFS_RDONLY) {
				error = EROFS;
				goto out;
			}
			if ((vp->v_flag & VISSWAP) && vp->v_type == VREG) {
				error = EBUSY;
				goto out;
			}
			/*
			 * Can't truncate files on which mandatory locking
			 * is in effect and locks exist on the file.
			 */
			if ((filemode & FTRUNC) && 
			    (vp->v_flag & (VFRLOCKS|VENF_LOCKING)) ==
			    (VFRLOCKS|VENF_LOCKING)) {
				error = EAGAIN;
			}
			if (error)
				goto out;
		}
		/* Check discretionary permissions.*/
		VOP_ACCESS(vp, mode, get_current_cred(), error);
		if (error)
			goto out;
	}

	if ((filemode & FWRITE) && !VN_ISREPLICABLE(vp)){
		VN_FLAGSET(vp, VNONREPLICABLE);
	}

	/*
	 * Do opening protocol.
	 */
	openvp = vp;
	VOP_OPEN(openvp, &vp, filemode, get_current_cred(), error);
	if (!error) {
		/*REFERENCED*/
		int unused;

		if (tvp) {
			VN_RELE(tvp);
			/* avoid extra VN_RELE in error case below */
			tvp = NULL;	
		}
		/*
		 * Truncate if required.
		 */
		if ((filemode & FTRUNC) && vp->v_type == VREG) {
			vattr.va_size = 0;
			vattr.va_mask = AT_SIZE;
			VOP_SETATTR(vp, &vattr, 0, get_current_cred(), error);
			if (error)
				/*
				 * since the open never succeeded, there can't
				 * be any locks
				 */
				VOP_CLOSE(vp, filemode, L_TRUE, 
					  get_current_cred(), unused);
		}
	}
out:
	if (error) {
		if (tvp)
			VN_RELE(tvp);
		VN_RELE(vp);
	} else
		*vpp = vp;
	return error;
}

/*
 * Create a vnode (makenode).
 */
/*ARGSUSED*/
int
vn_create(
        char		*pnamep,
	enum uio_seg 	seg,
	vattr_t 	*vap,
	int 		flags,
	int 		mode,
	vnode_t 	**vpp,
	enum create 	why,
	int		*ckpt)
{
	vnode_t		*dvp;	/* ptr to parent dir vnode */
	pathname_t	 pn;
	int 		error;
	vpagg_t		*vpag;
	int		existing = 0;

	ckpt_handle_t	*lookupp = NULL;
#ifdef CKPT
	ckpt_handle_t	lookup = NULL;
#endif
	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));

	/*
	 * VOP_CREATE/MKDIR needs the project id.
	 */
	VPROC_GETVPAGG(curvprocp, &vpag);
	vap->va_projid = VPAG_GETPRID(vpag);
	vap->va_mask |= AT_PROJID;

	/*
	 * Lookup directory.
	 * If new object is a file, call lower level to create it.
	 * Note that it is up to the lower level to enforce exclusive
	 * creation, if the file is already there.
	 * This allows the lower level to do whatever
	 * locking or protocol that is needed to prevent races.
	 * If the new object is directory call lower level to make
	 * the new directory, with "." and "..".
	 */
	if (error = pn_get(pnamep, seg, &pn))
		return error;
	_SAT_PN_SAVE(&pn, curuthread);
	dvp = NULL;
	*vpp = NULL;
#ifdef CKPT
	lookupp = (ckpt)? &lookup : NULL;
#endif
	/*
	 * lookup will find the parent directory for the vnode.
	 * When it is done the pn holds the name of the entry
	 * in the directory.
	 * If this is a non-exclusive create we also find the node itself.
	 */
	if (flags & VEXCL) 
		error = lookuppn(&pn, NO_FOLLOW, &dvp, NULLVPP, lookupp); 
	else 
		error = lookuppn(&pn, FOLLOW, &dvp, vpp, lookupp); 
	if (error) {
		pn_free(&pn);
		if (why == CRMKDIR && error == EINVAL)
			error = EEXIST;		/* SVID */
		return error;
	}
	ASSERT(dvp->v_count > 0);
	vn_trace_entry(dvp, "vn_create", (inst_t *)__return_address);
	if (*vpp)
		vn_trace_entry(*vpp, "vn_create:f", (inst_t *)__return_address);

	if (why != CRMKNOD)
		vap->va_mode &= ~VSVTX;

	/*
	 * Make sure filesystem is writeable.
	 */
	if (dvp->v_vfsp->vfs_flag & VFS_RDONLY) {
		if (*vpp)
			VN_RELE(*vpp);
		error = EROFS;
	} else if (!(flags & VEXCL) && *vpp != NULL) {
		register struct vnode *vp = *vpp;

		/*
		 * File already exists.  If a mandatory lock has been
		 * applied, return EAGAIN.
		 */
		if ((vp->v_flag & (VFRLOCKS|VENF_LOCKING)) == 
		    (VFRLOCKS|VENF_LOCKING)) {
			error = EAGAIN;
			VN_RELE(vp);
			goto out;
		}

		/* do not permit truncating a swap file */
		if ((vp->v_flag & VISSWAP) && vp->v_type == VREG) {
			error = EBUSY;
			VN_RELE(vp);
			goto out;
		}

		/*
		 * If the file is the root of a VFS, we've crossed a
		 * mount point and the "containing" directory that we
		 * acquired above (dvp) is irrelevant because it's in
		 * a different file system.  We apply VOP_CREATE to the
		 * target itself instead of to the containing directory
		 * and supply a null path name to indicate (conventionally)
		 * the node itself as the "component" of interest.
		 *
		 * The intercession of the file system is necessary to
		 * ensure that the appropriate permission checks are
		 * done.
		 */
		if (vp->v_flag & VROOT) {
			/*
			 * lvp (NULL) is needed since VOP_CREATE now has vpp
			 * as an in and out parameter.
			 * It has special meaning if set.
			 */
			vnode_t 	*lvp = NULL;

			ASSERT(why != CRMKDIR);
			VOP_CREATE(vp, "", vap, flags, mode, &lvp, 
				   get_current_cred(), error);
			/*
			 * If the create succeeded, it will have created
			 * a new reference to the vnode.  Give up the
			 * original reference.
			 */
			VN_RELE(vp);
			goto out;
		}

		/*
		 * Don't throw the vnode. Give it to VOP_CREATE
		 * so it can prevent another lookup and then
		 * deal with it in a non-racy manner.
		 */
		ASSERT(*vpp == vp);
		if (why == CRMKDIR) /* Won't be going to VOP_CREATE */
			VN_RELE(vp);
		ASSERT(!error);
		existing = 1;
	}

	if (error == 0) {
		/*
		 * Call fs dependent mkdir() to create dir.  Otherwise, fs
		 * dependent create.
		 */
		if (why == CRMKDIR || (why == CRMKNOD && vap->va_type == VDIR)) {
			VOP_MKDIR(dvp, pn.pn_path, vap, vpp, 
				  get_current_cred(), error);
		} else {
			VOP_CREATE(dvp, pn.pn_path, vap, flags, mode,
					   vpp, get_current_cred(), error);
			if (!error && *vpp) {
				IMON_EVENT(*vpp, get_current_cred(), 	
					   IMON_CONTENT);
			} else if ((error == ENOSYS) && *vpp) {
				VN_RELE(*vpp);
			}
		}
	}
out:
#ifdef CKPT
	if (lookup) {

		ASSERT(ckpt);

		if (!error && *vpp)
			*ckpt = ckpt_lookup_add(*vpp, lookup);
		else {
			*ckpt = -1;
			ckpt_lookup_free(lookup);
		}

	} else if (ckpt)
		*ckpt = -1;
#endif

        /*
         * Set Trix extended attributes on the vnode if:
         *      there was no previous error, and
         *      the vnode is newly created, and
         *      we have a handle on that vnode
	 *
         * At this time, those attributes are
         *      MAC label
         *      directory default ACL
         *
         * If appropriate extended security attributes cannot
         * be set on a filesystem object, it is removed.
         */
        if (!error && !existing && *vpp) {

		/* MAC label
		 *
		 * No file/directory should never get created with the
		 * moldy bit set by default: check to see if the process
		 * label has the moldy bit set and set the label without
		 * it.
		 */
		mac_label * label = get_current_cred()->cr_mac;
                if ( _MAC_IS_MOLDY ( label ) )
		{
		    if ( label = _MAC_DEMLD ( label ) )
		    {
			error = _MAC_VSETLABEL(*vpp, label );
			kern_free ( label );
		    }
		    else
			error = ENOMEM;
		}
		else
		    error = _MAC_VSETLABEL(*vpp, label );

		/* Directory default ACL */
                if ( error || (error = _ACL_INHERIT(dvp, *vpp, vap))) {
                        cmn_err_tag(318,CE_NOTE, "vn_create: %s(%d)",
                                __FILE__, __LINE__);
                        VOP_REMOVE(dvp, pn.pn_path, get_current_cred(), error);
                }
        }
 
	pn_free(&pn);
	VN_RELE(dvp);
	return error;
}

/*
 * Link.
 */
int
vn_link(char *from, char *to, enum uio_seg seg, enum symfollow follow)
{
	struct vnode *fvp;		/* from vnode ptr */
	struct vnode *tdvp;		/* to directory vnode ptr */
	struct pathname pn;
	register int error;
	struct vattr vattr;
	long fsid;

	fvp = tdvp = NULL;
	if (error = pn_get(to, seg, &pn))
		return error;
	_SAT_PN_SAVE(&pn, curuthread);

	if (error = lookupname(from, seg, follow, NULLVPP, &fvp, NULL))
		goto out;

	if (error = lookuppn(&pn, FOLLOW, &tdvp, NULLVPP, NULL))
		goto out;

	/*
	 * Make sure both source vnode and target directory vnode are
	 * in the same vfs and that it is writeable.
	 */
	vattr.va_mask = AT_FSID;
	VOP_GETATTR(fvp, &vattr, 0, get_current_cred(), error);
	if (error)
		goto out;
	fsid = vattr.va_fsid;
	VOP_GETATTR(tdvp, &vattr, 0, get_current_cred(), error);
	if (error)
		goto out;
	if (fsid != vattr.va_fsid) {
		error = EXDEV;
		goto out;
	}
	if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
		error = EROFS;
		goto out;
	}

	if (!error)
		VOP_LINK(tdvp, fvp, pn.pn_path, get_current_cred(), error);
out:
	pn_free(&pn);
	if (fvp)
		VN_RELE(fvp);
	if (tdvp)
		VN_RELE(tdvp);
	_SAT_ACCESS2(SAT_FILE_CRT_DEL2, error);
	return error;
}

/*
 * Rename.
 */
int
vn_rename(char *from, char *to, enum uio_seg seg)
{
	struct vnode *fdvp;		/* from directory vnode ptr */
	struct vnode *fvp;		/* from vnode ptr */
	struct vnode *tdvp;		/* to directory vnode ptr */
	struct pathname fpn;		/* from pathname */
	struct pathname tpn;		/* to pathname */
	register int error;

	fdvp = tdvp = fvp = NULL;
	/*
	 * Get to and from pathnames.
	 */
	if (error = pn_get(from, seg, &fpn))
		return error;
	_SAT_PN_SAVE(&fpn, curuthread);

	if (error = pn_get(to, seg, &tpn)) {
		pn_free(&fpn);
		return error;
	}
	_SAT_PN_SAVE(&tpn, curuthread);

	/*
	 * Lookup to and from directories.
	 */

	if (error = lookuppn(&fpn, NO_FOLLOW, &fdvp, &fvp, NULL))
		goto out;
	vn_trace_entry(fdvp, "vn_rename:fd", (inst_t *)__return_address);
	/*
	 * Make sure there is an entry.
	 */
	if (fvp == NULL) {
		error = ENOENT;
		goto out;
	}
	vn_trace_entry(fvp, "vn_rename:f", (inst_t *)__return_address);
	/*
	 * Make sure we're not moving an active swap file.
	 * This prevents mv/reboot/rm from being able to
	 * remove the swap file.  Must do swap -d first.
	 */
	if (fvp->v_flag & VISSWAP && fvp->v_type == VREG) {
		error = EBUSY;
		goto out;
	}

	if (error = lookuppn(&tpn, NO_FOLLOW, &tdvp, NULLVPP, NULL))
		goto out;
	vn_trace_entry(tdvp, "vn_rename:td", (inst_t *)__return_address);
	/*
	 * Make sure that the from vnode and to directory are 
	 * in the same vfs, or that the from vnode is not a 
	 * mount point (for lofs renames), and that the from and 
	 * to directories share the same vfs.
	 * Also make sure that the to directory is writable.
	 * XXX this traditional vnodes test differs from the va_fsid
	 *     test used by vn_link
	 */
	if ((fvp->v_vfsp != tdvp->v_vfsp && (fvp->v_flag & VROOT) != 0) ||
            (fdvp->v_vfsp != tdvp->v_vfsp)) {
		error = EXDEV;
		goto out;
	}
	if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) {
		error = EROFS;
		goto out;
	}
	VOP_RENAME(fdvp, fpn.pn_path, tdvp, tpn.pn_path, &tpn,
			   get_current_cred(), error);
	/*
	 *  Must explicitly post imon events because imon-vnode
	 *  layer will never see it if fdvp isn't monitored. K<bob> 6/22/94
	 */
	if (error == 0) {
		IMON_EVENT(tdvp, get_current_cred(), IMON_CONTENT);
		IMON_EVENT(fvp, get_current_cred(), IMON_RENAME);
	}
out:
	pn_free(&fpn);
	pn_free(&tpn);
	if (fvp)
		VN_RELE(fvp);
	if (fdvp)
		VN_RELE(fdvp);
	if (tdvp)
		VN_RELE(tdvp);
	_SAT_ACCESS2(SAT_FILE_CRT_DEL2, error);
	return error;
}

/*
 * Remove a file or directory.
 */
int
vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag)
{
	struct vnode *vp;		/* entry vnode */
	struct vnode *dvp;		/* ptr to parent dir vnode */
	struct pathname pn;		/* name of entry */
	enum vtype vtype;
	register int error;
	register struct vfs *vfsp;

	if (error = pn_get(fnamep, seg, &pn))
		return error;
	_SAT_PN_SAVE(&pn, curuthread);
	vp = NULL;
	if (error = lookuppn(&pn, NO_FOLLOW, &dvp, &vp, NULL)) {
		pn_free(&pn);
		return error;
	}

	vn_trace_entry(dvp, "vn_remove", (inst_t *)__return_address);
	/*
	 * Make sure there is an entry.
	 */
	if (vp == NULL) {
		error = ENOENT;
		goto out;
	}
	vn_trace_entry(vp, "vn_remove", (inst_t *)__return_address);

	vfsp = vp->v_vfsp;

	/*
	 * If the named file is the root of a mounted filesystem, fail.
	 */
	if (vp->v_flag & VROOT) {
		error = EBUSY;
		goto out;
	}

	/*
	 * Make sure filesystem is writeable.
	 */
	if (vfsp->vfs_flag & VFS_RDONLY) {
		error = EROFS;
		goto out;
	}

	/*
	 * Make sure we're not removing an active swap file.
	 */
	if (vp->v_flag & VISSWAP && vp->v_type == VREG) {
		error = EBUSY;
		goto out;
	}

	/*
	 * If vnode represents a named semaphore,
	 * cleanup the kernel semaphore state.
	 */
	if (vp->v_flag & VSEMAPHORE) {
		if (error = psema_indirect_unlink(vp))
			goto out;
	}

	/*
	 * Release vnode before removing.
	 */
	vtype = vp->v_type;
	VN_RELE(vp);
	vp = NULL;
	/*
	 * If caller is using rmdir(2), it can be applied only to directories.
	 * Unlink(2) can be applied to anything.
	 */
	if (dirflag == RMDIRECTORY) {
		if (vtype != VDIR) {
			error = ENOTDIR;
			goto out;
		}
		VOP_RMDIR(dvp, pn.pn_path, curuthread->ut_cdir,
			  get_current_cred(), error);
	} else
		VOP_REMOVE(dvp, pn.pn_path, get_current_cred(), error);
out:
	pn_free(&pn);
	if (vp != NULL)
		VN_RELE(vp);
	VN_RELE(dvp);
	return error;
}

/*
 * Compare two vnodes.  For now we use the ops of the base
 * behavior to decide if the VOP_CMP() call will make any
 * sense.
 */
int
vn_cmp(vnode_t *vp1, vnode_t *vp2)
{
	bhv_desc_t	*bdp1;
	bhv_desc_t	*bdp2;
	int		cmp;

	if (vp1 == vp2) {
		return 1;
	}
	if ((vp1 == NULL) ||
	    (vp2 == NULL) ||
	    (vp1->v_fbhv == NULL) ||
	    (vp2->v_fbhv == NULL)) {
		return 0;
	}
	bdp1 = vn_bhv_base_unlocked(VN_BHV_HEAD(vp1));
	bdp2 = vn_bhv_base_unlocked(VN_BHV_HEAD(vp2));
	if (BHV_OPS(bdp1) == BHV_OPS(bdp2)) {
		VOP_CMP(vp1, vp2, cmp);
		return cmp;
	}

	return 0;
}

/*ARGSUSED*/
pfd_t *
vn_pfind(struct vnode *vp, pgno_t pageno, int ckey, void *pm)
{

	pfd_t	*pfd;

	pfd = vnode_pfind(vp, pageno, ckey);

#if  defined(NUMA_REPLICATION)
	/* 
	 * If we found a page, and vnode is a candidate for replication,
	 * check with the replication module, if it's okay to return 
	 * this page.
	 */
	if (pfd && VN_ISREPLICABLE(vp))
		pfd = repl_pfind(vp, pageno, ckey, pm, pfd);

#endif	/* defined(NUMA_REPLICATION) */

	return pfd; 
}

#ifdef VNODE_TRACING
/*
 * Vnode tracing code.
 */
void
vn_trace_entry(vnode_t *vp, char *func, inst_t *ra)
{
	ktrace_enter(vp->v_trace, (void *)(__psint_t)VNODE_KTRACE_ENTRY,
		(void *)func, 0, (void *)(__psint_t)vp->v_count, (void *)ra,
		(void *)(__psunsigned_t)vp->v_flag, (void *)(__psint_t)cpuid(),
		(void *)(__psint_t)current_pid(), 0, 0, 0, 0, 0, 0, 0, 0);
}

void
vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra)
{
	ktrace_enter(vp->v_trace, (void *)(__psint_t)VNODE_KTRACE_HOLD,
		(void *)file, (void *)(__psint_t)line,
		(void *)(__psint_t)vp->v_count, (void *)ra,
		(void *)(__psunsigned_t)vp->v_flag, (void *)(__psint_t)cpuid(),
		(void *)(__psint_t)current_pid(), 0, 0, 0, 0, 0, 0, 0, 0);
}

void
vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra)
{
	ktrace_enter(vp->v_trace, (void *)(__psint_t)VNODE_KTRACE_REF,
		(void *)file, (void *)(__psint_t)line,
		(void *)(__psint_t)vp->v_count, (void *)ra,
		(void *)(__psunsigned_t)vp->v_flag, (void *)(__psint_t)cpuid(),
		(void *)(__psint_t)current_pid(), 0, 0, 0, 0, 0, 0, 0, 0);
}

void
vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra)
{
	ktrace_enter(vp->v_trace, (void *)(__psint_t)VNODE_KTRACE_RELE,
		(void *)file, (void *)(__psint_t)line,
		(void *)(__psint_t)vp->v_count, (void *)ra,
		(void *)(__psunsigned_t)vp->v_flag, (void *)(__psint_t)cpuid(),
		(void *)(__psint_t)current_pid(), 0, 0, 0, 0, 0, 0, 0, 0);
}
#endif /* VNODE_TRACING */