/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ /* All Rights Reserved */ /* THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF */ /* UNIX System Laboratories, Inc. */ /* The copyright notice above does not evidence any */ /* actual or intended publication of such source code. */ /* * +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ * PROPRIETARY NOTICE (Combined) * * This source code is unpublished proprietary information * constituting, or derived under license from AT&T's UNIX(r) System V. * In addition, portions of such source code were derived from Berkeley * 4.3 BSD under license from the Regents of the University of * California. * * * * Copyright Notice * * Notice of copyright on this source code product does not indicate * publication. * * (c) 1986,1987,1988,1989 Sun Microsystems, Inc * (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All rights reserved. * */ #ident "$Revision: 1.215 $" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _SHAREII #include #endif /* _SHAREII */ #ifdef VNODE_TRACING #include #endif /* * Managing the pool of allocated and free vnodes: * * Whenever a vnode is needed and the number of free vnodes is above * (vn_vnumber - vn_epoch), and ncsize, an attempt is made to * reclaim a vnode from a vnode freelist. Otherwise, or if a short search * of a freelist doesn't produce a reclaimable vnode, a vnode is * constructed from the heap. * * It is up to vn_shake to deconstruct free vnodes. */ /* * Internal data structures. */ /* * Vnode hash list bucket. */ typedef struct vhash_s { struct vnode *vh_vnode; lock_t vh_lock; } vhash_t; /* * Macros and defines. */ #define VFREELIST(count) &vfreelist[count].vf_freelist #define LOCK_VFREELIST(list) mutex_spinlock(&vfreelist[list].vf_lock) #define UNLOCK_VFREELIST(l,s) mutex_spinunlock(&vfreelist[l].vf_lock, s) #define LOCK_VFP(listp) mutex_spinlock(&(listp)->vf_lock) #define UNLOCK_VFP(listp,s) mutex_spinunlock(&(listp)->vf_lock, s) #define NESTED_LOCK_VFP(listp) nested_spinlock(&(listp)->vf_lock) #define NESTED_UNLOCK_VFP(listp) nested_spinunlock(&(listp)->vf_lock) #define VHASHMASK 127 #define VHASH(vnumber) (&vhash[(vnumber) & VHASHMASK]) #define NVSYNC 37 /* prime */ #define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC]) /* * Vnode global data. */ static hotUint64Counter_t vn_generation; /* vnode generation number */ hotUlongCounter_t vn_vnumber; /* # of vnodes ever allocated */ hotIntCounter_t vn_nfree; /* # of free vnodes */ #if MP #pragma fill_symbol (vn_generation, 128) #pragma fill_symbol (vn_vnumber, 128) #pragma fill_symbol (vn_nfree, 128) #endif static zone_t *vn_zone; /* vnode heap zone */ int vn_epoch; /* # of vnodes freed */ /* vn_vnumber - vn_epoch == # current vnodes */ static int vn_minvn; /* minimum # vnodes before reclaiming */ static int vn_shaken; /* damper for vn_alloc */ static uint_t vn_coin; /* coin for vn_alloc */ static vhash_t *vhash; /* hash buckets for active vnodes */ vnode_t *rootdir; /* pointer to root vnode */ vfreelist_t *vfreelist; /* pointer to array of freelist structs */ static int vfreelistmask; /* number of free-lists - 1 */ /* * Following is global data that can't be cellularized until any given * vnode is accessed from only one cell. */ static sv_t vsync[NVSYNC]; /* vnode inactive/reclaim sync semaphores */ lock_t mreg_lock; /* spinlock protecting all vp->v_mreg */ #if MP #pragma align_symbol (mreg_lock, L2cacheline) #pragma fill_symbol (mreg_lock, L2cacheline) #endif /* MP */ /* * Imon data - must be here instead of imon.c so that we don't get * gp-relative link errors. */ void (*imon_event)(struct vnode *, struct cred *cr, int); void (*imon_hook)(struct vnode *, dev_t, ino_t); void (*imon_broadcast)(dev_t, int); int imon_enabled; /* * Externs and static functions. */ static void vn_relink(vnlist_t *, vnode_t *, vnode_t *); static int vn_shake(int); /* * Convert stat(2) formats to vnode types and vice versa. (Knows about * numerical order of S_IFMT and vnode types.) */ enum vtype iftovt_tab[] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; u_short vttoif_tab[] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK }; /* * Vnode operations for a free or killed vnode. */ vnodeops_t dead_vnodeops = { BHV_IDENTITY_INIT_POSITION(VNODE_POSITION_INVALID), (vop_open_t)fs_nosys, (vop_close_t)fs_noerr, (vop_read_t)fs_nosys, (vop_write_t)fs_nosys, (vop_ioctl_t)fs_nosys, (vop_setfl_t)fs_nosys, (vop_getattr_t)fs_nosys, (vop_setattr_t)fs_nosys, (vop_access_t)fs_nosys, (vop_lookup_t)fs_nosys, (vop_create_t)fs_nosys, (vop_remove_t)fs_nosys, (vop_link_t)fs_nosys, (vop_rename_t)fs_nosys, (vop_mkdir_t)fs_nosys, (vop_rmdir_t)fs_nosys, (vop_readdir_t)fs_nosys, (vop_symlink_t)fs_nosys, (vop_readlink_t)fs_nosys, (vop_fsync_t)fs_nosys, (vop_inactive_t)fs_noerr, (vop_fid_t)fs_nosys, (vop_fid2_t)fs_nosys, (vop_rwlock_t)fs_noval, (vop_rwunlock_t)fs_noval, (vop_seek_t)fs_nosys, (vop_cmp_t)fs_nosys, (vop_frlock_t)fs_nosys, (vop_realvp_t)fs_nosys, (vop_bmap_t)fs_nosys, (vop_strategy_t)fs_noval, (vop_map_t)fs_nodev, (vop_addmap_t)fs_nosys, (vop_delmap_t)fs_nosys, (vop_poll_t)fs_nosys, (vop_dump_t)fs_nosys, (vop_pathconf_t)fs_nosys, (vop_allocstore_t)fs_nosys, (vop_fcntl_t)fs_nosys, (vop_reclaim_t)fs_noerr, (vop_attr_get_t)fs_nosys, (vop_attr_set_t)fs_nosys, (vop_attr_remove_t)fs_nosys, (vop_attr_list_t)fs_nosys, (vop_cover_t)fs_nosys, (vop_link_removed_t)fs_noval, (vop_vnode_change_t)fs_nosys, (vop_ptossvp_t)fs_noval, (vop_pflushinvalvp_t)fs_noval, (vop_pflushvp_t)fs_noval, (vop_pinvalfree_t)fs_noval, (vop_sethole_t)fs_noval, (vop_commit_t)fs_nosys, (vop_readbuf_t)fs_nosys, (vop_strgetmsg_t)fs_nosys, (vop_strputmsg_t)fs_nosys, }; void vn_init(void) { register vfreelist_t *vfp; register sv_t *svp; register int i; extern int nproc, ncsize; /* * There are ``vfreelistmask + 1'' freelists -- * so multiple clients can allocate vnodes simultaneously, and * to keep the individual lists reasonably short. */ i = MIN(numcpus, 16); while (i & (i-1)) i--; vfreelistmask = i - 1; vfp = vfreelist = (vfreelist_t *) kmem_zalloc(i * sizeof(vfreelist_t), KM_SLEEP); for (i = 0; i <= vfreelistmask; i++) { vn_initlist(&vfp->vf_freelist); vfp->vf_next = vfp + 1; vfp->vf_listid = i; init_spinlock(&vfp->vf_lock, "vf_lock", i); vfp++; } vfreelist[vfreelistmask].vf_next = vfreelist; vn_zone = kmem_zone_init(sizeof(vnode_t), "Vnodes"); spinlock_init(&mreg_lock, "mreg_lock"); for (svp = vsync, i = 0; i < NVSYNC; i++, svp++) init_sv(svp, SV_DEFAULT, "vsy", i); repl_init(); /* vnode replication */ shake_register(SHAKEMGR_MEMORY, vn_shake); vhash = (vhash_t *)kmem_zalloc((VHASHMASK+1) * sizeof(vhash_t), KM_SLEEP); for (i = 0; i <= VHASHMASK; i++) { init_spinlock(&vhash[i].vh_lock, "vhash", i); } vn_minvn = MAX(nproc, ncsize); /* Just refer to vn_passthrup to force linker to bring int * vn_passthru ops. */ if (!vn_passthrup){ cmn_err_tag(139,CE_PANIC,"vnode pass through mode not initialized ?"); } } /* * Find vnode in hash table with the given number. If the vnode is * found then it must be the same as the one passed in. * * NOTE: must not dereference 'vp' because it's possible the memory that * vp refers to has been reallocated to some other use (because it was * freed by the vn_shake mechanism). */ vnode_t * vn_find(vnode_t *vp, vnumber_t number) { register vhash_t *vhp = VHASH(number); register int s = mutex_spinlock(&vhp->vh_lock); register vnode_t *lvp; for (lvp = vhp->vh_vnode; lvp; lvp = lvp->v_hashn) { ASSERT(lvp->v_number != 0); if (lvp->v_number == number) { if (lvp != vp) { printf("vn_find: vp=0x%x lvp=0x%x\n", vp, lvp); panic("vn_find error"); } break; } } mutex_spinunlock(&vhp->vh_lock, s); return(lvp); } /* * Put vnode in hash table. Must have a non-zero v_number. */ void vn_hash(register vnode_t *vp) { register vhash_t *vhp = VHASH(vp->v_number); register int s = mutex_spinlock(&vhp->vh_lock); register vnode_t **vpp = &vhp->vh_vnode; ASSERT(vp->v_number); vp->v_hashp = (vnode_t *)NULL; vp->v_hashn = *vpp; if (vp->v_hashn) vp->v_hashn->v_hashp = vp; *vpp = vp; mutex_spinunlock(&vhp->vh_lock, s); } /* * Remove vnode from hash table. v_number is set to 0 so that vn_find * won't find it. */ void vn_unhash(register vnode_t *vp) { register vhash_t *vhp = VHASH(vp->v_number); register int s = mutex_spinlock(&vhp->vh_lock); register vnode_t *vnext = vp->v_hashn; register vnode_t *vprev = vp->v_hashp; if (vprev) vprev->v_hashn = vnext; else vhp->vh_vnode = vnext; if (vnext) vnext->v_hashp = vprev; mutex_spinunlock(&vhp->vh_lock, s); vp->v_hashp = vp->v_hashn = (vnode_t *)NULL; vp->v_number = 0; } /* * Clean a vnode of filesystem-specific data and prepare it for reuse. */ static int vn_reclaim(struct vnode *vp, int flag) { int error, s; #ifdef CKPT extern int ckpt_enabled; #endif VOPINFO.vn_reclaim++; /* * Only make the VOP_RECLAIM call if there are behaviors * to call. */ if (vp->v_fbhv != NULL) { VOP_RECLAIM(vp, flag, error); if (error) return error; } ASSERT(vp->v_fbhv == NULL); /* * File system erred somewhere along the line, and there * are still pages/buffers associated with the object. * Remove the debris and print a warning. * XXX LONG_MAX won't work for 64-bit offsets! */ if (vp->v_pgcnt || vp->v_dpages || vp->v_buf) { #ifdef DEBUG int i; if (vp->v_vfsp) i = vp->v_vfsp->vfs_fstype; else i = 0; cmn_err(CE_WARN, "vn_reclaim: vnode 0x%x fstype %d (%s) has unreclaimed data (pgcnt %d dbuf %d dpages 0x%x), flag:%x", vp, i, vfssw[i].vsw_name ? vfssw[i].vsw_name : "?", vp->v_pgcnt, vp->v_dbuf, vp->v_dpages, vp->v_flag); #endif VOP_FLUSHINVAL_PAGES(vp, 0, LONG_MAX, FI_NONE); } ASSERT(vp->v_dpages == NULL && vp->v_dbuf == 0 && vp->v_pgcnt == 0); /* * The v_pgcnt assertion will catch debug systems that screw up. * Patch up v_pgcnt for non-debug systems -- v_pgcnt probably * means accounting problem here, not hashed data. */ vp->v_pgcnt = 0; s = VN_LOCK(vp); vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address); if (vp->v_number) { vn_unhash(vp); } ASSERT(vp->v_hashp == (vnode_t *)NULL); ASSERT(vp->v_hashn == (vnode_t *)NULL); /* * Clear all flags except the ones relevant to the fact * that it's being reclaimed. */ vp->v_flag &= (VRECLM|VWAIT|VSHARE|VLOCK); VN_UNLOCK(vp, s); vp->v_stream = NULL; vp->v_type = VNON; vp->v_fbhv = NULL; /* * All locks should have been released by now, but * the lock sema data structure needs to be taken care of. */ ASSERT(vp->v_filocks == NULL); /* * Realaim all page cache related data. * This could sleep waiting to synchronize with the * threads trying to recycle hashed pages.. */ vnode_pcache_reclaim(vp); #ifdef CKPT /* * Free lookup info... */ if (ckpt_enabled) ckpt_vnode_free(vp); #endif ASSERT(vp->v_mreg == (struct pregion *)vp); ASSERT(vp->v_intpcount == 0); return 0; } static void vn_wakeup(struct vnode *vp) { int s; s = VN_LOCK(vp); vn_trace_entry(vp, "vn_wakeup", (inst_t *)__return_address); if (vp->v_flag & VWAIT) { sv_broadcast_bounded(vptosync(vp)); } vp->v_flag &= ~(VRECLM|VWAIT); VN_UNLOCK(vp, s); } /* * Allocate a vnode struct for filesystem usage. * Reclaim the oldest on the global freelist if there are any, * otherwise allocate another. */ struct vnode * vn_alloc(struct vfs *vfsp, enum vtype type, dev_t dev) { register struct vnode *vp; register int list; register vnlist_t *vlist; register int cnt, s, i; register u_long vnumber; long target; int alloced = 0; int error; #define VN_ALLOC_TRIES 4 VOPINFO.vn_alloc++; if (vfreelistmask) { list = ++private.p_hand & vfreelistmask; s = LOCK_VFREELIST(list); } else { s = LOCK_VFP(vfreelist); list = 0; } vlist = VFREELIST(list); vp = vlist->vl_next; /* * Easy cases: if list is empty, allocate a new vnode from the * heap; if first vnode on the list is empty, use it. */ if (vp == (struct vnode *)vlist) goto alloc; if (vp->v_fbhv == NULL) { ASSERT(!vp->v_dbuf && !vp->v_dpages && !vp->v_pgcnt); cnt = VN_ALLOC_TRIES; goto get; } #if VNODE_THRASH_TEST /* * Turn this ifdef on to force races with * vn_alloc/vn_reclaim/vn_get/dnlc_lookup_fast, etc. */ if (fetchIntHot(&vn_nfree) > 0) { cnt = VN_ALLOC_TRIES; goto get; } #endif /* * Allocate a minumum of vn_minvn vnodes. * XXX Do this from vn_init? */ vnumber = fetchUlongHot(&vn_vnumber) - vn_epoch; /* # of extant vnodes */ if (vnumber < vn_minvn) goto alloc; cnt = fetchIntHot(&vn_nfree); vnumber -= cnt; /* # of vnodes in use */ /* * If number of free vnode < number in-use, just alloc a new vnode. */ if (cnt < vnumber) goto alloc; /* * Calculate target # of total vnodes to have allocated. */ target = vnumber; /* * If number of free vnode < half of target, alloc a new vnode. */ if (cnt < target/2) goto alloc; /* * If below target # of free vnodes, devise the chance that * we'll manufacture a new vnode from the heap. The closer * we are to target, the more likely we'll just allocate from * the freelist -- don't want to manufacture vnodes willy-nilly * just to have vhand/vn_shake decommission them. */ if (cnt < target) { vnumber = target / 16; i = 0xf; if (vn_shaken > 0) { vn_shaken--; vnumber <<= 1; } while (cnt < target - vnumber) { i >>= 1; vnumber <<= 1; } if (!(++vn_coin & i)) goto alloc; } /* * If a reclaimable vnode isn't found after searching a very * few vnodes, put those vnodes on the tail of the free list * and allocate a vnode from the heap. This shouldn't happen * often, and vn_shake will trim down the number of vnodes if * the count rises too high. */ cnt = VN_ALLOC_TRIES; again: for ( ; vp != (struct vnode *)vlist ; vp = vp->v_next) { ASSERT(vp->v_listid == list); if (vp->v_dbuf || vp->v_dpages || vp->v_pgcnt > 8) { VOPINFO.vn_afreeloops++; if (--cnt < 0) { if (vlist->vl_next == vp) { vn_unlink(vp); vn_append(vp, vlist); } else if (vp->v_next != (struct vnode *)vlist) { vn_relink(vlist, vp, vp->v_prev); } VOPINFO.vn_afreemiss++; break; } continue; } get: VOPINFO.vn_afreeloops++; NESTED_VN_LOCK(vp); ASSERT(vp->v_count == 0); ASSERT((vp->v_flag & VINACT) == 0); if ((vp->v_flag & VRECLM) == 0) { vp->v_flag |= VRECLM; NESTED_VN_UNLOCK(vp); if (vlist->vl_next != vp && vlist->vl_prev != vp) { vn_relink(vlist, vp->v_next, vp->v_prev); vp->v_next = vp->v_prev = vp; } else vn_unlink(vp); ASSERT(vlist->vl_next->v_prev == (struct vnode *)vlist); ASSERT(vlist->vl_prev->v_next == (struct vnode *)vlist); ASSERT(vp->v_listid == list); ASSERT(vfreelist[list].vf_lsize > 0); vfreelist[list].vf_lsize--; UNLOCK_VFREELIST(list, s); error = vn_reclaim(vp, 0); if (error) { /* * Freelist lock must be held before cvsema'ing * vnode. A vn_get could happen on this vnode: * just after this process releases vp, it gets * an interrupt; the vn_get process acquires * freelist lock and dequeues it from nowhere; * then this process puts it back on free list. */ vn_trace_entry(vp, "REC FAIL1", (inst_t *)__return_address); s = LOCK_VFREELIST(list); vn_wakeup(vp); ASSERT(vp->v_listid == list); ASSERT(vfreelist[list].vf_lsize >= 0); vfreelist[list].vf_lsize++; vn_append(vp, vlist); vp = vlist->vl_next; vn_trace_entry(vp, "REC FAIL2", (inst_t *)__return_address); if (--cnt < 0) { break; } goto again; } else { vn_wakeup(vp); VOPINFO.vn_afree++; atomicAddIntHot(&vn_nfree, -1); ASSERT(!(vp->v_number)); #if defined(DEBUG) && defined(VNODE_INIT_BITLOCK) destroy_bitlock(&vp->v_flag); #endif vp->v_number = atomicAddUint64Hot(&vn_generation, 1); goto gotit; } } NESTED_VN_UNLOCK(vp); } alloc: UNLOCK_VFREELIST(list, s); VOPINFO.vn_aheap++; vp = kmem_zone_zalloc(vn_zone, KM_SLEEP); vp->v_number = atomicAddUint64Hot(&vn_generation, 1); alloced = 1; #ifdef VNODE_TRACING vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, 0); #endif vp->v_flag = VSHARE; (void) atomicAddUlongHot(&vn_vnumber, 1); init_bitlock(&vp->v_pcacheflag, VNODE_PCACHE_LOCKBIT, "v_pcache", (long)vp->v_number); # if defined(DEBUG) && defined(VNODE_INIT_BITLOCK) init_bitlock(&vp->v_flag, VLOCK, "vnode", (long)vp->v_number); # endif init_mutex(&vp->v_filocksem, MUTEX_DEFAULT, "vfl", (long)vp->v_number); init_mutex(&vp->v_buf_lock, MUTEX_DEFAULT, "vnbuf", (long)vp->v_number); vp->v_mreg = vp->v_mregb = (struct pregion *)vp; vnode_pcache_init(vp); gotit: vn_hash(vp); ASSERT(vp->v_count == 0); ASSERT(vp->v_dpages == NULL && vp->v_dbuf == 0 && vp->v_pgcnt == 0); ASSERT(vp->v_filocks == NULL); ASSERT(vp->v_intpcount == 0); ASSERT(vp->v_flag & VSHARE); /* * VLOCK may or may not be set, because other threads may be * trying to lock this vnode (in vn_get) in order to see if * it's the one they're looking for. They won't be able to * use the vnode though because the v_number won't match. */ ASSERT(!(vp->v_flag & (VNOSWAP | VISSWAP | VREPLICABLE | /* VNONREPLICABLE | XXX uncomment this */ VFRLOCKS | VENF_LOCKING | VREMAPPING | VDOCMP | VDUP | VSEMAPHORE | VUSYNC | VINACT | VRECLM | VEVICT | VWAIT | VFLUSH | VLOCKHOLD | VINACTIVE_TEARDOWN | VROOT | VMOUNTING))); vnode_pcache_reinit(vp); /* Initialize the first behavior and the behavior chain head. */ if (!alloced) { ASSERT(VN_BHV_NOT_READ_LOCKED(VN_BHV_HEAD(vp)) && VN_BHV_NOT_WRITE_LOCKED(VN_BHV_HEAD(vp))); vn_bhv_head_reinit(VN_BHV_HEAD(vp)); } else vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode"); vp->v_count = 1; vp->v_vfsp = vfsp; vp->v_type = type; vp->v_rdev = dev; vp->v_next = vp->v_prev = vp; vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address); #ifdef CKPT ASSERT(vp->v_ckpt == NULL); #endif return vp; } /* * Free an isolated vnode, putting it at the front of a vfreelist. * The vnode must not have any other references. */ void vn_free(struct vnode *vp) { register vfreelist_t *vfp; register int mask; register int s; #ifdef CKPT extern int ckpt_enabled; #endif #ifdef DEBUG ASSERT(vp->v_count == 1); if (vp->v_intpcount) printf("vn_free: v_intpcount = %d\n", vp->v_intpcount); ASSERT(vp->v_intpcount == 0); #endif if (mask = vfreelistmask) { register vfreelist_t *tp; vfp = &vfreelist[private.p_hand++ & mask]; tp = vfp->vf_next; if (vfp->vf_lsize > tp->vf_lsize) vfp = tp; vp->v_listid = vfp->vf_listid; } else { vp->v_listid = 0; vfp = vfreelist; } vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address); vp->v_count = 0; vp->v_fbhv = NULL; vnode_pcache_free(vp); #ifdef CKPT if (ckpt_enabled) ckpt_vnode_free(vp); #endif s = LOCK_VFP(vfp); ASSERT(vp->v_listid == vfp->vf_listid); vfp->vf_lsize++; vn_insert(vp, &vfp->vf_freelist); UNLOCK_VFP(vfp, s); atomicAddIntHot(&vn_nfree, 1); VOPINFO.vn_rele++; } static int vn_shake_freelist(register int nfree) { register struct vnode *vp; register vfreelist_t *vfp = vfreelist; register vnlist_t *vlist; register int list; int s, error; int shaken = 0; #ifdef DEBUG static int vn_shake_lock = 0; #endif again: if (vn_epoch == INT_MAX || nfree <= 0) return shaken; if (list = vfreelistmask) { register int count = vfp->vf_lsize; register vfreelist_t *tp = vfp; do { vfp = vfp->vf_next; if (vfp->vf_lsize > count) { count = vfp->vf_lsize; tp = vfp; } } while (--list > 0); vfp = tp; list = tp->vf_listid; } vlist = &vfp->vf_freelist; s = LOCK_VFP(vfp); for (vp = vlist->vl_next; vp != (struct vnode *)vlist; vp = vp->v_next) { ASSERT(vp->v_listid == list); if (vp->v_pgcnt) continue; NESTED_VN_LOCK(vp); ASSERT((vp->v_flag & VINACT) == 0); if ((vp->v_flag & VRECLM) == 0) { vp->v_flag |= VRECLM; ASSERT(vp->v_count == 0); NESTED_VN_UNLOCK(vp); vn_unlink(vp); ASSERT(vfp->vf_lsize > 0); vfp->vf_lsize--; UNLOCK_VFP(vfp, s); error = vn_reclaim(vp, 0); /* * Purge soft references to the vnode * from the name cache. */ if (!error) { dnlc_remove_vp(vp); ASSERT(fetchIntHot(&vn_nfree) > 0); atomicAddIntHot(&vn_nfree, -1); } else { /* * See comments in vn_alloc for explanation * why we lock vfreelist before cvsema in * error case. */ s = LOCK_VFP(vfp); vn_wakeup(vp); vp->v_listid = list; vn_append(vp, vlist); ASSERT(vfp->vf_lsize >= 0); vfp->vf_lsize++; UNLOCK_VFP(vfp, s); /* * Decrement nfree if we fail so we don't * get stuck here all day. */ if (nfree-- <= 0) return shaken; goto again; } s = LOCK_VFP(vfreelist); for (list = vfreelistmask, vfp = vfreelist->vf_next; list-- > 0; vfp = vfp->vf_next) { NESTED_LOCK_VFP(vfp); } #ifdef DEBUG vn_shake_lock = 1; #endif vn_wakeup(vp); /* * Define a new epoch in the history of vnodes... * Protected by having *every* freelist lock held. */ vn_epoch++; vn_shaken += 4; /* dampen vn_alloc's desire to */ /* allocate vnodes from heap */ VOPINFO.vn_destroy++; for (list = vfreelistmask, vfp = vfreelist->vf_next; list-- > 0; vfp = vfp->vf_next) { NESTED_UNLOCK_VFP(vfp); } UNLOCK_VFP(vfreelist, s); #ifdef DEBUG ASSERT(vn_shake_lock == 1); vn_shake_lock = 0; #endif /* DEBUG */ #ifdef VNODE_TRACING ktrace_free(vp->v_trace); #endif /* VNODE_TRACING */ /* Teardown behavior chain state. */ vn_bhv_head_destroy(VN_BHV_HEAD(vp)); vp->v_flag = 0; /* debug */ destroy_bitlock(&vp->v_pcacheflag); #if defined(DEBUG) && defined(VNODE_INIT_BITLOCK) destroy_bitlock(&vp->v_flag); #endif mutex_destroy(&vp->v_filocksem); mutex_destroy(&vp->v_buf_lock); kmem_zone_free(vn_zone, vp); shaken++; if (nfree-- <= 0) return shaken; goto again; } NESTED_VN_UNLOCK(vp); } UNLOCK_VFP(vfp, s); return shaken; } /* ARGSUSED */ int vn_shake(int level) { int total_vnodes = fetchUlongHot(&vn_vnumber) - vn_epoch; int free = fetchIntHot(&vn_nfree); int num_to_free; int v; /* * If we're below our configured minimum number of * vnodes, then just get out. */ if (total_vnodes < vn_minvn) { return 0; } v = total_vnodes; v -= free; /* # of in-use vnodes */ ASSERT(v >= 0); if (free <= v) { /* don't steal any vnodes */ return 0; /* if free count <= inuse count */ } v = free - v; /* # over target */ if (v < 0) v = 0; else v = v / 16; /* take either 1/Nth of # over target */ free = free / 128; /* or 1/Mth of free vnodes... */ num_to_free = MAX(free, v); num_to_free = MAX(num_to_free, 512); /* but no more than 512 */ /* * Don't pull the number of extant vnodes below the desired * minimum. */ if ((total_vnodes - num_to_free) < vn_minvn) { num_to_free = total_vnodes - vn_minvn; } return vn_shake_freelist(num_to_free); } /* * Routine which 1) makes a vnode invisible to vn_get, and 2) wakes up * anyone that has already found the vnode via vn_get but is blocked * there on the VINACT flag. * * This routine may only be called during inactivation processing. * It's used by nfs_inactive to resolve a deadlock condition. */ void vn_gone(struct vnode *vp) { register int s; ASSERT(vp->v_count == 0); ASSERT(vp->v_flag & VINACT); s = VN_LOCK(vp); vn_trace_entry(vp, "vn_gone", (inst_t *)__return_address); if (vp->v_flag & VWAIT) { sv_broadcast_bounded(vptosync(vp)); vp->v_flag &= ~VWAIT; } vp->v_flag |= VGONE; VN_UNLOCK(vp, s); } /* * Based on the value of the vnode's reference count, attempt to set * a vnode's VEVICT flag. * * If v_count > 1, then do nothing and return 1. * If v_count == 1, then set the vnode's VEVICT flag and return 0. * * The VEVICT flag causes callers of vn_get to wait until the vnode is * inactivated, or the evict condition is otherwise cleared (currently, * there's no interface to clear an evict condition). * * vn_evict is used by a caller who wishes to prevent additional references * to a vnode iff it holds the only reference. */ int vn_evict(struct vnode *vp) { register int s; s = VN_LOCK(vp); vn_trace_entry(vp, "vn_evict", (inst_t *)__return_address); ASSERT(vp->v_count >= 1); if (vp->v_count == 1) { vp->v_flag |= VEVICT; VN_UNLOCK(vp, s); return 0; } VN_UNLOCK(vp, s); return 1; } /* * Get and reference a vnode, possibly removing it from the freelist. * If v_count is zero and VINACT is set, then vn_rele is inactivating * and we must wait for vp to go on the freelist, or to be reclaimed. * If v_count is zero and VRECLM is set, vn_alloc is reclaiming vp; * we must sleep till vp is reclaimed, then return false to our caller, * who will try again to hash vp's identifier in its filesystem cache. * If during the sleep on vfreelock we miss a reclaim, we will notice * that v_number has changed. */ vnode_t * vn_get(register struct vnode *vp, register vmap_t *vmap, uint flags) { register int list; register vfreelist_t *vfp; register int s; list = vmap->v_id; VOPINFO.vn_get++; /* * A note about v_number: it gets set to a non-zero value at * vn_alloc() time and then reset to zero in vn_reclaim after * the call to VOP_RECLAIM. A file system calling vn_get will * have snapshotted the v_number and put it in the vmap structure * while the vnode was in its hash table: meaning it must have * been done between the time vn_alloc() completed and the time * VOP_RECLAIM completed. Hence, vmap->v_number should always be * non-zero */ if (vmap->v_number == 0) { #pragma mips_frequency_hint NEVER printf("vn_get error: vp=0x%x vmap->v_number=%d\n", vp, vmap->v_number); panic("vn_get: vmap->v_number == 0"); } again: /* * NOTE: must not dereference 'vp' until after verifying that * it still refers to a vnode. */ if (list < 0 || list > vfreelistmask) goto fail; vfp = &vfreelist[list]; s = LOCK_VFP(vfp); /* * Check that the epoch of vnodes hasn't changed. Epoch only * changes when a vnode is deallocated, which means that sampled * vnode pointers in filesystem caches may now be stale. If the * epoch has changed, search for the vnode in the vnode hash. */ if (vmap->v_epoch != vn_epoch) { /* if vn_find succeeds, it's guaranteed to find vp */ if (vn_find(vp, vmap->v_number) == NULL) { #pragma mips_frequency_hint NEVER UNLOCK_VFP(vfp, s); VOPINFO.vn_gchg++; vmap->v_id = 0; goto fail; } } /* * Now it's ok to dereference 'vp'. */ vn_trace_entry(vp, "GET AGAIN", (inst_t *)__return_address); #ifdef CKPT ASSERT(vp->v_ckpt != (ckpt_handle_t)-1L); #endif NESTED_VN_LOCK(vp); vn_trace_entry(vp, "GET LOCKED", (inst_t *)__return_address); if (vp->v_number != vmap->v_number) { #pragma mips_frequency_hint NEVER NESTED_UNLOCK_VFP(vfp); vmap->v_id = 0; vn_trace_entry(vp, "GET VERS", (inst_t *)__return_address); VN_UNLOCK(vp, s); VOPINFO.vn_gchg++; goto fail; } /* * If the vnode is being inactivated, reclaimed, or evicted, * then wait until the condition clears (unless VN_GET_NOWAIT * is specified). If the vnode has VGONE set, then return * immediately. */ if (vp->v_flag & (VINACT|VRECLM|VGONE|VEVICT)) { #pragma mips_frequency_hint NEVER ASSERT((vp->v_flag & VEVICT) ? vp->v_count <= 1 : 1); ASSERT((vp->v_flag & (VINACT|VRECLM|VGONE)) ? vp->v_count == 0 : 1); if (vp->v_flag & VGONE) { vmap->v_id = -1; vn_trace_entry(vp, "GET GONE", (inst_t *)__return_address); NESTED_VN_UNLOCK(vp); UNLOCK_VFP(vfp, s); goto fail; } /* * If the caller cannot get stuck waiting * for the vnode to complete its inactive * or reclaim routine, then return NULL. * Set v_id to -2 to indicate that this is * why NULL was returned. */ if (flags & VN_GET_NOWAIT) { vmap->v_id = -2; vn_trace_entry(vp, "GET NOWAIT", (inst_t *)__return_address); NESTED_VN_UNLOCK(vp); UNLOCK_VFP(vfp, s); goto fail; } NESTED_UNLOCK_VFP(vfp); vp->v_flag |= VWAIT; vn_trace_entry(vp, "GET RECL", (inst_t *)__return_address); sv_bitlock_wait(vptosync(vp), PINOD, &vp->v_flag, VLOCK, s); VOPINFO.vn_gchg++; goto again; } if (vp->v_count == 0) { /* * vnode could have travelled from one freelist to * another since it was sampled by caller. */ if (list != vp->v_listid) { #pragma mips_frequency_hint NEVER list = vp->v_listid; vn_trace_entry(vp, "GET SWTCH", (inst_t *)__return_address); NESTED_VN_UNLOCK(vp); UNLOCK_VFP(vfp, s); VOPINFO.vn_gchg++; goto again; } /* * If there are no behaviors attached to this vnode, * there is no point in giving it back to the caller. * This can happen if the behavior was detached in * the filesystem's inactive routine. */ if (vp->v_fbhv == NULL) { #pragma mips_frequency_hint NEVER vn_trace_entry(vp, "GET NO BHV", (inst_t *)__return_address); NESTED_VN_UNLOCK(vp); UNLOCK_VFP(vfp, s); vmap->v_id = 0; goto fail; } /* * Give vp one reference for our caller and unlink it from * the vnode freelist. */ vp->v_count = 1; vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address); NESTED_VN_UNLOCK(vp); ASSERT(vp->v_next != vp && vp->v_prev != vp); ASSERT(vp->v_flag & VSHARE); ASSERT(vp->v_filocks == NULL); vn_unlink(vp); ASSERT(vfp->vf_lsize > 0); vfp->vf_lsize--; UNLOCK_VFP(vfp, s); ASSERT(fetchIntHot(&vn_nfree) > 0); atomicAddIntHot(&vn_nfree, -1); VOPINFO.vn_gfree++; } else { vp->v_count++; vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address); NESTED_VN_UNLOCK(vp); UNLOCK_VFP(vfp, s); } #ifdef CKPT ASSERT(vp->v_ckpt != (ckpt_handle_t)-1L); #endif return vp; fail: return NULL; } /* * purge a vnode from the cache * At this point the vnode is guaranteed to have no references (v_count == 0) * The caller has to make sure that there are no ways someone could * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock). */ void vn_purge(struct vnode *vp, vmap_t *vmap) { register vfreelist_t *vfp; register int list; register int s; list = vmap->v_id; /* * See the note about v_number in vn_get. */ if (vmap->v_number == 0) { printf("vn_purge error: vp=0x%x vmap->v_number=%d\n", vp, vmap->v_number); panic("vn_purge: vmap->v_number == 0"); } again: if (list < 0 || list > vfreelistmask) return; vfp = &vfreelist[list]; s = LOCK_VFP(vfp); /* * Check that the epoch of vnodes hasn't changed. Epoch only * changes when a vnode is deallocated, which means that sampled * vnode pointers in filesystem caches may now be stale. If the * epoch has changed, search for the vnode in the vnode hash. */ if (vmap->v_epoch != vn_epoch) { /* if vn_find succeeds, it's guaranteed to find vp */ if (vn_find(vp, vmap->v_number) == NULL) { UNLOCK_VFP(vfp, s); VOPINFO.vn_gchg++; return; } } /* if you don't SHARE you don't get to play */ ASSERT(vp->v_flag & VSHARE); /* * Check whether vp has already been reclaimed since our caller * sampled its version while holding a filesystem cache lock that * its VOP_RECLAIM function acquires. */ NESTED_VN_LOCK(vp); vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address); if (vp->v_number != vmap->v_number) { NESTED_VN_UNLOCK(vp); UNLOCK_VFP(vfp, s); return; } /* * If vp is being reclaimed or inactivated, wait until it is inert, * then proceed. Can't assume that vnode is actually reclaimed * just because the reclaimed flag is asserted -- a vn_alloc * reclaim can fail. */ if (vp->v_flag & (VINACT | VRECLM)) { ASSERT(vp->v_count == 0); NESTED_UNLOCK_VFP(vfp); vp->v_flag |= VWAIT; sv_bitlock_wait(vptosync(vp), PINOD, &vp->v_flag, VLOCK, s); goto again; } /* * Another process could have raced in and gotten this vnode... */ if (vp->v_count > 0) { NESTED_UNLOCK_VFP(vfp); VN_UNLOCK(vp, s); return; } /* * vnode could have travelled from one freelist to * another since it was sampled by caller. */ if (list != vp->v_listid) { list = vp->v_listid; NESTED_UNLOCK_VFP(vfp); VN_UNLOCK(vp, s); VOPINFO.vn_gchg++; goto again; } vp->v_flag |= VRECLM; NESTED_VN_UNLOCK(vp); vn_unlink(vp); /* * XXX There is no routine that relies on a freelist's vf_lsize * XXX exactly matching the number of free list entries. Since * XXX this vnode is going right back on the same freelist, we * XXX won't bother to decrement, and later, increment vf_lsize. ASSERT(vfp->vf_lsize > 0); vfp->vf_lsize--; */ UNLOCK_VFP(vfp, s); /* * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells * vp's filesystem to flush and invalidate all cached resources. * When vn_reclaim returns, vp should have no private data, * either in a system cache or attached to the behavior chain. */ if (vn_reclaim(vp, FSYNC_INVAL) != 0) panic("vn_purge: cannot reclaim"); /* * Setting v_listid is protected by VRECLM flag above... vp->v_listid = list; */ s = LOCK_VFP(vfp); ASSERT(vp->v_listid == list); vn_insert(vp, &vfp->vf_freelist); /* * XXX See comments above about vf_lsize. ASSERT(vfp->vf_lsize >= 0); vfp->vf_lsize++; */ UNLOCK_VFP(vfp, s); /* * Wakeup anyone waiting for vp to be reclaimed. */ vn_wakeup(vp); } /* * Cause a vnode to be inacessible to everyone except those that * already have a reference. */ void vn_kill(struct vnode *vp) { register int s; /*REFERENCED*/ int closerr; /* if you don't SHARE you don't get to play */ ASSERT(vp->v_flag & VSHARE); ASSERT(vp->v_type == VCHR); ASSERT(vp->v_count > 0); s = VN_LOCK(vp); /* * Add ref so don't race with vn_rele/vn_reclaim. * * XXX How does this prevent a race? Since the VOP_CLOSE below * doesn't cause a vn_rele, then whatever vn_rele we're worried * about could equally as well happen prior to us bumping the * count here. Could this code be an artifact of the bug * in vhangup where it was calling vn_kill on a session vnode * that was already deallocated? */ vp->v_count++; vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address); VN_UNLOCK(vp, s); /* * Clean out any file locks in the unlikely event there are any. */ if (vp->v_flag & VFRLOCKS) fs_cleanlock(vp, sys_flid, sys_cred); /* * Pass special note to close routine - this won't really * close anything but will prevent any further "gets" from * succeeding on the vnode. */ VOP_CLOSE(vp, 0, FROM_VN_KILL, sys_cred, closerr); /* release our added ref */ VN_RELE(vp); } /* * Add a reference to a referenced vnode. */ struct vnode * vn_hold(struct vnode *vp) { register int s = VN_LOCK(vp); /* * Check validity of this request. Because, the effects of vn_hold-ing * a vnode on the free list are so dire, we always want to panic rather * than doing this. Note that non-VSHARE vnodes, which do not get * into the vnode free lists, are allowed to do a vn_hold when the * v_count is zero and that autofs takes advantage of this dubious * privilege. * * Also note that autofs in the CELL_IRIX case has been modified not * to do this (vn_hold a vnode with v_count equal to zero) so we * could enforce stricter requirements in that case but do not * bother since CELL_IRIX is a dead issue in the kudzu base. * This will be an issue when this is merged into teak but by * that time teak should be enforcing the stricter requirments * in all configruations. This will require hand-merging in any * case. */ if (vp->v_count <= 0) { #pragma mips_frequency_hint NEVER /* * We are holding a vnode with no hold or a corrupted * hold count. This probably means the vnode is on the * free list. If VSHARE is set, panic. * If VSHARE is not set, someone (autofs) can hold the vnode * with a count of 0. All others panic. */ if ((vp->v_flag & VSHARE) || (vp->v_count != 0)) cmn_err_tag(140,CE_PANIC, "holding vnode on free list %x(%x,%d)", vp, vp->v_flag, vp->v_count); } vp->v_count++; VN_UNLOCK(vp, s); return vp; } /* * Release a vnode. Decrements reference count and calls * VOP_INACTIVE on last reference. */ void vn_rele(struct vnode *vp) { register vfreelist_t *vfp; register int s; register int mask; int private_vnode; /*REFERENCED*/ int cache; extern void usync_cleanup(caddr_t); VOPINFO.vn_rele++; s = VN_LOCK(vp); if (--vp->v_count <= 0) { /* * Make sure that the count has not gone through * zero. If we were to let this go through unchecked, * all kinds of terrible things could happen to the * vnode lists and we wouldn't have a clue as we * surveyed the wreckage. */ if (vp->v_count < 0) { #pragma mips_frequency_hint NEVER cmn_err_tag(141,CE_PANIC, "vnode ref count negative %x(%x,%d)", vp, vp->v_flag, vp->v_count); } /* * It is absolutely, positively the case that * the lock manager will not be releasing vnodes * without first having released all of its locks. */ ASSERT(!(vp->v_flag & VLOCKHOLD)); private_vnode = !(vp->v_flag & VSHARE); /* * As soon as we turn this on, noone can find us * in vn_get until we turn off VINACT or VRECLM */ vp->v_flag |= VINACT; VN_UNLOCK(vp, s); /* * Cleanup left over file locks. These can get here * due to the NLM interfaces that could keep the last * close logic from finding them. We need to do it * now instead of reclaim time because XFS can reuse * vnodes/inodes without going through reclaim. * * Note that the intent here is merely to clean up * state associated with this particular vnode, which * is why cleanlocks() is used instead of fs_cleanlock() * (the latter being used when distributed vnodes need * to be taken into account). */ if (vp->v_filocks != NULL) cleanlocks(vp, IGN_PID, 0L); ASSERT(vp->v_filocks == NULL); /* * If the address space backed by this vnode represents * any usync objects, clean them up. */ if (vp->v_flag & VUSYNC) { usync_cleanup((caddr_t) vp); VN_FLAGCLR(vp, VUSYNC); } /* * Do not make the VOP_INACTIVE call if there * are no behaviors attached to the vnode to call. */ if (vp->v_fbhv != NULL) { VOP_INACTIVE(vp, get_current_cred(), cache); ASSERT(private_vnode ? 1 : cache == VN_INACTIVE_CACHE ? !(vp->v_flag & VINACTIVE_TEARDOWN) : (vp->v_flag & VINACTIVE_TEARDOWN)); } /* * For filesystems that do not want to be part of * the global vnode cache, we must not touch the * vp after we have called inactive */ if (private_vnode) return; ASSERT(vp->v_next == vp && vp->v_prev == vp); if (mask = vfreelistmask) { register vfreelist_t *tp; vfp = &vfreelist[private.p_hand++ & mask]; tp = vfp->vf_next; if (vfp->vf_lsize > tp->vf_lsize) vfp = tp; vp->v_listid = vfp->vf_listid; } else { vp->v_listid = 0; vfp = vfreelist; } ASSERT(vp->v_listid <= vfreelistmask); s = LOCK_VFP(vfp); if (vp->v_fbhv == NULL) vn_insert(vp, &vfp->vf_freelist); else vn_append(vp, &vfp->vf_freelist); ASSERT(vfp->vf_lsize >= 0); vfp->vf_lsize++; /* * Must hold freelist lock here to prevent * vnode from being deallocated first. * * VRECLM is turned off too because it may * have veen vn_reclaim'd above (this stmt. * not true anymore). */ NESTED_VN_LOCK(vp); if (vp->v_flag & VWAIT) { vn_trace_entry(vp, "RELE WAKEUP", (inst_t *)__return_address); sv_broadcast_bounded(vptosync(vp)); } vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VGONE|VEVICT); /* * If not interposed for replication, and if the vnode * was opened for write, NONREPLICABLE bit is turned on. * Turn it off now. */ VN_CLRNONREPLICABLE(vp); NESTED_UNLOCK_VFP(vfp); VN_UNLOCK(vp, s); ASSERT(fetchIntHot(&vn_nfree) >= 0); atomicAddIntHot(&vn_nfree, 1); } else VN_UNLOCK(vp, s); } /* * Vnode list primitives. The callers must exclude one another. */ void vn_initlist(struct vnlist *vl) { vl->vl_next = vl->vl_prev = (struct vnode *)vl; } void vn_insert(struct vnode *vp, struct vnlist *vl) { vp->v_next = vl->vl_next; vp->v_prev = (struct vnode *)vl; vl->vl_next = vp; /* * imon depends on vp this... */ vp->v_next->v_prev = vp; } void vn_unlink(register struct vnode *vp) { register struct vnode *next = vp->v_next; register struct vnode *prev = vp->v_prev; next->v_prev = prev; prev->v_next = next; vp->v_next = vp->v_prev = vp; } static void vn_relink( register vnlist_t *vlist, register vnode_t *next, register vnode_t *prev) { register struct vnode *N = vlist->vl_next; register struct vnode *P = vlist->vl_prev; next->v_prev = (struct vnode *)vlist; P->v_next = N; N->v_prev = P; vlist->vl_next = next; vlist->vl_prev = prev; prev->v_next = (struct vnode *)vlist; } /* * Read or write a vnode. Called from kernel code. */ int vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, size_t len, off_t offset, enum uio_seg seg, int ioflag, off_t ulimit, /* meaningful only if rw is UIO_WRITE */ cred_t *cr, ssize_t *residp, struct flid *fl) { struct uio uio; struct iovec iov; int error; if (rw == UIO_WRITE && (vp->v_vfsp->vfs_flag & VFS_RDONLY)) return EROFS; if ((ssize_t)len < 0) return EINVAL; iov.iov_base = base; iov.iov_len = len; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = offset; uio.uio_segflg = seg; uio.uio_resid = len; uio.uio_limit = ulimit; uio.uio_sigpipe = 0; uio.uio_pmp = NULL; uio.uio_pio = 0; uio.uio_readiolog = 0; uio.uio_writeiolog = 0; uio.uio_pbuf = 0; if (rw == UIO_WRITE) { uio.uio_fmode = FWRITE; VOP_WRITE(vp, &uio, ioflag, cr, fl, error); } else { uio.uio_fmode = FREAD; VOP_READ(vp, &uio, ioflag, cr, fl, error); } ASSERT(uio.uio_sigpipe == 0); if (residp) *residp = uio.uio_resid; else if (uio.uio_resid) error = EIO; return error; } /* * Open/create a vnode. * This may be callable by the kernel, the only known use * of user context being that the current user credentials * are used for permissions. crwhy is defined iff filemode & FCREAT. */ int vn_open(char *pnamep, enum uio_seg seg, register int filemode, mode_t createmode, struct vnode **vpp, enum create crwhy, int cflags, int *ckpt) { struct vnode *vp, *tvp, *openvp; register int mode; register int error; struct vattr vattr; #ifdef CKPT ckpt_handle_t lookup = NULL; #endif tvp = (struct vnode *)NULL; mode = 0; if (filemode & FREAD) mode |= VREAD; if (filemode & (FWRITE|FTRUNC)) mode |= VWRITE; if (filemode & FCREAT) { /* * Wish to create a file. */ vattr.va_type = VREG; vattr.va_mode = createmode; vattr.va_mask = AT_TYPE|AT_MODE; if (filemode & FTRUNC) { vattr.va_size = 0; vattr.va_mask |= AT_SIZE; } if (filemode & FEXCL) cflags |= VEXCL; filemode &= ~(FTRUNC|FEXCL); #ifdef _SHAREII /* * Adjust mode for share umask. */ SHR_SETATTR(vattr.va_mask, &vattr.va_mode); #endif /* _SHAREII */ /* * vn_create can take a while, so preempt. */ if (error = vn_create(pnamep, seg, &vattr, cflags, mode, &vp, crwhy, ckpt)) return error; tvp = vp; VN_HOLD(tvp); VOP_SETFL(vp, 0, filemode, get_current_cred(), error); if (error) goto out; } else { /* * Wish to open a file. Just look it up. */ if (error = lookupname(pnamep, seg, FOLLOW, NULLVPP, &vp, #ifdef CKPT (ckpt)? &lookup : NULL)) #else NULL)) #endif return error; tvp = vp; VN_HOLD(tvp); #ifdef CKPT if (ckpt) { if (lookup) *ckpt = ckpt_lookup_add(vp, lookup); else *ckpt = -1; } #endif VOP_SETFL(vp, 0, filemode, get_current_cred(), error); if (error) goto out; /* * Can't write directories, active texts, swap files, or * read-only filesystems. Can't truncate files * on which mandatory locking is in effect. */ if (filemode & (FWRITE|FTRUNC)) { if (vp->v_type == VDIR) { error = EISDIR; goto out; } if (vp->v_vfsp->vfs_flag & VFS_RDONLY) { error = EROFS; goto out; } if ((vp->v_flag & VISSWAP) && vp->v_type == VREG) { error = EBUSY; goto out; } /* * Can't truncate files on which mandatory locking * is in effect and locks exist on the file. */ if ((filemode & FTRUNC) && (vp->v_flag & (VFRLOCKS|VENF_LOCKING)) == (VFRLOCKS|VENF_LOCKING)) { error = EAGAIN; } if (error) goto out; } /* Check discretionary permissions.*/ VOP_ACCESS(vp, mode, get_current_cred(), error); if (error) goto out; } if ((filemode & FWRITE) && !VN_ISREPLICABLE(vp)){ VN_FLAGSET(vp, VNONREPLICABLE); } /* * Do opening protocol. */ openvp = vp; VOP_OPEN(openvp, &vp, filemode, get_current_cred(), error); if (!error) { /*REFERENCED*/ int unused; if (tvp) { VN_RELE(tvp); /* avoid extra VN_RELE in error case below */ tvp = NULL; } /* * Truncate if required. */ if ((filemode & FTRUNC) && vp->v_type == VREG) { vattr.va_size = 0; vattr.va_mask = AT_SIZE; VOP_SETATTR(vp, &vattr, 0, get_current_cred(), error); if (error) /* * since the open never succeeded, there can't * be any locks */ VOP_CLOSE(vp, filemode, L_TRUE, get_current_cred(), unused); } } out: if (error) { if (tvp) VN_RELE(tvp); VN_RELE(vp); } else *vpp = vp; return error; } /* * Create a vnode (makenode). */ /*ARGSUSED*/ int vn_create( char *pnamep, enum uio_seg seg, vattr_t *vap, int flags, int mode, vnode_t **vpp, enum create why, int *ckpt) { vnode_t *dvp; /* ptr to parent dir vnode */ pathname_t pn; int error; vpagg_t *vpag; int existing = 0; ckpt_handle_t *lookupp = NULL; #ifdef CKPT ckpt_handle_t lookup = NULL; #endif ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); /* * VOP_CREATE/MKDIR needs the project id. */ VPROC_GETVPAGG(curvprocp, &vpag); vap->va_projid = VPAG_GETPRID(vpag); vap->va_mask |= AT_PROJID; /* * Lookup directory. * If new object is a file, call lower level to create it. * Note that it is up to the lower level to enforce exclusive * creation, if the file is already there. * This allows the lower level to do whatever * locking or protocol that is needed to prevent races. * If the new object is directory call lower level to make * the new directory, with "." and "..". */ if (error = pn_get(pnamep, seg, &pn)) return error; _SAT_PN_SAVE(&pn, curuthread); dvp = NULL; *vpp = NULL; #ifdef CKPT lookupp = (ckpt)? &lookup : NULL; #endif /* * lookup will find the parent directory for the vnode. * When it is done the pn holds the name of the entry * in the directory. * If this is a non-exclusive create we also find the node itself. */ if (flags & VEXCL) error = lookuppn(&pn, NO_FOLLOW, &dvp, NULLVPP, lookupp); else error = lookuppn(&pn, FOLLOW, &dvp, vpp, lookupp); if (error) { pn_free(&pn); if (why == CRMKDIR && error == EINVAL) error = EEXIST; /* SVID */ return error; } ASSERT(dvp->v_count > 0); vn_trace_entry(dvp, "vn_create", (inst_t *)__return_address); if (*vpp) vn_trace_entry(*vpp, "vn_create:f", (inst_t *)__return_address); if (why != CRMKNOD) vap->va_mode &= ~VSVTX; /* * Make sure filesystem is writeable. */ if (dvp->v_vfsp->vfs_flag & VFS_RDONLY) { if (*vpp) VN_RELE(*vpp); error = EROFS; } else if (!(flags & VEXCL) && *vpp != NULL) { register struct vnode *vp = *vpp; /* * File already exists. If a mandatory lock has been * applied, return EAGAIN. */ if ((vp->v_flag & (VFRLOCKS|VENF_LOCKING)) == (VFRLOCKS|VENF_LOCKING)) { error = EAGAIN; VN_RELE(vp); goto out; } /* do not permit truncating a swap file */ if ((vp->v_flag & VISSWAP) && vp->v_type == VREG) { error = EBUSY; VN_RELE(vp); goto out; } /* * If the file is the root of a VFS, we've crossed a * mount point and the "containing" directory that we * acquired above (dvp) is irrelevant because it's in * a different file system. We apply VOP_CREATE to the * target itself instead of to the containing directory * and supply a null path name to indicate (conventionally) * the node itself as the "component" of interest. * * The intercession of the file system is necessary to * ensure that the appropriate permission checks are * done. */ if (vp->v_flag & VROOT) { /* * lvp (NULL) is needed since VOP_CREATE now has vpp * as an in and out parameter. * It has special meaning if set. */ vnode_t *lvp = NULL; ASSERT(why != CRMKDIR); VOP_CREATE(vp, "", vap, flags, mode, &lvp, get_current_cred(), error); /* * If the create succeeded, it will have created * a new reference to the vnode. Give up the * original reference. */ VN_RELE(vp); goto out; } /* * Don't throw the vnode. Give it to VOP_CREATE * so it can prevent another lookup and then * deal with it in a non-racy manner. */ ASSERT(*vpp == vp); if (why == CRMKDIR) /* Won't be going to VOP_CREATE */ VN_RELE(vp); ASSERT(!error); existing = 1; } if (error == 0) { /* * Call fs dependent mkdir() to create dir. Otherwise, fs * dependent create. */ if (why == CRMKDIR || (why == CRMKNOD && vap->va_type == VDIR)) { VOP_MKDIR(dvp, pn.pn_path, vap, vpp, get_current_cred(), error); } else { VOP_CREATE(dvp, pn.pn_path, vap, flags, mode, vpp, get_current_cred(), error); if (!error && *vpp) { IMON_EVENT(*vpp, get_current_cred(), IMON_CONTENT); } else if ((error == ENOSYS) && *vpp) { VN_RELE(*vpp); } } } out: #ifdef CKPT if (lookup) { ASSERT(ckpt); if (!error && *vpp) *ckpt = ckpt_lookup_add(*vpp, lookup); else { *ckpt = -1; ckpt_lookup_free(lookup); } } else if (ckpt) *ckpt = -1; #endif /* * Set Trix extended attributes on the vnode if: * there was no previous error, and * the vnode is newly created, and * we have a handle on that vnode * * At this time, those attributes are * MAC label * directory default ACL * * If appropriate extended security attributes cannot * be set on a filesystem object, it is removed. */ if (!error && !existing && *vpp) { /* MAC label * * No file/directory should never get created with the * moldy bit set by default: check to see if the process * label has the moldy bit set and set the label without * it. */ mac_label * label = get_current_cred()->cr_mac; if ( _MAC_IS_MOLDY ( label ) ) { if ( label = _MAC_DEMLD ( label ) ) { error = _MAC_VSETLABEL(*vpp, label ); kern_free ( label ); } else error = ENOMEM; } else error = _MAC_VSETLABEL(*vpp, label ); /* Directory default ACL */ if ( error || (error = _ACL_INHERIT(dvp, *vpp, vap))) { cmn_err_tag(318,CE_NOTE, "vn_create: %s(%d)", __FILE__, __LINE__); VOP_REMOVE(dvp, pn.pn_path, get_current_cred(), error); } } pn_free(&pn); VN_RELE(dvp); return error; } /* * Link. */ int vn_link(char *from, char *to, enum uio_seg seg, enum symfollow follow) { struct vnode *fvp; /* from vnode ptr */ struct vnode *tdvp; /* to directory vnode ptr */ struct pathname pn; register int error; struct vattr vattr; long fsid; fvp = tdvp = NULL; if (error = pn_get(to, seg, &pn)) return error; _SAT_PN_SAVE(&pn, curuthread); if (error = lookupname(from, seg, follow, NULLVPP, &fvp, NULL)) goto out; if (error = lookuppn(&pn, FOLLOW, &tdvp, NULLVPP, NULL)) goto out; /* * Make sure both source vnode and target directory vnode are * in the same vfs and that it is writeable. */ vattr.va_mask = AT_FSID; VOP_GETATTR(fvp, &vattr, 0, get_current_cred(), error); if (error) goto out; fsid = vattr.va_fsid; VOP_GETATTR(tdvp, &vattr, 0, get_current_cred(), error); if (error) goto out; if (fsid != vattr.va_fsid) { error = EXDEV; goto out; } if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) { error = EROFS; goto out; } if (!error) VOP_LINK(tdvp, fvp, pn.pn_path, get_current_cred(), error); out: pn_free(&pn); if (fvp) VN_RELE(fvp); if (tdvp) VN_RELE(tdvp); _SAT_ACCESS2(SAT_FILE_CRT_DEL2, error); return error; } /* * Rename. */ int vn_rename(char *from, char *to, enum uio_seg seg) { struct vnode *fdvp; /* from directory vnode ptr */ struct vnode *fvp; /* from vnode ptr */ struct vnode *tdvp; /* to directory vnode ptr */ struct pathname fpn; /* from pathname */ struct pathname tpn; /* to pathname */ register int error; fdvp = tdvp = fvp = NULL; /* * Get to and from pathnames. */ if (error = pn_get(from, seg, &fpn)) return error; _SAT_PN_SAVE(&fpn, curuthread); if (error = pn_get(to, seg, &tpn)) { pn_free(&fpn); return error; } _SAT_PN_SAVE(&tpn, curuthread); /* * Lookup to and from directories. */ if (error = lookuppn(&fpn, NO_FOLLOW, &fdvp, &fvp, NULL)) goto out; vn_trace_entry(fdvp, "vn_rename:fd", (inst_t *)__return_address); /* * Make sure there is an entry. */ if (fvp == NULL) { error = ENOENT; goto out; } vn_trace_entry(fvp, "vn_rename:f", (inst_t *)__return_address); /* * Make sure we're not moving an active swap file. * This prevents mv/reboot/rm from being able to * remove the swap file. Must do swap -d first. */ if (fvp->v_flag & VISSWAP && fvp->v_type == VREG) { error = EBUSY; goto out; } if (error = lookuppn(&tpn, NO_FOLLOW, &tdvp, NULLVPP, NULL)) goto out; vn_trace_entry(tdvp, "vn_rename:td", (inst_t *)__return_address); /* * Make sure that the from vnode and to directory are * in the same vfs, or that the from vnode is not a * mount point (for lofs renames), and that the from and * to directories share the same vfs. * Also make sure that the to directory is writable. * XXX this traditional vnodes test differs from the va_fsid * test used by vn_link */ if ((fvp->v_vfsp != tdvp->v_vfsp && (fvp->v_flag & VROOT) != 0) || (fdvp->v_vfsp != tdvp->v_vfsp)) { error = EXDEV; goto out; } if (tdvp->v_vfsp->vfs_flag & VFS_RDONLY) { error = EROFS; goto out; } VOP_RENAME(fdvp, fpn.pn_path, tdvp, tpn.pn_path, &tpn, get_current_cred(), error); /* * Must explicitly post imon events because imon-vnode * layer will never see it if fdvp isn't monitored. K 6/22/94 */ if (error == 0) { IMON_EVENT(tdvp, get_current_cred(), IMON_CONTENT); IMON_EVENT(fvp, get_current_cred(), IMON_RENAME); } out: pn_free(&fpn); pn_free(&tpn); if (fvp) VN_RELE(fvp); if (fdvp) VN_RELE(fdvp); if (tdvp) VN_RELE(tdvp); _SAT_ACCESS2(SAT_FILE_CRT_DEL2, error); return error; } /* * Remove a file or directory. */ int vn_remove(char *fnamep, enum uio_seg seg, enum rm dirflag) { struct vnode *vp; /* entry vnode */ struct vnode *dvp; /* ptr to parent dir vnode */ struct pathname pn; /* name of entry */ enum vtype vtype; register int error; register struct vfs *vfsp; if (error = pn_get(fnamep, seg, &pn)) return error; _SAT_PN_SAVE(&pn, curuthread); vp = NULL; if (error = lookuppn(&pn, NO_FOLLOW, &dvp, &vp, NULL)) { pn_free(&pn); return error; } vn_trace_entry(dvp, "vn_remove", (inst_t *)__return_address); /* * Make sure there is an entry. */ if (vp == NULL) { error = ENOENT; goto out; } vn_trace_entry(vp, "vn_remove", (inst_t *)__return_address); vfsp = vp->v_vfsp; /* * If the named file is the root of a mounted filesystem, fail. */ if (vp->v_flag & VROOT) { error = EBUSY; goto out; } /* * Make sure filesystem is writeable. */ if (vfsp->vfs_flag & VFS_RDONLY) { error = EROFS; goto out; } /* * Make sure we're not removing an active swap file. */ if (vp->v_flag & VISSWAP && vp->v_type == VREG) { error = EBUSY; goto out; } /* * If vnode represents a named semaphore, * cleanup the kernel semaphore state. */ if (vp->v_flag & VSEMAPHORE) { if (error = psema_indirect_unlink(vp)) goto out; } /* * Release vnode before removing. */ vtype = vp->v_type; VN_RELE(vp); vp = NULL; /* * If caller is using rmdir(2), it can be applied only to directories. * Unlink(2) can be applied to anything. */ if (dirflag == RMDIRECTORY) { if (vtype != VDIR) { error = ENOTDIR; goto out; } VOP_RMDIR(dvp, pn.pn_path, curuthread->ut_cdir, get_current_cred(), error); } else VOP_REMOVE(dvp, pn.pn_path, get_current_cred(), error); out: pn_free(&pn); if (vp != NULL) VN_RELE(vp); VN_RELE(dvp); return error; } /* * Compare two vnodes. For now we use the ops of the base * behavior to decide if the VOP_CMP() call will make any * sense. */ int vn_cmp(vnode_t *vp1, vnode_t *vp2) { bhv_desc_t *bdp1; bhv_desc_t *bdp2; int cmp; if (vp1 == vp2) { return 1; } if ((vp1 == NULL) || (vp2 == NULL) || (vp1->v_fbhv == NULL) || (vp2->v_fbhv == NULL)) { return 0; } bdp1 = vn_bhv_base_unlocked(VN_BHV_HEAD(vp1)); bdp2 = vn_bhv_base_unlocked(VN_BHV_HEAD(vp2)); if (BHV_OPS(bdp1) == BHV_OPS(bdp2)) { VOP_CMP(vp1, vp2, cmp); return cmp; } return 0; } /*ARGSUSED*/ pfd_t * vn_pfind(struct vnode *vp, pgno_t pageno, int ckey, void *pm) { pfd_t *pfd; pfd = vnode_pfind(vp, pageno, ckey); #if defined(NUMA_REPLICATION) /* * If we found a page, and vnode is a candidate for replication, * check with the replication module, if it's okay to return * this page. */ if (pfd && VN_ISREPLICABLE(vp)) pfd = repl_pfind(vp, pageno, ckey, pm, pfd); #endif /* defined(NUMA_REPLICATION) */ return pfd; } #ifdef VNODE_TRACING /* * Vnode tracing code. */ void vn_trace_entry(vnode_t *vp, char *func, inst_t *ra) { ktrace_enter(vp->v_trace, (void *)(__psint_t)VNODE_KTRACE_ENTRY, (void *)func, 0, (void *)(__psint_t)vp->v_count, (void *)ra, (void *)(__psunsigned_t)vp->v_flag, (void *)(__psint_t)cpuid(), (void *)(__psint_t)current_pid(), 0, 0, 0, 0, 0, 0, 0, 0); } void vn_trace_hold(vnode_t *vp, char *file, int line, inst_t *ra) { ktrace_enter(vp->v_trace, (void *)(__psint_t)VNODE_KTRACE_HOLD, (void *)file, (void *)(__psint_t)line, (void *)(__psint_t)vp->v_count, (void *)ra, (void *)(__psunsigned_t)vp->v_flag, (void *)(__psint_t)cpuid(), (void *)(__psint_t)current_pid(), 0, 0, 0, 0, 0, 0, 0, 0); } void vn_trace_ref(vnode_t *vp, char *file, int line, inst_t *ra) { ktrace_enter(vp->v_trace, (void *)(__psint_t)VNODE_KTRACE_REF, (void *)file, (void *)(__psint_t)line, (void *)(__psint_t)vp->v_count, (void *)ra, (void *)(__psunsigned_t)vp->v_flag, (void *)(__psint_t)cpuid(), (void *)(__psint_t)current_pid(), 0, 0, 0, 0, 0, 0, 0, 0); } void vn_trace_rele(vnode_t *vp, char *file, int line, inst_t *ra) { ktrace_enter(vp->v_trace, (void *)(__psint_t)VNODE_KTRACE_RELE, (void *)file, (void *)(__psint_t)line, (void *)(__psint_t)vp->v_count, (void *)ra, (void *)(__psunsigned_t)vp->v_flag, (void *)(__psint_t)cpuid(), (void *)(__psint_t)current_pid(), 0, 0, 0, 0, 0, 0, 0, 0); } #endif /* VNODE_TRACING */