1536 lines
38 KiB
C
1536 lines
38 KiB
C
/*
|
|
* os/vm/vnode_pcache.c
|
|
*
|
|
*
|
|
* Copyright 1995, Silicon Graphics, Inc.
|
|
* ALL RIGHTS RESERVED
|
|
*
|
|
* UNPUBLISHED -- Rights reserved under the copyright laws of the United
|
|
* States. Use of a copyright notice is precautionary only and does not
|
|
* imply publication or disclosure.
|
|
*
|
|
* U.S. GOVERNMENT RESTRICTED RIGHTS LEGEND:
|
|
* Use, duplication or disclosure by the Government is subject to restrictions
|
|
* as set forth in FAR 52.227.19(c)(2) or subparagraph (c)(1)(ii) of the Rights
|
|
* in Technical Data and Computer Software clause at DFARS 252.227-7013 and/or
|
|
* in similar or successor clauses in the FAR, or the DOD or NASA FAR
|
|
* Supplement. Contractor/manufacturer is Silicon Graphics, Inc.,
|
|
* 2011 N. Shoreline Blvd. Mountain View, CA 94039-7311.
|
|
*
|
|
* THE CONTENT OF THIS WORK CONTAINS CONFIDENTIAL AND PROPRIETARY
|
|
* INFORMATION OF SILICON GRAPHICS, INC. ANY DUPLICATION, MODIFICATION,
|
|
* DISTRIBUTION, OR DISCLOSURE IN ANY FORM, IN WHOLE, OR IN PART, IS STRICTLY
|
|
* PROHIBITED WITHOUT THE PRIOR EXPRESS WRITTEN PERMISSION OF SILICON
|
|
* GRAPHICS, INC.
|
|
*/
|
|
|
|
|
|
#ident "$Revision: 1.36 $"
|
|
|
|
#include <sys/types.h>
|
|
#include <sys/buf.h> /* chunktoss() */
|
|
#include <sys/cmn_err.h>
|
|
#include <sys/debug.h>
|
|
#include <sys/immu.h>
|
|
#include <sys/param.h>
|
|
#include <sys/pda.h>
|
|
#include <sys/pfdat.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/sbd.h>
|
|
#include <sys/sema.h>
|
|
#include <sys/swap.h>
|
|
#include <sys/sysinfo.h>
|
|
#include <sys/sysmacros.h>
|
|
#include <sys/systm.h> /* Misc stuff like kdebug */
|
|
#include <sys/vfs.h>
|
|
#include <sys/vnode.h>
|
|
#include <sys/page.h>
|
|
#include <sys/lpage.h>
|
|
#include <sys/atomic_ops.h>
|
|
#include <ksys/migr.h>
|
|
#include <ksys/rmap.h>
|
|
#include <sys/numa.h>
|
|
#include <ksys/pcache.h>
|
|
#include <ksys/vnode_pcache.h>
|
|
#include <sys/nodepda.h>
|
|
#include <sys/ktrace.h> /* Tracing functions */
|
|
#include <sys/idbgentry.h> /* idbg_addfunc */
|
|
|
|
#ifdef DEBUG1
|
|
#define PRINTF(x) printf((x))
|
|
#else
|
|
#define PRINTF(x)
|
|
#endif
|
|
|
|
|
|
#ifdef PGCACHEDEBUG
|
|
static void vnode_anypage_valid(struct vnode *);
|
|
static void vnode_oobpages(struct vnode *, pgno_t);
|
|
#endif /* PGCACHEDEBUG */
|
|
|
|
#define VNODE_PAGEOP_BAD (0x1)
|
|
#define VNODE_PAGEOP_NOASSOC (0x2)
|
|
|
|
#define VNODE_PAGEOP_TOSS (void *)(__psunsigned_t)(VNODE_PAGEOP_BAD|VNODE_PAGEOP_NOASSOC)
|
|
|
|
|
|
#ifdef DEBUG
|
|
int vnode_broadcast_count;
|
|
#endif
|
|
|
|
/*
|
|
* Vnode pagecache Reference counting mechanism.
|
|
*
|
|
* This mechanism is used to synchronize the threads trying to
|
|
* reclaim a vnode (and hence the vnode page cache), with the threads coming
|
|
* down the pagealloc path and trying to steal pages hashed to a vnode.
|
|
*
|
|
* We need this synchronization so that the threads coming down the pagealloc
|
|
* path can pick the page out of free list, drop all the locks it's holding
|
|
* and then call into vnode page cache to free the page.
|
|
*
|
|
* Without this synchronization, page cache could vanis when the thread
|
|
* in pagealloc drops the locks.
|
|
*
|
|
* Synchronization mechanism:
|
|
*
|
|
* Threads coming down the pagealloc path (and trying to pick a page out
|
|
* of the freelist, check if the page is hashed and belongs to a vnode.
|
|
* It then sets the P_RECYCLE bit in pfdat, and also atomically bump a
|
|
* counter in vnode pcache structure
|
|
*
|
|
* If a thread is in the vnode reclaim path at the same time, and
|
|
* tossing away the vnode pages, at the end of tossing the pages,
|
|
* it checks for the counter (while holding the pagecache lock)
|
|
* and if non-zero, sets a bit indicating there is a sleeping process,
|
|
* and sleeps on this synchronization variable.
|
|
* (Note that by the time this thread completes tossing all its pages
|
|
* and decides to reclaim the vnode, all threads which could bump
|
|
* the reference counter in pagealloc path are already done with their
|
|
* work, and no one would be able to find a page still hashed to this
|
|
* vnode in the freelist.
|
|
*
|
|
* Later the thread in the pagealloc() path, invokes vnode_page_recycle(),
|
|
* which takes the page out of hash. and decrements the counter.
|
|
* If counter gets to zero, it checks if there is anyone sleeping,
|
|
* If so, it clears the sleeping bit, and sends a broadcast
|
|
* message to the signal variable on which the vnode reclaim thread would
|
|
* be waiting on.
|
|
*
|
|
* All threads that receive the broadcast, wakeup, check if their
|
|
* sleeping bit is still set, and if not, they are free to reclaim
|
|
* the vnode. If their sleeping bit is still set, it's not their turn
|
|
* and they go back to sleeping mode.
|
|
*
|
|
* Having just one such variable for the entire system may be a concern.
|
|
* We need to measure how much of a bottleneck this is, and fix it.
|
|
* Possibilities are to have an array of these or one in each vnode..
|
|
*
|
|
* The race condition is supposed to happen only if vnodes are reclaimed,
|
|
* which happens only if we are running low on memory. It's a good
|
|
* chance that all the hashed pages are gone by then!!.
|
|
*/
|
|
|
|
/*
|
|
* VNODE_PCACHE_SYNC is the macro used to get to the synchronization
|
|
* variable given a vnode. This is used for communication between
|
|
* the threads trying to reclaim vndoe, and threads trying to hold vnode.
|
|
*
|
|
* Having a larger value for this variable would reduce the contention
|
|
* in signalling.
|
|
* But this contention is not observed to be very high (a few times in
|
|
* a stress test which ran for about 6 hours on a 36 cpu MP. So,
|
|
* just one signal variable should be sufficient for MP systems too.
|
|
*/
|
|
#define VNODE_PCACHE_SYNC(vp) (&vnode_pcache_sync)
|
|
|
|
/*
|
|
* Actual Sync Variable.
|
|
*/
|
|
sv_t vnode_pcache_sync;
|
|
|
|
|
|
/*
|
|
* A preemption cookie is used to encapsulate information about the lock
|
|
* currently held so that it can be passed around as a unit. This is
|
|
* used to allow lock preemption (and hence allow interrupts to occur)
|
|
* during long pcache_remove() operations.
|
|
*/
|
|
|
|
struct preemption_cookie {
|
|
struct vnode *pc_vp; /* the vnode who's lock is held */
|
|
int pc_locktoken; /* from the vnode lock call */
|
|
};
|
|
|
|
typedef struct preemption_cookie preempt_cookie_t;
|
|
|
|
static void vnode_preempt(void *);
|
|
|
|
|
|
#ifdef DEBUG
|
|
ktrace_t *vnode_ktrace;
|
|
extern void idbg_vnode_pcache_trace(__psunsigned_t);
|
|
#endif /* DEBUG */
|
|
|
|
/*
|
|
* init_vnode_pcache
|
|
* One time initialization routine called at system boot time.
|
|
*/
|
|
void
|
|
init_vnode_pcache(void)
|
|
{
|
|
|
|
sv_init(&vnode_pcache_sync, SV_DEFAULT, "v_pcsync");
|
|
|
|
#ifdef VNODE_PCACHE_TRACE
|
|
vnode_ktrace = ktrace_alloc(1024, 1);
|
|
idbg_addfunc("tracevp", idbg_vnode_pcache_trace);
|
|
#endif /* VNODE_PCACHE_TRACE */
|
|
}
|
|
|
|
/*
|
|
* vnode_pcache_init
|
|
* Routine that gets called when a vnode is allocated fresh.
|
|
*/
|
|
void
|
|
vnode_pcache_init(struct vnode *vp)
|
|
{
|
|
/*
|
|
* Vnode page cache related one time initialization.
|
|
*/
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_INIT, vp, 0, 0);
|
|
|
|
vp->v_pcacheref = 0;
|
|
pcache_init(&vp->v_pcache);
|
|
}
|
|
|
|
/*
|
|
* vnode_pcache_reinit
|
|
* Routine to reinitialize a page cache. (occurs when a vnode gets
|
|
* recycled.
|
|
*/
|
|
void
|
|
vnode_pcache_reinit(struct vnode *vp)
|
|
{
|
|
int s;
|
|
VNODE_PCACHE_LOCK(vp, s);
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_REINIT, vp, 0, 0);
|
|
|
|
vp->v_pcacheref = 0;
|
|
pcache_init(&vp->v_pcache);
|
|
VNODE_PCACHE_UNLOCK(vp, s);
|
|
}
|
|
|
|
/*
|
|
* Gets called when an isolated vnode gets freed, and gets placed in
|
|
* front of the vfreelist.
|
|
*/
|
|
void
|
|
vnode_pcache_free(struct vnode *vp)
|
|
{
|
|
int s;
|
|
/*
|
|
* release any hash buckets associated with the page cache.
|
|
* This has the side effect of initializing the
|
|
* page cache data structures to a known state.
|
|
*/
|
|
|
|
VNODE_PCACHE_LOCK(vp, s);
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_FREE, vp, 0, 0);
|
|
|
|
pcache_release(&vp->v_pcache);
|
|
VNODE_PCACHE_UNLOCK(vp, s);
|
|
}
|
|
|
|
/*
|
|
* Reclaim the page cache related data structure from vnode.
|
|
* In addition synchronize with threads who still have an
|
|
* extra reference on the pagecache of this vnode.
|
|
*/
|
|
void
|
|
vnode_pcache_reclaim(struct vnode *vp)
|
|
{
|
|
int s;
|
|
/*
|
|
* Before letting the vnode be reclaimed, see if there are any
|
|
* outstanding page cache waiters, and if so wait for them to
|
|
* finish.
|
|
*/
|
|
VNODE_PCACHE_LOCK(vp, s);
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_RECLAIM, vp, 0, 0);
|
|
|
|
while (vp->v_pcacheref) {
|
|
|
|
atomicFieldAssignUint(&vp->v_pcacheflag, VNODE_PCACHE_WAITBIT,
|
|
VNODE_PCACHE_WAITBIT);
|
|
sv_bitlock_wait(VNODE_PCACHE_SYNC(vp), PINOD,
|
|
&vp->v_pcacheflag, VNODE_PCACHE_LOCKBIT, s);
|
|
|
|
VNODE_PCACHE_LOCK(vp, s);
|
|
}
|
|
/* Cleanup the pagecache related fields */
|
|
|
|
pcache_release(&vp->v_pcache);
|
|
VNODE_PCACHE_UNLOCK(vp, s);
|
|
}
|
|
|
|
/*
|
|
* vnode_page_decommission
|
|
* Update the fields of pfdat to indicate it's out of hash.
|
|
* 'op' values passed in, indicate one the following.
|
|
* VNODE_PAGEOP_BAD: Mark the page as bad.
|
|
* VNODE_PAGEOP_NOASSOC: Requeue the page in freelist..
|
|
* If the op passed in indicate that the page should be marked
|
|
*
|
|
* This routine gets passed in as the function pointer to pcace_remove.
|
|
* which gets called either from vnode_pcache_remove() or as part
|
|
* of file system tossing pages.
|
|
*
|
|
* 'op' passed would have the right set of bits set to indicate who
|
|
* invoked this routine.
|
|
*
|
|
* It's possible to have two different set of routines. But the
|
|
* work done between the two is so similar that combining them into
|
|
* on does not result in much overhead for the other
|
|
*
|
|
* Two versions of this routine (one which holds the pfdat lock, and
|
|
* other which expects caller to hold the pfdat lock), are provided
|
|
* for convenience.
|
|
* If the calling routine is already holding the pfdat lock, there is
|
|
* no reason to let it go, before acquiring it back again in this
|
|
* routine.
|
|
* In other situations, this routine gets called multiple times from
|
|
* pcache_remove (once for each page found in page cache.). In this
|
|
* case vnode_page_decommission() which grabs the lock is called.
|
|
*/
|
|
static void
|
|
vnode_page_decommission_nolock(void *pageop, struct pfdat *pfd)
|
|
{
|
|
int op = (__psunsigned_t)pageop;
|
|
|
|
ASSERT(VNODE_PCACHE_ISLOCKED(pfd->pf_vp));
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_PAGEOP, pfd->pf_vp, pfd, pageop);
|
|
ASSERT(pfd->pf_flags & P_HASH);
|
|
|
|
if (!(pfd->pf_flags & (P_BAD))) {
|
|
ASSERT(pfd->pf_vp->v_pgcnt > 0);
|
|
pfd->pf_vp->v_pgcnt--;
|
|
if (pfd_replicated(pfd)){
|
|
page_replica_remove(pfd);
|
|
}
|
|
}
|
|
|
|
pfd->pf_flags &= ~P_HASH;
|
|
|
|
if (pfd->pf_flags & P_QUEUE){
|
|
ADD_NODE_EMPTYMEM((pfdattocnode(pfd)), 1);
|
|
#ifdef NOT_DEFINED
|
|
/*
|
|
* Earlier we used to place this page back in queue
|
|
* with no association. This would have made the page
|
|
* avaiable earlier than otherwise.
|
|
* Does it still make sense to do it?
|
|
* If so, we need to fix the requeue routine to
|
|
* take the appropriate freelist lock.
|
|
*/
|
|
if (op & VNODE_PAGEOP_NOASSOC) {
|
|
nested_pfdat_unlock(pfd);
|
|
requeue(pfd, STALE_NOASSOC);
|
|
nested_pfdat_lock(pfd);
|
|
}
|
|
#endif /* NOT_DEFINED */
|
|
|
|
} else if ((op & VNODE_PAGEOP_BAD) && !(pfd->pf_flags & P_BAD)) {
|
|
pfd->pf_flags |= P_BAD;
|
|
}
|
|
}
|
|
|
|
static void
|
|
vnode_page_decommission(void *pageop, struct pfdat *pfd)
|
|
{
|
|
/*
|
|
* Since we should already be holding object lock,
|
|
* doing a nospl lock is sufficient.
|
|
*/
|
|
|
|
nested_pfdat_lock(pfd);
|
|
vnode_page_decommission_nolock(pageop, pfd);
|
|
nested_pfdat_unlock(pfd);
|
|
}
|
|
|
|
/*
|
|
* vnode_plookup
|
|
* Return the page that corresponds to the page number passed in.
|
|
*/
|
|
pfd_t *
|
|
vnode_plookup(vnode_t *vp, pgno_t pgno)
|
|
{
|
|
int s;
|
|
pfd_t *pfd;
|
|
|
|
VNODE_PCACHE_LOCK(vp, s);
|
|
pfd = pcache_find(&vp->v_pcache, pgno);
|
|
VNODE_PCACHE_UNLOCK(vp, s);
|
|
|
|
return pfd;
|
|
}
|
|
|
|
/*
|
|
* vnode_pnext:
|
|
* Given the pfd, find the next page that has the same
|
|
* page number as the pfdat passed in.
|
|
* Primiarily used to get to replicated pages.
|
|
*/
|
|
pfd_t *
|
|
vnode_pnext(vnode_t *vp, pfd_t *pfd)
|
|
{
|
|
int s;
|
|
VNODE_PCACHE_LOCK(vp, s);
|
|
pfd = pcache_next(&vp->v_pcache, pfd);
|
|
VNODE_PCACHE_UNLOCK(vp, s);
|
|
|
|
return pfd;
|
|
}
|
|
|
|
|
|
/*
|
|
* vnode_pagebad
|
|
* Mark a specified page as bad, and remove it from page cache ..
|
|
*/
|
|
void
|
|
vnode_pagebad(vnode_t *vp, pfd_t *pfd)
|
|
{
|
|
int locktoken;
|
|
off_t offset;
|
|
|
|
ASSERT((pfd->pf_flags & (P_QUEUE|P_SQUEUE|P_RECYCLE)) == 0);
|
|
ASSERT(pfd->pf_use > 0);
|
|
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_PAGEBAD, vp, pfd, 0);
|
|
|
|
/* Hold pfdat_lock to prevent any changes to pf_flags/pf_pageno. */
|
|
|
|
nested_pfdat_lock(pfd);
|
|
if ((pfd->pf_flags & P_BAD) == 0){
|
|
ASSERT((pfd->pf_flags & (P_ANON|P_HASH)) == P_HASH);
|
|
offset = ctob(pfd->pf_pageno);
|
|
|
|
pcache_remove_pfdat(&vp->v_pcache, pfd);
|
|
vnode_page_decommission_nolock(VNODE_PAGEOP_TOSS, pfd);
|
|
|
|
nested_pfdat_unlock(pfd);
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
|
|
/* Make sure no buffer maps this page. */
|
|
chunktoss(vp, offset, offset + ctob(1) - 1);
|
|
return;
|
|
}
|
|
nested_pfdat_unlock(pfd);
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
}
|
|
|
|
/*
|
|
* vnode_page_attach
|
|
* pfd is in the page cache. This routine checks if the
|
|
* page is in the free list, and tries to take it out of
|
|
* the free list to return the page to caller.
|
|
*
|
|
* This has to take care of the race condition with a
|
|
* different thread trying to steal this page from the
|
|
* free list.
|
|
*
|
|
* This routine expects the caller to be holding the page cache lock.
|
|
*/
|
|
pfd_t *
|
|
vnode_page_attach(struct vnode *vp, pfd_t *pfd)
|
|
{
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_PAGEATTACH, vp, pfd, 0);
|
|
|
|
nested_pfdat_lock(pfd);
|
|
|
|
ASSERT(VNODE_PCACHE_ISLOCKED(vp));
|
|
ASSERT((pfd->pf_flags & (P_BAD|P_ANON)) == 0);
|
|
ASSERT(pfd->pf_flags & P_HASH);
|
|
ASSERT(pfd->pf_vp == vp);
|
|
|
|
/*
|
|
* Check if the page is on the free list. If so,
|
|
* we could be racing with pagealloc trying to
|
|
* free this page and returning it for some other
|
|
* purpose.
|
|
*/
|
|
if (pfd->pf_flags & P_QUEUE) {
|
|
nested_pfdat_unlock(pfd);
|
|
pageunfree(pfd);
|
|
nested_pfdat_lock(pfd);
|
|
}
|
|
|
|
/*
|
|
* At this stage, either pageunfree was able
|
|
* to take the page out of free list, or
|
|
* the page is in recycle state.
|
|
* Since the vnode lock is still held, page cannot
|
|
* go out of the recylced state till we are done.
|
|
*/
|
|
|
|
if (pfd->pf_flags & P_RECYCLE) {
|
|
|
|
/*
|
|
* XXX
|
|
* Instead of trying to remove the page from pagecache
|
|
* right here, why not let vnode_page_recycle do it ?
|
|
*/
|
|
/*
|
|
* We are racing with a different cpu
|
|
* doing pagealloc which has already
|
|
* acquired this page and wishes to
|
|
* recycle it for its new use.
|
|
* When this race occurs, we let
|
|
* pagealloc win as it simplifies the
|
|
* locking. In this situation, unhash
|
|
* the page, and let the caller create
|
|
* a new page and insert.
|
|
*
|
|
* This should not happen very often though..
|
|
* (Yes it's bad that we have to let a
|
|
* page that has valid file data go.
|
|
* Complexity otherwise is too large).
|
|
*/
|
|
|
|
pcache_remove_pfdat(&vp->v_pcache, pfd);
|
|
vnode_page_decommission_nolock((void *)NULL, pfd);
|
|
|
|
pfd->pf_vp = NULL;
|
|
/*
|
|
* Don't turn off P_RECYCLE flag here. It will be
|
|
* turned off when vnode_page_recycle() gets called
|
|
* by the thread in pagealloc path.
|
|
* Otherwise, it would lead to a race condition, where
|
|
* thread in pagealloc path believes that recycling
|
|
* is not needed, and messes with the pfdat flag
|
|
* directly.
|
|
*/
|
|
pfd->pf_flags &= ~P_DONE;
|
|
nested_pfdat_unlock(pfd);
|
|
pfd = NULL;
|
|
} else {
|
|
|
|
/*
|
|
* Since we already hold pfdat lock, bump use count
|
|
* directly, release pfdat lock and return.
|
|
*/
|
|
ASSERT((pfd->pf_flags &
|
|
(P_HASH|P_RECYCLE)) == P_HASH);
|
|
pfdat_inc_usecnt(pfd);
|
|
nested_pfdat_unlock(pfd);
|
|
}
|
|
|
|
return pfd;
|
|
}
|
|
|
|
/*
|
|
* vnode_pfind:
|
|
* Look up the vnode's page cache for the page that corresponds
|
|
* to 'pgno'.
|
|
* If acquire says VM_ATTACH, get an extra reference on the page.
|
|
* returns:
|
|
* 0 -> can't find it
|
|
* pfd -> ptr to pfdat entry
|
|
*/
|
|
|
|
pfd_t *
|
|
vnode_pfind_nolock(vnode_t *vp, pgno_t pgno, int acquire)
|
|
{
|
|
|
|
pfd_t *pfd;
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_PFIND, vp, pgno, 0);
|
|
|
|
/* Try and get first non bad page */
|
|
pfd = pcache_find(&vp->v_pcache, pgno);
|
|
|
|
/*
|
|
* If callers needs the page to be attached, do it.
|
|
*/
|
|
|
|
if (pfd && (acquire & VM_ATTACH)) {
|
|
ASSERT((pfd->pf_flags & P_BAD) == 0);
|
|
ASSERT(pfd->pf_vp == vp);
|
|
pfd = vnode_page_attach(vp, pfd);
|
|
}
|
|
|
|
return pfd;
|
|
|
|
}
|
|
|
|
pfd_t *
|
|
vnode_pfind(vnode_t *vp, pgno_t pgno, int acquire)
|
|
{
|
|
int s;
|
|
pfd_t *pfd;
|
|
|
|
VNODE_PCACHE_LOCK(vp, s);
|
|
pfd = vnode_pfind_nolock(vp, pgno, acquire);
|
|
VNODE_PCACHE_UNLOCK(vp, s);
|
|
|
|
return(pfd);
|
|
}
|
|
|
|
/*
|
|
* vnode_hold
|
|
*
|
|
* Hold a reference to given vnode. The caller must be holding the
|
|
* pfdat lock for a page hashed to this vnode. Bump the vnode
|
|
* reference count to reflect the hold.
|
|
*/
|
|
|
|
void
|
|
vnode_hold(vnode_t *vp)
|
|
{
|
|
ASSERT(vp);
|
|
ASSERT(pcache_pagecount(&vp->v_pcache) > 0);
|
|
|
|
VNODE_PCACHE_INCREF(vp);
|
|
}
|
|
|
|
/*
|
|
* vnode_release
|
|
*
|
|
* Release vnode hold. If there is any thread waiting to be woken up
|
|
* (Happens only in vnode reclaim path), do so.
|
|
*/
|
|
|
|
static void
|
|
vnode_release(vnode_t *vp)
|
|
{
|
|
ASSERT(VNODE_PCACHE_ISLOCKED(vp));
|
|
ASSERT(vp && vp->v_pcacheref);
|
|
|
|
VNODE_PCACHE_DECREF(vp);
|
|
|
|
if ((vp->v_pcacheref == 0) && VNODE_PCACHE_WAITING(vp)){
|
|
/*
|
|
* Reset the waiting bit.
|
|
*/
|
|
atomicFieldAssignUint(&vp->v_pcacheflag, VNODE_PCACHE_WAITBIT, 0);
|
|
sv_broadcast(VNODE_PCACHE_SYNC(vp));
|
|
|
|
#ifdef DEBUG
|
|
vnode_broadcast_count++; /* Count for debugging */
|
|
#endif /* DEBUG */
|
|
}
|
|
}
|
|
|
|
/*
|
|
* vnode_pagemigr
|
|
*
|
|
* Check whether the page represented by old_pfd is migratable and
|
|
* if so, copy its state to new_pfd, remove it from the pcache,
|
|
* and insert new_pfd.
|
|
*/
|
|
|
|
int
|
|
vnode_pagemigr(vnode_t *vp, pfd_t *old_pfd, pfd_t *new_pfd)
|
|
{
|
|
int locktoken;
|
|
int errcode;
|
|
|
|
ASSERT(vp && old_pfd && new_pfd);
|
|
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
|
|
nested_pfdat_lock(old_pfd);
|
|
|
|
/*
|
|
* Has the memory object remained constant?
|
|
*/
|
|
if (pfd_getmo(old_pfd) != vp) {
|
|
PRINTF(("[vnode_pagemigr]: memory object mutation\n"));
|
|
errcode = MIGRERR_MOMUTATION;
|
|
goto migr_fail0;
|
|
}
|
|
|
|
/*
|
|
* At this point we've obtained the vnode pcache lock
|
|
* and the old_pfdat lock in the right order, with
|
|
* the pfdat still being owned by the same memory
|
|
* object we detected in the migration engine (this
|
|
* function's caller).
|
|
*/
|
|
|
|
/*
|
|
* Check whether the page is in a migratable state
|
|
* and transfer pfdat state
|
|
*/
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_MIGR, vp, old_pfd, new_pfd);
|
|
|
|
if ((errcode = MIGR_PFDAT_XFER(old_pfd, new_pfd)) != 0) {
|
|
PRINTF(("[vnode_pagemigr]: failed pfdat_xfer\n"));
|
|
goto migr_fail0;
|
|
}
|
|
|
|
/*
|
|
* The old page is *not* freed; it's just removed from the cache
|
|
* The pf_pageno field in this pfdat must be intact.
|
|
*/
|
|
pcache_remove_pfdat(&vp->v_pcache, old_pfd);
|
|
|
|
/*
|
|
* We're completely done with the old pfdat.
|
|
* Release its pfdat lock
|
|
*/
|
|
nested_pfdat_unlock(old_pfd);
|
|
|
|
/*
|
|
* Now we insert the new pfdat into the pcache
|
|
*/
|
|
pcache_insert(&vp->v_pcache, new_pfd);
|
|
|
|
errcode = 0;
|
|
goto migr_success;
|
|
|
|
/*
|
|
* Exit levels.
|
|
*/
|
|
|
|
migr_fail0:
|
|
nested_pfdat_unlock(old_pfd);
|
|
|
|
migr_success:
|
|
|
|
/*
|
|
* Drop the extra reference the caller acquired on the vnode.
|
|
* This may cause the vnode vp points at to be freed, so we can't
|
|
* reference vp anymore after this call.
|
|
*/
|
|
|
|
vnode_release(vp);
|
|
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
|
|
return (errcode);
|
|
}
|
|
|
|
|
|
/*
|
|
* Conditionally insert pfd in hash list. If another page already
|
|
* exists that maps [tag,pgno], toss the passed page and return the
|
|
* found match.
|
|
*/
|
|
pfd_t *
|
|
vnode_pinsert_try(vnode_t *vp, pfd_t *pfd, pgno_t pgno)
|
|
{
|
|
register int locktoken;
|
|
register struct pfdat *pfd_hash;
|
|
void *pcache_token;
|
|
|
|
ASSERT((pfd->pf_flags & (P_HASH|P_SWAP|P_ANON|P_QUEUE|P_SQUEUE)) == 0);
|
|
|
|
pcache_token = pcache_getspace(&vp->v_pcache, 1);
|
|
|
|
if (pcache_token){
|
|
/*
|
|
* If we successfully allocated the space,
|
|
* we may as well go and grow the pagecache.
|
|
* It takes quite an effort to allocate space!!.
|
|
* If it's already grown to proper size, pcache_resize
|
|
* frees allocated space.
|
|
*/
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
pcache_token = pcache_resize(&vp->v_pcache, pcache_token);
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
pcache_resize_epilogue(pcache_token);
|
|
}
|
|
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_PINS_TRY, vp, pfd, pgno);
|
|
|
|
pfd_hash = pcache_find(&vp->v_pcache, pgno);
|
|
if (pfd_hash) {
|
|
ASSERT((pfd_hash->pf_flags & P_BAD) == 0);
|
|
|
|
/*
|
|
* A page with "pgno" already exists in this cache.
|
|
* If we can successfully grab it, return it to
|
|
* user. vnode_page_attach tries to resolve the
|
|
* race with pagealloc().
|
|
*/
|
|
|
|
pfd_hash = vnode_page_attach(vp, pfd_hash);
|
|
if (pfd_hash) {
|
|
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
/*
|
|
* Free the page allocated earlier.
|
|
*/
|
|
pagefree(pfd);
|
|
return pfd_hash;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Could not find any page in the page cache. Time to
|
|
* insert the page passed in.
|
|
*/
|
|
|
|
/*
|
|
* These assignments are safe since page is not in use yet.
|
|
*/
|
|
pfd->pf_vp = vp;
|
|
pfd->pf_pageno = pgno;
|
|
pfd->pf_flags |= P_HASH;
|
|
|
|
pcache_insert(&vp->v_pcache, pfd);
|
|
|
|
#ifdef _VCE_AVOIDANCE
|
|
if (vce_avoidance) {
|
|
if (pfd_to_vcolor(pfd) < 0)
|
|
pfd_set_vcolor(pfd,vcache_color(pfd->pf_pageno));
|
|
}
|
|
#endif /* _VCE_AVOIDANCE */
|
|
ASSERT(vp->v_pgcnt >= 0);
|
|
vp->v_pgcnt++;
|
|
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
return pfd;
|
|
}
|
|
|
|
/*
|
|
* Replace a page found in the page cache with a different page. This
|
|
* interface is for cells & it used to replace a page allocated from
|
|
* the local cell with a different page imported from a server. The
|
|
* page being replaced is NOT in the P_DONE state.
|
|
*/
|
|
#ifdef CELL
|
|
pfd_t *
|
|
vnode_page_replace(pfd_t *opfd, pfd_t *npfd)
|
|
{
|
|
register int locktoken;
|
|
vnode_t *vp;
|
|
|
|
ASSERT((opfd->pf_flags & (P_DONE|P_SWAP|P_ANON|P_QUEUE|P_SQUEUE)) == 0);
|
|
ASSERT(opfd->pf_flags & (P_HASH|P_BAD));
|
|
|
|
vp = opfd->pf_vp;
|
|
ASSERT(vp);
|
|
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_REPLACE, vp, opfd, npfd);
|
|
|
|
/*
|
|
* These assignments are safe since the new page is not in use yet.
|
|
*/
|
|
nested_pfdat_lock(opfd);
|
|
npfd->pf_vp = opfd->pf_vp;
|
|
npfd->pf_pageno = opfd->pf_pageno;
|
|
npfd->pf_flags |= opfd->pf_flags&(P_HASH|P_BAD);
|
|
if (npfd->pf_flags&P_BAD)
|
|
npfd->pf_flags &= ~P_HASH;
|
|
nested_pfdat_unlock(opfd);
|
|
|
|
|
|
/*
|
|
* If the old page has not been marked P_BAD, remove it
|
|
* remove the page cache.
|
|
*/
|
|
if (!(opfd->pf_flags & P_BAD)) {
|
|
pcache_remove_pfdat(&vp->v_pcache, opfd);
|
|
vnode_page_decommission(VNODE_PAGEOP_TOSS, opfd);
|
|
}
|
|
|
|
if (npfd->pf_flags&P_HASH) {
|
|
pcache_insert(&vp->v_pcache, npfd);
|
|
ASSERT(vp->v_pgcnt >= 0);
|
|
vp->v_pgcnt++;
|
|
}
|
|
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
|
|
pagedone(opfd);
|
|
pagefree(opfd);
|
|
|
|
return npfd;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Insert page on hash chain.
|
|
* vp -> pointer to incore vnode
|
|
* pgno -> logical file page number
|
|
* Expects the caller to hold the the pager object lock
|
|
*/
|
|
void
|
|
vnode_pinsert(vnode_t *vp, pfd_t *pfd, pgno_t pgno, unsigned flag)
|
|
{
|
|
void *pcache_token;
|
|
int locktoken;
|
|
|
|
ASSERT((flag & (P_ANON|P_SWAP)) == 0);
|
|
|
|
pcache_token = pcache_getspace(&vp->v_pcache, 1);
|
|
|
|
if (pcache_token) {
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
pcache_token = pcache_resize(&vp->v_pcache, pcache_token);
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
pcache_resize_epilogue(pcache_token);
|
|
}
|
|
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_PINSERT, vp, pfd, pgno);
|
|
|
|
/*
|
|
* These are safe since page is not in use yet.
|
|
*/
|
|
pfd->pf_vp = vp;
|
|
pfd->pf_pageno = pgno;
|
|
|
|
ASSERT((pfd->pf_flags & (P_HASH|P_SWAP|P_ANON|P_QUEUE|P_SQUEUE|P_RECYCLE)) == 0);
|
|
|
|
pfd->pf_flags |= (P_HASH | flag);
|
|
|
|
#ifdef DEBUG
|
|
/*
|
|
* check if the page already exists.
|
|
*/
|
|
|
|
ASSERT(!pcache_find(&vp->v_pcache, pgno));
|
|
#endif /* DEBUG */
|
|
|
|
pcache_insert(&vp->v_pcache, pfd);
|
|
|
|
ASSERT(vp->v_pgcnt >= 0);
|
|
vp->v_pgcnt++;
|
|
|
|
#ifdef _VCE_AVOIDANCE
|
|
ASSERT((pfd->pf_flags & P_LPG_INDEX) == 0); /* XXX */
|
|
if (vce_avoidance) {
|
|
if (pfd_to_vcolor(pfd) < 0)
|
|
pfd_set_vcolor(pfd,vcache_color(pfd->pf_pageno));
|
|
}
|
|
#endif /* _VCE_AVOIDANCE */
|
|
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
}
|
|
|
|
/*
|
|
* Exported pinsert interface with no locking.
|
|
* Caller must have the object lock.
|
|
* As it stands now, caller does not take the object lock.
|
|
* So, just call vnode_pinsert which does the right thing.
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
|
void
|
|
vnode_pinsert_nolock(
|
|
vnode_t *vp,
|
|
pfd_t *pfd,
|
|
pgno_t pgno,
|
|
unsigned flag,
|
|
int locktoken)
|
|
{
|
|
vnode_pinsert(vp, pfd, pgno, flag);
|
|
}
|
|
|
|
|
|
/*
|
|
* remove page from hash chain
|
|
* pfd -> page frame pointer
|
|
* Assumes the caller holds the pagecache lock.
|
|
* This routine holds the pfdat lock.
|
|
* Currently in use by the replication code trying to shootdown
|
|
* pages.
|
|
*/
|
|
|
|
void
|
|
vnode_premove_nolock(vnode_t *vp, pfd_t *pfd)
|
|
{
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_PREMOVE, vp, pfd, 0);
|
|
|
|
pcache_remove_pfdat(&vp->v_pcache, pfd);
|
|
vnode_page_decommission((void *)NULL, pfd);
|
|
|
|
}
|
|
|
|
void
|
|
vnode_premove(vnode_t *vp, pfd_t *pfd)
|
|
{
|
|
int locktoken;
|
|
|
|
ASSERT((pfd->pf_flags & (P_ANON|P_SWAP)) == 0);
|
|
ASSERT(pfd->pf_flags & P_HASH);
|
|
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
vnode_premove_nolock(vp, pfd);
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
}
|
|
|
|
|
|
/*
|
|
* Renounce vnode pages [first, last].
|
|
* Mark pages in the range <first, last> as bad.
|
|
* Invoke the chunktoss, which marks the data in this range as
|
|
* stale. chunktoss() invokes chunkrelease() which is responsible
|
|
* for freeing the pages in the range <first, last>
|
|
* NOTE: chunkrelease() invokes vnode_pagesrelease to free the pages!!.
|
|
*/
|
|
|
|
|
|
void
|
|
vnode_tosspages(vnode_t *vp, off_t first, off_t last)
|
|
{
|
|
pfd_t *pfd;
|
|
int i, locktoken;
|
|
pgno_t pfirst, plast;
|
|
preempt_cookie_t preempt_cookie;
|
|
|
|
ASSERT(vp->v_pgcnt >= 0);
|
|
|
|
if (first > last)
|
|
return;
|
|
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_PTOSS, vp, first, last);
|
|
|
|
if (vp->v_pgcnt) {
|
|
|
|
pfirst = offtoc(first);
|
|
plast = offtoct(last);
|
|
|
|
preempt_cookie.pc_vp = vp;
|
|
preempt_cookie.pc_locktoken = locktoken;
|
|
|
|
pcache_remove(&vp->v_pcache,
|
|
pfirst, (plast - pfirst + 1),
|
|
vnode_page_decommission,
|
|
VNODE_PAGEOP_TOSS,
|
|
vnode_preempt, (void *) &preempt_cookie);
|
|
} else {
|
|
#ifdef PGCACHEDEBUG
|
|
vnode_anypage_valid(vp);
|
|
#endif
|
|
}
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
|
|
/*
|
|
* This is a hack, but EFS and xFS protect against accessing
|
|
* beyond eof, so save time and only zero page-end for
|
|
* non-local (NFS) filesystems.
|
|
*/
|
|
if ((i = poff(first)) && !(vp->v_vfsp->vfs_flag & VFS_LOCAL)) {
|
|
pfd = vnode_pfind(vp, offtoct(first), VM_ATTACH);
|
|
if (pfd) {
|
|
page_zero(pfd, 0, i, NBPP-i);
|
|
pagefree(pfd);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* we have marked all pages as BAD and unhashed them.
|
|
* Now call chunktoss which will remove them (w/o writing)
|
|
* from the delwri queue if they are there, as well as STALE
|
|
* out any buffers containing pointers to these pages
|
|
*/
|
|
chunktoss(vp, first, last);
|
|
#ifdef PGCACHEDEBUG
|
|
vnode_oobpages(vp, offtoc(first));
|
|
#endif
|
|
|
|
}
|
|
|
|
#if 0
|
|
/*
|
|
* flush and unhash all pages associated with a filesystem
|
|
* vfsp -> mounted filesystem
|
|
* returns:
|
|
* none
|
|
* This routine is not needed anymore. It used to be called from
|
|
* efs while unmounting the root. All that was done here was to toss
|
|
* all pages that belongs to the vnode within the filesystem.
|
|
* It makes no sens to do that while shutting down the system!!.
|
|
*
|
|
* Prototype code is here just to show what we need to do if it's
|
|
* required to implement this routine.
|
|
*/
|
|
/*ARGSUSED*/
|
|
void
|
|
vfs_flushinval_pages(struct vfs *vfsp)
|
|
{
|
|
|
|
/* Flush out delayed-write pages first.
|
|
*
|
|
* binval is always called before pflushinvalvfsp, so any
|
|
* delwri buffers have already been pushed.
|
|
vfsppush(vfsp, B_STALE);
|
|
*/
|
|
|
|
ASSERT(0);
|
|
THIS CODE IS STILL TO BE FIXED. It would look like
|
|
for (vp = first_vp_in_vfs;
|
|
vp != last_vp_in_vfs;
|
|
vp = get_next_vp(vfs, vp))
|
|
vnode_flushinval_pages(vp, first, last);
|
|
|
|
}
|
|
#endif /* 0 */
|
|
|
|
/*
|
|
* Flush delayed writes for vp,
|
|
* then invalidate remaining used pages and unhash cached ones.
|
|
*
|
|
* returns: nothing
|
|
*/
|
|
void
|
|
vnode_flushinval_pages(struct vnode *vp, off_t start, off_t end)
|
|
{
|
|
register pgno_t pfirst, plast;
|
|
register int locktoken;
|
|
preempt_cookie_t preempt_cookie;
|
|
|
|
ASSERT(vp->v_pgcnt >= 0);
|
|
if (start > end)
|
|
return;
|
|
|
|
/*
|
|
* Always call chunkpush - its possible that we have dirty bad pages
|
|
* on vp - this can occur on file truncation of a mmaped file.
|
|
* These pages will have been unhashed so won't be represented by
|
|
* pgcnt.
|
|
*
|
|
* We don't need to map start/end out to page boundaries because
|
|
* pfirst/plast are mapped inward to page boundaries.
|
|
*/
|
|
(void) chunkpush(vp, start, end, B_STALE);
|
|
|
|
if (vp->v_pgcnt == 0)
|
|
return;
|
|
|
|
|
|
pfirst = offtoc(start);
|
|
plast = offtoct(end);
|
|
|
|
|
|
/*
|
|
* We have some valid pages to be flush-invalidated.
|
|
* Invoke pcache_remove on the range.
|
|
*/
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
|
|
KTRACE_ENTER(vnode_ktrace,
|
|
VNODE_PCACHE_FLUSHINVAL, vp, pfirst, plast);
|
|
|
|
preempt_cookie.pc_vp = vp;
|
|
preempt_cookie.pc_locktoken = locktoken;
|
|
|
|
pcache_remove(&vp->v_pcache,
|
|
pfirst,
|
|
(plast - pfirst + 1),
|
|
vnode_page_decommission,
|
|
VNODE_PAGEOP_TOSS,
|
|
vnode_preempt, (void *) &preempt_cookie);
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
|
|
|
|
#ifdef PGCACHEDEBUG
|
|
if (vp->v_pgcnt == 0)
|
|
vnode_anypage_valid(vp);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* flush and unhash all unused, cached pages associated with an vnode
|
|
* vp -> vnode
|
|
* returns: nothing
|
|
*/
|
|
void
|
|
vnode_invalfree_pages(vnode_t *vp, off_t filesize)
|
|
{
|
|
int locktoken;
|
|
pgno_t pgno;
|
|
|
|
if (filesize == 0)
|
|
return;
|
|
|
|
chunkinvalfree(vp);
|
|
|
|
ASSERT(vp->v_pgcnt >= 0);
|
|
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
|
|
for (pgno = 0; pgno <= offtoct(filesize); pgno++) {
|
|
pfd_t *pfd;
|
|
/* Got to make a single interface for following operations */
|
|
if (pfd = pcache_find(&vp->v_pcache, pgno)) {
|
|
|
|
/*
|
|
* Hold pfdat lock across the pagecache operation.
|
|
* This avoids the race condition between CPU A
|
|
* looking at pf_use field, and deciding to remove
|
|
* this page, and CPU B bumping the reference
|
|
* count on this. (But.. remapf() should have
|
|
* removed this race condition.. If that's the
|
|
* case, take this lock out..
|
|
*/
|
|
ASSERT(pfd->pf_vp == vp);
|
|
nested_pfdat_lock(pfd);
|
|
if (!(pfd->pf_flags & P_BAD) && (pfd->pf_use == 0)) {
|
|
KTRACE_ENTER(vnode_ktrace,
|
|
VNODE_PCACHE_INVALFREE,
|
|
vp, pfd, pgno);
|
|
|
|
pcache_remove_pfdat(&vp->v_pcache, pfd);
|
|
vnode_page_decommission_nolock(
|
|
VNODE_PAGEOP_TOSS, pfd);
|
|
}
|
|
nested_pfdat_unlock(pfd);
|
|
}
|
|
}
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
|
|
}
|
|
|
|
/*
|
|
* Flush delayed writes for vp.
|
|
*
|
|
* returns: 0 or an errno
|
|
* Called from filesystem to write back all the pages that
|
|
* could be dirty. This interface was established when there
|
|
* was not unified page cache. Since we have one now, all
|
|
* that's needed will be done in chunkpush()
|
|
*/
|
|
int
|
|
vnode_flush_pages(struct vnode *vp, off_t first, off_t last, uint64_t bflags)
|
|
{
|
|
if (last == 0)
|
|
return 0;
|
|
|
|
return chunkpush(vp, first, last, bflags);
|
|
}
|
|
|
|
/*
|
|
* vnode_pagesrelease
|
|
* Release all the pages associated with a vnode.
|
|
* This gets invoked via chunkrelse routine and is responsible
|
|
* for freeing all the pages linked via the "special" field
|
|
* "pf_pchain" in pfdat. The caller holds the buffer lock
|
|
* protecting the chain itself.
|
|
*/
|
|
void
|
|
vnode_pagesrelease(vnode_t *vp, pfd_t *pfd, int count, uint64_t flags)
|
|
{
|
|
pfd_t *tpfd;
|
|
int locktoken;
|
|
|
|
for ( ; --count >= 0; pfd = tpfd) {
|
|
ASSERT(pfd);
|
|
|
|
KTRACE_ENTER(vnode_ktrace,
|
|
VNODE_PCACHE_PAGESRELEASE, vp, pfd, flags);
|
|
|
|
/*
|
|
* The pf_pchain field is protected by the buffer lock
|
|
* held by our caller.
|
|
*/
|
|
tpfd = pfd->pf_pchain;
|
|
|
|
ASSERT(pfd->pf_vp == vp);
|
|
ASSERT(!(pfd->pf_flags & (P_ANON|P_SWAP)));
|
|
|
|
if ((flags & B_ERROR) && !(pfd->pf_flags & P_BAD)) {
|
|
|
|
/*
|
|
* We acquire the pcache lock for each page so
|
|
* that we don't hold it for too long if we're
|
|
* releasing a long list of pages.
|
|
*/
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
/*
|
|
* The P_BAD flag is checked inside the PCACHE lock as
|
|
* it is set holding this lock.
|
|
*/
|
|
if (!(pfd->pf_flags & P_BAD)) {
|
|
pcache_remove_pfdat(&vp->v_pcache, pfd);
|
|
vnode_page_decommission(VNODE_PAGEOP_TOSS, pfd);
|
|
}
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
}
|
|
/*
|
|
* mark page as done, and wakeup anyone sleeping on it.
|
|
*/
|
|
pagedone(pfd);
|
|
|
|
/*
|
|
* free page back to its pool.
|
|
* This may be a lengthy operation (measured at over 50
|
|
* microseconds or so on 100 Mhz SN0), which is only a
|
|
* problem if we're in a loop releasing lots of pages.
|
|
*/
|
|
pagefree(pfd);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Set the P_HOLE bit in the pages of the given list.
|
|
*/
|
|
/*ARGSUSED*/
|
|
void
|
|
vnode_pages_sethole(vnode_t *vp, pfd_t *pfd, int count)
|
|
{
|
|
|
|
int locktoken;
|
|
pfd_t *pfd1;
|
|
|
|
ASSERT(pfd != NULL);
|
|
ASSERT(vp);
|
|
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
|
|
while (count > 0) {
|
|
ASSERT(pfd != NULL);
|
|
ASSERT(pfd->pf_vp == vp);
|
|
nested_pfdat_lock(pfd);
|
|
pfd_nolock_setflags(pfd, P_HOLE);
|
|
pfd1 = pfd->pf_pchain;
|
|
count--;
|
|
nested_pfdat_unlock(pfd);
|
|
pfd = pfd1;
|
|
}
|
|
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
}
|
|
|
|
|
|
/*
|
|
* vnode_page_recycle
|
|
* Recycle the page belonging to vnode for a different purpose.
|
|
* Gets invoked from the pagealloc() path if the page to be
|
|
* allocated still has P_HASH bit set, and belongs to a vnode.
|
|
*
|
|
* This routine is responsible for taking the page out of the
|
|
* page cache and returning it for use by the thread in pagealloc()
|
|
*
|
|
* If the page has P_HASH bit still set, then this routine removes the
|
|
* page from vnode page cache, turns off the appropriate flags, and
|
|
* returns it.
|
|
* If P_HASH is not set, some one else raced with us, and already
|
|
* removed the page from the pagecache. In this case, there is no
|
|
* work to be done.
|
|
*
|
|
* In addition, this routine has to synchronize with the threads trying
|
|
* to reclaim the vnode.
|
|
*
|
|
* Refer to the detailed description of how the threads are synchronized
|
|
* at the top of file.
|
|
*/
|
|
void
|
|
vnode_page_recycle(struct vnode *vp, struct pfdat *pfd)
|
|
{
|
|
int locktoken;
|
|
|
|
ASSERT(vp && vp->v_pcacheref);
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
ASSERT(vp && vp->v_pcacheref);
|
|
|
|
KTRACE_ENTER(vnode_ktrace, VNODE_PCACHE_PAGERECYCLE,
|
|
vp, pfd, vp->v_pcacheflag);
|
|
|
|
nested_pfdat_lock(pfd);
|
|
ASSERT((pfd->pf_flags & (P_ANON|P_SQUEUE)) == 0);
|
|
ASSERT(pfd->pf_use == 0);
|
|
|
|
/*
|
|
* Page in recycled state could have already been removed from
|
|
* the page cache by the thread that got this page while in
|
|
* pagealloc. So try to remove this page only if it's still
|
|
* on the page cache.
|
|
*/
|
|
|
|
if (pfd->pf_flags & P_HASH) {
|
|
ASSERT(pfd->pf_vp == vp);
|
|
ASSERT((pfd->pf_flags & (P_HASH|P_RECYCLE)) ==
|
|
(P_HASH|P_RECYCLE));
|
|
|
|
/*
|
|
* Turn off the hash and recycle flags, and let the
|
|
* page cache remove this page.
|
|
* Since we are holding the object lock, it's not
|
|
* a problem to turn off flags here.
|
|
*/
|
|
pcache_remove_pfdat(&vp->v_pcache, pfd);
|
|
vnode_page_decommission_nolock((void *)NULL, pfd);
|
|
|
|
pfd->pf_vp = NULL;
|
|
pfd->pf_flags &= ~P_RECYCLE;
|
|
} else {
|
|
/*
|
|
* Page has already been removed from page cache.
|
|
* This could happen either due to a second CPU
|
|
* trying to acquire this page, would look at the
|
|
* P_RECYCLE bit, and do the job for us.
|
|
* Second possibility is if a flushinval_ operation
|
|
* came in and decided to mark page as bad.
|
|
* So, just turn off all relavent bits.
|
|
*/
|
|
pfd->pf_flags &= ~(P_BAD|P_RECYCLE);
|
|
pfd->pf_vp = NULL;
|
|
}
|
|
nested_pfdat_unlock(pfd);
|
|
|
|
/*
|
|
* Decrement the page cache reference on the vnode.
|
|
* and if there is any thread waiting to be woken up
|
|
* (Happens only in vnode reclaim path), do so.
|
|
*/
|
|
vnode_release(vp);
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
/*
|
|
* Momentarily release the vnode pcache lock to allow any pending interrupts
|
|
* to occur.
|
|
*/
|
|
|
|
static void
|
|
vnode_preempt(void *cookie)
|
|
{
|
|
preempt_cookie_t *preempt_cookie;
|
|
|
|
preempt_cookie = (preempt_cookie_t *) cookie;
|
|
|
|
VNODE_PCACHE_UNLOCK(preempt_cookie->pc_vp, preempt_cookie->pc_locktoken);
|
|
delay_for_intr();
|
|
VNODE_PCACHE_LOCK(preempt_cookie->pc_vp, preempt_cookie->pc_locktoken);
|
|
}
|
|
|
|
|
|
#ifdef VNODE_PCACHE_TRACE
|
|
|
|
static char *vnode_pcache_ops[] = {
|
|
"none ", "init ", "reinit ", "pgfree ",
|
|
"reclaim ", "pageop ", "pagebad ", "pgattach ",
|
|
"pfind ", "pinstry ", "pinsert ", "pgremove ",
|
|
"ptoss ", "pflushinval", "pinvalfree ", "pgsrelease ",
|
|
"pagerecycle", "replinsert ", "replfound ", "replinsertd",
|
|
"pg_findnode", "pg_foundnod", "replshoot ", "replsh_pg ",
|
|
"replsh_end ", "pagefree ", "pagefreanon", "migr ",
|
|
"epilogue ", "pg_ismigr ", "preplace"
|
|
};
|
|
|
|
|
|
void
|
|
idbg_vnode_pcache_trace(__psunsigned_t val)
|
|
{
|
|
struct vnode *vp = (struct vnode *)val;
|
|
ktrace_entry_t *kt;
|
|
ktrace_snap_t ktsnap;
|
|
|
|
if (vnode_ktrace == NULL)
|
|
return;
|
|
|
|
qprintf("Vnode pagecache operations for vnode 0x%x\n", vp);
|
|
kt = ktrace_first(vnode_ktrace, &ktsnap);
|
|
while (kt != NULL) {
|
|
if (val == -1)
|
|
qprintf("vp: 0x%x", kt->val[1]);
|
|
if ((val == -1) || ((vnode_t *)(kt->val[1]) == vp)) {
|
|
qprintf(" %s @%d RA:0x%x 0x%x 0x%x \n",
|
|
vnode_pcache_ops[(long)kt->val[0]],
|
|
kt->val[4],
|
|
kt->val[5],
|
|
kt->val[2], kt->val[3]);
|
|
}
|
|
kt = ktrace_next(vnode_ktrace, &ktsnap);
|
|
}
|
|
}
|
|
|
|
#endif /* VNODE_PCACHE_TRACE */
|
|
|
|
#if defined(PGCACHEDEBUG)
|
|
|
|
#if defined(NUMA_BASE)
|
|
static void
|
|
vnode_anypage_valid(vnode_t *vp)
|
|
{
|
|
int node, slot, slot_psize, base_pfn, pfn;
|
|
struct pfdat *pfd;
|
|
|
|
|
|
for (node=0; node < numnodes; node++){
|
|
for (slot = 0; slot < node_getnumslots(node); slot++){
|
|
slot_psize = slot_getsize(node,slot);
|
|
base_pfn = slot_getbasepfn(node,slot);
|
|
for (pfn = base_pfn; pfn < base_pfn + slot_psize; ++pfn) {
|
|
pfd = pfntopfdat(pfn);
|
|
/*
|
|
* We are looking without holding locks.
|
|
* So, be aware of the potential race
|
|
* conditions.
|
|
*/
|
|
|
|
if ((pfd->pf_vp == vp) &&
|
|
!(pfd->pf_flags & P_BAD) &&
|
|
(pfd->pf_flags & P_HASH)) {
|
|
|
|
printf("pfd 0x%x still hashed on vp 0x%x\n",
|
|
pfd, vp);
|
|
debug("");
|
|
}
|
|
}
|
|
} /* for slots */
|
|
} /* for node */
|
|
|
|
}
|
|
#else /* !(NUMA_BASE) */
|
|
static void
|
|
vnode_anypage_valid(vnode_t *vp)
|
|
{
|
|
int node;
|
|
|
|
for (node=0; node < numnodes; node++){
|
|
pfd_t *pfd;
|
|
|
|
for (pfd = PFD_LOW(node); pfd <= PFD_HIGH(node); pfd++){
|
|
if ((pfd->pf_vp == vp) &&
|
|
!(pfd->pf_flags & P_BAD) &&
|
|
(pfd->pf_flags & P_HASH)) {
|
|
printf("pfd 0x%x still hashed on vp 0x%x\n",
|
|
pfd, vp);
|
|
debug("");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
#endif /* NUMA_BASE */
|
|
|
|
static void
|
|
vnode_oobpages(vnode_t *vp, pgno_t pgno)
|
|
{
|
|
int locktoken;
|
|
struct pfdat *pfd;
|
|
|
|
VNODE_PCACHE_LOCK(vp, locktoken);
|
|
while (pfd = pcache_find(&vp->v_pcache, pgno++)){
|
|
if ((pfd->pf_flags & P_BAD) == 0){
|
|
cmn_err(CE_WARN, "residual cached pfd 0x%x for vp 0x%x",
|
|
pfd, vp);
|
|
debug(0);
|
|
}
|
|
}
|
|
VNODE_PCACHE_UNLOCK(vp, locktoken);
|
|
}
|
|
|
|
#endif /* PGCACHEDEBUG */
|