1
0
Files
irix-657m-src/irix/kern/os/swapalloc.c
2022-09-29 17:59:04 +03:00

2464 lines
56 KiB
C

/**************************************************************************
* *
* Copyright (C) 1986-1996 Silicon Graphics, Inc. *
* *
* These coded instructions, statements, and computer programs contain *
* unpublished proprietary information of Silicon Graphics, Inc., and *
* are protected by Federal copyright law. They may not be disclosed *
* to third parties or copied or duplicated in any form, in whole or *
* in part, without the prior written consent of Silicon Graphics, Inc. *
* *
**************************************************************************/
/* Copyright (c) 1984 AT&T */
/* All Rights Reserved */
/* THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF AT&T */
/* The copyright notice above does not evidence any */
/* actual or intended publication of such source code. */
#ident "$Revision: 3.176 $"
#include <string.h>
#include <sys/types.h>
#include <sys/anon.h>
#include <ksys/as.h>
#include <sys/atomic_ops.h>
#include <sys/buf.h>
#include <sys/capability.h>
#include <sys/cmn_err.h>
#include <sys/conf.h>
#include <sys/cred.h>
#include <sys/ddi.h>
#include <sys/debug.h>
#include <sys/errno.h>
#include <ksys/vfile.h>
#include <sys/flock.h>
#include <sys/getpages.h>
#include <sys/immu.h>
#include <sys/kabi.h>
#include <sys/kmem.h>
#include <sys/numa.h>
#include <sys/param.h>
#include <sys/page.h>
#include <sys/pfdat.h>
#include <os/as/pmap.h>
#include <sys/prctl.h>
#include <os/as/region.h>
#include <sys/resource.h>
#include <sys/sbd.h>
#include <sys/sema.h>
#include <sys/swap.h>
#include <sys/sysinfo.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/tuneable.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <ksys/vproc.h>
#include <ksys/vshm.h>
#include <ksys/vm_pool.h>
#include <ksys/vmmacros.h>
#include <sys/nodepda.h>
#define EMISER -2
/*
* XXX - a version of sm_free that already has lock??
*/
struct delarg {
int hard;
int lswap;
};
static pgno_t swapfind(register swapinfo_t *st, register int size, int *fsize);
static int shake_swap(int);
static int swapseg(dev_t, struct buf *, int);
extern int swaptabsnap(swapinfo_t **si_snapp);
static int doswapdel(swapinfo_t *st);
static int unswap(pas_t *, pgno_t, preg_t *, int);
static int xswap(pgno_t, preg_t *, reg_t *, int);
static pgno_t _nfreeswap(void);
static void unswapsysreg(void);
static void unswapshm(struct delarg *);
static void do_unswapreg(vshm_t *, void *);
static lock_t swap_spinlock; /* mutex for all swap entries
* protects add/del from alloc/free
*/
static sema_t inswapmod; /* single thread add/deletes
* protects multiple users from
* adding/deleting at same time
*/
static mutex_t shake_swap_lock; /* serialize swap shake operations */
static swaptab_t swaptab[NSWAPPRI]; /* access via priority */
swapinfo_t *lswaptab[MAXLSWAP]; /* direct access via logical swap */
static int nswapfiles; /* current number configured swaps */
#define swaplock() mp_mutex_spinlock(&swap_spinlock);
#define swapunlock(s) mp_mutex_spinunlock(&swap_spinlock, s);
#define swapmodlock() psema(&inswapmod, PZERO);
#define cpswapmodlock() cpsema(&inswapmod)
#define swapmodunlock() vsema(&inswapmod);
/*
* an internal swap handle
*/
typedef union {
struct {
__uint32_t sid_lswap:8, /* logical swap device */
sid_pageno:24; /* page number on device */
} sid;
sm_swaphandle_t sm_all;
} swaphandle_t;
#define sm_lswap sid.sid_lswap
#define sm_pageno sid.sid_pageno
#ifdef DEBUG
int swapwrinject = 0; /* inject swap write errors */
int swapwrinjectfreq = 10;
int swaprdinject = 0; /* inject swap read errors */
int swaprdinjectfreq = 10;
#endif
void
swapinit(void)
{
initnsema(&inswapmod, 1, "swapmod");
spinlock_init(&swap_spinlock, "swap lock");
mutex_init(&shake_swap_lock, MUTEX_DEFAULT, "swapshk");
shake_register(SHAKEMGR_SWAP, shake_swap);
}
#undef SWAPDEBUG
#ifdef SWAPDEBUG
static int doswapcksum = 1;
static long
swapcksum(pfd_t *pfd, int size, int rw)
{
register int csum = 0;
register int max = size / sizeof(long);
register int i;
register long *ptr, *sptr;
sptr = ptr = (long *)page_mapin(pfd, 0, 0);
cache_operation(sptr, size,
CACH_DCACHE|CACH_WBACK|CACH_INVAL|CACH_LOCAL_ONLY|
CACH_AVOID_VCES|
((rw & B_READ) : CACH_IO_COHERENCY ? 0) );
for (i = 0; i < max; i++) {
csum += *ptr;
ptr++;
}
/* noone could have written to it yet right?? */
cache_operation(sptr, size,
CACH_DCACHE|CACH_WBACK|CACH_INVAL|CACH_LOCAL_ONLY|
CACH_AVOID_VCES);
page_mapout((caddr_t)sptr);
return(-(csum+1));
}
static void
swapsum(pglst_t *pglist, swaphandle_t sw, pgno_t size, int rw)
{
register long *sumptr, psum;
register swapinfo_t *st;
pgno_t swappg;
int i;
if (!doswapcksum)
return;
st = lswaptab[sw.sm_lswap];
swappg = sw.sm_pageno - st->st_swppglo;
sumptr = &st->st_cksum[swappg];
for (i = 0; i < size; i++, sumptr++, pglist++) {
psum = swapcksum(pglist->gp_pfd, NBPP, rw);
if (!(rw & B_READ)) {
*sumptr = psum;
continue;
}
if (*sumptr != psum) {
cmn_err(CE_WARN,
"wrong swapsum, pglist 0x%x was %d not %d",
pglist, psum, *sumptr);
debug(0);
}
}
}
#endif
/*
* swapctl(2)
*/
struct swapcmda {
sysarg_t sc_cmd; /* command code for swapctl */
void *sc_arg; /* argument pointer for swaptcl */
};
/*
* int
* swapctl(struct swapcmda *uap, rval_t *rvp)
* System call to add, delete, list, and total swap devices.
*
* On success, zero is returned and the requested operation has
* been performed. If (sc_cmd == SC_GETNSWAP), then the number
* of swap devices will be returned in rvp->r_val1.
* If (sc_cmd == SC_LIST), then the requested information will have
* been copied out into the user's address space at the address
* indicated by ((swaptbl_t *)uap->sc_arg)->swt_ent and rvp->r_val1
* will be set to indicate how many entries were actually returned.
*
* On failure, a non-zero errno is returned to indicate the failure
* mode.
*
* Remarks:
* When performing SC_ADD and SC_REMOVE requests, offset and length
* values passed in sr_start and sr_length are provided in terms
* of UBSIZE (512-byte) units and are converted before being used.
* See comment in SC_ADD/SC_REMOVE cases below.
*/
int
swapctl(struct swapcmda *uap, rval_t *rvp)
{
int error = 0;
xswapres_t xsr;
struct vnode *vp, *oldvp;
int length;
char *swapname;
auto int lswap;
auto pgno_t vswap, nswap;
auto off_t nblks;
int s;
switch (uap->sc_cmd) {
case SC_GETRESVSWAP:
case SC_GETLSWAPTOT:
case SC_GETFREESWAP:
case SC_GETSWAPTOT:
case SC_GETSWAPMAX:
case SC_GETSWAPVIRT:
switch (uap->sc_cmd) {
case SC_GETRESVSWAP:
/*
* take maximum logical space and subtract off the
* amount of virtual space left.
* This of course can be greater than the amount
* of physical swap.
*/
getmaxswap(&nswap, &vswap, NULL);
nswap = maxmem + nswap + vswap - tune.t_minasmem;
nswap = (signed)nswap - (signed)GLOBAL_AVAILSMEM();
break;
case SC_GETLSWAPTOT:
/* return total logical swap possible
* (sum of physical memory plus max physical swap plus
* virtual swap)
*/
getmaxswap(&nswap, &vswap, NULL);
nswap = maxmem + nswap + vswap - tune.t_minasmem;
break;
case SC_GETFREESWAP:
s = swaplock();
nswap = _nfreeswap();
swapunlock(s);
break;
case SC_GETSWAPTOT:
getmaxswap(NULL, NULL, &nswap);
break;
case SC_GETSWAPMAX:
getmaxswap(&nswap, NULL, NULL);
break;
case SC_GETSWAPVIRT:
getmaxswap(NULL, &nswap, NULL);
break;
}
nblks = ptod(nswap);
#if (_MIPS_SIM == _MIPS_SIM_ABI64)
if (ABI_IS_IRIX5_64(get_current_abi())) {
if (copyout(&nblks, uap->sc_arg, sizeof(nblks)))
return EFAULT;
} else
#endif
if (ABI_IS_IRIX5_N32(get_current_abi())) {
if (copyout(&nblks, uap->sc_arg, sizeof(nblks)))
return EFAULT;
} else {
irix5_off_t i5_nblks;
i5_nblks = (irix5_off_t)nblks;
if (copyout(&i5_nblks, uap->sc_arg, sizeof(i5_nblks)))
return EFAULT;
}
return 0;
case SC_GETNSWP:
/*
* This data may be stale if it is ever passed back
* in a subsequent SC_LIST request.
*/
rvp->r_val1 = nswapfiles;
return(0);
case SC_LIST:
{
int i, nswapfiles_snap;
register int cnt;
swapinfo_t *si_snap, *st, *sip;
#if (_MIPS_SIM == _MIPS_SIM_ABI64)
swapent_t *ust;
#endif
irix5_n32_swapent_t *i5_n32_ust;
irix5_swapent_t *i5_ust;
if (copyin(uap->sc_arg, &length, sizeof(int)))
return(EFAULT);
retry:
nswapfiles_snap = nswapfiles;
if (nswapfiles_snap == 0) {
rvp->r_val1 = 0;
return(0);
}
si_snap = kmem_alloc(nswapfiles_snap * sizeof(swapinfo_t),
KM_SLEEP);
swapmodlock();
/*
* If more files were added while we blocked, free our
* old memory reservation and try again. We want to give
* as accurate a picture as possible.
*/
if (nswapfiles != nswapfiles_snap) {
swapmodunlock();
kmem_free(si_snap,
sizeof(swapinfo_t) * nswapfiles_snap);
goto retry;
}
/*
* Return an error if we don't have enough space
* for the whole table.
*/
if (length < nswapfiles) {
swapmodunlock();
kmem_free(si_snap,
sizeof(swapinfo_t) * nswapfiles_snap);
return(ENOMEM);
}
/*
* OK, we have enough room to copy the entire table.
* Skip any entries which are only in the process
* of being added.
*/
sip = si_snap;
for (i = 0; i < NSWAPPRI; i++) {
for (st = swaptab[i].sw_list; st; st = st->st_list) {
if (!(st->st_flags & ST_NOTREADY)) {
*sip = *st;
sip++;
}
}
}
swapmodunlock();
/*
* Shuffle out our local copy of the table to user mode
*/
#if (_MIPS_SIM == _MIPS_SIM_ABI64)
if (ABI_IS_IRIX5_64(get_current_abi()))
ust = (swapent_t *)((swaptbl_t *)uap->sc_arg)->swt_ent;
else
#endif
if (ABI_IS_IRIX5_N32(get_current_abi()))
i5_n32_ust = ((irix5_n32_swaptbl_t *)uap->sc_arg)->swt_ent;
else
i5_ust = ((irix5_swaptbl_t *)uap->sc_arg)->swt_ent;
error = 0;
cnt = 0;
for (--sip; sip >= si_snap; --sip, cnt++) {
/*
* We copyin the user's structure first to get
* a hold of the ste_path pointer which we need
* to copyout si_name to.
*/
#if (_MIPS_SIM == _MIPS_SIM_ABI64)
if (ABI_IS_IRIX5_64(get_current_abi())) {
swapent_t se;
if (copyin(ust, &se, sizeof(swapent_t))) {
error = EFAULT;
break;
}
se.ste_length = sip->st_length;
se.ste_start = sip->st_start;
se.ste_pages = sip->st_npgs;
se.ste_free = sip->st_nfpgs;
se.ste_flags = sip->st_flags;
se.ste_pri = sip->st_pri;
se.ste_maxpages = sip->st_maxpgs;
se.ste_vpages = sip->st_vpgs;
se.ste_lswap = sip->st_lswap;
if (copyout(&se, ust, sizeof(swapent_t))) {
error = EFAULT;
break;
}
/*
* Copyout st_name.
*/
ASSERT(sip->st_name != NULL);
if (copyout(sip->st_name, se.ste_path,
(strlen(sip->st_name) + 1))) {
error = EFAULT;
break;
}
ust++;
} else
#endif
if (ABI_IS_IRIX5_N32(get_current_abi())) {
irix5_n32_swapent_t i5_n32_se;
if (copyin(i5_n32_ust, &i5_n32_se,
sizeof(irix5_n32_swapent_t))) {
error = EFAULT;
break;
}
i5_n32_se.ste_length =
(irix5_n32_off_t)sip->st_length;
i5_n32_se.ste_start =
(irix5_n32_off_t)sip->st_start;
i5_n32_se.ste_pages =
(app32_long_t)sip->st_npgs;
i5_n32_se.ste_free =
(app32_long_t)sip->st_nfpgs;
i5_n32_se.ste_flags =
(app32_long_t)sip->st_flags;
i5_n32_se.ste_pri = sip->st_pri;
i5_n32_se.ste_maxpages =
(app32_long_t)sip->st_maxpgs;
i5_n32_se.ste_vpages =
(app32_long_t)sip->st_vpgs;
i5_n32_se.ste_lswap = sip->st_lswap;
if (copyout(&i5_n32_se, i5_n32_ust,
sizeof(irix5_n32_swapent_t))) {
error = EFAULT;
break;
}
/*
* Copyout st_name.
*/
ASSERT(sip->st_name != NULL);
if (copyout(sip->st_name,
(char *)(__psint_t)i5_n32_se.ste_path,
(strlen(sip->st_name) + 1))) {
error = EFAULT;
break;
}
i5_n32_ust++;
} else {
irix5_swapent_t i5_se;
if (copyin(i5_ust, &i5_se,
sizeof(irix5_swapent_t))) {
error = EFAULT;
break;
}
i5_se.ste_length = (irix5_off_t)sip->st_length;
i5_se.ste_start = (irix5_off_t)sip->st_start;
i5_se.ste_pages = (app32_long_t)sip->st_npgs;
i5_se.ste_free = (app32_long_t)sip->st_nfpgs;
i5_se.ste_flags = (app32_long_t)sip->st_flags;
i5_se.ste_pri = sip->st_pri;
i5_se.ste_maxpages =
(app32_long_t)sip->st_maxpgs;
i5_se.ste_vpages = (app32_long_t)sip->st_vpgs;
i5_se.ste_lswap = sip->st_lswap;
if (copyout(&i5_se, i5_ust,
sizeof(irix5_swapent_t))) {
error = EFAULT;
break;
}
/*
* Copyout st_name.
*/
ASSERT(sip->st_name != NULL);
if (copyout(sip->st_name,
(char *)(__psint_t)i5_se.ste_path,
(strlen(sip->st_name) + 1))) {
error = EFAULT;
break;
}
i5_ust++;
}
}
rvp->r_val1 = cnt;
kmem_free(si_snap, nswapfiles_snap * sizeof(swapinfo_t));
return (error);
}
case SC_SGIADD:
#if (_MIPS_SIM == _MIPS_SIM_ABI64)
if (ABI_IS_IRIX5_64(get_current_abi())) {
if (copyin(uap->sc_arg, &xsr, sizeof(xswapres_t)))
return(EFAULT);
} else
#endif
if (ABI_IS_IRIX5_N32(get_current_abi())) {
irix5_n32_xswapres_t i5_n32_xsr;
if (copyin(uap->sc_arg, &i5_n32_xsr,
sizeof(irix5_n32_xswapres_t)))
return(EFAULT);
xsr.sr_name = (char *)(__psint_t)i5_n32_xsr.sr_name;
xsr.sr_start = (off_t)i5_n32_xsr.sr_start;
xsr.sr_length = (off_t)i5_n32_xsr.sr_length;
xsr.sr_maxlength = (off_t)i5_n32_xsr.sr_maxlength;
xsr.sr_vlength = (off_t)i5_n32_xsr.sr_vlength;
xsr.sr_pri = i5_n32_xsr.sr_pri;
} else {
irix5_xswapres_t i5_xsr;
if (copyin(uap->sc_arg, &i5_xsr,
sizeof(irix5_xswapres_t)))
return(EFAULT);
xsr.sr_name = (char *)(__psint_t)i5_xsr.sr_name;
xsr.sr_start = (off_t)i5_xsr.sr_start;
xsr.sr_length = (off_t)i5_xsr.sr_length;
xsr.sr_maxlength = (off_t)i5_xsr.sr_maxlength;
xsr.sr_vlength = (off_t)i5_xsr.sr_vlength;
xsr.sr_pri = i5_xsr.sr_pri;
}
break;
case SC_ADD:
case SC_REMOVE:
#if (_MIPS_SIM == _MIPS_SIM_ABI64)
if (ABI_IS_IRIX5_64(get_current_abi())) {
swapres_t sr;
if (copyin(uap->sc_arg, &sr, sizeof(swapres_t)))
return(EFAULT);
xsr.sr_name = sr.sr_name;
xsr.sr_start = sr.sr_start;
xsr.sr_length = sr.sr_length;
xsr.sr_maxlength = sr.sr_length;
xsr.sr_vlength = sr.sr_length;
xsr.sr_pri = -1;
} else
#endif
if (ABI_IS_IRIX5_N32(get_current_abi())) {
irix5_n32_swapres_t i5_n32_sr;
if (copyin(uap->sc_arg, &i5_n32_sr,
sizeof(irix5_n32_swapres_t)))
return(EFAULT);
xsr.sr_name = (char *)(__psint_t)i5_n32_sr.sr_name;
xsr.sr_start = (off_t)i5_n32_sr.sr_start;
xsr.sr_length = (off_t)i5_n32_sr.sr_length;
xsr.sr_maxlength = (off_t)i5_n32_sr.sr_length;
xsr.sr_vlength = (off_t)i5_n32_sr.sr_length;
xsr.sr_pri = -1;
} else {
irix5_swapres_t i5_sr;
if (copyin(uap->sc_arg, &i5_sr,
sizeof(irix5_swapres_t)))
return(EFAULT);
xsr.sr_name = (char *)(__psint_t)i5_sr.sr_name;
xsr.sr_start = (off_t)i5_sr.sr_start;
xsr.sr_length = (off_t)i5_sr.sr_length;
xsr.sr_maxlength = (off_t)i5_sr.sr_length;
xsr.sr_vlength = (off_t)i5_sr.sr_length;
xsr.sr_pri = -1;
}
break;
case SC_KSGIADD:
if (!_CAP_ABLE(CAP_SWAP_MGT))
return(EPERM);
xsr = *(xswapres_t *)uap->sc_arg;
break;
case SC_LREMOVE: /* delete via lswap handle */
{
__psint_t lswap = (__psint_t)uap->sc_arg;
swapinfo_t *st;
if (!_CAP_ABLE(CAP_SWAP_MGT))
return(EPERM);
if (lswap >= MAXLSWAP)
return EINVAL;
swapmodlock();
error = 0;
if (((st = lswaptab[lswap]) == NULL) ||
(st->st_flags & (ST_INDEL|ST_NOTREADY)))
error = EINVAL;
if (!error)
error = doswapdel(st);
swapmodunlock();
return error;
}
default:
return(EINVAL);
}
if (!_CAP_ABLE(CAP_SWAP_MGT))
return(EPERM);
/*
* Allocate the space to read in pathname.
* Note we don't wait for memory and take the
* chance that if we are short this request will
* wedge.
*/
if ((swapname = (char *)kmem_alloc(MAXPATHLEN, KM_NOSLEEP)) == NULL)
return(ENOMEM);
if (uap->sc_cmd == SC_KSGIADD)
error = copystr(xsr.sr_name, swapname, MAXPATHLEN, 0);
else
error = copyinstr(xsr.sr_name, swapname, MAXPATHLEN, 0);
if (error)
goto out;
error = lookupname(swapname, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, NULL);
if (error)
goto out;
/*
* XXX check that if swapvp == rootvp that swplo is set??
*/
if (vp->v_flag & VNOSWAP) {
VN_RELE(vp);
error = ENOSYS;
goto out;
}
/*
* Check that the file is mappable - via a zero length mapping.
* If a new vp is returned, then it has a reference that is ours.
*/
oldvp = vp;
VOP_MAP(vp, 0, 0, PROT_WRITE, MAP_PRIVATE, get_current_cred(), &vp,
error);
if (error) {
VN_RELE(oldvp);
if (error == ENODEV)
error = ENOSYS; /* to comply with swapctl() man page */
return error;
}
if (oldvp != vp)
VN_RELE(oldvp); /* release the old reference */
switch (vp->v_type) {
case VBLK:
break;
case VREG:
VOP_ACCESS(vp, VREAD|VWRITE, get_current_cred(), error);
break;
case VDIR:
error = EISDIR;
break;
default:
error = ENOSYS;
break;
}
if (error == 0) {
if (uap->sc_cmd == SC_REMOVE)
error = swapdel(vp, xsr.sr_start);
else {
error = swapadd(vp, &xsr, swapname, &lswap);
if (error == 0 && uap->sc_cmd == SC_KSGIADD)
rvp->r_val1 = lswap;
}
/* VN_RELE handled by swapadd/swapdel */
} else
VN_RELE(vp);
out:
kmem_free(swapname, MAXPATHLEN);
return(error);
} /* swapctl */
/*
* Add first swap file
*/
int
addkswap(char *path, off_t start, off_t length, vnode_t **vp)
{
struct swapcmda uap;
rval_t rvp;
xswapres_t xsr;
int error;
#if CELL_IRIX
char pn[32];
sprintf(pn,"/swap.cell.%d", cellid());
path = pn;
start = 0;
length = -1;
printf("addkswap: %s \n", path);
#endif
xsr.sr_name = path;
xsr.sr_start = start;
xsr.sr_length = (length <= 0) ? -1 : length;
xsr.sr_maxlength = -1;
xsr.sr_vlength = -1;
xsr.sr_pri = -1;
uap.sc_cmd = SC_KSGIADD;
uap.sc_arg = &xsr;
if (error = swapctl(&uap, &rvp)) {
return error;
}
*vp = lswaptab[rvp.r_val1]->st_vp;
lswaptab[rvp.r_val1]->st_flags |= ST_BOOTSWAP;
return 0;
}
/*
* Add a new swap file.
*/
int
swapadd(struct vnode *vp, /* swap file vnode */
xswapres_t *xsr, /* parameters of swap file */
char *swapname, /* name of swap file */
int *plswap)
{
register swapinfo_t *st, *nst = NULL;
register pgno_t maxpgs;
register int i, lswap;
off_t start, end;
int s, error;
int locktoken;
/*REFERENCED*/
int unused;
struct vattr vattr;
int local = 0;
vfssw_t *efs_vfs, *nfs_vfs, *specfs_vfs, *xfs_vfs;
daddr_t swap_size;
cred_t *crp = get_current_cred();
extern int xfs_swappable(bhv_desc_t*);
vnode_t *openvp;
bhv_desc_t *bdp;
/*
* some checks below are file system specific - find
* vfs info now
*/
efs_vfs = vfs_getvfssw("efs");
nfs_vfs = vfs_getvfssw("nfs");
specfs_vfs = vfs_getvfssw("specfs");
xfs_vfs = vfs_getvfssw("xfs");
/*
* if doesn't have a priority, assign one:
* BLK devices all get pri 0
* efs gets 2
* nfs gets 4
*/
if (xsr->sr_pri < 0) {
bdp = vn_bhv_base_unlocked(VN_BHV_HEAD(vp));
if (vp->v_type == VBLK)
xsr->sr_pri = 0;
else if (nfs_vfs &&
((struct vnodeops *)BHV_OPS(bdp) ==
nfs_vfs->vsw_vnodeops))
xsr->sr_pri = 4;
else
xsr->sr_pri = 2;
} else if (xsr->sr_pri >= NSWAPPRI) {
VN_RELE(vp);
return EINVAL;
}
/* Open the swap file - might change vp .. */
openvp = vp;
VOP_OPEN(openvp, &vp, FREAD|FWRITE, crp, error);
if (error)
return error;
bdp = vn_bhv_base_unlocked(VN_BHV_HEAD(vp));
/* determine if local */
if (efs_vfs &&
efs_vfs->vsw_vnodeops == (struct vnodeops *)BHV_OPS(bdp))
local = 1;
if (xfs_vfs &&
xfs_vfs->vsw_vnodeops == (struct vnodeops *)BHV_OPS(bdp))
local = 1;
if (specfs_vfs &&
specfs_vfs->vsw_vnodeops == (struct vnodeops *)BHV_OPS(bdp))
local = 1;
/* only do 1 add/delete at a time */
swapmodlock();
/* compute size of resource */
if (vp->v_type == VREG) {
struct bmapval iex;
vattr.va_mask = AT_SIZE;
VOP_GETATTR(vp, &vattr, 0, crp, error);
if (error)
goto bad;
if (xfs_vfs &&
(xfs_vfs->vsw_vnodeops ==
(struct vnodeops *)BHV_OPS(bdp))) {
error = xfs_swappable(bdp);
if (error) {
goto bad;
}
}
/*
* kind of kludge - must get indirect extents in core -
* easiest way to do this is to call bmap
*/
if (vattr.va_size) {
s = 1;
VOP_RWLOCK(vp, VRWLOCK_WRITE);
VOP_BMAP(vp, vattr.va_size-1, 1, B_READ,
crp, &iex, &s, error);
if (error) {
VOP_RWUNLOCK(vp, VRWLOCK_WRITE);
goto bad;
}
VOP_RWUNLOCK(vp, VRWLOCK_WRITE);
}
swap_size = vattr.va_size >> SCTRSHFT;
} else {
/* stolen from specvnops.c */
struct bdevsw *my_bdevsw;
my_bdevsw = get_bdevsw(vp->v_rdev);
if (!bdstatic(my_bdevsw)) {
error = ENXIO;
goto bad;
}
if ((int (*)(void))my_bdevsw->d_size64 != nulldev) {
error = (*my_bdevsw->d_size64)(vp->v_rdev, &swap_size);
if (error)
goto bad;
} else if ((int (*)(void))my_bdevsw->d_size != nulldev)
swap_size = (*my_bdevsw->d_size)(vp->v_rdev);
else {
error = ENXIO;
goto bad;
}
}
if (swap_size < xsr->sr_start) {
error = EINVAL;
goto bad;
}
/*
* Do defaults/sanity checking for length, maxlength, vlength
*/
if (xsr->sr_length == -1)
xsr->sr_length = swap_size - xsr->sr_start;
if (xsr->sr_maxlength == -1)
xsr->sr_maxlength = xsr->sr_length;
if (xsr->sr_vlength == -1)
xsr->sr_vlength = xsr->sr_maxlength;
maxpgs = dtopt(xsr->sr_maxlength);
if (xsr->sr_maxlength < xsr->sr_length) {
error = EINVAL;
goto bad;
}
if (vp->v_type == VBLK && xsr->sr_maxlength != xsr->sr_length) {
/* block devices do not grow */
error = EINVAL;
goto bad;
}
/* virtual length must be equal or larger than other lengths */
if (xsr->sr_vlength < xsr->sr_length ||
xsr->sr_vlength < xsr->sr_maxlength) {
error = EINVAL;
goto bad;
}
/*
* XXX do not permit growing yet - if we do, then must
* handle sometime syncing meta-data associated with file out to
* disk ... currently VOP_SYNC skips ISSWAP files
* Also must watch losing buf that holds swapvp - then
* the iupdate may require memory to sync ..
*/
if (xsr->sr_length != xsr->sr_maxlength) {
error = EINVAL;
goto bad;
}
if (swap_size < (xsr->sr_start + xsr->sr_length)) {
error = EINVAL;
goto bad;
}
if (vp->v_flag & VISSWAP) {
error = EBUSY;
goto bad1;
}
VN_FLAGSET(vp, VISSWAP);
/*
* we need to be sure that no pages of this file are in the
* page cache since as soon as we start to swap the cache tags
* will be different (instead of vp it will be anon ptr).
* In addition we have set the V_ISSWAP flag to notify others
* that this file CANNOT be opened.
* Call VOP_FSYNC to make sure the swap file meta-data are properly
* syncd to disk
*/
VOP_FSYNC(vp, FSYNC_WAIT|FSYNC_INVAL, crp, (off_t)0, (off_t)-1,error);
if (error)
goto bad;
/*
* allocate a new swapinfo struct, bitmap, and string space
*/
nst = kmem_zalloc(sizeof(*nst), KM_SLEEP);
nst->st_name = kmem_alloc(strlen(swapname) + 1, KM_SLEEP);
nst->st_bmap = kmem_zalloc((maxpgs / 8) + 1, KM_SLEEP);
#ifdef SWAPDEBUG
nst->st_cksum = kmem_zalloc(maxpgs * sizeof(long), KM_SLEEP);
#endif
/*
* find free logical swap id - don't use 0 since NULL sm_swaphandle
* is 0
*/
for (lswap = 1; lswap < MAXLSWAP; lswap++)
if (lswaptab[lswap] == NULL)
break;
if (lswap >= MAXLSWAP) {
error = ENOSPC;
goto bad;
}
/* check for overlaps with existing swap files */
start = xsr->sr_start;
end = start + xsr->sr_length;
for (i = 0; i < NSWAPPRI; i++) {
for (st = swaptab[i].sw_list; st; st = st->st_list) {
if (vp != st->st_vp ||
((start >= (st->st_start + st->st_length)) &&
(end <= st->st_start)))
continue;
error = EEXIST;
goto bad;
}
}
/* Initialize the new entry */
nst->st_vp = vp;
nst->st_flags = ST_NOTREADY | (local ? ST_LOCAL_SWAP : 0);
nst->st_pri = xsr->sr_pri;
nst->st_lswap = lswap;
*plswap = nst->st_lswap;
nst->st_start = xsr->sr_start;
nst->st_length = xsr->sr_length;
nst->st_swppglo = dtop(xsr->sr_start); /* round up */
nst->st_npgs = dtopt(xsr->sr_length);
nst->st_nfpgs = nst->st_npgs;
nst->st_maxpgs = maxpgs;
nst->st_allocs = 0;
nst->st_gen = 0;
/*
* record vpgs as the actual number of 'virtual' pages - those
* over and above any 'real physical swap pages
*/
nst->st_vpgs = dtopt(xsr->sr_vlength) - maxpgs;
bcopy(swapname, nst->st_name, strlen(swapname) + 1);
/* add to logical swap table && priority list */
locktoken = swaplock();
lswaptab[lswap] = nst;
nst->st_list = swaptab[nst->st_pri].sw_list;
swaptab[nst->st_pri].sw_list = nst;
if (swaptab[nst->st_pri].sw_next == NULL)
swaptab[nst->st_pri].sw_next = nst;
nswapfiles++;
swapunlock(locktoken);
/* Add the swap space to the total available space count. */
reservemem(GLOBAL_POOL, -(nst->st_vpgs + maxpgs), 0, 0);
/* Clearing the flags allows sm_alloc to find it */
nst->st_flags &= ~ST_NOTREADY;
swapmodunlock();
return 0;
bad:
#ifdef SWAPDEBUG
if (nst && nst->st_cksum)
kmem_free(nst->st_cksum, maxpgs * sizeof(long));
#endif
if (nst && nst->st_bmap)
kmem_free(nst->st_bmap, (maxpgs / 8) + 1);
if (nst && nst->st_name)
kmem_free(nst->st_name, strlen(swapname) + 1);
if (nst)
kmem_free(nst, sizeof(*nst));
/* we opened file, so we can't really have any file locks against it */
VN_FLAGCLR(vp, VISSWAP);
bad1:
VOP_CLOSE(vp, FREAD|FWRITE, L_TRUE, crp, unused);
VN_RELE(vp); /* one for lookup now */
swapmodunlock();
return error;
}
#ifdef DELDEBUG
int nxswap;
int nnotdone;
int npfnnonzero;
int nswaperr;
int nnomem;
int nnoshdlck;
int nkv;
int nnoswap;
int nswpchg;
static void
prdelstats(void)
{
printf("swapdel:xswap %d notdone %d pfnnon0 %d swaperr %d nswpchg %d\n",
nxswap, nnotdone, npfnnonzero, nswaperr, nswpchg);
printf(" nomem %d noshdlck %d nkv %d nnoswap %d\n",
nnomem, nnoshdlck, nkv, nnoswap);
}
static void
initdelstats(void)
{
nxswap = nnotdone = npfnnonzero = nswaperr =
nnomem = nnoshdlck = nkv = nnoswap = nswpchg = 0;
}
#endif
/*
* Delete a swap file - based on vp and start.
*/
int
swapdel(struct vnode *vp, off_t start)
{
register swapinfo_t *st;
int i, error;
/* only do 1 add/delete at a time */
swapmodlock();
/*
* Find the swap file table entry for the file to
* be deleted.
* In progress adds and deletes are ignored (i.e. they give errors)
*/
for (i = 0; i < NSWAPPRI; i++) {
for (st = swaptab[i].sw_list; st; st = st->st_list) {
if (!(st->st_flags & (ST_INDEL|ST_NOTREADY)))
if (st->st_vp == vp && st->st_start == start)
break;
}
if (st)
break;
}
if (!st) {
swapmodunlock();
return EINVAL;
}
error = doswapdel(st);
swapmodunlock();
VN_RELE(vp); /* remove reference from lookup */
return error;
}
#if DEBUG
int swapdel_waited;
#endif
/*
* Delete a swap file.
* Can fail with ENOMEM if there are too few free pages to swap in all
* the swap pages, or if it will exceed some miser job's rss limit.
*/
static int
doswapdel(swapinfo_t *st)
{
register swapinfo_t *pst, *tst;
swaptab_t *tabp;
register int passes;
pgno_t maxpgs, rpgs;
int ret;
int s;
vnode_t *vp = st->st_vp;
as_scan_t scanargs;
struct delarg da;
/* once tagged with INDEL, noone will look at it */
s = swaplock();
st->st_flags |= ST_INDEL;
swapunlock(s);
rpgs = st->st_vpgs + st->st_maxpgs;
if (reservemem(GLOBAL_POOL, rpgs, 0, 0)) {
st->st_flags &= ~ST_INDEL; /* still good! */
cmn_err(CE_NOTE, "swapdelete - too few free pages");
return ENOMEM;
}
#ifdef DELDEBUG
initdelstats();
#endif
/* first get rid of any unecessary backing pages */
scanargs.as_scan_shake.as_shakeswap_lswap = st->st_lswap;
scanargs.as_scan_shake.as_shakeswap_hard = 0;
if (as_scan(AS_SWAPSCAN, 0, &scanargs) == EMISER) {
st->st_flags &= ~ST_INDEL; /* still good! */
cmn_err(CE_NOTE, "swapdelete - miser job limit reached");
unreservemem(GLOBAL_POOL, rpgs, 0, 0);
return ENOMEM;
}
#ifdef DELDEBUG
prdelstats();
initdelstats();
#endif
/*
* do a easy check to try to make sure that system won't totally
* go to pieces if we remove this swap device
* Is amount of swap we are currently using on the device greater
* than the amount of places (main memory + other swap devices)?
*/
if ((st->st_npgs - st->st_nfpgs) > (GLOBAL_FREEMEM() + _nfreeswap())) {
st->st_flags &= ~ST_INDEL;
unreservemem(GLOBAL_POOL, rpgs, 0, 0);
return ENOMEM;
}
/* get back pages from sysreg */
unswapsysreg();
/* reclaim pages */
passes = 0;
while (st->st_nfpgs < st->st_npgs) {
if (passes > 10) {
st->st_flags &= ~ST_INDEL;
unreservemem(GLOBAL_POOL, rpgs, 0, 0);
return EBUSY;
}
scanargs.as_scan_shake.as_shakeswap_lswap = st->st_lswap;
scanargs.as_scan_shake.as_shakeswap_hard = 1;
ret = as_scan(AS_SWAPSCAN, 0, &scanargs);
if (ret == EMISER) {
st->st_flags &= ~ST_INDEL;
unreservemem(GLOBAL_POOL, rpgs, 0, 0);
return EBUSY;
} else if (ret) {
/* ran out of memory! - go sxbrk and try again */
setsxbrk();
} else {
delay(10);
passes++;
}
da.lswap = st->st_lswap;
da.hard = 1;
unswapshm(&da);
#ifdef DELDEBUG
prdelstats();
initdelstats();
#endif
}
/*
* Someone might have been in sm_alloc doing the bit map search
* while we started deleting the swap device. They should be
* out of there by now, but check here just in case.
*/
if (st->st_allocs != 0) {
cmn_err(CE_WARN, "st_allocs not zero at end of swap deletion. Waiting...");
#if DEBUG
swapdel_waited++;
#endif
while (st->st_allocs != 0)
delay(HZ);
}
/* all done */
s = swaplock();
ASSERT(nswapfiles > 0);
ASSERT(lswaptab[st->st_lswap] == st);
nswapfiles--;
lswaptab[st->st_lswap] = NULL;
tabp = &swaptab[st->st_pri];
for (pst = NULL, tst = tabp->sw_list; tst;
pst = tst, tst = tst->st_list) {
if (tst == st) {
if (pst)
pst->st_list = st->st_list;
else
tabp->sw_list = st->st_list;
break;
}
}
tabp->sw_next = tabp->sw_list;
ASSERT(tst);
swapunlock(s);
maxpgs = st->st_maxpgs;
if (st->st_cksum)
kmem_free(st->st_cksum, maxpgs * sizeof(long));
if (st->st_bmap)
kmem_free(st->st_bmap, (maxpgs / 8) + 1);
if (st->st_name)
kmem_free(st->st_name, strlen(st->st_name) + 1);
kmem_free(st, sizeof(*st));
VOP_CLOSE(vp, FREAD|FWRITE, L_TRUE, get_current_cred(), ret);
VN_FLAGCLR(vp, VISSWAP);
VN_RELE(vp); /* we held one while a swap file */
return 0;
}
/*
* free up swap space associated with shared memory segments.
*
* Loop through all shared memory segments, and unswap the regions
* associated with them.
*/
static void
unswapshm(struct delarg *da)
{
vshm_iterate(do_unswapreg, da);
}
void
do_unswapreg(vshm_t *vshm, void *arg)
{
struct delarg *da;
struct region *rp;
as_mohandle_t mo;
VSHM_GETMO(vshm, &mo);
rp = mo.as_mohandle;
if (!rp)
return;
da = (struct delarg *)arg;
reglock(rp);
if (!unswapreg(0, rp, da->hard, da->lswap, 0))
regrele(rp);
}
/*
* Pull the region back from the swap disk. If we know where to put it
* in memory, try to leave it there. Otherwise, just exchange swap
* handles.
*/
int
unswapreg(pas_t *pas, reg_t *rp, int hard, int lswap, preg_t *prp)
{
register pgno_t rpn, apn, spn, epn;
sm_swaphandle_t sh;
swaphandle_t sw;
void *id;
int result, errs = 0;
if (!(rp->r_flags & RG_ANON)) {
ASSERT(rp->r_anon == NULL);
return 0;
}
(void) anon_shake_tree(rp->r_anon);
/*
* Set up the start and end page numbers based on the pregion
* that we're provided. It's possible that the pregion is only
* mapping part of the region so we need to be careful how
* much we unmap. If no pregion is provided, we assume that we
* will need to unmap the entire region.
*/
if (prp) {
spn = prp->p_offset;
epn = prp->p_pglen;
ASSERT(pas);
} else {
spn = 0;
epn = rp->r_pgsz;
ASSERT(pas == 0);
}
for (rpn = spn; rpn < epn; rpn++) {
apn = prp ? rpntoapn(prp, rpn) : rpn;
if ((id = anon_lookup(rp->r_anon, apn, &sh)) != NULL) {
if (sh) {
sw.sm_all = sh;
if (sw.sm_lswap != lswap)
continue;
if (prp == NULL ||
(rp->r_anon != (anon_hdl)id &&
!anon_isdegenerate(rp->r_anon)))
result = xswap(apn, prp, rp, hard);
else
result = unswap(pas, rpn, prp, hard);
switch(result)
{
case 0: /* success, keep going */
break;
case -1: /* out of memory, stop here */
return 1;
case EMISER:
/* Only possible in the shake case */
ASSERT(prp && pas);
return(EMISER);
default: /* random error, relock and continue */
/* well if we get to many, let's
* not get into an infinite loop
*/
if (++errs > 100)
return ENOMEM;
reglock(rp);
break;
}
}
}
}
return 0;
}
/*
* Swap space shaker. This is called when we run out of swap space
* to try and recover some unused space from the anonymous manager.
* At this point, the algorithm is all or nothing -- it runs through
* all address spaces and all anon trees. it never sleeps on an address
* space lock.
*/
/* ARGSUSED */
static int
shake_swap(int level)
{
ASSERT(level == SHAKEMGR_SWAP);
if (mutex_trylock(&shake_swap_lock) == 0)
return(0);
/* XXX should this be local only?? */
as_scan(AS_ANONSCAN, 0, NULL);
mutex_unlock(&shake_swap_lock);
return(0);
}
/*
* unswap - attempt to release a single page of swap
* if things get difficult, we just return and hope that next
* time life will be easier
*
* Returns - 0 if cleared swap w/o releasing region lock or just skipping page
* -1 if no mem
* -2/EMISER indicates miser job's limit would be exceeded
* >0 error
* XXX sysreg??
* Releases region lock unless returns 0
*/
int traceunswap = 0;
int traceunswapall = 0;
static int
unswap(pas_t *pas, pgno_t rpn, preg_t *prp, int hard)
{
pfd_t *pfd;
pfd_t *swappfd = NULL;
pglst_t swaplst[1];
reg_t *rp = prp->p_reg;
uint cachekey;
pgno_t apn = rpntoapn(prp, rpn);
caddr_t vaddr;
/*REFERENCED*/
attr_t *attr;
/*REFERENCED*/
struct pm *pm;
register pde_t *pd;
sm_swaphandle_t newsh = NULL;
auto void *id;
auto sm_swaphandle_t sp;
/*
* The only safe cases to deal with are :
* 1) leaf anon nodes (id == r_anon)
* 2) an interior node in a degenerate anon tree (ones with no branches)
*
* In either case we can guarantee that all (namely the one pregion/pmap
* we have locked) are referenced in a pmap. Only if they
* are can we remove the backing swap handle
*/
ASSERT(pas && prp);
vaddr = rpntov(prp, rpn);
again:
if ((pd = pmap_pte(pas, prp->p_pmap, vaddr, VM_NOSLEEP)) == NULL) {
regrele(rp);
#ifdef DELDEBUG
nnomem++;
#endif
return -1;
}
/*
* The pm we get here can be used safely because we
* have a reference from the region prp, which is
* locked and therefore cannot be modified.
*/
attr = findattr(prp, vaddr);
pm = attr_pm_get(attr);
ASSERT(pg_getpgi(pd) != SHRD_SENTINEL);
if (!pg_isvalid(pd) && pg_getpfn(pd) != 0) {
/* someone else in process of faulting in */
#ifdef DELDEBUG
npfnnonzero++;
#endif
if (swappfd) {
pagefree(swappfd);
VPAG_UPDATE_VM_RSS(PAS_TO_VPAGG(pas), JOBRSS_INC_BLIND,
0, -1);
}
return(0);
}
if (pfd = anon_pfind(rp->r_anon, apn, &id, &sp)) {
if (swappfd) {
pagefree(swappfd);
VPAG_UPDATE_VM_RSS(PAS_TO_VPAGG(pas), JOBRSS_INC_BLIND,
0, -1);
}
/*
* already in cache - interior nodes present a problem.
* The basic invariant that must be maintained is that either
* a page is referenced in ALL the pmap(s) or is out on swap.
* The only exception to this is if a shared region is
* marked HASSANON. Note that HASSANON only works for a single
* region since once a region is freed is removes all SANON
* pages.
*/
if (!(pfd->pf_flags & P_DONE)) {
if (rp->r_flags & RG_HASSANON)
pagefreesanon(pfd, 0);
else
anon_pagefree_and_cache(pfd);
#ifdef DELDEBUG
nnotdone++;
if (traceunswap && (id != rp->r_anon || traceunswapall))
printf("int apn %d id 0x%x swap 0x%x not done\n", apn, id, sp);
#endif
return(0);
}
ASSERT((pfd->pf_flags & (P_ANON|P_SWAP)) == (P_ANON|P_SWAP));
ASSERT(pfd->pf_pchain == NULL);
if (!pg_isvalid(pd)) {
int error;
/*
* We need to explicitly hold the pfdat
* because we're calling handlepd after
* rmap_addmap.
*/
pfdat_hold(pfd);
VPAG_UPDATE_RMAP_ADDMAP_RET(PAS_TO_VPAGG(pas),
JOBRSS_INC_FOR_PFD, pfd, pd, pm, error);
if (error) {
pfdat_release(pfd);
if (rp->r_flags & RG_HASSANON)
pagefreesanon(pfd, 0);
else
anon_pagefree_and_cache(pfd);
return (EMISER);
}
anon_clrswap(id, apn, NULL);
MINFO.cache++;
pg_setpfn(pd, pfdattopfn(pfd));
pg_setccuc(pd, attr->attr_cc, attr->attr_uc);
prp->p_nvalid++;
(void) handlepd(vaddr, pfd, pd, prp, 1);
pfdat_release(pfd);
} else {
anon_clrswap(id, apn, NULL);
if (rp->r_flags & RG_HASSANON)
pagefreesanon(pfd, 0);
else
anon_pagefree_and_cache(pfd);
}
#ifdef DELDEBUG
if (traceunswapall)
printf("apn %d id 0x%x swap 0x%x pfd 0x%x cnt %d hashed\n",
apn, id, sp, pfd, pfd->pf_use);
#endif
ASSERT(pfd->pf_use >= 1);
return(0);
}
if (!hard) {
return(0);
}
if (swappfd == NULL) {
/*
* first time through alloc page and swap in
*/
if (VPAG_UPDATE_VM_RSS(PAS_TO_VPAGG(pas), JOBRSS_INC_BLIND,
0, 1)) {
return(EMISER);
}
cachekey = vcache2(prp, attr, apn);
if ((swappfd = pagealloc(cachekey, 0)) == NULL) {
/* no memory */
regrele(rp);
VPAG_UPDATE_VM_RSS(PAS_TO_VPAGG(pas), JOBRSS_INC_BLIND,
0, -1);
#ifdef DELDEBUG
nnomem++;
#endif
return(-1);
}
regrele(rp);
swaplst[0].gp_pfd = swappfd;
ASSERT(swappfd->pf_pchain == NULL);
if (sm_swap(swaplst, sp, 1, B_READ, NULL)) {
pagefree(swappfd);
VPAG_UPDATE_VM_RSS(PAS_TO_VPAGG(pas), JOBRSS_INC_BLIND,
0, -1);
#ifdef DELDEBUG
nswaperr++;
#endif
return(EIO);
}
ASSERT(swappfd->pf_pchain == NULL);
/*
* now have page successfully read in - check that while things
* were unlocked that the page wasn't faulted in or
* that the anon/swap handle changed
* Note that region can't have gone away or changed size
* since we have aspacelock
*/
reglock(rp);
if ((id != anon_lookup(rp->r_anon, apn, &newsh)) ||
newsh != sp) {
pagefree(swappfd);
VPAG_UPDATE_VM_RSS(PAS_TO_VPAGG(pas), JOBRSS_INC_BLIND,
0, -1);
return(0);
}
goto again;
}
/*
* have a replacement page!
*/
if (anon_swapin(id, swappfd, apn) == DUPLICATE) {
pagefree(swappfd);
swappfd = NULL;
goto again;
}
anon_clrswap(id, apn, NULL);
ASSERT(!pg_isvalid(pd));
MINFO.swap++;
/*
* We have to explicitly hold the pfdat
* because we're calling handlepd after adding
* the reverse map link.
*/
pfdat_hold(swappfd);
pg_setpfn(pd, pfdattopfn(swappfd));
pg_setccuc(pd, attr->attr_cc, attr->attr_uc);
prp->p_nvalid++;
VPAG_UPDATE_RMAP_ADDMAP(PAS_TO_VPAGG(pas), JOBRSS_INS_PFD, swappfd,
pd, pm);
(void) handlepd(vaddr, swappfd, pd, prp, 1);
pagedone(swappfd);
pfdat_release(swappfd);
#ifdef DELDEBUG
if (traceunswapall)
printf("apn %d id 0x%x swap 0x%x pfd 0x%x cnt %d unswapped\n",
apn, id, sp, swappfd, swappfd->pf_use);
#endif
return(0);
}
/*
* xswap - exchange swap handles
*
* Since we hold the region locked, and guarantee that anon_lookup
* on this locked region continues to point to the swap handel in question
* we need not worry about other regions looking up and potentially
* swapping in the same page. It does no harm to pinsert the new page
* if we swapped it in.
*/
static int
xswap(pgno_t apn, preg_t *prp, reg_t *rp, int hard)
{
pfd_t *pfd;
pfd_t *swappfd = NULL;
pglst_t swaplst[1];
uint cachekey;
auto sm_swaphandle_t newsh;
auto sm_swaphandle_t tsh;
auto int npgs;
auto void *id;
auto sm_swaphandle_t sp;
if (!hard)
return 0;
npgs = 1;
newsh = sm_alloc(&npgs, 0);
if (npgs != 1) {
#ifdef DELDEBUG
nnoswap++;
#endif
return 0;
}
again:
if (pfd = anon_pfind(rp->r_anon, apn, &id, &sp)) {
if (swappfd) {
pagefree(swappfd);
swappfd = NULL;
}
if (!(pfd->pf_flags & P_DONE)) {
if (rp->r_flags & RG_HASSANON)
pagefreesanon(pfd, 0);
else
anon_pagefree_and_cache(pfd);
sm_free(&newsh, 1);
#ifdef DELDEBUG
nnotdone++;
#endif
return(0);
}
ASSERT((pfd->pf_flags & (P_ANON|P_SWAP)) == (P_ANON|P_SWAP));
ASSERT(pfd->pf_pchain == NULL);
} else if (swappfd == NULL) {
regrele(rp);
/*
* first time through alloc page and swap in
*/
if (prp)
cachekey = vcache2(prp, &prp->p_attrs, apn);
else
cachekey = vcache(rp->r_gen, apn);
if ((swappfd = pagealloc(cachekey, 0)) == NULL) {
/* no memory */
sm_free(&newsh, 1);
#ifdef DELDEBUG
nnomem++;
#endif
return(-1);
}
swaplst[0].gp_pfd = swappfd;
ASSERT(swappfd->pf_pchain == NULL);
if (sm_swap(swaplst, sp, 1, B_READ, NULL)) {
pagefree(swappfd);
sm_free(&newsh, 1);
#ifdef DELDEBUG
nswaperr++;
#endif
return(EIO);
}
ASSERT(swappfd->pf_pchain == NULL);
/*
* now have page successfully read in - check that while things
* were unlocked that the page wasn't faulted in or
* that the anon/swap handle changed
* Note that region can't have gone away or changed size
* since we have aspacelock
*/
reglock(rp);
if ((id != anon_lookup(rp->r_anon, apn, &tsh)) || tsh != sp) {
#ifdef DELDEBUG
nswpchg++;
#endif
pagefree(swappfd);
sm_free(&newsh, 1);
return(0);
}
goto again;
}
if (swappfd) {
/* enter page into hash */
if (anon_swapin(id, swappfd, apn) == DUPLICATE) {
pagefree(swappfd);
swappfd = NULL;
goto again;
}
pagedone(swappfd);
} else {
swappfd = pfd;
}
/*
* At this point, swappfd has contents of page we wish to exchange
* swap on & newsh has a new swap handle
*/
regrele(rp);
swaplst[0].gp_pfd = swappfd;
ASSERT(swappfd->pf_pchain == NULL);
if (sm_swap(swaplst, newsh, 1, B_WRITE, NULL)) {
if (rp->r_flags & RG_HASSANON)
pagefreesanon(swappfd, 0);
else
pagefree(swappfd);
sm_free(&newsh, 1);
#ifdef DELDEBUG
nswaperr++;
#endif
return(EIO);
}
ASSERT(swappfd->pf_pchain == NULL);
/* one last time - check that everything is still correct */
reglock(rp);
if ((id != anon_lookup(rp->r_anon, apn, &tsh)) || tsh != sp) {
#ifdef DELDEBUG
nswpchg++;
#endif
if (rp->r_flags & RG_HASSANON)
pagefreesanon(swappfd, 0);
else
anon_pagefree_and_cache(swappfd);
sm_free(&newsh, 1);
return(0);
}
anon_clrswap(id, apn, newsh);
if (rp->r_flags & RG_HASSANON)
pagefreesanon(swappfd, 0);
else
anon_pagefree_and_cache(swappfd);
#ifdef DELDEBUG
nxswap++;
if (traceunswapall)
printf("apn %d id 0x%x swap 0x%x pfd 0x%x cnt %d xswapped\n",
apn, id, sp, swappfd, swappfd->pf_use);
#endif
return(0);
}
static void
unswapsysreg(void)
{
}
/*
* swapfind - find best fit
*/
static pgno_t
swapfind(swapinfo_t *sp, int size, int *fsize)
{
bitnum_t max, beststart, start, end;
bitlen_t len, bestlen;
int pass2 = 0;
/* starting at st_next, search for 'size' free bits */
bestlen = 0;
start = sp->st_next;
end = (bitnum_t)sp->st_npgs;
again:
while (start < end) {
max = MIN(size, end - start);
len = bftstclr(sp->st_bmap, start, max);
ASSERT(len <= size);
if (len == size) {
/* found one */
*fsize = size;
return((pgno_t)start);
} else if (len > bestlen) {
beststart = start;
bestlen = len;
}
/* skip over any clear bits */
start += len;
/* skip over set bits */
start += bftstset(sp->st_bmap, start, max - len);
ASSERT(start <= end);
}
if (!pass2) {
pass2++;
start = 0;
end = sp->st_next;
goto again;
}
if (bestlen) {
/* return best we got */
*fsize = bestlen;
return((pgno_t)beststart);
}
*fsize = 0;
return(-1);
}
/*
* isswapdeleted - return true if swap device is going away
*/
int
isswapdeleted(sm_swaphandle_t sh)
{
register swaphandle_t sw;
sw.sm_all = sh;
return(lswaptab[sw.sm_lswap]->st_flags & ST_OFFLINE);
}
/*
* nfreeswap - return amount of freeswap (in disk blocks)
* Note - called from interrupt level so can't grab semaphores ..
*/
int
nfreeswap(ulong *freeblocks)
{
if (cpswapmodlock() == 0)
return -1;
*freeblocks = ptod(_nfreeswap());
swapmodunlock();
return 0;
}
/*
* _nfreeswap - return amount of freeswap (in pages)
* Must be called with swapmodlock set
*/
static pgno_t
_nfreeswap(void)
{
register swapinfo_t *st;
register pgno_t tfreeswap = 0;
int i;
for (i = 0; i < NSWAPPRI; i++) {
for (st = swaptab[i].sw_list; st; st = st->st_list) {
if (!(st->st_flags & ST_OFFLINE))
tfreeswap += st->st_nfpgs;
}
}
return tfreeswap;
}
/*
* getmaxswap - return amount of swap (in pages)
*/
void
getmaxswap(pgno_t *max, pgno_t *vmax, pgno_t *tot)
{
register swapinfo_t *st;
register pgno_t curswap = 0, tswap = 0, tvswap = 0;
int i;
swapmodlock();
for (i = 0; i < NSWAPPRI; i++) {
for (st = swaptab[i].sw_list; st; st = st->st_list) {
if (!(st->st_flags & ST_OFFLINE)) {
tswap += st->st_maxpgs;
tvswap += st->st_vpgs;
curswap += st->st_npgs;
}
}
}
swapmodunlock();
if (max)
*max = tswap;
if (vmax)
*vmax = tvswap;
if (tot)
*tot = curswap;
}
/*
* Swap Manager
*/
#if DEBUG
int alloc_offline, alloc_gen, alloc_out;
#endif
/*
* sm_alloc - allocate contigous pages
* Returns # pages allocated
*/
sm_swaphandle_t
sm_alloc(int *npgs, int local)
{
register bitnum_t bitnum;
register int i;
register pgno_t swappg;
auto int fsize = 0; /* found # of pages */
swaphandle_t sw;
swapinfo_t *st, *stop;
int s;
int shaken = 0;
ushort_t gen;
try_again:
sw.sm_all = 0;
swappg = -1;
/*
* Search, in priority order, for swap space
*/
s = swaplock();
for (i = 0; i < NSWAPPRI; i++) {
/*
* Start looking where we left off last time through. We
* stop when we loop around the last back to where we
* started from. The stopping point will be set the
* first time through.
*/
stop = NULL;
for (st = swaptab[i].sw_next; st != stop ; st = st->st_list) {
/*
* First time through, remember our stopping point.
*/
if (stop == NULL)
stop = swaptab[i].sw_next;
/*
* If we hit the end of the list, then loop back to
* check the swap areas at the beginning of the list.
*/
if (st == NULL) {
st = swaptab[i].sw_list;
/*
* If we've been here already, then we're
* done with this list.
*/
if (st == stop)
break;
}
if (st->st_flags & ST_OFFLINE)
continue;
if (local && ((st->st_flags & ST_LOCAL_SWAP) == 0))
continue;
/*
* Unlock the swaplock while we search for space
* since this can take awhile if the bit map is
* large and fragmented. When we come back,
* re-check to make sure the swap device hasn't
* been deleted. The st_alloc counter is used
* to let doswapdel() know we're using the bit
* map.
*/
find_again:
st->st_allocs++;
gen = st->st_gen;
swapunlock(s);
swappg = swapfind(st, *npgs, &fsize);
s = swaplock();
st->st_allocs--;
/*
* Swap device deleted? Go try another one.
*/
if (st->st_flags & ST_OFFLINE) {
#if DEBUG
alloc_offline++;
#endif
continue;
}
/*
* Someone else modified the bit map while we were
* looking. Redo the search in case things have
* changed.
*/
if (st->st_gen != gen) {
#if DEBUG
alloc_gen++;
#endif
goto find_again;
}
if (fsize > 0)
/* XXX search for best??? */
break;
}
if (fsize > 0)
break;
}
if (fsize == 0) {
swapunlock(s);
if (!shaken) {
(void) shake_shake(SHAKEMGR_SWAP);
shaken = 1;
goto try_again;
}
*npgs = 0;
#if DEBUG
alloc_out++;
#endif
return((sm_swaphandle_t)sw.sm_all);
}
ASSERT(fsize > 0);
bitnum = (bitnum_t)swappg;
/* set swappg to physical pageno on swap device */
swappg = st->st_swppglo + swappg;
#ifdef _VCE_AVOIDANCE
if (vce_avoidance) {
/* On mapped files we limit the swap size - XXX */
if (st->st_vp->v_type != VBLK)
fsize = 1;
}
#endif
st->st_next = bitnum + fsize;
st->st_nfpgs -= fsize;
ASSERT(st->st_nfpgs >= 0);
swaptab[st->st_pri].sw_next = st->st_list ? st->st_list : swaptab[st->st_pri].sw_list;
sw.sm_lswap = st->st_lswap;
sw.sm_pageno = swappg;
/* set in use bits */
ASSERT(bftstclr(st->st_bmap, bitnum, fsize) == fsize);
bfset(st->st_bmap, bitnum, (bitlen_t)fsize);
st->st_gen++;
swapunlock(s);
*npgs = fsize;
return((sm_swaphandle_t)sw.sm_all);
}
/*
* sm_dealloc - free list of swap pages
*/
void
sm_dealloc(sm_swaphandle_t sh, int npgs)
{
register pgno_t pgnbr;
register swapinfo_t *st;
swaphandle_t sw;
int s;
s = swaplock();
sw.sm_all = sh;
ASSERT(sw.sm_lswap < MAXLSWAP);
st = lswaptab[sw.sm_lswap];
pgnbr = sw.sm_pageno - st->st_swppglo;
ASSERT(bftstset(st->st_bmap, (bitnum_t)pgnbr, npgs));
bfclr(st->st_bmap, (bitnum_t)pgnbr, npgs);
st->st_nfpgs += npgs;
ASSERT(st->st_nfpgs <= st->st_npgs);
swapunlock(s);
}
/*
* sm_getlswap - return logical swap # (for error messages mostly)
*/
short
sm_getlswap(sm_swaphandle_t sh)
{
swaphandle_t sw;
sw.sm_all = sh;
return(sw.sm_lswap);
}
/*
* sm_free - free list of swap pages
*/
void
sm_free(sm_swaphandle_t *sh, int npgs)
{
register pgno_t pgnbr;
register swapinfo_t *st;
register int i;
int s;
swaphandle_t sw;
s = swaplock();
for (i = 0; i < npgs; i++, sh++) {
if (!*sh) {
#ifdef DEBUG
*sh = -1;
#endif
continue;
}
sw.sm_all = *sh;
ASSERT(sw.sm_lswap < MAXLSWAP);
st = lswaptab[sw.sm_lswap];
pgnbr = sw.sm_pageno - st->st_swppglo;
ASSERT(btst(st->st_bmap, (bitnum_t)pgnbr));
bclr(st->st_bmap, (bitnum_t)pgnbr);
st->st_nfpgs++;
ASSERT(st->st_nfpgs <= st->st_npgs);
#ifdef DEBUG
*sh = -1;
#endif
}
swapunlock(s);
}
/*
* sm_makeswaphandle - construct a swap handle given a dev/blkno pair
* This is only used for async xfers to block devices..
* Note that this does NOT work for anything but block devices!
*/
sm_swaphandle_t
sm_makeswaphandle(dev_t dev, daddr_t blkno)
{
register int i;
register swapinfo_t *st;
swaphandle_t sw;
sw.sm_all = 0;
for (i = 0; i < NSWAPPRI; i++) {
for (st = swaptab[i].sw_list; st; st = st->st_list) {
if (st->st_vp->v_type != VBLK)
continue;
if (st->st_vp->v_rdev == dev) {
sw.sm_lswap = st->st_lswap;
sw.sm_pageno = dtop(blkno);
break;
}
}
}
return((sm_swaphandle_t)sw.sm_all);
}
/*
* swap I/O
*
* XXX it would be really nice if we could, for READS, simply read
* the page into the page cache like pageins do. this would simplify
* the vfault logic and permit read-ahead - a win for nfs swapping
* The only problem is VBLK swap devices since we would have to
* alloc the page and insert it in the hash.
*/
int
sm_swap(register pglst_t *pglptr, sm_swaphandle_t sh, int npgs,
int rw, void (*done)(buf_t *))
{
register buf_t *bp;
register swapinfo_t *st;
register int i;
register pfd_t *npfd, *pfd;
register dev_t dev;
swaphandle_t sw;
struct vnode *vp;
int error;
int s;
/* REFERENCED */
#ifdef SWAPDEBUG
pglst_t *spg = pglptr;
#endif
ASSERT(npgs > 0);
ASSERT((rw & B_ASYNC) == 0 || done);
sw.sm_all = sh;
st = lswaptab[sw.sm_lswap];
ASSERT(sw.sm_pageno >= st->st_swppglo);
ASSERT(sw.sm_pageno < (st->st_npgs + st->st_swppglo));
vp = st->st_vp;
dev = vp->v_rdev;
bp = getphysbuf(dev);
ASSERT(bp->b_un.b_addr == NULL);
ASSERT(bp->b_remain == 0);
ASSERT(bp->b_resid == 0);
bp->b_flags = B_PAGEIO | B_BUSY | B_PHYS | B_SWAP | rw;
bp->b_blkno = ptod(sw.sm_pageno);
bp->b_bcount = ctob(npgs);
bp->b_bufsize = bp->b_bcount;
/*
* getpages relies on the fact that pchain is
* non-null to keep from chaining
* pfds on the list twice therefore
* be sure NOT to null out last page
* Note that this means that one MUST use the buffer size NOT just
* chain through list
*/
pglptr += (npgs - 2); /* points to next to last */
pfd = npfd = (pglptr+1)->gp_pfd; /* last page */
ASSERT(pfd->pf_pchain == NULL || pfd->pf_pchain == (pfd_t *)0xaddL);
for (i = npgs-1; i; i--, pglptr--) {
pfd = pglptr->gp_pfd;
ASSERT(pfd->pf_pchain == NULL || pfd->pf_pchain == (pfd_t *)0xaddL);
pfd->pf_pchain = npfd;
npfd = pfd;
}
bp->b_pages = pfd;
/* accounting */
if (rw & B_READ) {
/* sar, osview, etc. convert from pages to disk blocks */
SYSINFO.swapin++;
SYSINFO.bswapin += npgs;
} else {
SYSINFO.swapout++;
SYSINFO.bswapout += npgs;
}
if (vp->v_type == VBLK) {
if (rw & B_ASYNC)
bp->b_relse = done;
error = swapseg(dev, bp, rw);
} else {
/* swap to file! */
uio_t auio;
uio_t *uio = &auio;
iovec_t aiovec;
aiovec.iov_base = bp_mapin(bp);
uio->uio_iov = &aiovec;
uio->uio_iovcnt = 1;
uio->uio_resid = aiovec.iov_len = bp->b_bcount;
uio->uio_offset = ctob(sw.sm_pageno);
uio->uio_segflg = UIO_NOSPACE; /* no copying!!! */
uio->uio_limit = RLIM_INFINITY;
uio->uio_fmode = 0;
uio->uio_sigpipe = 0;
uio->uio_pmp = NULL;
uio->uio_pio = 0;
uio->uio_readiolog = 0;
uio->uio_writeiolog = 0;
uio->uio_pbuf = 0;
/*
* no async file xfers since have no way to tell when
* they're done
* Note that its important that the VOP_WRITE doesn't require
* any memory - that could wedge vhand
*/
rw &= ~B_ASYNC;
if (rw & B_READ) {
VOP_READ(vp, uio, IO_DIRECT|IO_IGNCACHE|IO_SYNC|IO_PFUSE_SAFE,
sys_cred, sys_flid, error);
}
else {
VOP_WRITE(vp, uio, IO_DIRECT|IO_IGNCACHE|IO_SYNC|IO_PFUSE_SAFE,
sys_cred, sys_flid, error);
}
ASSERT(uio->uio_sigpipe == 0);
#ifdef DEBUG
if (!(rw & B_READ) && swapwrinject > 0 && --swapwrinject == 0) {
swapwrinject = swapwrinjectfreq;
error = EIO;
}
if ((rw & B_READ) && swaprdinject > 0 && --swaprdinject == 0) {
swaprdinject = swaprdinjectfreq;
error = EIO;
}
#endif
if (error) {
s = swaplock();
if (error == ESTALE)
st->st_flags |= ST_STALE;
else if (error == EACCES)
st->st_flags |= ST_EACCES;
else
st->st_flags |= ST_IOERR;
swapunlock(s);
}
ASSERT(error || uio->uio_resid == 0);
}
if (error) {
cmn_err_tag(132,CE_ALERT, "Swap %s failed on logical swap %d blkno 0x%x for process [ %s ]",
(rw & B_READ) ? "in" : "out", sm_getlswap(sh),
bp->b_blkno, get_current_name());
putphysbuf(bp);
return(error);
}
#ifdef SWAPDEBUG
swapsum(spg, sw, npgs, rw);
#endif
if (rw & B_ASYNC)
return 0;
/*
* a sync xfer - if they provided a 'done' function - call it
* else just free up bp and return
*/
if (done)
(*done)(bp);
else
putphysbuf(bp);
return 0;
}
static int
swapseg(dev_t dev, struct buf *bp, int rw)
{
struct bdevsw *my_bdevsw;
#ifdef DEBUG
if (!(rw & B_READ) && swapwrinject > 0 && --swapwrinject == 0) {
swapwrinject = swapwrinjectfreq;
bp->b_flags |= B_ERROR;
return EIO;
}
if ((rw & B_READ) && swaprdinject > 0 && --swaprdinject == 0) {
swaprdinject = swaprdinjectfreq;
bp->b_flags |= B_ERROR;
return EIO;
}
#endif
my_bdevsw = get_bdevsw(dev);
ASSERT(my_bdevsw != NULL);
bdstrat(my_bdevsw, bp);
if (bp->b_flags & B_ERROR)
return(EIO);
if (rw & B_ASYNC)
return(0);
/* wait for transaction */
ADD_SYSWAIT(swap);
psema(&bp->b_iodonesema, PRIBIO|TIMER_SHIFT(AS_PHYSIO_WAIT));
SUB_SYSWAIT(swap);
if (bp->b_flags & B_ERROR)
return(EIO);
return(0);
}
/*
* int
* swaptabsnap(swapinfo_t **si_snapp)
* Function call to a snapshot of the swap devices
*
* The length (number of swap entries) is returned.
* The information will have been copied to a newly allocated
* kernel address space and the passed in pointer (*si_snapp)
* will point to the information. The caller is responsible
* for calling kmem_free().
*/
int /* number of entries in snapshot */
swaptabsnap (swapinfo_t **si_snapp)
{
int i, nswapfiles_snap;
swapinfo_t *si_snap, *st, *sip;
retry:
nswapfiles_snap = nswapfiles;
si_snap = kmem_alloc(nswapfiles_snap * sizeof(swapinfo_t),
KM_SLEEP);
swapmodlock();
/*
* If more files were added while we blocked, free our
* old memory reservation and try again. We want to give
* as accurate a picture as possible.
*/
if (nswapfiles != nswapfiles_snap) {
swapmodunlock();
kmem_free(si_snap,
sizeof(swapinfo_t) * nswapfiles_snap);
goto retry;
}
/*
* Copy the entire table.
*/
sip = si_snap;
for (i = 0; i < NSWAPPRI; i++) {
for (st = swaptab[i].sw_list; st; st = st->st_list) {
*sip = *st;
sip++;
}
}
swapmodunlock();
*si_snapp = si_snap;
return (nswapfiles_snap);
}