1227 lines
29 KiB
C
1227 lines
29 KiB
C
/*************************************************************************
|
|
* *
|
|
* Copyright (C) 1986-1996 Silicon Graphics, Inc. *
|
|
* *
|
|
* These coded instructions, statements, and computer programs contain *
|
|
* unpublished proprietary information of Silicon Graphics, Inc., and *
|
|
* are protected by Federal copyright law. They may not be disclosed *
|
|
* to third parties or copied or duplicated in any form, in whole or *
|
|
* in part, without the prior written consent of Silicon Graphics, Inc. *
|
|
* *
|
|
**************************************************************************/
|
|
|
|
/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
|
|
/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
|
|
/* All Rights Reserved */
|
|
|
|
/* THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF */
|
|
/* UNIX System Laboratories, Inc. */
|
|
/* The copyright notice above does not evidence any */
|
|
/* actual or intended publication of such source code. */
|
|
|
|
#ident "$Revision: 3.355 $"
|
|
|
|
#include <sys/types.h>
|
|
#include <sys/acct.h>
|
|
#include <ksys/as.h>
|
|
#include <sys/atomic_ops.h>
|
|
#include <sys/buf.h>
|
|
#include <sys/capability.h>
|
|
#include <sys/cmn_err.h>
|
|
#include <sys/cred.h>
|
|
#include <sys/debug.h>
|
|
#include <sys/errno.h>
|
|
#include <ksys/exception.h>
|
|
#include <sys/exec.h>
|
|
#include <ksys/fdt.h>
|
|
#include <sys/kabi.h>
|
|
#include <sys/kmem.h>
|
|
#include <sys/ksignal.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/param.h>
|
|
#include <sys/pathname.h>
|
|
#include <sys/prctl.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/resource.h>
|
|
#include <sys/runq.h>
|
|
#include <sys/sema.h>
|
|
#include <sys/sat.h>
|
|
#include <sys/attributes.h>
|
|
#include <sys/signal.h>
|
|
#include <sys/sysinfo.h>
|
|
#include <sys/sysmacros.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/uio.h>
|
|
#include <ksys/vfile.h>
|
|
#include <sys/vfs.h>
|
|
#include <sys/vnode.h>
|
|
#include <ksys/vproc.h>
|
|
#include <string.h>
|
|
#include <sys/rtmon.h>
|
|
#include <sys/par.h>
|
|
#include <sys/imon.h>
|
|
#include <procfs/prsystm.h>
|
|
#include <sys/numa.h>
|
|
#include <sys/ckpt.h>
|
|
#ifdef _SYSTEM_SIMULATION
|
|
#include <sys/kopt.h>
|
|
#endif
|
|
#include "os/proc/pproc_private.h" /* XXX bogus */
|
|
#if CELL_IRIX
|
|
#include <ksys/cell.h>
|
|
#include <ksys/cell/membership.h>
|
|
#endif
|
|
#include <sys/dmi.h>
|
|
#include <sys/dmi_kern.h>
|
|
#ifdef R10000
|
|
#include <sys/hwperftypes.h>
|
|
#include <sys/hwperfmacros.h>
|
|
#endif
|
|
|
|
/* these are dynamically changable */
|
|
extern int ncargs;
|
|
extern int reset_limits_on_exec;
|
|
|
|
extern sv_t nexit;
|
|
|
|
extern int gfx_exit(void);
|
|
|
|
/*
|
|
* Length of /dev/fd prefix and suffix
|
|
*/
|
|
#define DEV_FD_PREFIX_LEN 8
|
|
#define DEV_FD_SUFFIX_LEN 8
|
|
|
|
/*
|
|
* The exec switch table. Called in order to try and exec a
|
|
* particular a.out type.
|
|
*/
|
|
int (*execsw[])(vnode_t *, vattr_t *, struct uarg *, int) = {
|
|
elfexec,
|
|
intpexec,
|
|
};
|
|
|
|
int nexectype = sizeof(execsw) / sizeof(int (*)());
|
|
|
|
struct rexeca {
|
|
sysarg_t cell;
|
|
char *fname;
|
|
char **argp;
|
|
char **envp;
|
|
};
|
|
|
|
struct execa {
|
|
char *fname;
|
|
char **argp;
|
|
char **envp;
|
|
};
|
|
static int iexec(char *fname, char **argp, char **envp, cell_t cell);
|
|
|
|
#if CELL_IRIX
|
|
/*
|
|
* TESTING - Set exec_rotor_low/exec_rotor_high to the min & max cells
|
|
* you want to run on. The default is all cells. Set do_rexec to
|
|
* turn it on using a combination of the REXEC bit flags.
|
|
*/
|
|
int exec_rotor = 0;
|
|
int exec_rotor_low=0, exec_rotor_high=MAX_CELLS - 1;
|
|
|
|
#define REXEC_ON 0x1
|
|
#define REXEC_SKIP_GOLDEN 0x2
|
|
#define REXEC_RANDOM 0x4
|
|
|
|
int do_rexec = 0;
|
|
|
|
static cell_t
|
|
pick_cell(void)
|
|
{
|
|
/* pick a cell to run on */
|
|
|
|
if (!do_rexec) {
|
|
return(cellid());
|
|
}
|
|
while (do_rexec & REXEC_ON) {
|
|
if (do_rexec & REXEC_RANDOM) {
|
|
exec_rotor = get_timestamp() %
|
|
((cell_membership >> 1) + 1);
|
|
}
|
|
else {
|
|
|
|
/* round-robin */
|
|
|
|
exec_rotor++;
|
|
if (exec_rotor > exec_rotor_high) {
|
|
exec_rotor = exec_rotor_low;
|
|
}
|
|
if (exec_rotor == cellid())
|
|
continue;
|
|
}
|
|
if (cell_in_membership(exec_rotor)) {
|
|
if ((do_rexec & REXEC_SKIP_GOLDEN) &&
|
|
exec_rotor == golden_cell) {
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
return(exec_rotor);
|
|
}
|
|
#else
|
|
#define pick_cell() cellid()
|
|
#endif
|
|
|
|
/* ARGSUSED */
|
|
int
|
|
exec(struct execa *uap, rval_t *rvp)
|
|
{
|
|
cell_t cell;
|
|
|
|
cell = pick_cell();
|
|
return (iexec(uap->fname, uap->argp, NULL, cell));
|
|
}
|
|
|
|
/* ARGSUSED */
|
|
int
|
|
exece(struct execa *uap, rval_t *rvp)
|
|
{
|
|
cell_t cell;
|
|
|
|
cell = pick_cell();
|
|
return (iexec(uap->fname, uap->argp, uap->envp, cell));
|
|
}
|
|
|
|
/*
|
|
* rexec - new process executes on specified cell
|
|
*/
|
|
/* ARGSUSED */
|
|
int
|
|
rexec(struct rexeca *uap, rval_t *rvp)
|
|
{
|
|
cell_t cell;
|
|
|
|
cell = uap->cell;
|
|
if (cell < 0)
|
|
return EINVAL;
|
|
#if CELL_IRIX
|
|
if (!cell_in_membership(cell))
|
|
return ECELLDOWN;
|
|
#else
|
|
if (cell != 0)
|
|
return EINVAL;
|
|
#endif
|
|
return (iexec(uap->fname, uap->argp, uap->envp, cell));
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
int exectrace = 0;
|
|
#endif
|
|
|
|
static int
|
|
iexec(char *fname, char **argp, char **envp, cell_t cell)
|
|
{
|
|
int error = 0;
|
|
vnode_t *vp;
|
|
struct pathname pn;
|
|
int snapncargs = ncargs;
|
|
struct uarg args;
|
|
ckpt_handle_t *ckptp = NULL;
|
|
#ifdef CKPT
|
|
ckpt_handle_t ckpt = NULL;
|
|
|
|
if (ckpt_enabled)
|
|
ckptp = &ckpt;
|
|
#endif
|
|
SYSINFO.sysexec++;
|
|
|
|
/*
|
|
* Lookup path name and remember last component for later.
|
|
*/
|
|
if (error = pn_get(fname, UIO_USERSPACE, &pn))
|
|
return error;
|
|
_SAT_PN_SAVE(&pn, curuthread);
|
|
#ifdef DEBUG
|
|
{
|
|
proc_t *p = curprocp;
|
|
if (exectrace == -1 || exectrace == p->p_pid)
|
|
cmn_err(CE_CONT, "exec of (pid=%d) %s by %s\n",
|
|
p->p_pid, pn.pn_buf, get_current_name());
|
|
}
|
|
#endif
|
|
bzero (&args, sizeof(args));
|
|
|
|
_SAT_PN_BOOK(SAT_EXEC, curuthread);
|
|
if (error = lookuppn(&pn, FOLLOW, NULLVPP, &vp, ckptp)) {
|
|
pn_free(&pn);
|
|
return error;
|
|
}
|
|
/*
|
|
* Save audit information about the old state.
|
|
* When the record's generated for real it'll be
|
|
* after the exec has changed attributes.
|
|
*/
|
|
_SAT_SAVE_ATTR(SAT_CAP_SET_TOKEN, curuthread);
|
|
_SAT_SAVE_ATTR(SAT_UGID_TOKEN, curuthread);
|
|
|
|
strncpy(args.ua_exec_file, pn.pn_path, PSCOMSIZ);
|
|
args.ua_exec_file[PSCOMSIZ - 1] = '\0';
|
|
|
|
args.ua_argp = argp;
|
|
args.ua_envp = envp;
|
|
args.ua_fname = fname;
|
|
args.ua_fnameseg = UIO_USERSPACE;
|
|
args.ua_ncargs = snapncargs;
|
|
args.ua_cell = cell;
|
|
args.ua_exec_vp = vp;
|
|
#ifdef CKPT
|
|
args.ua_ckpt = (ckptp && *ckptp)? ckpt_lookup_add(vp, *ckptp) : -1;
|
|
#endif
|
|
|
|
#ifdef _SYSTEM_SIMULATION
|
|
{
|
|
proc_t *p = curprocp;
|
|
if (is_enabled(arg_sableexectrace))
|
|
printf("Exec of %s (PID %d) by %s\n", pn.pn_path, p->p_pid,
|
|
p->p_comm);
|
|
}
|
|
#endif
|
|
pn_free(&pn);
|
|
|
|
/* We have state that needs to be cleaned up on error */
|
|
args.ua_exec_cleanup = 1;
|
|
|
|
error = gexec(&vp, &args, 0);
|
|
|
|
if (error == 0) {
|
|
/*
|
|
* Tell rtmond our new name. This actually generates more
|
|
* events than we typically want since this will cause
|
|
* events to be generated for every process in the system
|
|
* when we often are just tracing a small handful of
|
|
* processes. We may need to think about ways of trying to
|
|
* avoid unneeded events ...
|
|
*/
|
|
if (IS_TSTAMP_EVENT_ACTIVE(RTMON_PIDAWARE)) {
|
|
#pragma mips_frequency_hint NEVER
|
|
fawlty_exec(args.ua_exec_file);
|
|
}
|
|
} else if (args.ua_exec_cleanup) { /* handle error */
|
|
#pragma mips_frequency_hint NEVER
|
|
if (error == EAGAIN)
|
|
nomemmsg("exec");
|
|
|
|
VN_RELE(args.ua_exec_vp);
|
|
}
|
|
_SAT_EXEC(error);
|
|
|
|
return(error);
|
|
}
|
|
|
|
/*
|
|
* Get executable permissions (and capabilities if requested) of a vnode.
|
|
* Check for various access/execute/etc. issues and return an appropriate
|
|
* error if things are amiss.
|
|
*/
|
|
int
|
|
execpermissions(struct vnode *vp,
|
|
struct vattr *vattrp,
|
|
struct uarg *args)
|
|
{
|
|
int error;
|
|
uthread_t *ut = curuthread;
|
|
proc_t *p = UT_TO_PROC(ut);
|
|
|
|
vattrp->va_mask = AT_MODE|AT_UID|AT_GID|AT_SIZE;
|
|
|
|
VOP_GETATTR(vp, vattrp, ATTR_EXEC, ut->ut_cred, error);
|
|
if (error)
|
|
return error;
|
|
|
|
/*
|
|
* Check the access mode.
|
|
*/
|
|
VOP_ACCESS(vp, VEXEC, ut->ut_cred, error);
|
|
if (error != 0
|
|
|| vp->v_type != VREG
|
|
|| (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
|
|
if (error == 0)
|
|
error = EACCES;
|
|
return error;
|
|
}
|
|
|
|
if (p->p_trace || PTRACED(p)) {
|
|
#pragma mips_frequency_hint NEVER
|
|
/*
|
|
* If we have read access then it's okay to let the exec()
|
|
* happen.
|
|
*/
|
|
VOP_ACCESS(vp, VREAD, ut->ut_cred, error);
|
|
if (!error)
|
|
return 0;
|
|
/*
|
|
* If process is traced via ptrace(2), fail the exec(2).
|
|
*/
|
|
if (p->p_flag & STRC)
|
|
return ENOEXEC;
|
|
/*
|
|
* Process is traced via /proc.
|
|
* Arrange to invalidate the /proc vnode.
|
|
*/
|
|
args->ua_traceinval = 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Returns with args->ua_setid[level] set to:
|
|
* 0 - not a setuid/setgid/setcap program or setuid/setgid/setcap disallowed
|
|
* !0 - a permitted setuid/setgid/setcap program
|
|
*
|
|
* Also reads any capabilities attached to the executable into args->ua_cap.
|
|
* We need them here to determine if the image has attached capabilities and
|
|
* later on we'll need them to recalculate our capabilities. If an image does
|
|
* not have attached capabilities, we mark args->ua_cap as invalid for that
|
|
* capability recalculation.
|
|
*
|
|
* If the image is a permitted setuid/setgid/setcap program, then we also set
|
|
* args->ua_uid and args->ua_gid.
|
|
*
|
|
* If there are no errors, 0 is returned; otherwise an error code is returned.
|
|
*/
|
|
static int
|
|
execsetid(vnode_t *vp, vattr_t *vattrp, struct uarg *args)
|
|
{
|
|
uthread_t *ut = curuthread;
|
|
proc_t *p = UT_TO_PROC(ut);
|
|
cap_set_t *acap = &args->ua_cap;
|
|
int setid, capsize, error;
|
|
uid_t uid;
|
|
gid_t gid;
|
|
|
|
/*
|
|
* Grab any capabilities attached to the executable.
|
|
*/
|
|
capsize = sizeof(cap_set_t);
|
|
VOP_ATTR_GET(vp, SGI_CAP_FILE, (char *)acap, &capsize,
|
|
ATTR_ROOT, sys_cred, error);
|
|
if (error ||
|
|
(acap->cap_effective & CAP_INVALID) ||
|
|
(acap->cap_permitted & CAP_INVALID) ||
|
|
(acap->cap_inheritable & CAP_INVALID)) {
|
|
#pragma mips_frequency_hint FREQUENT
|
|
/*
|
|
* A non-zero error indicates that there is either no
|
|
* capability set on the file or that if there was one, we
|
|
* couldn't get it. Mark as an invalid capability set to
|
|
* keep track of the fact the file had no capabilities
|
|
* attached to it. Treat an invalid capability set as a
|
|
* missing one.
|
|
*/
|
|
acap->cap_effective = CAP_INVALID;
|
|
acap->cap_permitted = CAP_INVALID;
|
|
acap->cap_inheritable = CAP_INVALID;
|
|
}
|
|
|
|
/*
|
|
* If neither SUID, SGID or SCAP simply return successfully. Also
|
|
* silently ignore SUID/SGID/SCAP for file systems mounted with
|
|
* "nosuid" option.
|
|
*/
|
|
if (((vattrp->va_mode & (VSUID|VSGID)) == 0 &&
|
|
(acap->cap_effective & CAP_INVALID)) ||
|
|
(vp->v_vfsp->vfs_flag & VFS_NOSUID)) {
|
|
#pragma mips_frequency_hint FREQUENT
|
|
args->ua_setid = 0;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Compute proposed execution credentials.
|
|
*/
|
|
setid = 0;
|
|
uid = ut->ut_cred->cr_uid;
|
|
gid = ut->ut_cred->cr_gid;
|
|
if (vattrp->va_mode & VSUID && vattrp->va_uid != uid) {
|
|
uid = vattrp->va_uid;
|
|
setid = 1;
|
|
}
|
|
if (vattrp->va_mode & VSGID && vattrp->va_gid != gid) {
|
|
gid = vattrp->va_gid;
|
|
setid = 1;
|
|
}
|
|
if (!(acap->cap_effective & CAP_INVALID)) {
|
|
cap_set_t *ucapp = &ut->ut_cred->cr_cap;
|
|
if (acap->cap_effective != ucapp->cap_effective ||
|
|
acap->cap_permitted != ucapp->cap_permitted ||
|
|
acap->cap_inheritable != ucapp->cap_inheritable) {
|
|
setid = 1;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Set setuid/setgid/setcap protections, if not tracing. If the
|
|
* process is being debuged (STRC) we never allow SUID/SGID/SCAP.
|
|
* If the process has system call and/or context switch tracing
|
|
* enabled, we allow the tracing to remain active if the process is
|
|
* privileged (CAP_PROC_MGT) or if it's being traced by a privileged
|
|
* tracer (SPARPRIV).
|
|
*/
|
|
if (setid) {
|
|
if ((p->p_flag & (STRC|SPARSYS|SPARSWTCH)) &&
|
|
!(p->p_flag & SPARPRIV) && !_CAP_ABLE(CAP_PROC_MGT)) {
|
|
#pragma mips_frequency_hint NEVER
|
|
int s;
|
|
if (p->p_flag & STRC)
|
|
return EPERM;
|
|
s = p_lock(p);
|
|
p->p_flag &= ~(SPARSYS|SPARSWTCH|SPARINH);
|
|
p->p_parcookie = 0;
|
|
p_unlock(p, s);
|
|
}
|
|
args->ua_uid = uid;
|
|
args->ua_gid = gid;
|
|
args->ua_setid = 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
gexec(struct vnode **vpp, struct uarg *args, int level)
|
|
{
|
|
proc_t *pp = curprocp;
|
|
uthread_t *ut = curuthread;
|
|
int i, error;
|
|
struct vnode *vp;
|
|
struct vattr vattr;
|
|
|
|
vp = *vpp;
|
|
if ((error = execpermissions(vp, &vattr, args)) != 0)
|
|
return error;
|
|
|
|
VOP_OPEN(vp, vpp, FREAD, ut->ut_cred, error);
|
|
if (error)
|
|
return error;
|
|
|
|
vp = *vpp;
|
|
args->ua_prev_script[level] = pp->p_script;
|
|
args->ua_level = level;
|
|
pp->p_script = NULL;
|
|
|
|
repl_interpose(vp, "ReplDefault");
|
|
|
|
if (vattr.va_size < MAGIC_SIZE) {
|
|
#pragma mips_frequency_hint NEVER
|
|
error = ENOEXEC;
|
|
goto closevp;
|
|
}
|
|
|
|
if (level == 0) {
|
|
#pragma mips_frequency_hint FREQUENT
|
|
/*
|
|
* We only check for suid/sgid/scap on the thing being
|
|
* exec()'d. In particular this means that for script
|
|
* execution, we only check the script and not its
|
|
* interpreter.
|
|
*/
|
|
error = execsetid(vp, &vattr, args);
|
|
if (error)
|
|
goto closevp;
|
|
}
|
|
if (!(ut->ut_pproxy->prxy_fp.pfp_fpflags & P_FP_PRESERVE)) {
|
|
#if TFP
|
|
ut->ut_pproxy->prxy_fp.pfp_fpflags = P_FP_IMPRECISE_EXCP;
|
|
#else
|
|
ut->ut_pproxy->prxy_fp.pfp_fpflags = 0;
|
|
#endif
|
|
}
|
|
|
|
error = check_dmapi_file(vp);
|
|
if (error)
|
|
goto closevp;
|
|
|
|
/*
|
|
* Loop through the switch table looking for the module that
|
|
* can handle this executable.
|
|
*/
|
|
for (i = 0; i < nexectype; i++) {
|
|
error = (*execsw[i]) (vp, &vattr, args, level);
|
|
if (error != ENOEXEC)
|
|
break;
|
|
}
|
|
|
|
if (!error) {
|
|
#pragma mips_frequency_hint FREQUENT
|
|
pp->p_exec_cnt++;
|
|
return 0;
|
|
}
|
|
|
|
if (!(args->ua_exec_cleanup)) {
|
|
/* don't handle error */
|
|
return error;
|
|
}
|
|
|
|
closevp:
|
|
ASSERT(error);
|
|
VOP_CLOSE(vp, FREAD, L_TRUE, ut->ut_cred, i);
|
|
|
|
/*
|
|
* Decrement use of new script, restore old one.
|
|
*/
|
|
if (pp->p_script) {
|
|
int s = VN_LOCK(pp->p_script);
|
|
int c = --pp->p_script->v_intpcount;
|
|
ASSERT(c >= 0);
|
|
VN_UNLOCK(pp->p_script, s);
|
|
if (!c)
|
|
IMON_EVENT(pp->p_script, ut->ut_cred, IMON_EXIT);
|
|
VN_RELE(pp->p_script);
|
|
}
|
|
pp->p_script = args->ua_prev_script[level];
|
|
|
|
return error;
|
|
}
|
|
|
|
int
|
|
exrdhead(struct vnode *vp, off_t off, size_t len, caddr_t *addrp)
|
|
{
|
|
ssize_t resid;
|
|
|
|
*addrp = kern_malloc(len);
|
|
|
|
if (vn_rdwr(UIO_READ, vp, *addrp, len, off, UIO_SYSSPACE, 0, 0L,
|
|
get_current_cred(), &resid, &curuthread->ut_flid) != 0
|
|
|| resid) {
|
|
return ENOEXEC;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
exhead_free(caddr_t addr)
|
|
{
|
|
kern_free(addr);
|
|
}
|
|
|
|
int
|
|
execmap(vnode_t *vp,
|
|
caddr_t addr,
|
|
size_t len,
|
|
size_t zfodlen,
|
|
off_t offset,
|
|
int prot,
|
|
int flags,
|
|
vasid_t vasid,
|
|
int ckpt)
|
|
{
|
|
caddr_t oldaddr;
|
|
size_t oldlen;
|
|
off_t oldoffset;
|
|
vnode_t *oldvp;
|
|
as_addspace_t asadd;
|
|
/* REFERENCED */
|
|
as_addspaceres_t asres;
|
|
int error = 0;
|
|
int ismappable = 0;
|
|
|
|
oldvp = vp; /* for VN_RELE check at the end */
|
|
|
|
oldaddr = addr;
|
|
addr = (caddr_t)((long)addr & ~POFFMASK);
|
|
oldlen = len;
|
|
len += ((size_t)oldaddr - (size_t)addr);
|
|
oldoffset = offset;
|
|
offset = (off_t)((long)offset & ~POFFMASK);
|
|
|
|
flags |= MAP_FIXED;
|
|
if (poff(oldoffset) == poff(oldaddr) && len) {
|
|
/* potentially map-able */
|
|
ismappable = 1;
|
|
/*
|
|
* Non-writable mappings are assumed to be text. For these,
|
|
* we set them up as MAP_SHARED so that we don't have to
|
|
* allocate smem. If we need to write to them later on,
|
|
* like when setting a breakpoint, we convert them to
|
|
* MAP_PRIVATE. The MAP_TEXT flag tells us when to do this.
|
|
*/
|
|
if ((vp->v_type == VCHR) || (vp->v_type == VBLK))
|
|
/* always PRIVATE */
|
|
flags |= MAP_PRIVATE;
|
|
else if (vp->v_type == VREG) {
|
|
if (prot & PROT_WRITE)
|
|
flags |= MAP_PRIVATE;
|
|
else
|
|
flags |= MAP_SHARED|MAP_TEXT;
|
|
} else
|
|
return ENODEV;
|
|
|
|
/*
|
|
* check if ok with file system to be mapped
|
|
* We could get back a different vp ... (lofs)
|
|
*
|
|
* If a new vp is returned, then it has a reference
|
|
* that we must VN_RELE.
|
|
*/
|
|
VOP_MAP(vp, offset, len, prot, flags, get_current_cred(),
|
|
&vp, error);
|
|
if (error)
|
|
return error;
|
|
}
|
|
|
|
if (ismappable) {
|
|
/*
|
|
* this can occur when rld maps /dev/zero ...
|
|
*/
|
|
if ((vp->v_type == VCHR) || (vp->v_type == VBLK)) {
|
|
asadd.as_op = AS_ADD_MMAPDEV;
|
|
asadd.as_addr = addr;
|
|
asadd.as_length = len;
|
|
asadd.as_prot = prot;
|
|
asadd.as_maxprot = PROT_ALL;
|
|
asadd.as_mmap_off = offset;
|
|
asadd.as_mmap_vp = vp;
|
|
asadd.as_mmap_flags = flags;
|
|
asadd.as_mmap_ckpt = ckpt;
|
|
/* XXX really shouldn't have to do this -
|
|
* we shouldn't be called with aspacelock held
|
|
*/
|
|
VAS_UNLOCK(vasid);
|
|
error = VAS_ADDSPACE(vasid, &asadd, &asres);
|
|
ASSERT(error || asres.as_addr == addr);
|
|
VAS_LOCK(vasid, AS_EXCL);
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* If vnode is being mapped for execution, mark it for
|
|
* replication also.
|
|
* This is done here instead of at gexec() time to allow
|
|
* mapping libraries for replication.
|
|
* If it's a one-node system, don't interpose the
|
|
* replication layer on vnode.
|
|
*/
|
|
if (((prot & (PROT_WRITE|PROT_EXEC)) == PROT_EXEC) &&
|
|
curprocp->p_shaddr == 0 && numnodes > 1) {
|
|
repl_interpose(vp, "ReplDefault");
|
|
}
|
|
|
|
asadd.as_op = AS_ADD_EXEC;
|
|
asadd.as_addr = addr;
|
|
asadd.as_length = len;
|
|
asadd.as_prot = prot;
|
|
asadd.as_maxprot = PROT_ALL;
|
|
asadd.as_exec_off = offset;
|
|
asadd.as_exec_vp = vp;
|
|
asadd.as_exec_flags = flags;
|
|
asadd.as_exec_ckpt = ckpt;
|
|
asadd.as_exec_zfodlen = zfodlen;
|
|
error = VAS_ADDSPACE(vasid, &asadd, &asres);
|
|
|
|
if (error) {
|
|
/*
|
|
* don't repl_dispose - there could be others using
|
|
* it since nothing here is locked. Shouldn't
|
|
* really do any harm...
|
|
*/
|
|
goto out;
|
|
}
|
|
ASSERT(asres.as_addr == addr);
|
|
} else {
|
|
if (vp->v_type != VREG) {
|
|
error = ENODEV;
|
|
goto out;
|
|
}
|
|
asadd.as_op = AS_ADD_LOAD;
|
|
asadd.as_addr = addr;
|
|
asadd.as_length = len;
|
|
asadd.as_prot = prot;
|
|
asadd.as_maxprot = PROT_ALL;
|
|
asadd.as_load_off = oldoffset;
|
|
asadd.as_load_vp = vp;
|
|
asadd.as_load_flags = flags;
|
|
asadd.as_load_ckpt = ckpt;
|
|
asadd.as_load_laddr = oldaddr;
|
|
asadd.as_load_llength = oldlen;
|
|
asadd.as_load_zfodlen = zfodlen;
|
|
error = VAS_ADDSPACE(vasid, &asadd, &asres);
|
|
if (error)
|
|
goto out;
|
|
ASSERT(asres.as_addr == addr);
|
|
}
|
|
|
|
if (zfodlen) {
|
|
/*
|
|
* space already set up - all we have to do is zero the
|
|
* portion from the end of the load data to the end of the
|
|
* page
|
|
*/
|
|
size_t end;
|
|
|
|
ASSERT(error == 0);
|
|
end = (size_t)addr + len;
|
|
if (poff(end)) {
|
|
/* need to unlock so can fault */
|
|
VAS_UNLOCK(vasid);
|
|
if (uzero((caddr_t)end, NBPP - poff(end)) != 0)
|
|
error = EFAULT;
|
|
VAS_LOCK(vasid, AS_EXCL);
|
|
if (error)
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
out:
|
|
if (oldvp != vp)
|
|
VN_RELE(vp); /* release ref. from VOP_MAP */
|
|
|
|
return error;
|
|
}
|
|
|
|
/*
|
|
* Machine independent final setup goes here
|
|
*/
|
|
void
|
|
setexecenv(struct vnode *vp)
|
|
{
|
|
int i;
|
|
uthread_t *ut = curuthread;
|
|
struct proc *p = curprocp;
|
|
proc_proxy_t *prxy = ut->ut_pproxy;
|
|
sigvec_t *sigvp = &p->p_sigvec;
|
|
|
|
mrlock(&p->p_who, MR_UPDATE, PZERO);
|
|
ASSERT(p->p_exec == NULL);
|
|
p->p_exec = vp;
|
|
if (p->p_exec)
|
|
VN_HOLD(p->p_exec); /* in with the new */
|
|
mrunlock(&p->p_who);
|
|
|
|
prxy->prxy_oldcontext = 0;
|
|
sigemptyset(&prxy->prxy_sigonstack);
|
|
prxy->prxy_ssflags = 0;
|
|
prxy->prxy_sigsp = 0;
|
|
prxy->prxy_siglb = (caddr_t)0;
|
|
|
|
/*
|
|
* In 1003.1b, section 3.1.2.2:
|
|
*
|
|
* "Signals set to the default action (SIG_DFL) in the calling
|
|
* process shall be set to the default action in the new process
|
|
* image. Signals set to be ignored (SIG_IGN) by the calling
|
|
* process image shall be set to be ignored by the new process
|
|
* image. Signals set to be caught by the calling process image
|
|
* shall be set to the default action in the new process image."
|
|
*
|
|
* XXX Push all this into sig.c!
|
|
*/
|
|
sigvec_lock(&p->p_sigvec);
|
|
for (i = 0; i < NUMSIGS; i++) {
|
|
if (sigvp->sv_hndlr[i] != SIG_DFL &&
|
|
sigvp->sv_hndlr[i] != SIG_IGN) {
|
|
sigvp->sv_hndlr[i] = SIG_DFL;
|
|
sigemptyset(&sigvp->sv_sigmasks[i]);
|
|
|
|
if (!sigvp->sv_sigpend.s_sigqueue &&
|
|
!ut->ut_sigpend.s_sigqueue)
|
|
continue;
|
|
if (sigismember(&ignoredefault, i + 1)) {
|
|
sigdelq(&sigvp->sv_sigqueue, i + 1, sigvp);
|
|
sigdelq(&ut->ut_sigqueue, i + 1, sigvp);
|
|
} else
|
|
if (sigvp->sv_sigpend.s_sigqueue) {
|
|
sigqueue_t *sqp;
|
|
while (sqp = sigdeq(&sigvp->sv_sigpend,
|
|
i+1, sigvp))
|
|
{
|
|
ASSERT(sqp->sq_info.si_signo == i + 1);
|
|
sigaddq(&ut->ut_sigqueue, sqp, sigvp);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Don't clear sv_sigrestart. It should be inherited from the
|
|
* parent.
|
|
*/
|
|
sigorset(&sigvp->sv_sigign, &ignoredefault);
|
|
sigemptyset(&sigvp->sv_sigcatch);
|
|
sigemptyset(&sigvp->sv_signodefer);
|
|
sigemptyset(&sigvp->sv_sigresethand);
|
|
sigdiffset(&sigvp->sv_sainfo, &ignoredefault);
|
|
|
|
/*
|
|
* If the user asked for NOCLDSTOP, the should still get it. We
|
|
* turn off SNOWAIT, though, since in general we want children to
|
|
* zombify (POSIX assumes this behavior). If the process wants
|
|
* SNOWAIT on, it will need to explicitly set it via
|
|
* sigaction().
|
|
*/
|
|
sigvp->sv_flags &= ~SNOWAIT;
|
|
|
|
/*
|
|
* Pending signals remain pending and held signals remain held, so
|
|
* don't clear p_phold or p_sig. We should clear out any 'default
|
|
* == ignore' signals from p_sig, though.
|
|
*/
|
|
sigorset(&ut->ut_sig, &sigvp->sv_sigpend.s_sig);
|
|
sigemptyset(&sigvp->sv_sigpend.s_sig);
|
|
sigdiffset(&ut->ut_sig, &ignoredefault);
|
|
sigvec_unlock(&p->p_sigvec);
|
|
}
|
|
|
|
/*
|
|
* remove old process address space.
|
|
* At this point, the exec no longer 'fails' the new process
|
|
* is simply killed
|
|
*/
|
|
/*ARGSUSED4*/
|
|
int
|
|
remove_proc(
|
|
struct proc *p,
|
|
struct uarg *args,
|
|
struct vnode *vp,
|
|
int rmp)
|
|
{
|
|
int s;
|
|
int c, level;
|
|
int error;
|
|
vasid_t vasid;
|
|
as_deletespace_t asd;
|
|
uthread_t *ut = curuthread;
|
|
vproc_t *vpr;
|
|
int oldf;
|
|
|
|
ASSERT(p == curprocp);
|
|
|
|
/*
|
|
* If the process is a graphics process, call the
|
|
* graphics exit routine since we are giving up graphics.
|
|
*/
|
|
if (UT_TO_KT(ut)->k_runcond & RQF_GFX)
|
|
gfx_exit();
|
|
|
|
/*
|
|
* Kill off other uthreads if this is pthreaded app.
|
|
* This discards pshare structure, too.
|
|
*/
|
|
if (ut->ut_flags & UT_PTHREAD) {
|
|
vpr = UT_TO_VPROC(ut);
|
|
VPROC_HOLD(vpr);
|
|
VPROC_THREAD_STATE(vpr, THRD_EXEC, s);
|
|
VPROC_RELE(vpr);
|
|
|
|
if (s)
|
|
return EBUSY;
|
|
}
|
|
|
|
/* if unblock on exec/exit flag is set, do that now */
|
|
if (p->p_unblkonexecpid) {
|
|
vpr = VPROC_LOOKUP(p->p_unblkonexecpid);
|
|
|
|
if (vpr != NULL) {
|
|
VPROC_UNBLKPID(vpr);
|
|
VPROC_RELE(vpr);
|
|
}
|
|
p->p_unblkonexecpid = 0;
|
|
}
|
|
|
|
/*
|
|
* certain other things like profiling and single stepping are not
|
|
* held across an exec
|
|
*/
|
|
#if R10000
|
|
/*
|
|
* If we have a hardware event counter CPU monitoring structure
|
|
* attached to the uthread and it is set up for PC profiling, disable
|
|
* it because we're going to be tearing down the p_profp array below
|
|
* which PC profiling depends on. We don't bother freeing it up here
|
|
* since the eventual exit path for the process/uthread will handle
|
|
* that and there are some complications with the tear down because
|
|
* the last uthread in a process is torn down after the proc/proxy
|
|
* are.
|
|
*/
|
|
if (ut->ut_cpumon && (ut->ut_cpumon->cm_flags & HWPERF_CM_PROFILING))
|
|
hwperf_disable_counters(ut->ut_cpumon);
|
|
#endif
|
|
s = p_lock(p);
|
|
/*
|
|
* POSIX says to inherit all non-specified attributes.
|
|
* XXX FIXADE is of marginal use to inherit..
|
|
*/
|
|
if (p->p_flag & SPROFFAST)
|
|
stopprfclk();
|
|
if (p->p_flag & SABORTSIG)
|
|
p->p_exitsig = 0;
|
|
p->p_flag &= ~(SPROF|SPROF32|SPROFFAST|SABORTSIG);
|
|
p->p_flag |= SEXECED; /* for setpgid() */
|
|
p_unlock(p, s);
|
|
|
|
prxy_flagclr(&p->p_proxy, PRXY_USERVME);
|
|
|
|
s = ut_lock(ut);
|
|
oldf = ut->ut_flags;
|
|
ut->ut_flags &= ~(UT_STEP|UT_SRIGHT|UT_PTHREAD|UT_PTPSCOPE|UT_OWEUPC);
|
|
ut_unlock(ut, s);
|
|
if (oldf & UT_SRIGHT)
|
|
prsright_release(&p->p_proxy);
|
|
|
|
if (p->p_profp) {
|
|
(void) kern_free(p->p_profp);
|
|
p->p_profp = NULL;
|
|
p->p_profn = 0;
|
|
} else {
|
|
ASSERT(p->p_profn == 0);
|
|
}
|
|
|
|
/*
|
|
* If ut_sighold points to prda, change back to kernel space.
|
|
*/
|
|
if (ut->ut_sighold != &p->p_sigvec.sv_sighold) {
|
|
s = ut_lock(ut);
|
|
p->p_sigvec.sv_sighold = *ut->ut_sighold;
|
|
ut->ut_sighold = &p->p_sigvec.sv_sighold;
|
|
ut_unlock(ut, s);
|
|
#if (_MIPS_SIM != _ABIO32)
|
|
ut->ut_prda->t_sys.t_flags &= ~T_HOLD_VALID;
|
|
#else
|
|
ut->ut_prda->t_sys.t_flags &= ~(T_HOLD_VALID|T_HOLD_KSIG_O32);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* if part of a share group - get rid of that
|
|
*/
|
|
if (IS_SPROC(&p->p_proxy)) {
|
|
/* Notify scheduler that we are leaving share group */
|
|
leaveshaddrRunq(ut);
|
|
asd.as_exec_detachstk = detachshaddr(p, SHDEXEC);
|
|
} else
|
|
asd.as_exec_detachstk = 0;
|
|
|
|
/*
|
|
* Now that we've detached from the share group, we can check
|
|
* to see if this is an intp suid exec. If so, we can do the
|
|
* open and adjust the arg list appropriately.
|
|
*/
|
|
if (args->ua_intpvp) {
|
|
/*
|
|
* Note that in fuexarg(), the args->ua_intpstkloc has been
|
|
* set up to point at a location that contains args->ua_fname.
|
|
* In intp, we set up args->ua_name to be "/dev/fd/XXXXXXX".
|
|
*
|
|
* Things would be a lot easier if we set up the
|
|
* stack after sloughing the share group.
|
|
*/
|
|
int fd;
|
|
char fdnum[32];
|
|
int len;
|
|
|
|
if (error = execopen(&args->ua_intpvp, &fd)) {
|
|
/*
|
|
* Caller will kill the process.
|
|
*/
|
|
return error;
|
|
}
|
|
#ifdef CKPT
|
|
if (args->ua_ckpt >= 0)
|
|
ckpt_setfd(fd, args->ua_ckpt);
|
|
#endif
|
|
/*
|
|
* Substitute the name of the /dev/fd node into
|
|
* the args structure.
|
|
*/
|
|
ASSERT(args->ua_intpstkloc);
|
|
|
|
/*
|
|
* Get the fd value.
|
|
*/
|
|
numtos(fd, fdnum);
|
|
|
|
/*
|
|
* Wipe out the "XXXXXXX" in the stack location.
|
|
*/
|
|
bzero(&args->ua_intpstkloc[DEV_FD_PREFIX_LEN],
|
|
DEV_FD_SUFFIX_LEN);
|
|
|
|
/*
|
|
* Now insert the new descriptor value into the stack
|
|
* location.
|
|
*/
|
|
strcpy(&args->ua_intpstkloc[DEV_FD_PREFIX_LEN], fdnum);
|
|
|
|
/*
|
|
* Fix up the p_psargs location. We know that this must
|
|
* be the last argument in the list according to how
|
|
* intp sets up arguments.
|
|
*/
|
|
if (args->ua_intppsloc) {
|
|
/*
|
|
* We need to make sure we don't stomp off the end
|
|
* of the psargs array when copying the fd number.
|
|
*/
|
|
len = ((PSARGSZ - 1) -
|
|
(&args->ua_intppsloc[DEV_FD_PREFIX_LEN] -
|
|
&p->p_psargs[0]));
|
|
if (len > 0) {
|
|
len = MIN(strlen(fdnum), len);
|
|
strncpy(&args->ua_intppsloc[DEV_FD_PREFIX_LEN],
|
|
fdnum, len);
|
|
args->ua_intppsloc[DEV_FD_PREFIX_LEN+len] = '\0';
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Perform exec processing, including close-on-exec.
|
|
*
|
|
* Note that for sprocs the fdt_exec logic assumes that
|
|
* detachshaddr (and hence fdt_detach_shaddr) has already
|
|
* been called. This ensures that close-on-exec
|
|
* processing occurs for a sproc after it has gotten
|
|
* its own copy of the fdt.
|
|
*/
|
|
fdt_exec();
|
|
|
|
/* remove old exec image */
|
|
if (p->p_exec) {
|
|
mrlock(&p->p_who, MR_UPDATE, PZERO);
|
|
VN_RELE(p->p_exec); /* out with the old */
|
|
p->p_exec = NULL;
|
|
mrunlock(&p->p_who);
|
|
}
|
|
|
|
/*
|
|
* release old scripts before rexec - ok to do here because
|
|
* any failures from this point on cause the process
|
|
* to be killed
|
|
*/
|
|
|
|
for (level = 0; level < 2; level++) {
|
|
if (args->ua_prev_script[level]) {
|
|
s = VN_LOCK(args->ua_prev_script[level]);
|
|
c = --args->ua_prev_script[level]->v_intpcount;
|
|
ASSERT(c >= 0);
|
|
VN_UNLOCK(args->ua_prev_script[level], s);
|
|
if (!c)
|
|
IMON_EVENT(args->ua_prev_script[level],
|
|
ut->ut_cred, IMON_EXIT);
|
|
VN_RELE(args->ua_prev_script[level]);
|
|
args->ua_prev_script[level] = NULL;
|
|
}
|
|
}
|
|
|
|
/* Remove old address space */
|
|
asd.as_op = AS_DEL_EXEC;
|
|
asd.as_exec_rmp = rmp;
|
|
asd.as_exec_prda = ut->ut_prda;
|
|
ut->ut_prda = 0;
|
|
as_lookup_current(&vasid);
|
|
VAS_DELETESPACE(vasid, &asd, NULL);
|
|
|
|
ASSERT(ut->ut_sharena == NULL);
|
|
|
|
/* The new tlbpid is needed after we're done with the
|
|
* old process and before we need anything for the new one.
|
|
* This effectively flushes the tlb of any pages that
|
|
* were released by execbld.
|
|
* Getxfile may read in the text of the new process and
|
|
* will need to use the new tlbpid.
|
|
*/
|
|
new_tlbpid(&ut->ut_as, VM_TLBINVAL);
|
|
|
|
/*
|
|
* Initialized the wired tlb entries for the new process.
|
|
*/
|
|
setup_wired_tlb(1);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
execopen(struct vnode **vpp, int *fdp)
|
|
{
|
|
struct vnode *vp = *vpp;
|
|
struct vnode *openvp = vp;
|
|
vfile_t *fp;
|
|
int error = 0;
|
|
int filemode = FREAD;
|
|
|
|
VN_HOLD(vp); /* open reference */
|
|
if (error = vfile_alloc(filemode, &fp, fdp)) {
|
|
VN_RELE(vp);
|
|
*fdp = -1; /* just in case vfile_alloc changed value */
|
|
return error;
|
|
}
|
|
|
|
VOP_OPEN(openvp, &vp, filemode, sys_cred, error);
|
|
if (error) {
|
|
VN_RELE(vp);
|
|
vfile_alloc_undo(*fdp, fp);
|
|
*fdp = -1;
|
|
return error;
|
|
}
|
|
vfile_ready(fp, vp);
|
|
|
|
*vpp = vp; /* vnode should not have changed */
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
execclose(int fd)
|
|
{
|
|
int error;
|
|
auto vfile_t *fp;
|
|
|
|
if (error = closefd(fd, &fp))
|
|
return error;
|
|
return vfile_close(fp);
|
|
}
|
|
|
|
/*
|
|
* Check if the file has DMAPI managed regions/events. If so,
|
|
* generate a DMAPI read event for the entire file.
|
|
*
|
|
* Only a "read" event will be generated since check_dmapi_file
|
|
* is intended to be called only for gexec() and elfmap() files.
|
|
* In these cases, if the process later uses the mprotect() syscall
|
|
* to upgrade the page protection to include PROT_WRITE, the mapping
|
|
* type is changed to private.
|
|
*
|
|
* Note that the VOP_FCNTL used here returns an error if the
|
|
* underlying file system is unaware of the F_DMAPI subfunction
|
|
* being used. This causes no problems, since a non-zero return
|
|
* status is simply ignored. Only in the case of a zero return status
|
|
* can we be sure that the VOP_FCNTL F_DMAPI subfunction
|
|
* DM_FCNTL_MAPEVENT is implemented for this file system, and then
|
|
* interpret the maprq.error field.
|
|
*/
|
|
/* ARGSUSED */
|
|
int
|
|
check_dmapi_file(vnode_t *vp)
|
|
{
|
|
#ifdef CELL_IRIX
|
|
/*
|
|
* This code doesn't work with cells for the following reasons:
|
|
* 1) idl cannot tolerate null rval pointer
|
|
* 2) stack variable dmfcntl is pass as an in/out param; server cell
|
|
* accesses it directly (read and write); this will fail on
|
|
* sn0 due to the firewall protections
|
|
*/
|
|
return(0);
|
|
#else
|
|
int error;
|
|
dm_fcntl_t dmfcntl;
|
|
|
|
dmfcntl.dmfc_subfunc = DM_FCNTL_MAPEVENT;
|
|
dmfcntl.u_fcntl.maprq.length = 0; /* length = 0 for whole file */
|
|
dmfcntl.u_fcntl.maprq.max_event = DM_EVENT_READ;
|
|
|
|
VOP_FCNTL(vp, F_DMAPI, &dmfcntl, 0, (off_t)0, sys_cred, NULL, error);
|
|
if (error == 0) {
|
|
if ((error = dmfcntl.u_fcntl.maprq.error) != 0)
|
|
return error;
|
|
}
|
|
return 0;
|
|
#endif
|
|
}
|