1
0
Files
2022-09-29 17:59:04 +03:00

451 lines
12 KiB
C

/*
** File: onlinediag.c
** ------------------
** Test cpus to see if they run simple diags correctly. Normally run as
** a cron job.
*/
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <time.h>
#include <sys/sysmp.h>
#include <sys/pda.h>
#include <sys/fcntl.h>
#include <sys/wait.h>
#include <syslog.h>
#include <sys/capability.h>
#include <sys/resource.h>
#include <sys/stat.h>
#define BUFFSIZE 512
#define NUM_FILES 64
#define DEFAULT_LOGFILE "/var/adm/onlinediag.logfile"
#define DEFAULT_CONFIG "/etc/config/onlinediag.config"
#define DEFAULT_PRIORITY 10
#define PARAMETERS "[-p procnum] [-c config] [-l logfile] [-rdkf]"
enum { MUSTRUN_FAILURE = EXIT_FAILURE+1,
EXEC_FAILURE,
WAIT_FAILURE,
FORK_FAILURE,
FINISH_FAILURE
};
typedef struct {
char *args[BUFFSIZE];
} argsT;
/* helper functions */
static void run_diags(int logfd, int cpu_num, int* restrict,
int* disable, struct pda_stat *p, int forced);
static int check_diags(int logfd, int cpu_num);
static int check_exit_error(int logfd, char *binary, int cpu, int status);
static void log_cpu_failure(int cpu_num, int restrict, int disable);
static int read_config_file(char *config_file, char *program);
static void free_binaryfiles(void);
/* globals */
int keep_logfile = 0;
argsT *binaryfiles[NUM_FILES];
int main(int argc, char *argv[])
{
int c, err = 0, r;
int logfd;
char err_string[BUFFSIZE];
int cpu_num = -1, restrict = 0, disable = 0, forced = 0;
int num_processors;
struct pda_stat *pstatus;
extern errno;
char *config_file = DEFAULT_CONFIG;
char *logfile = DEFAULT_LOGFILE;
char *tmp_logfile;
int priority = DEFAULT_PRIORITY;
if((num_processors = sysmp(MP_NPROCS)) < 1) {
fprintf(stderr, "%s: sysmp(MP_NPROCS) failed\n", argv[0]);
exit(1);
}
while ((c = getopt(argc, argv, "p:rdc:l:kP:")) != EOF) {
switch (c) {
case 'p': /* processor number to run diags on */
if ((cpu_num = strtol(optarg, (char **) 0, 0)) < 0 ||
cpu_num >= num_processors) {
fprintf(stderr, "%s: invalid processor number\n", argv[0]);
exit(1);
}
break;
case 'r': /* restrict the scheduling of any cpu found faulty */
restrict = 1;
break;
case 'd': /* disable a cpu on reboot if found faulty */
disable = 1;
break;
case 'c': /* use the specified config file for list of binaries */
config_file = optarg;
break;
case 'l': /* print out into the specified log file */
logfile = optarg;
break;
case 'k': /* keep the log file regardless of failure or not */
keep_logfile = 1;
break;
case 'f': /* force to run tests on _all_ cpus even, restricted
and isolated ones */
forced = 1;
break;
case 'P': /* sets priority of that this program runs at. */
priority = strtol(optarg, NULL, 0);
break;
case '?':
err++;
break;
}
}
if (err) {
fprintf(stderr, "usage: %s %s\n", argv[0], PARAMETERS);
exit(1);
}
/* get status of processors */
pstatus = (struct pda_stat *)
calloc(num_processors, sizeof(struct pda_stat));
if(!pstatus) {
fprintf(stderr, "%s: malloc failed\n", argv[0]);
exit(1);
}
if(sysmp(MP_STAT, pstatus) < 0) {
sprintf(err_string, "%s: sysmp(MP_STAT) failed", argv[0]);
perror(err_string);
exit(1);
}
if (setpriority(PRIO_PROCESS, getpid(), priority) < 0) {
sprintf(err_string, "%s: setpriority failed", argv[0]);
perror(err_string);
exit(1);
}
/* deal with config file */
if((r = read_config_file(config_file, argv[0])) <= 0) {
if(r == -1)
sprintf(err_string, "%s: can't open config file\n", argv[0]);
else sprintf(err_string, "%s: out of memory", argv[0]);
perror(err_string);
exit(1);
}
/* deal with temp log */
tmp_logfile = malloc(strlen(logfile) + 15);
sprintf(tmp_logfile, "%s_%d", logfile, getpid());
if((logfd = open(tmp_logfile, O_WRONLY | O_CREAT | O_TRUNC,
S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH)) < 0) {
sprintf(err_string, "%s: can't open temporary logfile \"%s\"",
argv[0], tmp_logfile);
perror(err_string);
exit(1);
}
/* check out the cpus */
if(cpu_num >= 0) {
run_diags(logfd, cpu_num, &restrict, &disable, &(pstatus[cpu_num]),
forced);
} else {
for(cpu_num = num_processors -1; cpu_num >= 0; cpu_num--) {
run_diags(logfd, cpu_num, &restrict, &disable,
&(pstatus[cpu_num]), forced);
}
}
/* if we have an error, keep the log file */
close(logfd);
if(keep_logfile) rename(tmp_logfile, logfile);
else unlink(tmp_logfile);
free(pstatus);
free(tmp_logfile);
free_binaryfiles();
return 0;
}
/* run_diags
** ---------
** call check_diags to run the binaries, then restrict and disable
** the cpu if check_diags returns true and we are supposed to and
** able to. Then log the failure.
*/
static void run_diags(int logfd, int cpu_num, int* restrict,
int* disable, struct pda_stat *p, int forced)
{
cap_t ocap;
cap_value_t cap_value;
if(forced || (p->p_flags & PDAF_ENABLED &&
!(p->p_flags & PDAF_ISOLATED))) {
if(check_diags(logfd, cpu_num)) {
/* set the restrict flag for the syslog entry */
if((p->p_flags & PDAF_ENABLED) && *restrict) {
cap_value = CAP_SCHED_MGT;
ocap = cap_acquire(1, &cap_value);
if(sysmp(MP_RESTRICT, cpu_num) < 0)
*restrict = 0;
cap_surrender(ocap);
} else if(~(p->p_flags) & PDAF_ENABLED) {
*restrict = 1;
}
/* set the disable flag for the syslog entry */
if((~(p->p_flags) & PDAF_DISABLE_CPU) && *disable) {
cap_value = CAP_NVRAM_MGT;
ocap = cap_acquire(1, &cap_value);
if(sysmp(MP_DISABLE_CPU, cpu_num) < 0)
*disable = 0;
cap_surrender(ocap);
} else if(p->p_flags & PDAF_DISABLE_CPU) {
*disable = 1;
}
log_cpu_failure(cpu_num, *restrict, *disable);
}
}
}
/* check_diags
** -----------
** fork a child for every binary in our list and run the binary on the
** specified cpu number. Make sure that the output of the binary goes
** to the log file and call check_exit_error to print out any errors
** that occur for the binary. Return failed as true if any of the
** binaries return a EXIT_FAILURE and a false for failed if not.
*/
static int check_diags(int logfd, int cpu_num)
{
int status = 0, failure = 0;
int pid;
int i;
char error_str[BUFFSIZE];
time_t time_val;
char *now;
for(i = 0; binaryfiles[i]; i++) {
(void)time(&time_val);
now = ctime(&time_val);
sprintf(error_str,
"\n\n************* RUNNING %s FOR CPU #%d at %s\n",
binaryfiles[i]->args[0], cpu_num, now);
write(logfd, error_str, strlen(error_str));
switch((pid = fork())) {
case -1: /* fork failed */
status = WAIT_FAILURE;
break;
case 0: /* child execs binary */
dup2(logfd, STDOUT_FILENO);
dup2(logfd, STDERR_FILENO);
close(logfd);
if(sysmp(MP_MUSTRUN, cpu_num) < 0) exit(MUSTRUN_FAILURE);
execvp(binaryfiles[i]->args[0], binaryfiles[i]->args);
exit(EXEC_FAILURE);
break;
default: /* parent waits for child */
if(waitpid(pid, &status, 0) == -1) status = WAIT_FAILURE;
else {
if(!(WIFEXITED(status))) status = FINISH_FAILURE;
else status = WEXITSTATUS(status);
}
break;
}
failure += check_exit_error(logfd, binaryfiles[i]->args[0],
cpu_num, status);
sprintf(error_str,
"\n------------- DONE WITH %s FOR CPU #%d\n",
binaryfiles[i]->args[0], cpu_num);
write(logfd, error_str, strlen(error_str));
}
return(failure);
}
/* check_exit_error
** ----------------
** See if the binary failed for some reason (run failure, exec failure
** or couldn't run on particular cpu). If it did, write the failure
** message to the log file and make sure that we keep the log file.
*/
static int check_exit_error(int logfd, char *binary, int cpu, int status)
{
int failure = 0;
char error_str[BUFFSIZE];
if(status) {
switch(status) {
case MUSTRUN_FAILURE:
sprintf(error_str, "ERROR: Can't run binary %s as mustrun "
"process on cpu #%d\n", binary, cpu);
break;
case EXEC_FAILURE:
sprintf(error_str, "ERROR: Can't exec %s for cpu #%d\n",
binary, cpu);
break;
case FORK_FAILURE:
sprintf(error_str, "ERROR: Can't fork process for binary %s "
"cpu #%d\n", binary, cpu);
break;
case WAIT_FAILURE:
sprintf(error_str, "ERROR: waitpid failed for binary %s "
"cpu #%d\n", binary, cpu);
break;
case FINISH_FAILURE:
sprintf(error_str, "ERROR: Didn't finish executing binary %s "
"cpu #%d\n", binary, cpu);
break;
case EXIT_FAILURE:
default:
sprintf(error_str, "ERROR: Diag %s for cpu #%d failed\n",
binary, cpu);
failure = 1;
break;
}
write(logfd, error_str, strlen(error_str));
keep_logfile = 1;
}
return(failure);
}
/* log_cpu_failure
** ---------------
** a cpu failed one of the tests, add a note to the SYSLOG file
*/
static void log_cpu_failure(int cpu_num, int restrict, int disable)
{
char hw_path[BUFFSIZE];
char* restrict_pre = (char*)calloc(10,1);
char* disable_pre = (char*)calloc(10,1);
char* preamble = (char*)calloc(80,1);
if(sysmp(MP_CPU_PATH, cpu_num, hw_path, BUFFSIZE) < 0)
hw_path[0] = '\0';
else
strncat(hw_path, ": ", (BUFFSIZE - strlen(hw_path)) - 1);
if(!restrict)
sprintf(restrict_pre, "%s", "not ");
else
restrict_pre = "";
if(!disable)
sprintf(disable_pre, "%s", "not ");
else
disable_pre = "";
openlog("onlinediag", LOG_PID | LOG_CONS, LOG_USER);
/* These tags are for Availmon */
/* Let the user know that the system has a bad FPU ... */
sprintf(preamble,"(MAINT-NEEDED");
/* If we restricted usage of this proc, notify the user ... */
if(restrict)
strcat(preamble, ":SYS-DEGRADED");
#ifdef COMMENT
/* If we disabled this proc, let the user know we took this action ... */
if(disable)
strcat(preamble, ":TOOK-ACTION");
#endif
strcat(preamble,"):");
syslog(LOG_CRIT | LOG_USER,
"%scpu %d: %sfails floating point diagnostics; "
"it is %srestricted and will %sbe disabled on reboot\n",
preamble,cpu_num, hw_path, restrict_pre, disable_pre);
closelog();
free(disable_pre);
free(restrict_pre);
free(preamble);
}
/* read_config_file
** ----------------
** Read through the config file ignoring white space and all info
** on a line after a '#' character. Put the names found in the
** config file into the global binaryfiles array and make sure that
** it is NULL terminated.
*/
static int read_config_file(char *config_file, char *program)
{
char name[BUFFSIZE];
int ch, i, filenum = 0;
char *pathname, *arg;
FILE *file;
if(!(file = fopen(config_file, "r"))) return -1;
ch = getc(file);
while(ch != EOF && filenum < NUM_FILES) {
/* ignor white space */
while(ch != EOF && isspace(ch)) ch = getc(file);
/* ignor till end of line if we see a '#' */
if(ch == EOF || ch == '#') {
while(ch != EOF && ch != '\n') ch = getc(file);
continue;
}
/* get the binary name */
for(i = 0; ch != EOF && ch != '\n' && ch != '#' && i < BUFFSIZE-1;
i++) {
name[i] = ch;
ch = getc(file);
}
name[i] = '\0';
/* add binary name and args to list of binaries */
arg = strtok(name, " \t");
binaryfiles[filenum] = malloc(sizeof(argsT));
if(!binaryfiles[filenum]) return 0;
for(i = 0; arg && i < BUFFSIZE-1; i++) {
binaryfiles[filenum]->args[i] = malloc(strlen(arg) + 1);
if(!binaryfiles[filenum]->args[i]) return 0;
strcpy(binaryfiles[filenum]->args[i], arg);
arg = strtok(NULL, " \t");
}
binaryfiles[filenum]->args[i] = NULL;
filenum++;
}
binaryfiles[filenum] = NULL;
return 1;
}
/* free_binaryfiles
** ----------------
** free the memory associated with the binary files array
*/
static void free_binaryfiles(void)
{
int i, j;
for(i = 0; binaryfiles[i]; i++) {
for(j = 0; binaryfiles[i]->args[j]; j++) {
free(binaryfiles[i]->args[j]);
}
free(binaryfiles[i]);
}
}