1
0
Files
irix-657m-src/irix/cmd/icrash/cmds/cmd_sbe.c
2022-09-29 17:59:04 +03:00

495 lines
17 KiB
C

/*
* sbe.c
*
* Single bit error code for MC3 systems. This permits us to go through
* each MC3 board on an IP19/21/25 system and look for any particular
* single bit errors that might be logged. Some of the comments below
* come directly from Alex Petruncola's brief document on single bit
* error logging:
*
* 'icrash' reads through the mc3_errcount array in order to dump out
* single bit error statistics. Look in irix/kern/ml/EVEREST/everror.c
* to find this variable declaration. Inside of it, you'll find that
* each bank maintains its own error count, including simm counts inside
* of it.
*
* Note that we do *not* look at any everror_t or everror_ext_t data,
* so we can see what is going on while the system is active.
*
* Alex's information:
*
* By default, the kernel has logging turned off. There are two
* controls of interest to customer systems. Each is manipulated with
* 'systune' utility.
*
* 1. sbe_log_errors, which enables SYSLOG messages on single bit
* correction occurance.
*
* 2. sbe_mfr_override, which causes each event to get a log message.
* Otherwise the occurance of a second correctable on the same bank
* will disable further reports for 1 hour, in order to avoid flooding
* the log. The disable is only for reporting on the particular bank,
* other banks will continue to make 2 reports before also being
* disabled for an hour.
*
* To enable error logging, login as root, and
*
* # systune -i
* Updates will be made to running system and /unix.install
*
* systune-> sbe_log_errors = 1
* sbe_log_errors = 0 (0x0)
* Do you really want to change sbe_log_errors to 1 (0x1)? (y/n) y
*
* systune-> quit
*
* To disable error logging, enter systune, but instead of entering:
*
* systune-> sbe_log_errors = 1
*
* instead, enter:
*
* systune-> sbe_log_errors = 0
*
* The kernel is immediately updated, and a new /unix.install is created
* so that a future reboot will continue to have the new setting.
*
*
* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
*
* A memory board which has recoverable (single bit) errors is NOT
* considered defective, and will not be replaced. The purpose of this
* information is in evaluating certain memory problems, and does not in
* itself indicate there is any failure. The occurance of single bit
* correctable messages DOES NOT identify a board requiring repairs.
*
* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
*
*
* There are two classes of memory boards, which I will call "old" and
* "new". A "new" board is considered to be one whose MC3_REVLEVEL is
* set to 1 and whose MC3_BISTRESULT value has bits 16 & 17 set. The SW
* refers to this as "rev 5".
*
* Older revision MC3s cannot report correctable errors. There is no
* update or jumper change which can be made to an older MC3. Some
* field people are aware of a jumper on older MC3s which disabled
* correctable error reporting. --> This jumper may NOT be altered or
* data corruption will result. The customer will experience corrupted
* data files and panics and hangs. Absolutely no jumper changes may be
* made. <--
*
* Older revision MC3s do single bit error correcting without making any
* report to software. This is an intended design feature of those
* boards.
*
* Default IRIX operation:
*
* When an SBE is reported, an error count is incremented. Individual
* error counts are maintained for each bank of each memory board. No
* messages are ever logged for SBEs but the counts can be examined at
* any time by a command (TBD).
*
* In addition, when an SBE occurs, the kernel leaves SBEs disabled for
* a while. If this is the first error on this bank, then it is left
* disabled for 1 second. If more than one error has been seen, it is
* left disabled for 60 minutes (this behaviour can be overriden by
* setting the systuneable paramter "sbe_mfr_override", in which case it
* is always left disabled for only one second).
*
* In addition, the code will attempt to recover the SBE data by
* determining if the data error was "transient" (i.e. flushing the
* cache and re-reading did not result in a subsequent error) or whether
* "scrubbing" is required (i.e. flushes the cache, dirties the line,
* flushes the cache and then determines if the error recurrs).
*
* Other IRIX operation:
*
* By setting the systunable parameter sbe_log_errors, additional
* information is logged to the SYSLOG whenver an SBE is reported. If
* sbe_report_cons is also set, then the information is also reported to
* the console.
*
* The recovery algorithm mentioned above is still performed in exactly
* the same sequence. The only difference is that the type of error
* ("transient" or "scrubbed") is also reported. And the algorithm
* mentioned above for disabling subsequent SBE interrupts is also the
* same.
*
* MC3 SIMM labeling:
*
* TOP VIEW
*
* EBus midplane connector
* =================================================================
* +---------+ +---------+ +---------+ +---------+ +---------+
* | | | | | | | | | |
* | MD | | MD | | MA | | MD | | MD |
* | | | | | | | | | |
* +---------+ +---------+ +---------+ +---------+ +---------+
*
* AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
* CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
* EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
* GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
*
* BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
* DDDDDDDDDDDDDDDDDDDDDDDDDDDDDD DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
* FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
* HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
*
* AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
* CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
* EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
* GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
*
* BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB
* DDDDDDDDDDDDDDDDDDDDDDDDDDDDDD DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
* FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
* HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH
*
* Rows of letters are SIMMs. They're lettered on the MC3 board.
* The letters A,C,E,G are leaf 0, and letters B,D,F,H are leaf 1.
*
* "bank n" numbers the banks 0..7 in the order ACEGBDFH.
*
* The format for 'icrash' messages if a bank/simm error is found
* will be:
*
* MC3 In Slot %d:
* Bank %d: Error Count: %d
* First Error Time: %s
* Last Error Time: %s
* Last Log Time: %s
*
* Simm %d: Error Count %d\n",
*
* The error log times will be printed out in ctime() format. The
* rest is obvious.
* ---------------------------------------------------------------------------
* ORIGIN SYSTEMS SBE COUNTS (from Curt McDowell):
*
* Each node has an sbe_info structure pointed to by the nodepda. The
* sbe_info_t structure contains
*
* disabled True if node monitoring has become disabled
* because there were too many errors in a bank.
* Statistics are no longer being gathered.
*
* log_cnt Number of valid entries in log array below.
* If log_cnt reaches the maximum SBE_EVENTS_PER_NODE
* then it is assumed many errors are accumulating
* rapidly in all different pages, probably a stuck
* data line, and disables monitoring for that bank
* after printing a warning. Monitoring is disabled
* to avoid taking constant error interrupts.
*
* log List of SBE events that have not expired yet.
* Each entry has the pfn of the error and a repeat
* count. If an error happens in the same page,
* the repeat count increments. Entries that have
* not repeated within SBE_TIMEOUT (60 sec) are
* removed from the list. Entries that repeat
* SBE_MAX_PER_PAGE times try to take the page out
* of service (good luck!) Entries that repeat
* SBE_MAXOUT times per page cause bank monitoring
* to be disabled.
*
* bank_cnt Simple array of counts of SBE events per bank
* since reboot.
*/
#ident "$Header: /proj/irix6.5.7m/isms/irix/cmd/icrash/cmds/RCS/cmd_sbe.c,v 1.16 1999/05/25 19:21:38 tjm Exp $"
#ifndef _KERNEL
#define _KERNEL 1
#define _K64U64 1
#define _PAGESZ 16384
#include <sys/types.h>
#include <sys/immu.h>
#undef _KERNEL
#include <sys/EVEREST/IP19addrs.h>
#else
#include <sys/types.h>
#include <sys/cpu.h>
#endif /* _KERNEL */
#include <sys/EVEREST/everest.h>
#include <sys/EVEREST/everror.h>
#include <sys/EVEREST/evconfig.h>
#include <sys/EVEREST/mc3.h>
#include <sys/EVEREST/gda.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <klib/klib.h>
#include "icrash.h"
/*
* Included directly from the ml/SN0/memerror.c code found in the
* latest 6.4 kernel roll-up patch.
*
*/
#define MD_MEM_BANKS 8 /* 8 banks of memory max in M mode */
#define SBE_EVENTS_PER_NODE 16 /* Storage space allocated */
#define SBE_TIMEOUT 60 /* Seconds */
#define SBE_MAXOUT 10 /* Max times for same page error */
#define SBE_MAX_PER_PAGE 4 /* Errors before page removed */
#define SBE_DISCARD_TIMEOUT 600 /* Time between discard attempts */
#define SBE_EVENT_DIR 0x01 /* Flags */
#define SBE_MAX_INTR_PER_MIN 600
typedef struct sbe_event_s {
pfn_t pfn; /* Page of error */
int flags;
int repcnt;
time_t expire; /* Time until sbe is forgotten */
} sbe_event_t;
typedef struct sbe_info_s {
int disabled; /* True if polling is disabled */
int log_cnt; /* Num. log array entries in use */
sbe_event_t log[SBE_EVENTS_PER_NODE];
int bank_cnt[MD_MEM_BANKS]; /* Per-bank error count */
int intr_ct; /* Safety threshold intr count */
time_t intr_tm; /* Safety threshold reset time */
} sbe_info_t;
/*
* challenge_sbe_cmd() -- Obtain single bit error output on Challenge systems.
*/
int
challenge_sbe_cmd(int flags, FILE *ofp)
{
int merror = 0;
evbrdinfo_t *eb;
mc3error_t *mc3err;
mc3_bank_err_t *bnk;
struct syment *tsym;
mc3_array_err_t mc3_errcount[EV_MAX_MC3S];
uint i, j, k, slot, mbid, leaf_num, bank_num, simm_num;
evcfginfo_t *ecbuf = (evcfginfo_t *)NULL;
if ((tsym = kl_get_sym("sbe_log_errors", K_TEMP)) ==
(struct syment *)NULL) {
fprintf(ofp,
"sbe: could not retrieve sbe_log_errors kernel value\n");
return(1);
}
if (tsym->n_value == 0) {
fprintf(ofp,
"sbe: single bit error logging turned off (sbe_log_errors = 0)\n");
kl_free_sym(tsym);
return(1);
}
/* Get evconfig
*/
ecbuf = (evcfginfo_t *)kl_alloc_block(sizeof(evcfginfo_t), K_TEMP);
if (!kl_get_block((kaddr_t)EVCFGINFO,
sizeof(evcfginfo_t), ecbuf, "evcfg")) {
fprintf(ofp, "sbe: could not read evconfiginfo structure\n");
kl_free_block((k_ptr_t)ecbuf);
kl_free_sym(tsym);
return(1);
}
kl_free_sym(tsym);
if (!(tsym = kl_get_sym("mc3_errcount", K_TEMP))) {
fprintf(ofp,
"sbe: could not retrieve mc3 bank/simm error count symbol\n");
kl_free_block((k_ptr_t)ecbuf);
return(1);
}
if (!(kl_get_block(tsym->n_value, sizeof(mc3_errcount),
mc3_errcount, "mc3_errcount"))) {
fprintf(ofp, "sbe: could not retrieve mc3 bank/simm error counts\n");
kl_free_block((k_ptr_t)ecbuf);
kl_free_sym(tsym);
return(1);
}
kl_free_sym(tsym);
for (slot = 1; slot < EV_MAX_SLOTS; slot++) {
/* Scan through all boards, and for each one that is an MC3, check
* through all the leaves and see if there are memory errors logged.
* If so, report them. In the cases where we have a partial write
* multiple bit error and a single bit read error, analyze the MC3
* single-bit error information and find out which bank/leaf/address
* we had the error at.
*/
eb = &(ecbuf->ecfg_board[slot]);
if (eb->eb_type == EVTYPE_MC3) {
/* Now go through all the memory banks and see if there are
* any error counts for each.
*/
merror = 0;
mbid = eb->eb_mem.eb_mc3num;
fprintf(ofp, "\nMC3 In Slot %d:\n", slot);
if (mbid && (mbid < EV_MAX_MC3S)) {
if (mc3_errcount[mbid].m_unk_bank_errcount) {
fprintf(ofp,
" MA Unresolved To Bank Error Count: %d\n",
mc3_errcount[mbid].m_unk_bank_errcount);
merror = 1;
}
for (j = 0; j < MC3_NUM_BANKS; j++) {
bnk = &(mc3_errcount[mbid].m_bank_errinfo[j]);
if (bnk->m_bank_errcount) {
fprintf(ofp,
" Bank %d: Error Count: %d\n"
" First Error Time: %s\n"
" Last Error Time: %s\n"
" Last Log Time: %s\n",
j, bnk->m_bank_errcount,
ctime((time_t *)&(bnk->m_first_err_time)),
ctime((time_t *)&(bnk->m_last_err_time)),
ctime((time_t *)&(bnk->m_last_log_time)));
merror = 1;
}
for (k = 0; k < MC3_SIMMS_PER_BANK; k++) {
if (bnk->m_simm_errcount[k]) {
fprintf(ofp,
"\n Simm %d: Error Count %d\n",
k, bnk->m_simm_errcount[k]);
merror = 1;
}
}
}
}
if (!merror) {
fprintf(ofp,
" No Bank, Simm, or Memory Errors Found "
"For This Board\n");
}
}
}
kl_free_block((k_ptr_t)ecbuf);
return(0);
}
/*
* origin_sbe_cmd() -- Obtain single bit error output on Origin systems.
*/
int
origin_sbe_cmd(int flags, FILE *ofp)
{
k_ptr_t npdap;
int i, count = 0, node, bank, started = FALSE;
kaddr_t value, nodepdaval, sbeinfoval;
sbe_info_t *sbebuf;
npdap = kl_alloc_block(NODEPDA_S_SIZE, K_TEMP);
/*
* Now walk through all of the nodes, being sure to dump out any
* and all errors for all banks. Be very specific as to which node
* has had its memory disabled.
*/
for (i = 0; i < K_NUMNODES; i++) {
value = (K_NODEPDAINDR + (i * K_NBPW));
kl_get_kaddr(value, &nodepdaval, "nodepda_s");
if (KL_ERROR) {
continue;
}
kl_get_struct(nodepdaval, NODEPDA_S_SIZE, npdap, "nodepda_s");
if (KL_ERROR) {
continue;
}
sbeinfoval = kl_kaddr(npdap, "nodepda_s", "sbe_info");
if (sbeinfoval) {
sbebuf = (sbe_info_t *)kl_alloc_block(sizeof(sbe_info_t), K_TEMP);
if (!kl_get_block(sbeinfoval, sizeof(sbe_info_t),
sbebuf, "sbe_info_t")) {
continue;
}
for (bank = 0; bank < 8; bank++) {
if (sbebuf->bank_cnt[bank] > 0) {
if (started == FALSE) {
if (report_flag) {
fprintf(ofp, "\n");
}
fprintf(ofp,
"NODE BANK # BANK ERRORS DISABLED (Y/N)\n");
fprintf(ofp,
"=============================================\n");
started = TRUE;
}
fprintf(ofp, "%4d %4d %12d %s\n",
i, bank, sbebuf->bank_cnt[bank],
(sbebuf->disabled ?
" Y" : " N"));
count++;
}
}
}
}
if (started == FALSE) {
if (report_flag) {
fprintf(ofp,
"\nNo single-bit errors found on any node\n");
} else {
fprintf(ofp,
"sbe: No single-bit errors found for any node\n");
}
} else {
fprintf(ofp,
"=============================================\n"
"%d node%s with sbe errors found\n",
count, (count != 1) ? "s" : "");
}
return(0);
}
/*
* sbe_cmd() -- Run the 'sbe' command.
*/
int
sbe_cmd(command_t cmd)
{
if ((K_IP == 19) || (K_IP == 21) || (K_IP == 25)) {
return(challenge_sbe_cmd(cmd.flags, cmd.ofp));
} else if ((K_IP == 27) || (K_IP == 29)) {
return(origin_sbe_cmd(cmd.flags, cmd.ofp));
}
fprintf(cmd.ofp,
"sbe: this command is only valid on Challenge and Origin systems\n");
return (1);
}
#define _SBE_USAGE "[-w outfile]"
/*
* sbe_usage() -- Print the usage string for the 'sbe' command.
*/
void
sbe_usage(command_t cmd)
{
CMD_USAGE(cmd, _SBE_USAGE);
}
/*
* sbe_help() -- Print the help information for the 'sbe' command.
*/
void
sbe_help(command_t cmd)
{
CMD_HELP(cmd, _SBE_USAGE,
"Print out the single bit error information for Challenge or Origin "
"systems. If the flags for single bit error logging in the kernel "
"are not turned on, the command will return no results.");
}
/*
* sbe_parse() -- Parse the command line arguments for 'sbe'.
*/
int
sbe_parse(command_t cmd)
{
return (C_FALSE|C_WRITE);
}