powerpc: Rework pseries machine check handler

Rework pseries machine check handler:

- If MSR_RI isn't set, we cannot recover even if the machine check was fully
  recovered

- Rename nonfatal to recovered

- Handle RTAS_DISP_LIMITED_RECOVERY

- Use BUS_MCEERR_AR instead of BUS_ADRERR

- Don't check all the RTAS error log fields when receiving a synchronous
  machine check. Recent versions of the pseries firmware do not fill them
  in during a machine check and instead send a follow up error log with
  the detailed information. If we see a synchronous machine check, and we
  came from userspace then kill the task.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
This commit is contained in:
Anton Blanchard 2011-01-11 19:49:19 +00:00 committed by Benjamin Herrenschmidt
parent e49b1fae0b
commit d47d1d8af5

View File

@ -259,31 +259,43 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
* Return 1 if corrected (or delivered a signal). * Return 1 if corrected (or delivered a signal).
* Return 0 if there is nothing we can do. * Return 0 if there is nothing we can do.
*/ */
static int recover_mce(struct pt_regs *regs, struct rtas_error_log * err) static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
{ {
int nonfatal = 0; int recovered = 0;
if (err->disposition == RTAS_DISP_FULLY_RECOVERED) { if (!(regs->msr & MSR_RI)) {
/* If MSR_RI isn't set, we cannot recover */
recovered = 0;
} else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) {
/* Platform corrected itself */ /* Platform corrected itself */
nonfatal = 1; recovered = 1;
} else if ((regs->msr & MSR_RI) &&
user_mode(regs) && } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) {
err->severity == RTAS_SEVERITY_ERROR_SYNC && /* Platform corrected itself but could be degraded */
err->disposition == RTAS_DISP_NOT_RECOVERED && printk(KERN_ERR "MCE: limited recovery, system may "
err->target == RTAS_TARGET_MEMORY && "be degraded\n");
err->type == RTAS_TYPE_ECC_UNCORR && recovered = 1;
!(current->pid == 0 || is_global_init(current))) {
/* Kill off a user process with an ECC error */ } else if (user_mode(regs) && !is_global_init(current) &&
printk(KERN_ERR "MCE: uncorrectable ecc error for pid %d\n", err->severity == RTAS_SEVERITY_ERROR_SYNC) {
current->pid);
/* XXX something better for ECC error? */ /*
_exception(SIGBUS, regs, BUS_ADRERR, regs->nip); * If we received a synchronous error when in userspace
nonfatal = 1; * kill the task. Firmware may report details of the fail
* asynchronously, so we can't rely on the target and type
* fields being valid here.
*/
printk(KERN_ERR "MCE: uncorrectable error, killing task "
"%s:%d\n", current->comm, current->pid);
_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
recovered = 1;
} }
log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);
return nonfatal; return recovered;
} }
/* /*