Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/bp/bp: (21 commits)
  EDAC, MCE: Fix shift warning on 32-bit
  EDAC, MCE: Add a BIT_64() macro
  EDAC, MCE: Enable MCE decoding on F12h
  EDAC, MCE: Add F12h NB MCE decoder
  EDAC, MCE: Add F12h IC MCE decoder
  EDAC, MCE: Add F12h DC MCE decoder
  EDAC, MCE: Add support for F11h MCEs
  EDAC, MCE: Enable MCE decoding on F14h
  EDAC, MCE: Fix FR MCEs decoding
  EDAC, MCE: Complete NB MCE decoders
  EDAC, MCE: Warn about LS MCEs on F14h
  EDAC, MCE: Adjust IC decoders to F14h
  EDAC, MCE: Adjust DC decoders to F14h
  EDAC, MCE: Rename files
  EDAC, MCE: Rework MCE injection
  EDAC: Export edac sysfs class to users.
  EDAC, MCE: Pass complete MCE info to decoders
  EDAC, MCE: Sanitize error codes
  EDAC, MCE: Remove unused function parameter
  EDAC, MCE: Add HW_ERR prefix
  ...
This commit is contained in:
Linus Torvalds 2010-10-21 14:04:58 -07:00
commit c029e405bd
16 changed files with 1018 additions and 758 deletions

View File

@ -39,7 +39,7 @@ config EDAC_DEBUG
there're four debug levels (x=0,1,2,3 from low to high).
Usually you should select 'N'.
config EDAC_DECODE_MCE
config EDAC_DECODE_MCE
tristate "Decode MCEs in human-readable form (only on AMD for now)"
depends on CPU_SUP_AMD && X86_MCE
default y
@ -51,6 +51,16 @@ config EDAC_DEBUG
which occur really early upon boot, before the module infrastructure
has been initialized.
config EDAC_MCE_INJ
tristate "Simple MCE injection interface over /sysfs"
depends on EDAC_DECODE_MCE
default n
help
This is a simple interface to inject MCEs over /sysfs and test
the MCE decoding code in EDAC.
This is currently AMD-only.
config EDAC_MM_EDAC
tristate "Main Memory EDAC (Error Detection And Correction) reporting"
help
@ -72,7 +82,7 @@ config EDAC_AMD64
Families of Memory Controllers (K8, F10h and F11h)
config EDAC_AMD64_ERROR_INJECTION
bool "Sysfs Error Injection facilities"
bool "Sysfs HW Error injection facilities"
depends on EDAC_AMD64
help
Recent Opterons (Family 10h and later) provide for Memory Error

View File

@ -17,6 +17,9 @@ ifdef CONFIG_PCI
edac_core-objs += edac_pci.o edac_pci_sysfs.o
endif
obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o
edac_mce_amd-objs := mce_amd.o
obj-$(CONFIG_EDAC_DECODE_MCE) += edac_mce_amd.o
obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o

View File

@ -2073,11 +2073,18 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
amd64_handle_ue(mci, info);
}
void amd64_decode_bus_error(int node_id, struct err_regs *regs)
void amd64_decode_bus_error(int node_id, struct mce *m, u32 nbcfg)
{
struct mem_ctl_info *mci = mci_lookup[node_id];
struct err_regs regs;
__amd64_decode_bus_error(mci, regs);
regs.nbsl = (u32) m->status;
regs.nbsh = (u32)(m->status >> 32);
regs.nbeal = (u32) m->addr;
regs.nbeah = (u32)(m->addr >> 32);
regs.nbcfg = nbcfg;
__amd64_decode_bus_error(mci, &regs);
/*
* Check the UE bit of the NB status high register, if set generate some
@ -2086,7 +2093,7 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs)
*
* FIXME: this should go somewhere else, if at all.
*/
if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
if (regs.nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
edac_mc_handle_ue_no_info(mci, "UE bit is set");
}

View File

@ -72,7 +72,7 @@
#include <linux/edac.h>
#include <asm/msr.h>
#include "edac_core.h"
#include "edac_mce_amd.h"
#include "mce_amd.h"
#define amd64_printk(level, fmt, arg...) \
edac_printk(level, "amd64", fmt, ##arg)
@ -482,11 +482,10 @@ extern const char *rrrr_msgs[16];
extern const char *to_msgs[2];
extern const char *pp_msgs[4];
extern const char *ii_msgs[4];
extern const char *ext_msgs[32];
extern const char *htlink_msgs[8];
#ifdef CONFIG_EDAC_DEBUG
#define NUM_DBG_ATTRS 9
#define NUM_DBG_ATTRS 5
#else
#define NUM_DBG_ATTRS 0
#endif

View File

@ -1,167 +1,16 @@
#include "amd64_edac.h"
/*
* accept a hex value and store it into the virtual error register file, field:
* nbeal and nbeah. Assume virtual error values have already been set for: NBSL,
* NBSH and NBCFG. Then proceed to map the error values to a MC, CSROW and
* CHANNEL
*/
static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
size_t count)
{
struct amd64_pvt *pvt = mci->pvt_info;
unsigned long long value;
int ret = 0;
ret = strict_strtoull(data, 16, &value);
if (ret != -EINVAL) {
debugf0("received NBEA= 0x%llx\n", value);
/* place the value into the virtual error packet */
pvt->ctl_error_info.nbeal = (u32) value;
value >>= 32;
pvt->ctl_error_info.nbeah = (u32) value;
/* Process the Mapping request */
/* TODO: Add race prevention */
amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1);
return count;
}
return ret;
#define EDAC_DCT_ATTR_SHOW(reg) \
static ssize_t amd64_##reg##_show(struct mem_ctl_info *mci, char *data) \
{ \
struct amd64_pvt *pvt = mci->pvt_info; \
return sprintf(data, "0x%016llx\n", (u64)pvt->reg); \
}
/* display back what the last NBEA (MCA NB Address (MC4_ADDR)) was written */
static ssize_t amd64_nbea_show(struct mem_ctl_info *mci, char *data)
{
struct amd64_pvt *pvt = mci->pvt_info;
u64 value;
value = pvt->ctl_error_info.nbeah;
value <<= 32;
value |= pvt->ctl_error_info.nbeal;
return sprintf(data, "%llx\n", value);
}
/* store the NBSL (MCA NB Status Low (MC4_STATUS)) value user desires */
static ssize_t amd64_nbsl_store(struct mem_ctl_info *mci, const char *data,
size_t count)
{
struct amd64_pvt *pvt = mci->pvt_info;
unsigned long value;
int ret = 0;
ret = strict_strtoul(data, 16, &value);
if (ret != -EINVAL) {
debugf0("received NBSL= 0x%lx\n", value);
pvt->ctl_error_info.nbsl = (u32) value;
return count;
}
return ret;
}
/* display back what the last NBSL value written */
static ssize_t amd64_nbsl_show(struct mem_ctl_info *mci, char *data)
{
struct amd64_pvt *pvt = mci->pvt_info;
u32 value;
value = pvt->ctl_error_info.nbsl;
return sprintf(data, "%x\n", value);
}
/* store the NBSH (MCA NB Status High) value user desires */
static ssize_t amd64_nbsh_store(struct mem_ctl_info *mci, const char *data,
size_t count)
{
struct amd64_pvt *pvt = mci->pvt_info;
unsigned long value;
int ret = 0;
ret = strict_strtoul(data, 16, &value);
if (ret != -EINVAL) {
debugf0("received NBSH= 0x%lx\n", value);
pvt->ctl_error_info.nbsh = (u32) value;
return count;
}
return ret;
}
/* display back what the last NBSH value written */
static ssize_t amd64_nbsh_show(struct mem_ctl_info *mci, char *data)
{
struct amd64_pvt *pvt = mci->pvt_info;
u32 value;
value = pvt->ctl_error_info.nbsh;
return sprintf(data, "%x\n", value);
}
/* accept and store the NBCFG (MCA NB Configuration) value user desires */
static ssize_t amd64_nbcfg_store(struct mem_ctl_info *mci,
const char *data, size_t count)
{
struct amd64_pvt *pvt = mci->pvt_info;
unsigned long value;
int ret = 0;
ret = strict_strtoul(data, 16, &value);
if (ret != -EINVAL) {
debugf0("received NBCFG= 0x%lx\n", value);
pvt->ctl_error_info.nbcfg = (u32) value;
return count;
}
return ret;
}
/* various show routines for the controls of a MCI */
static ssize_t amd64_nbcfg_show(struct mem_ctl_info *mci, char *data)
{
struct amd64_pvt *pvt = mci->pvt_info;
return sprintf(data, "%x\n", pvt->ctl_error_info.nbcfg);
}
static ssize_t amd64_dhar_show(struct mem_ctl_info *mci, char *data)
{
struct amd64_pvt *pvt = mci->pvt_info;
return sprintf(data, "%x\n", pvt->dhar);
}
static ssize_t amd64_dbam_show(struct mem_ctl_info *mci, char *data)
{
struct amd64_pvt *pvt = mci->pvt_info;
return sprintf(data, "%x\n", pvt->dbam0);
}
static ssize_t amd64_topmem_show(struct mem_ctl_info *mci, char *data)
{
struct amd64_pvt *pvt = mci->pvt_info;
return sprintf(data, "%llx\n", pvt->top_mem);
}
static ssize_t amd64_topmem2_show(struct mem_ctl_info *mci, char *data)
{
struct amd64_pvt *pvt = mci->pvt_info;
return sprintf(data, "%llx\n", pvt->top_mem2);
}
EDAC_DCT_ATTR_SHOW(dhar);
EDAC_DCT_ATTR_SHOW(dbam0);
EDAC_DCT_ATTR_SHOW(top_mem);
EDAC_DCT_ATTR_SHOW(top_mem2);
static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data)
{
@ -180,38 +29,6 @@ static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data)
*/
struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
{
.attr = {
.name = "nbea_ctl",
.mode = (S_IRUGO | S_IWUSR)
},
.show = amd64_nbea_show,
.store = amd64_nbea_store,
},
{
.attr = {
.name = "nbsl_ctl",
.mode = (S_IRUGO | S_IWUSR)
},
.show = amd64_nbsl_show,
.store = amd64_nbsl_store,
},
{
.attr = {
.name = "nbsh_ctl",
.mode = (S_IRUGO | S_IWUSR)
},
.show = amd64_nbsh_show,
.store = amd64_nbsh_store,
},
{
.attr = {
.name = "nbcfg_ctl",
.mode = (S_IRUGO | S_IWUSR)
},
.show = amd64_nbcfg_show,
.store = amd64_nbcfg_store,
},
{
.attr = {
.name = "dhar",
@ -225,7 +42,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
.name = "dbam",
.mode = (S_IRUGO)
},
.show = amd64_dbam_show,
.show = amd64_dbam0_show,
.store = NULL,
},
{
@ -233,7 +50,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
.name = "topmem",
.mode = (S_IRUGO)
},
.show = amd64_topmem_show,
.show = amd64_top_mem_show,
.store = NULL,
},
{
@ -241,7 +58,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
.name = "topmem2",
.mode = (S_IRUGO)
},
.show = amd64_topmem2_show,
.show = amd64_top_mem2_show,
.store = NULL,
},
{

View File

@ -13,6 +13,7 @@
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/edac.h>
#include "edac_core.h"
#include "edac_module.h"
@ -235,7 +236,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
debugf1("%s()\n", __func__);
/* get the /sys/devices/system/edac reference */
edac_class = edac_get_edac_class();
edac_class = edac_get_sysfs_class();
if (edac_class == NULL) {
debugf1("%s() no edac_class error\n", __func__);
err = -ENODEV;
@ -255,7 +256,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
if (!try_module_get(edac_dev->owner)) {
err = -ENODEV;
goto err_out;
goto err_mod_get;
}
/* register */
@ -282,6 +283,9 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
err_kobj_reg:
module_put(edac_dev->owner);
err_mod_get:
edac_put_sysfs_class();
err_out:
return err;
}
@ -290,12 +294,11 @@ err_out:
* edac_device_unregister_sysfs_main_kobj:
* the '..../edac/<name>' kobject
*/
void edac_device_unregister_sysfs_main_kobj(
struct edac_device_ctl_info *edac_dev)
void edac_device_unregister_sysfs_main_kobj(struct edac_device_ctl_info *dev)
{
debugf0("%s()\n", __func__);
debugf4("%s() name of kobject is: %s\n",
__func__, kobject_name(&edac_dev->kobj));
__func__, kobject_name(&dev->kobj));
/*
* Unregister the edac device's kobject and
@ -304,7 +307,8 @@ void edac_device_unregister_sysfs_main_kobj(
* a) module_put() this module
* b) 'kfree' the memory
*/
kobject_put(&edac_dev->kobj);
kobject_put(&dev->kobj);
edac_put_sysfs_class();
}
/* edac_dev -> instance information */

View File

@ -11,6 +11,7 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/edac.h>
#include <linux/bug.h>
#include "edac_core.h"
@ -1011,13 +1012,13 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
*/
int edac_sysfs_setup_mc_kset(void)
{
int err = 0;
int err = -EINVAL;
struct sysdev_class *edac_class;
debugf1("%s()\n", __func__);
/* get the /sys/devices/system/edac class reference */
edac_class = edac_get_edac_class();
edac_class = edac_get_sysfs_class();
if (edac_class == NULL) {
debugf1("%s() no edac_class error=%d\n", __func__, err);
goto fail_out;
@ -1028,15 +1029,16 @@ int edac_sysfs_setup_mc_kset(void)
if (!mc_kset) {
err = -ENOMEM;
debugf1("%s() Failed to register '.../edac/mc'\n", __func__);
goto fail_out;
goto fail_kset;
}
debugf1("%s() Registered '.../edac/mc' kobject\n", __func__);
return 0;
fail_kset:
edac_put_sysfs_class();
/* error unwind stack */
fail_out:
return err;
}
@ -1049,5 +1051,6 @@ fail_out:
void edac_sysfs_teardown_mc_kset(void)
{
kset_unregister(mc_kset);
edac_put_sysfs_class();
}

View File

@ -1,452 +0,0 @@
#include <linux/module.h>
#include "edac_mce_amd.h"
static bool report_gart_errors;
static void (*nb_bus_decoder)(int node_id, struct err_regs *regs);
void amd_report_gart_errors(bool v)
{
report_gart_errors = v;
}
EXPORT_SYMBOL_GPL(amd_report_gart_errors);
void amd_register_ecc_decoder(void (*f)(int, struct err_regs *))
{
nb_bus_decoder = f;
}
EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *))
{
if (nb_bus_decoder) {
WARN_ON(nb_bus_decoder != f);
nb_bus_decoder = NULL;
}
}
EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
/*
* string representation for the different MCA reported error types, see F3x48
* or MSR0000_0411.
*/
const char *tt_msgs[] = { /* transaction type */
"instruction",
"data",
"generic",
"reserved"
};
EXPORT_SYMBOL_GPL(tt_msgs);
const char *ll_msgs[] = { /* cache level */
"L0",
"L1",
"L2",
"L3/generic"
};
EXPORT_SYMBOL_GPL(ll_msgs);
const char *rrrr_msgs[] = {
"generic",
"generic read",
"generic write",
"data read",
"data write",
"inst fetch",
"prefetch",
"evict",
"snoop",
"reserved RRRR= 9",
"reserved RRRR= 10",
"reserved RRRR= 11",
"reserved RRRR= 12",
"reserved RRRR= 13",
"reserved RRRR= 14",
"reserved RRRR= 15"
};
EXPORT_SYMBOL_GPL(rrrr_msgs);
const char *pp_msgs[] = { /* participating processor */
"local node originated (SRC)",
"local node responded to request (RES)",
"local node observed as 3rd party (OBS)",
"generic"
};
EXPORT_SYMBOL_GPL(pp_msgs);
const char *to_msgs[] = {
"no timeout",
"timed out"
};
EXPORT_SYMBOL_GPL(to_msgs);
const char *ii_msgs[] = { /* memory or i/o */
"mem access",
"reserved",
"i/o access",
"generic"
};
EXPORT_SYMBOL_GPL(ii_msgs);
/*
* Map the 4 or 5 (family-specific) bits of Extended Error code to the
* string table.
*/
const char *ext_msgs[] = {
"K8 ECC error", /* 0_0000b */
"CRC error on link", /* 0_0001b */
"Sync error packets on link", /* 0_0010b */
"Master Abort during link operation", /* 0_0011b */
"Target Abort during link operation", /* 0_0100b */
"Invalid GART PTE entry during table walk", /* 0_0101b */
"Unsupported atomic RMW command received", /* 0_0110b */
"WDT error: NB transaction timeout", /* 0_0111b */
"ECC/ChipKill ECC error", /* 0_1000b */
"SVM DEV Error", /* 0_1001b */
"Link Data error", /* 0_1010b */
"Link/L3/Probe Filter Protocol error", /* 0_1011b */
"NB Internal Arrays Parity error", /* 0_1100b */
"DRAM Address/Control Parity error", /* 0_1101b */
"Link Transmission error", /* 0_1110b */
"GART/DEV Table Walk Data error" /* 0_1111b */
"Res 0x100 error", /* 1_0000b */
"Res 0x101 error", /* 1_0001b */
"Res 0x102 error", /* 1_0010b */
"Res 0x103 error", /* 1_0011b */
"Res 0x104 error", /* 1_0100b */
"Res 0x105 error", /* 1_0101b */
"Res 0x106 error", /* 1_0110b */
"Res 0x107 error", /* 1_0111b */
"Res 0x108 error", /* 1_1000b */
"Res 0x109 error", /* 1_1001b */
"Res 0x10A error", /* 1_1010b */
"Res 0x10B error", /* 1_1011b */
"ECC error in L3 Cache Data", /* 1_1100b */
"L3 Cache Tag error", /* 1_1101b */
"L3 Cache LRU Parity error", /* 1_1110b */
"Probe Filter error" /* 1_1111b */
};
EXPORT_SYMBOL_GPL(ext_msgs);
static void amd_decode_dc_mce(u64 mc0_status)
{
u32 ec = mc0_status & 0xffff;
u32 xec = (mc0_status >> 16) & 0xf;
pr_emerg("Data Cache Error");
if (xec == 1 && TLB_ERROR(ec))
pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
else if (xec == 0) {
if (mc0_status & (1ULL << 40))
pr_cont(" during Data Scrub.\n");
else if (TLB_ERROR(ec))
pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
else if (MEM_ERROR(ec)) {
u8 ll = ec & 0x3;
u8 tt = (ec >> 2) & 0x3;
u8 rrrr = (ec >> 4) & 0xf;
/* see F10h BKDG (31116), Table 92. */
if (ll == 0x1) {
if (tt != 0x1)
goto wrong_dc_mce;
pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec));
} else if (ll == 0x2 && rrrr == 0x3)
pr_cont(" during L1 linefill from L2.\n");
else
goto wrong_dc_mce;
} else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf)
pr_cont(" during system linefill.\n");
else
goto wrong_dc_mce;
} else
goto wrong_dc_mce;
return;
wrong_dc_mce:
pr_warning("Corrupted DC MCE info?\n");
}
static void amd_decode_ic_mce(u64 mc1_status)
{
u32 ec = mc1_status & 0xffff;
u32 xec = (mc1_status >> 16) & 0xf;
pr_emerg("Instruction Cache Error");
if (xec == 1 && TLB_ERROR(ec))
pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
else if (xec == 0) {
if (TLB_ERROR(ec))
pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
else if (BUS_ERROR(ec)) {
if (boot_cpu_data.x86 == 0xf &&
(mc1_status & (1ULL << 58)))
pr_cont(" during system linefill.\n");
else
pr_cont(" during attempted NB data read.\n");
} else if (MEM_ERROR(ec)) {
u8 ll = ec & 0x3;
u8 rrrr = (ec >> 4) & 0xf;
if (ll == 0x2)
pr_cont(" during a linefill from L2.\n");
else if (ll == 0x1) {
switch (rrrr) {
case 0x5:
pr_cont(": Parity error during "
"data load.\n");
break;
case 0x7:
pr_cont(": Copyback Parity/Victim"
" error.\n");
break;
case 0x8:
pr_cont(": Tag Snoop error.\n");
break;
default:
goto wrong_ic_mce;
break;
}
}
} else
goto wrong_ic_mce;
} else
goto wrong_ic_mce;
return;
wrong_ic_mce:
pr_warning("Corrupted IC MCE info?\n");
}
static void amd_decode_bu_mce(u64 mc2_status)
{
u32 ec = mc2_status & 0xffff;
u32 xec = (mc2_status >> 16) & 0xf;
pr_emerg("Bus Unit Error");
if (xec == 0x1)
pr_cont(" in the write data buffers.\n");
else if (xec == 0x3)
pr_cont(" in the victim data buffers.\n");
else if (xec == 0x2 && MEM_ERROR(ec))
pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
else if (xec == 0x0) {
if (TLB_ERROR(ec))
pr_cont(": %s error in a Page Descriptor Cache or "
"Guest TLB.\n", TT_MSG(ec));
else if (BUS_ERROR(ec))
pr_cont(": %s/ECC error in data read from NB: %s.\n",
RRRR_MSG(ec), PP_MSG(ec));
else if (MEM_ERROR(ec)) {
u8 rrrr = (ec >> 4) & 0xf;
if (rrrr >= 0x7)
pr_cont(": %s error during data copyback.\n",
RRRR_MSG(ec));
else if (rrrr <= 0x1)
pr_cont(": %s parity/ECC error during data "
"access from L2.\n", RRRR_MSG(ec));
else
goto wrong_bu_mce;
} else
goto wrong_bu_mce;
} else
goto wrong_bu_mce;
return;
wrong_bu_mce:
pr_warning("Corrupted BU MCE info?\n");
}
static void amd_decode_ls_mce(u64 mc3_status)
{
u32 ec = mc3_status & 0xffff;
u32 xec = (mc3_status >> 16) & 0xf;
pr_emerg("Load Store Error");
if (xec == 0x0) {
u8 rrrr = (ec >> 4) & 0xf;
if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4))
goto wrong_ls_mce;
pr_cont(" during %s.\n", RRRR_MSG(ec));
}
return;
wrong_ls_mce:
pr_warning("Corrupted LS MCE info?\n");
}
void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
{
u32 ec = ERROR_CODE(regs->nbsl);
if (!handle_errors)
return;
/*
* GART TLB error reporting is disabled by default. Bail out early.
*/
if (TLB_ERROR(ec) && !report_gart_errors)
return;
pr_emerg("Northbridge Error, node %d", node_id);
/*
* F10h, revD can disable ErrCpu[3:0] so check that first and also the
* value encoding has changed so interpret those differently
*/
if ((boot_cpu_data.x86 == 0x10) &&
(boot_cpu_data.x86_model > 7)) {
if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
} else {
u8 assoc_cpus = regs->nbsh & 0xf;
if (assoc_cpus > 0)
pr_cont(", core: %d", fls(assoc_cpus) - 1);
pr_cont("\n");
}
pr_emerg("%s.\n", EXT_ERR_MSG(regs->nbsl));
if (BUS_ERROR(ec) && nb_bus_decoder)
nb_bus_decoder(node_id, regs);
}
EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
static void amd_decode_fr_mce(u64 mc5_status)
{
/* we have only one error signature so match all fields at once. */
if ((mc5_status & 0xffff) == 0x0f0f)
pr_emerg(" FR Error: CPU Watchdog timer expire.\n");
else
pr_warning("Corrupted FR MCE info?\n");
}
static inline void amd_decode_err_code(unsigned int ec)
{
if (TLB_ERROR(ec)) {
pr_emerg("Transaction: %s, Cache Level %s\n",
TT_MSG(ec), LL_MSG(ec));
} else if (MEM_ERROR(ec)) {
pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
} else if (BUS_ERROR(ec)) {
pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
"Participating Processor: %s\n",
RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
PP_MSG(ec));
} else
pr_warning("Huh? Unknown MCE error 0x%x\n", ec);
}
static int amd_decode_mce(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;
struct err_regs regs;
int node, ecc;
pr_emerg("MC%d_STATUS: ", m->bank);
pr_cont("%sorrected error, other errors lost: %s, "
"CPU context corrupt: %s",
((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
/* do the two bits[14:13] together */
ecc = (m->status >> 45) & 0x3;
if (ecc)
pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
pr_cont("\n");
switch (m->bank) {
case 0:
amd_decode_dc_mce(m->status);
break;
case 1:
amd_decode_ic_mce(m->status);
break;
case 2:
amd_decode_bu_mce(m->status);
break;
case 3:
amd_decode_ls_mce(m->status);
break;
case 4:
regs.nbsl = (u32) m->status;
regs.nbsh = (u32)(m->status >> 32);
regs.nbeal = (u32) m->addr;
regs.nbeah = (u32)(m->addr >> 32);
node = amd_get_nb_id(m->extcpu);
amd_decode_nb_mce(node, &regs, 1);
break;
case 5:
amd_decode_fr_mce(m->status);
break;
default:
break;
}
amd_decode_err_code(m->status & 0xffff);
return NOTIFY_STOP;
}
static struct notifier_block amd_mce_dec_nb = {
.notifier_call = amd_decode_mce,
};
static int __init mce_amd_init(void)
{
/*
* We can decode MCEs for K8, F10h and F11h CPUs:
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return 0;
if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
return 0;
atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
return 0;
}
early_initcall(mce_amd_init);
#ifdef MODULE
static void __exit mce_amd_exit(void)
{
atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
}
MODULE_DESCRIPTION("AMD MCE decoder");
MODULE_ALIAS("edac-mce-amd");
MODULE_LICENSE("GPL");
module_exit(mce_amd_exit);
#endif

View File

@ -26,15 +26,6 @@ EXPORT_SYMBOL_GPL(edac_debug_level);
/* scope is to module level only */
struct workqueue_struct *edac_workqueue;
/*
* sysfs object: /sys/devices/system/edac
* need to export to other files in this modules
*/
static struct sysdev_class edac_class = {
.name = "edac",
};
static int edac_class_valid;
/*
* edac_op_state_to_string()
*/
@ -54,60 +45,6 @@ char *edac_op_state_to_string(int opstate)
return "UNKNOWN";
}
/*
* edac_get_edac_class()
*
* return pointer to the edac class of 'edac'
*/
struct sysdev_class *edac_get_edac_class(void)
{
struct sysdev_class *classptr = NULL;
if (edac_class_valid)
classptr = &edac_class;
return classptr;
}
/*
* edac_register_sysfs_edac_name()
*
* register the 'edac' into /sys/devices/system
*
* return:
* 0 success
* !0 error
*/
static int edac_register_sysfs_edac_name(void)
{
int err;
/* create the /sys/devices/system/edac directory */
err = sysdev_class_register(&edac_class);
if (err) {
debugf1("%s() error=%d\n", __func__, err);
return err;
}
edac_class_valid = 1;
return 0;
}
/*
* sysdev_class_unregister()
*
* unregister the 'edac' from /sys/devices/system
*/
static void edac_unregister_sysfs_edac_name(void)
{
/* only if currently registered, then unregister it */
if (edac_class_valid)
sysdev_class_unregister(&edac_class);
edac_class_valid = 0;
}
/*
* edac_workqueue_setup
* initialize the edac work queue for polling operations
@ -153,22 +90,12 @@ static int __init edac_init(void)
*/
edac_pci_clear_parity_errors();
/*
* perform the registration of the /sys/devices/system/edac class object
*/
if (edac_register_sysfs_edac_name()) {
edac_printk(KERN_ERR, EDAC_MC,
"Error initializing 'edac' kobject\n");
err = -ENODEV;
goto error;
}
/*
* now set up the mc_kset under the edac class object
*/
err = edac_sysfs_setup_mc_kset();
if (err)
goto sysfs_setup_fail;
goto error;
/* Setup/Initialize the workq for this core */
err = edac_workqueue_setup();
@ -183,9 +110,6 @@ static int __init edac_init(void)
workq_fail:
edac_sysfs_teardown_mc_kset();
sysfs_setup_fail:
edac_unregister_sysfs_edac_name();
error:
return err;
}
@ -201,7 +125,6 @@ static void __exit edac_exit(void)
/* tear down the various subsystems */
edac_workqueue_teardown();
edac_sysfs_teardown_mc_kset();
edac_unregister_sysfs_edac_name();
}
/*

View File

@ -42,7 +42,6 @@ extern void edac_device_unregister_sysfs_main_kobj(
struct edac_device_ctl_info *edac_dev);
extern int edac_device_create_sysfs(struct edac_device_ctl_info *edac_dev);
extern void edac_device_remove_sysfs(struct edac_device_ctl_info *edac_dev);
extern struct sysdev_class *edac_get_edac_class(void);
/* edac core workqueue: single CPU mode */
extern struct workqueue_struct *edac_workqueue;

View File

@ -7,7 +7,7 @@
*
*/
#include <linux/module.h>
#include <linux/sysdev.h>
#include <linux/edac.h>
#include <linux/slab.h>
#include <linux/ctype.h>
@ -354,7 +354,7 @@ static int edac_pci_main_kobj_setup(void)
/* First time, so create the main kobject and its
* controls and atributes
*/
edac_class = edac_get_edac_class();
edac_class = edac_get_sysfs_class();
if (edac_class == NULL) {
debugf1("%s() no edac_class\n", __func__);
err = -ENODEV;
@ -368,7 +368,7 @@ static int edac_pci_main_kobj_setup(void)
if (!try_module_get(THIS_MODULE)) {
debugf1("%s() try_module_get() failed\n", __func__);
err = -ENODEV;
goto decrement_count_fail;
goto mod_get_fail;
}
edac_pci_top_main_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
@ -403,6 +403,9 @@ kobject_init_and_add_fail:
kzalloc_fail:
module_put(THIS_MODULE);
mod_get_fail:
edac_put_sysfs_class();
decrement_count_fail:
/* if are on this error exit, nothing to tear down */
atomic_dec(&edac_pci_sysfs_refcount);
@ -429,6 +432,7 @@ static void edac_pci_main_kobj_teardown(void)
__func__);
kobject_put(edac_pci_top_main_kobj);
}
edac_put_sysfs_class();
}
/*

View File

@ -3,10 +3,13 @@
*
* Author: Dave Jiang <djiang@mvista.com>
*
* 2007 (c) MontaVista Software, Inc. This file is licensed under
* the terms of the GNU General Public License version 2. This program
* is licensed "as is" without any warranty of any kind, whether express
* or implied.
* 2007 (c) MontaVista Software, Inc.
* 2010 (c) Advanced Micro Devices Inc.
* Borislav Petkov <borislav.petkov@amd.com>
*
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
* warranty of any kind, whether express or implied.
*
*/
#include <linux/module.h>
@ -23,6 +26,8 @@ EXPORT_SYMBOL_GPL(edac_handlers);
int edac_err_assert = 0;
EXPORT_SYMBOL_GPL(edac_err_assert);
static atomic_t edac_class_valid = ATOMIC_INIT(0);
/*
* called to determine if there is an EDAC driver interested in
* knowing an event (such as NMI) occurred
@ -44,3 +49,41 @@ void edac_atomic_assert_error(void)
edac_err_assert++;
}
EXPORT_SYMBOL_GPL(edac_atomic_assert_error);
/*
* sysfs object: /sys/devices/system/edac
* need to export to other files
*/
struct sysdev_class edac_class = {
.name = "edac",
};
EXPORT_SYMBOL_GPL(edac_class);
/* return pointer to the 'edac' node in sysfs */
struct sysdev_class *edac_get_sysfs_class(void)
{
int err = 0;
if (atomic_read(&edac_class_valid))
goto out;
/* create the /sys/devices/system/edac directory */
err = sysdev_class_register(&edac_class);
if (err) {
printk(KERN_ERR "Error registering toplevel EDAC sysfs dir\n");
return NULL;
}
out:
atomic_inc(&edac_class_valid);
return &edac_class;
}
EXPORT_SYMBOL_GPL(edac_get_sysfs_class);
void edac_put_sysfs_class(void)
{
/* last user unregisters it */
if (atomic_dec_and_test(&edac_class_valid))
sysdev_class_unregister(&edac_class);
}
EXPORT_SYMBOL_GPL(edac_put_sysfs_class);

680
drivers/edac/mce_amd.c Normal file
View File

@ -0,0 +1,680 @@
#include <linux/module.h>
#include <linux/slab.h>
#include "mce_amd.h"
static struct amd_decoder_ops *fam_ops;
static u8 nb_err_cpumask = 0xf;
static bool report_gart_errors;
static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
void amd_report_gart_errors(bool v)
{
report_gart_errors = v;
}
EXPORT_SYMBOL_GPL(amd_report_gart_errors);
void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
{
nb_bus_decoder = f;
}
EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
{
if (nb_bus_decoder) {
WARN_ON(nb_bus_decoder != f);
nb_bus_decoder = NULL;
}
}
EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
/*
* string representation for the different MCA reported error types, see F3x48
* or MSR0000_0411.
*/
/* transaction type */
const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
EXPORT_SYMBOL_GPL(tt_msgs);
/* cache level */
const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
EXPORT_SYMBOL_GPL(ll_msgs);
/* memory transaction type */
const char *rrrr_msgs[] = {
"GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
};
EXPORT_SYMBOL_GPL(rrrr_msgs);
/* participating processor */
const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
EXPORT_SYMBOL_GPL(pp_msgs);
/* request timeout */
const char *to_msgs[] = { "no timeout", "timed out" };
EXPORT_SYMBOL_GPL(to_msgs);
/* memory or i/o */
const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
EXPORT_SYMBOL_GPL(ii_msgs);
static const char *f10h_nb_mce_desc[] = {
"HT link data error",
"Protocol error (link, L3, probe filter, etc.)",
"Parity error in NB-internal arrays",
"Link Retry due to IO link transmission error",
"L3 ECC data cache error",
"ECC error in L3 cache tag",
"L3 LRU parity bits error",
"ECC Error in the Probe Filter directory"
};
static bool f12h_dc_mce(u16 ec)
{
bool ret = false;
if (MEM_ERROR(ec)) {
u8 ll = ec & 0x3;
ret = true;
if (ll == LL_L2)
pr_cont("during L1 linefill from L2.\n");
else if (ll == LL_L1)
pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
else
ret = false;
}
return ret;
}
static bool f10h_dc_mce(u16 ec)
{
u8 r4 = (ec >> 4) & 0xf;
u8 ll = ec & 0x3;
if (r4 == R4_GEN && ll == LL_L1) {
pr_cont("during data scrub.\n");
return true;
}
return f12h_dc_mce(ec);
}
static bool k8_dc_mce(u16 ec)
{
if (BUS_ERROR(ec)) {
pr_cont("during system linefill.\n");
return true;
}
return f10h_dc_mce(ec);
}
static bool f14h_dc_mce(u16 ec)
{
u8 r4 = (ec >> 4) & 0xf;
u8 ll = ec & 0x3;
u8 tt = (ec >> 2) & 0x3;
u8 ii = tt;
bool ret = true;
if (MEM_ERROR(ec)) {
if (tt != TT_DATA || ll != LL_L1)
return false;
switch (r4) {
case R4_DRD:
case R4_DWR:
pr_cont("Data/Tag parity error due to %s.\n",
(r4 == R4_DRD ? "load/hw prf" : "store"));
break;
case R4_EVICT:
pr_cont("Copyback parity error on a tag miss.\n");
break;
case R4_SNOOP:
pr_cont("Tag parity error during snoop.\n");
break;
default:
ret = false;
}
} else if (BUS_ERROR(ec)) {
if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
return false;
pr_cont("System read data error on a ");
switch (r4) {
case R4_RD:
pr_cont("TLB reload.\n");
break;
case R4_DWR:
pr_cont("store.\n");
break;
case R4_DRD:
pr_cont("load.\n");
break;
default:
ret = false;
}
} else {
ret = false;
}
return ret;
}
static void amd_decode_dc_mce(struct mce *m)
{
u16 ec = m->status & 0xffff;
u8 xec = (m->status >> 16) & 0xf;
pr_emerg(HW_ERR "Data Cache Error: ");
/* TLB error signatures are the same across families */
if (TLB_ERROR(ec)) {
u8 tt = (ec >> 2) & 0x3;
if (tt == TT_DATA) {
pr_cont("%s TLB %s.\n", LL_MSG(ec),
(xec ? "multimatch" : "parity error"));
return;
}
else
goto wrong_dc_mce;
}
if (!fam_ops->dc_mce(ec))
goto wrong_dc_mce;
return;
wrong_dc_mce:
pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
}
static bool k8_ic_mce(u16 ec)
{
u8 ll = ec & 0x3;
u8 r4 = (ec >> 4) & 0xf;
bool ret = true;
if (!MEM_ERROR(ec))
return false;
if (ll == 0x2)
pr_cont("during a linefill from L2.\n");
else if (ll == 0x1) {
switch (r4) {
case R4_IRD:
pr_cont("Parity error during data load.\n");
break;
case R4_EVICT:
pr_cont("Copyback Parity/Victim error.\n");
break;
case R4_SNOOP:
pr_cont("Tag Snoop error.\n");
break;
default:
ret = false;
break;
}
} else
ret = false;
return ret;
}
static bool f14h_ic_mce(u16 ec)
{
u8 ll = ec & 0x3;
u8 tt = (ec >> 2) & 0x3;
u8 r4 = (ec >> 4) & 0xf;
bool ret = true;
if (MEM_ERROR(ec)) {
if (tt != 0 || ll != 1)
ret = false;
if (r4 == R4_IRD)
pr_cont("Data/tag array parity error for a tag hit.\n");
else if (r4 == R4_SNOOP)
pr_cont("Tag error during snoop/victimization.\n");
else
ret = false;
}
return ret;
}
static void amd_decode_ic_mce(struct mce *m)
{
u16 ec = m->status & 0xffff;
u8 xec = (m->status >> 16) & 0xf;
pr_emerg(HW_ERR "Instruction Cache Error: ");
if (TLB_ERROR(ec))
pr_cont("%s TLB %s.\n", LL_MSG(ec),
(xec ? "multimatch" : "parity error"));
else if (BUS_ERROR(ec)) {
bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
} else if (fam_ops->ic_mce(ec))
;
else
pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
}
static void amd_decode_bu_mce(struct mce *m)
{
u32 ec = m->status & 0xffff;
u32 xec = (m->status >> 16) & 0xf;
pr_emerg(HW_ERR "Bus Unit Error");
if (xec == 0x1)
pr_cont(" in the write data buffers.\n");
else if (xec == 0x3)
pr_cont(" in the victim data buffers.\n");
else if (xec == 0x2 && MEM_ERROR(ec))
pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
else if (xec == 0x0) {
if (TLB_ERROR(ec))
pr_cont(": %s error in a Page Descriptor Cache or "
"Guest TLB.\n", TT_MSG(ec));
else if (BUS_ERROR(ec))
pr_cont(": %s/ECC error in data read from NB: %s.\n",
RRRR_MSG(ec), PP_MSG(ec));
else if (MEM_ERROR(ec)) {
u8 rrrr = (ec >> 4) & 0xf;
if (rrrr >= 0x7)
pr_cont(": %s error during data copyback.\n",
RRRR_MSG(ec));
else if (rrrr <= 0x1)
pr_cont(": %s parity/ECC error during data "
"access from L2.\n", RRRR_MSG(ec));
else
goto wrong_bu_mce;
} else
goto wrong_bu_mce;
} else
goto wrong_bu_mce;
return;
wrong_bu_mce:
pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
}
static void amd_decode_ls_mce(struct mce *m)
{
u16 ec = m->status & 0xffff;
u8 xec = (m->status >> 16) & 0xf;
if (boot_cpu_data.x86 == 0x14) {
pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
" please report on LKML.\n");
return;
}
pr_emerg(HW_ERR "Load Store Error");
if (xec == 0x0) {
u8 r4 = (ec >> 4) & 0xf;
if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
goto wrong_ls_mce;
pr_cont(" during %s.\n", RRRR_MSG(ec));
} else
goto wrong_ls_mce;
return;
wrong_ls_mce:
pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
}
static bool k8_nb_mce(u16 ec, u8 xec)
{
bool ret = true;
switch (xec) {
case 0x1:
pr_cont("CRC error detected on HT link.\n");
break;
case 0x5:
pr_cont("Invalid GART PTE entry during GART table walk.\n");
break;
case 0x6:
pr_cont("Unsupported atomic RMW received from an IO link.\n");
break;
case 0x0:
case 0x8:
if (boot_cpu_data.x86 == 0x11)
return false;
pr_cont("DRAM ECC error detected on the NB.\n");
break;
case 0xd:
pr_cont("Parity error on the DRAM addr/ctl signals.\n");
break;
default:
ret = false;
break;
}
return ret;
}
static bool f10h_nb_mce(u16 ec, u8 xec)
{
bool ret = true;
u8 offset = 0;
if (k8_nb_mce(ec, xec))
return true;
switch(xec) {
case 0xa ... 0xc:
offset = 10;
break;
case 0xe:
offset = 11;
break;
case 0xf:
if (TLB_ERROR(ec))
pr_cont("GART Table Walk data error.\n");
else if (BUS_ERROR(ec))
pr_cont("DMA Exclusion Vector Table Walk error.\n");
else
ret = false;
goto out;
break;
case 0x1c ... 0x1f:
offset = 24;
break;
default:
ret = false;
goto out;
break;
}
pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
out:
return ret;
}
static bool nb_noop_mce(u16 ec, u8 xec)
{
return false;
}
void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
{
u8 xec = (m->status >> 16) & 0x1f;
u16 ec = m->status & 0xffff;
u32 nbsh = (u32)(m->status >> 32);
pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
/*
* F10h, revD can disable ErrCpu[3:0] so check that first and also the
* value encoding has changed so interpret those differently
*/
if ((boot_cpu_data.x86 == 0x10) &&
(boot_cpu_data.x86_model > 7)) {
if (nbsh & K8_NBSH_ERR_CPU_VAL)
pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
} else {
u8 assoc_cpus = nbsh & nb_err_cpumask;
if (assoc_cpus > 0)
pr_cont(", core: %d", fls(assoc_cpus) - 1);
}
switch (xec) {
case 0x2:
pr_cont("Sync error (sync packets on HT link detected).\n");
return;
case 0x3:
pr_cont("HT Master abort.\n");
return;
case 0x4:
pr_cont("HT Target abort.\n");
return;
case 0x7:
pr_cont("NB Watchdog timeout.\n");
return;
case 0x9:
pr_cont("SVM DMA Exclusion Vector error.\n");
return;
default:
break;
}
if (!fam_ops->nb_mce(ec, xec))
goto wrong_nb_mce;
if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
nb_bus_decoder(node_id, m, nbcfg);
return;
wrong_nb_mce:
pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
}
EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
static void amd_decode_fr_mce(struct mce *m)
{
if (boot_cpu_data.x86 == 0xf ||
boot_cpu_data.x86 == 0x11)
goto wrong_fr_mce;
/* we have only one error signature so match all fields at once. */
if ((m->status & 0xffff) == 0x0f0f) {
pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
return;
}
wrong_fr_mce:
pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
}
static inline void amd_decode_err_code(u16 ec)
{
if (TLB_ERROR(ec)) {
pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
TT_MSG(ec), LL_MSG(ec));
} else if (MEM_ERROR(ec)) {
pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
} else if (BUS_ERROR(ec)) {
pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
"Participating Processor: %s\n",
RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
PP_MSG(ec));
} else
pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
}
/*
* Filter out unwanted MCE signatures here.
*/
static bool amd_filter_mce(struct mce *m)
{
u8 xec = (m->status >> 16) & 0x1f;
/*
* NB GART TLB error reporting is disabled by default.
*/
if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
return true;
return false;
}
int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
{
struct mce *m = (struct mce *)data;
int node, ecc;
if (amd_filter_mce(m))
return NOTIFY_STOP;
pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
pr_cont("%sorrected error, other errors lost: %s, "
"CPU context corrupt: %s",
((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
/* do the two bits[14:13] together */
ecc = (m->status >> 45) & 0x3;
if (ecc)
pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
pr_cont("\n");
switch (m->bank) {
case 0:
amd_decode_dc_mce(m);
break;
case 1:
amd_decode_ic_mce(m);
break;
case 2:
amd_decode_bu_mce(m);
break;
case 3:
amd_decode_ls_mce(m);
break;
case 4:
node = amd_get_nb_id(m->extcpu);
amd_decode_nb_mce(node, m, 0);
break;
case 5:
amd_decode_fr_mce(m);
break;
default:
break;
}
amd_decode_err_code(m->status & 0xffff);
return NOTIFY_STOP;
}
EXPORT_SYMBOL_GPL(amd_decode_mce);
static struct notifier_block amd_mce_dec_nb = {
.notifier_call = amd_decode_mce,
};
static int __init mce_amd_init(void)
{
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return 0;
if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
(boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
return 0;
fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
if (!fam_ops)
return -ENOMEM;
switch (boot_cpu_data.x86) {
case 0xf:
fam_ops->dc_mce = k8_dc_mce;
fam_ops->ic_mce = k8_ic_mce;
fam_ops->nb_mce = k8_nb_mce;
break;
case 0x10:
fam_ops->dc_mce = f10h_dc_mce;
fam_ops->ic_mce = k8_ic_mce;
fam_ops->nb_mce = f10h_nb_mce;
break;
case 0x11:
fam_ops->dc_mce = k8_dc_mce;
fam_ops->ic_mce = k8_ic_mce;
fam_ops->nb_mce = f10h_nb_mce;
break;
case 0x12:
fam_ops->dc_mce = f12h_dc_mce;
fam_ops->ic_mce = k8_ic_mce;
fam_ops->nb_mce = nb_noop_mce;
break;
case 0x14:
nb_err_cpumask = 0x3;
fam_ops->dc_mce = f14h_dc_mce;
fam_ops->ic_mce = f14h_ic_mce;
fam_ops->nb_mce = nb_noop_mce;
break;
default:
printk(KERN_WARNING "Huh? What family is that: %d?!\n",
boot_cpu_data.x86);
kfree(fam_ops);
return -EINVAL;
}
pr_info("MCE: In-kernel MCE decoding enabled.\n");
atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
return 0;
}
early_initcall(mce_amd_init);
#ifdef MODULE
static void __exit mce_amd_exit(void)
{
atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
kfree(fam_ops);
}
MODULE_DESCRIPTION("AMD MCE decoder");
MODULE_ALIAS("edac-mce-amd");
MODULE_LICENSE("GPL");
module_exit(mce_amd_exit);
#endif

View File

@ -1,11 +1,14 @@
#ifndef _EDAC_MCE_AMD_H
#define _EDAC_MCE_AMD_H
#include <linux/notifier.h>
#include <asm/mce.h>
#define BIT_64(n) (U64_C(1) << (n))
#define ERROR_CODE(x) ((x) & 0xffff)
#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
@ -20,13 +23,14 @@
#define II_MSG(x) ii_msgs[II(x)]
#define LL(x) (((x) >> 0) & 0x3)
#define LL_MSG(x) ll_msgs[LL(x)]
#define RRRR(x) (((x) >> 4) & 0xf)
#define RRRR_MSG(x) rrrr_msgs[RRRR(x)]
#define TO(x) (((x) >> 8) & 0x1)
#define TO_MSG(x) to_msgs[TO(x)]
#define PP(x) (((x) >> 9) & 0x3)
#define PP_MSG(x) pp_msgs[PP(x)]
#define RRRR(x) (((x) >> 4) & 0xf)
#define RRRR_MSG(x) ((RRRR(x) < 9) ? rrrr_msgs[RRRR(x)] : "Wrong R4!")
#define K8_NBSH 0x4C
#define K8_NBSH_VALID_BIT BIT(31)
@ -41,13 +45,45 @@
#define K8_NBSH_UECC BIT(13)
#define K8_NBSH_ERR_SCRUBER BIT(8)
enum tt_ids {
TT_INSTR = 0,
TT_DATA,
TT_GEN,
TT_RESV,
};
enum ll_ids {
LL_RESV = 0,
LL_L1,
LL_L2,
LL_LG,
};
enum ii_ids {
II_MEM = 0,
II_RESV,
II_IO,
II_GEN,
};
enum rrrr_ids {
R4_GEN = 0,
R4_RD,
R4_WR,
R4_DRD,
R4_DWR,
R4_IRD,
R4_PREF,
R4_EVICT,
R4_SNOOP,
};
extern const char *tt_msgs[];
extern const char *ll_msgs[];
extern const char *rrrr_msgs[];
extern const char *pp_msgs[];
extern const char *to_msgs[];
extern const char *ii_msgs[];
extern const char *ext_msgs[];
/*
* relevant NB regs
@ -60,10 +96,19 @@ struct err_regs {
u32 nbeal;
};
/*
* per-family decoder ops
*/
struct amd_decoder_ops {
bool (*dc_mce)(u16);
bool (*ic_mce)(u16);
bool (*nb_mce)(u16, u8);
};
void amd_report_gart_errors(bool);
void amd_register_ecc_decoder(void (*f)(int, struct err_regs *));
void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *));
void amd_decode_nb_mce(int, struct err_regs *, int);
void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32));
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32));
void amd_decode_nb_mce(int, struct mce *, u32);
int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data);
#endif /* _EDAC_MCE_AMD_H */

171
drivers/edac/mce_amd_inj.c Normal file
View File

@ -0,0 +1,171 @@
/*
* A simple MCE injection facility for testing the MCE decoding code. This
* driver should be built as module so that it can be loaded on production
* kernels for testing purposes.
*
* This file may be distributed under the terms of the GNU General Public
* License version 2.
*
* Copyright (c) 2010: Borislav Petkov <borislav.petkov@amd.com>
* Advanced Micro Devices Inc.
*/
#include <linux/kobject.h>
#include <linux/sysdev.h>
#include <linux/edac.h>
#include <asm/mce.h>
#include "mce_amd.h"
struct edac_mce_attr {
struct attribute attr;
ssize_t (*show) (struct kobject *kobj, struct edac_mce_attr *attr, char *buf);
ssize_t (*store)(struct kobject *kobj, struct edac_mce_attr *attr,
const char *buf, size_t count);
};
#define EDAC_MCE_ATTR(_name, _mode, _show, _store) \
static struct edac_mce_attr mce_attr_##_name = __ATTR(_name, _mode, _show, _store)
static struct kobject *mce_kobj;
/*
* Collect all the MCi_XXX settings
*/
static struct mce i_mce;
#define MCE_INJECT_STORE(reg) \
static ssize_t edac_inject_##reg##_store(struct kobject *kobj, \
struct edac_mce_attr *attr, \
const char *data, size_t count)\
{ \
int ret = 0; \
unsigned long value; \
\
ret = strict_strtoul(data, 16, &value); \
if (ret < 0) \
printk(KERN_ERR "Error writing MCE " #reg " field.\n"); \
\
i_mce.reg = value; \
\
return count; \
}
MCE_INJECT_STORE(status);
MCE_INJECT_STORE(misc);
MCE_INJECT_STORE(addr);
#define MCE_INJECT_SHOW(reg) \
static ssize_t edac_inject_##reg##_show(struct kobject *kobj, \
struct edac_mce_attr *attr, \
char *buf) \
{ \
return sprintf(buf, "0x%016llx\n", i_mce.reg); \
}
MCE_INJECT_SHOW(status);
MCE_INJECT_SHOW(misc);
MCE_INJECT_SHOW(addr);
EDAC_MCE_ATTR(status, 0644, edac_inject_status_show, edac_inject_status_store);
EDAC_MCE_ATTR(misc, 0644, edac_inject_misc_show, edac_inject_misc_store);
EDAC_MCE_ATTR(addr, 0644, edac_inject_addr_show, edac_inject_addr_store);
/*
* This denotes into which bank we're injecting and triggers
* the injection, at the same time.
*/
static ssize_t edac_inject_bank_store(struct kobject *kobj,
struct edac_mce_attr *attr,
const char *data, size_t count)
{
int ret = 0;
unsigned long value;
ret = strict_strtoul(data, 10, &value);
if (ret < 0) {
printk(KERN_ERR "Invalid bank value!\n");
return -EINVAL;
}
if (value > 5) {
printk(KERN_ERR "Non-existant MCE bank: %lu\n", value);
return -EINVAL;
}
i_mce.bank = value;
amd_decode_mce(NULL, 0, &i_mce);
return count;
}
static ssize_t edac_inject_bank_show(struct kobject *kobj,
struct edac_mce_attr *attr, char *buf)
{
return sprintf(buf, "%d\n", i_mce.bank);
}
EDAC_MCE_ATTR(bank, 0644, edac_inject_bank_show, edac_inject_bank_store);
static struct edac_mce_attr *sysfs_attrs[] = { &mce_attr_status, &mce_attr_misc,
&mce_attr_addr, &mce_attr_bank
};
static int __init edac_init_mce_inject(void)
{
struct sysdev_class *edac_class = NULL;
int i, err = 0;
edac_class = edac_get_sysfs_class();
if (!edac_class)
return -EINVAL;
mce_kobj = kobject_create_and_add("mce", &edac_class->kset.kobj);
if (!mce_kobj) {
printk(KERN_ERR "Error creating a mce kset.\n");
err = -ENOMEM;
goto err_mce_kobj;
}
for (i = 0; i < ARRAY_SIZE(sysfs_attrs); i++) {
err = sysfs_create_file(mce_kobj, &sysfs_attrs[i]->attr);
if (err) {
printk(KERN_ERR "Error creating %s in sysfs.\n",
sysfs_attrs[i]->attr.name);
goto err_sysfs_create;
}
}
return 0;
err_sysfs_create:
while (i-- >= 0)
sysfs_remove_file(mce_kobj, &sysfs_attrs[i]->attr);
kobject_del(mce_kobj);
err_mce_kobj:
edac_put_sysfs_class();
return err;
}
static void __exit edac_exit_mce_inject(void)
{
int i;
for (i = 0; i < ARRAY_SIZE(sysfs_attrs); i++)
sysfs_remove_file(mce_kobj, &sysfs_attrs[i]->attr);
kobject_del(mce_kobj);
edac_put_sysfs_class();
}
module_init(edac_init_mce_inject);
module_exit(edac_exit_mce_inject);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Borislav Petkov <borislav.petkov@amd.com>");
MODULE_AUTHOR("AMD Inc.");
MODULE_DESCRIPTION("MCE injection facility for testing MCE decoding");

View File

@ -13,6 +13,7 @@
#define _LINUX_EDAC_H_
#include <asm/atomic.h>
#include <linux/sysdev.h>
#define EDAC_OPSTATE_INVAL -1
#define EDAC_OPSTATE_POLL 0
@ -22,9 +23,12 @@
extern int edac_op_state;
extern int edac_err_assert;
extern atomic_t edac_handlers;
extern struct sysdev_class edac_class;
extern int edac_handler_set(void);
extern void edac_atomic_assert_error(void);
extern struct sysdev_class *edac_get_sysfs_class(void);
extern void edac_put_sysfs_class(void);
static inline void opstate_init(void)
{