x86/mce: Add an AMD severities-grading function
Add a severities function that caters to AMD processors. This allows us to do some vendor-specific work within the function if necessary. Also, introduce a vendor flag bitfield for vendor-specific settings. The severities code uses this to define error scope based on the prescence of the flags field. This is based off of work by Boris Petkov. Testing details: Fam10h, Model 9h (Greyhound) Fam15h: Models 0h-0fh (Orochi), 30h-3fh (Kaveri) and 60h-6fh (Carrizo), Fam16h Model 00h-0fh (Kabini) Boris: Intel SNB AMD K8 (JH-E0) Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com> Acked-by: Tony Luck <tony.luck@intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@kernel.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Chen Yucong <slaoub@gmail.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: linux-edac@vger.kernel.org Link: http://lkml.kernel.org/r/1427125373-2918-2-git-send-email-Aravind.Gopalakrishnan@amd.com [ Fixup build, clean up comments. ] Signed-off-by: Borislav Petkov <bp@suse.de>
This commit is contained in:
		
							parent
							
								
									c9ce871283
								
							
						
					
					
						commit
						bf80bbd7dc
					
				| @ -116,6 +116,12 @@ struct mca_config { | |||||||
| 	u32 rip_msr; | 	u32 rip_msr; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | struct mce_vendor_flags { | ||||||
|  | 	__u64		overflow_recov	: 1, /* cpuid_ebx(80000007) */ | ||||||
|  | 			__reserved_0	: 63; | ||||||
|  | }; | ||||||
|  | extern struct mce_vendor_flags mce_flags; | ||||||
|  | 
 | ||||||
| extern struct mca_config mca_cfg; | extern struct mca_config mca_cfg; | ||||||
| extern void mce_register_decode_chain(struct notifier_block *nb); | extern void mce_register_decode_chain(struct notifier_block *nb); | ||||||
| extern void mce_unregister_decode_chain(struct notifier_block *nb); | extern void mce_unregister_decode_chain(struct notifier_block *nb); | ||||||
|  | |||||||
| @ -186,12 +186,68 @@ static int error_context(struct mce *m) | |||||||
| 	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; | 	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * See AMD Error Scope Hierarchy table in a newer BKDG. For example | ||||||
|  |  * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" | ||||||
|  |  */ | ||||||
|  | static int mce_severity_amd(struct mce *m, enum context ctx) | ||||||
|  | { | ||||||
|  | 	/* Processor Context Corrupt, no need to fumble too much, die! */ | ||||||
|  | 	if (m->status & MCI_STATUS_PCC) | ||||||
|  | 		return MCE_PANIC_SEVERITY; | ||||||
|  | 
 | ||||||
|  | 	if (m->status & MCI_STATUS_UC) { | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * On older systems where overflow_recov flag is not present, we | ||||||
|  | 		 * should simply panic if an error overflow occurs. If | ||||||
|  | 		 * overflow_recov flag is present and set, then software can try | ||||||
|  | 		 * to at least kill process to prolong system operation. | ||||||
|  | 		 */ | ||||||
|  | 		if (mce_flags.overflow_recov) { | ||||||
|  | 			/* software can try to contain */ | ||||||
|  | 			if (!(m->mcgstatus & MCG_STATUS_RIPV)) | ||||||
|  | 				if (ctx == IN_KERNEL) | ||||||
|  | 					return MCE_PANIC_SEVERITY; | ||||||
|  | 
 | ||||||
|  | 				/* kill current process */ | ||||||
|  | 				return MCE_AR_SEVERITY; | ||||||
|  | 		} else { | ||||||
|  | 			/* at least one error was not logged */ | ||||||
|  | 			if (m->status & MCI_STATUS_OVER) | ||||||
|  | 				return MCE_PANIC_SEVERITY; | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * For any other case, return MCE_UC_SEVERITY so that we log the | ||||||
|  | 		 * error and exit #MC handler. | ||||||
|  | 		 */ | ||||||
|  | 		return MCE_UC_SEVERITY; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * deferred error: poll handler catches these and adds to mce_ring so | ||||||
|  | 	 * memory-failure can take recovery actions. | ||||||
|  | 	 */ | ||||||
|  | 	if (m->status & MCI_STATUS_DEFERRED) | ||||||
|  | 		return MCE_DEFERRED_SEVERITY; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * corrected error: poll handler catches these and passes responsibility | ||||||
|  | 	 * of decoding the error to EDAC | ||||||
|  | 	 */ | ||||||
|  | 	return MCE_KEEP_SEVERITY; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) | int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) | ||||||
| { | { | ||||||
| 	enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); | 	enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); | ||||||
| 	enum context ctx = error_context(m); | 	enum context ctx = error_context(m); | ||||||
| 	struct severity *s; | 	struct severity *s; | ||||||
| 
 | 
 | ||||||
|  | 	if (m->cpuvendor == X86_VENDOR_AMD) | ||||||
|  | 		return mce_severity_amd(m, ctx); | ||||||
|  | 
 | ||||||
| 	for (s = severities;; s++) { | 	for (s = severities;; s++) { | ||||||
| 		if ((m->status & s->mask) != s->result) | 		if ((m->status & s->mask) != s->result) | ||||||
| 			continue; | 			continue; | ||||||
|  | |||||||
| @ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); | |||||||
| DEFINE_PER_CPU(unsigned, mce_exception_count); | DEFINE_PER_CPU(unsigned, mce_exception_count); | ||||||
| 
 | 
 | ||||||
| struct mce_bank *mce_banks __read_mostly; | struct mce_bank *mce_banks __read_mostly; | ||||||
|  | struct mce_vendor_flags mce_flags __read_mostly; | ||||||
| 
 | 
 | ||||||
| struct mca_config mca_cfg __read_mostly = { | struct mca_config mca_cfg __read_mostly = { | ||||||
| 	.bootlog  = -1, | 	.bootlog  = -1, | ||||||
| @ -1534,6 +1535,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) | |||||||
| 		if (c->x86 == 6 && cfg->banks > 0) | 		if (c->x86 == 6 && cfg->banks > 0) | ||||||
| 			mce_banks[0].ctl = 0; | 			mce_banks[0].ctl = 0; | ||||||
| 
 | 
 | ||||||
|  | 		/*
 | ||||||
|  | 		 * overflow_recov is supported for F15h Models 00h-0fh | ||||||
|  | 		 * even though we don't have a CPUID bit for it. | ||||||
|  | 		 */ | ||||||
|  | 		if (c->x86 == 0x15 && c->x86_model <= 0xf) | ||||||
|  | 			mce_flags.overflow_recov = 1; | ||||||
|  | 
 | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * Turn off MC4_MISC thresholding banks on those models since | 		 * Turn off MC4_MISC thresholding banks on those models since | ||||||
| 		 * they're not supported there. | 		 * they're not supported there. | ||||||
| @ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) | |||||||
| 		break; | 		break; | ||||||
| 	case X86_VENDOR_AMD: | 	case X86_VENDOR_AMD: | ||||||
| 		mce_amd_feature_init(c); | 		mce_amd_feature_init(c); | ||||||
|  | 		mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; | ||||||
| 		break; | 		break; | ||||||
| 	default: | 	default: | ||||||
| 		break; | 		break; | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user