x86/MCE: Correct TSC timestamping of error records
We did have logic in the MCE code which would TSC-timestamp an error record only when it is exact - i.e., when it wasn't detected by polling. This isn't the case anymore. So let's fix that: We have a valid TSC timestamp in the error record only when it has been a precise detection, i.e., either in the #MC handler or in one of the interrupt handlers (thresholding, deferred, ...). All other error records still have mce.time which contains the wall time in order to be able to place the error record in time at least approximately. Also, this fixes another bug where machine_check_poll() would clear mce.tsc unconditionally even if we requested precise MCP_TIMESTAMP logging. The proper fix would be to generate timestamp only when it has been requested and not always. But that would require a more thorough code audit of all mce_gather_info/mce_setup() users. Add a FIXME for now. Signed-off-by: Borislav Petkov <bp@suse.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tony <tony.luck@intel.com> Cc: Tony Luck <tony.luck@intel.com> Cc: kernel test robot <xiaolong.ye@intel.com> Cc: linux-edac <linux-edac@vger.kernel.org> Cc: lkp@01.org Link: http://lkml.kernel.org/r/20161110131053.kybsijfs5venpjnf@pd.tnic Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
c09a8c40e0
commit
54467353a9
@ -706,6 +706,15 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
|
|||||||
|
|
||||||
mce_gather_info(&m, NULL);
|
mce_gather_info(&m, NULL);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* m.tsc was set in mce_setup(). Clear it if not requested.
|
||||||
|
*
|
||||||
|
* FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid
|
||||||
|
* that dance.
|
||||||
|
*/
|
||||||
|
if (!(flags & MCP_TIMESTAMP))
|
||||||
|
m.tsc = 0;
|
||||||
|
|
||||||
for (i = 0; i < mca_cfg.banks; i++) {
|
for (i = 0; i < mca_cfg.banks; i++) {
|
||||||
if (!mce_banks[i].ctl || !test_bit(i, *b))
|
if (!mce_banks[i].ctl || !test_bit(i, *b))
|
||||||
continue;
|
continue;
|
||||||
@ -713,14 +722,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
|
|||||||
m.misc = 0;
|
m.misc = 0;
|
||||||
m.addr = 0;
|
m.addr = 0;
|
||||||
m.bank = i;
|
m.bank = i;
|
||||||
m.tsc = 0;
|
|
||||||
|
|
||||||
barrier();
|
barrier();
|
||||||
m.status = mce_rdmsrl(msr_ops.status(i));
|
m.status = mce_rdmsrl(msr_ops.status(i));
|
||||||
if (!(m.status & MCI_STATUS_VAL))
|
if (!(m.status & MCI_STATUS_VAL))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Uncorrected or signalled events are handled by the exception
|
* Uncorrected or signalled events are handled by the exception
|
||||||
* handler when it is enabled, so don't process those here.
|
* handler when it is enabled, so don't process those here.
|
||||||
@ -735,9 +742,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
|
|||||||
|
|
||||||
mce_read_aux(&m, i);
|
mce_read_aux(&m, i);
|
||||||
|
|
||||||
if (!(flags & MCP_TIMESTAMP))
|
|
||||||
m.tsc = 0;
|
|
||||||
|
|
||||||
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
|
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
|
||||||
|
|
||||||
if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
|
if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
|
||||||
@ -1394,7 +1398,7 @@ static void mce_timer_fn(unsigned long data)
|
|||||||
iv = __this_cpu_read(mce_next_interval);
|
iv = __this_cpu_read(mce_next_interval);
|
||||||
|
|
||||||
if (mce_available(this_cpu_ptr(&cpu_info))) {
|
if (mce_available(this_cpu_ptr(&cpu_info))) {
|
||||||
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
|
machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
|
||||||
|
|
||||||
if (mce_intel_cmci_poll()) {
|
if (mce_intel_cmci_poll()) {
|
||||||
iv = mce_adjust_timer(iv);
|
iv = mce_adjust_timer(iv);
|
||||||
|
@ -130,7 +130,7 @@ bool mce_intel_cmci_poll(void)
|
|||||||
* Reset the counter if we've logged an error in the last poll
|
* Reset the counter if we've logged an error in the last poll
|
||||||
* during the storm.
|
* during the storm.
|
||||||
*/
|
*/
|
||||||
if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
|
if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
|
||||||
this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
|
this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
|
||||||
else
|
else
|
||||||
this_cpu_dec(cmci_backoff_cnt);
|
this_cpu_dec(cmci_backoff_cnt);
|
||||||
@ -342,7 +342,7 @@ void cmci_recheck(void)
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
local_irq_save(flags);
|
local_irq_save(flags);
|
||||||
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
|
machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
|
||||||
local_irq_restore(flags);
|
local_irq_restore(flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user