mainlining shenanigans
Triggering a GPU reset for one engine affects another, notably
corrupting the context status buffer (CSB) effectively losing track of
inflight requests.
Adding a few printks:
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ad41836fa5e5..a969456bc0fa 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1953,6 +1953,7 @@ int i915_reset_engine(struct intel_engine_cs *engine)
goto out;
}
+ pr_err("Resetting %s\n", engine->name);
ret = intel_gpu_reset(engine->i915, intel_engine_flag(engine));
if (ret) {
/* If we fail here, we expect to fallback to a global reset */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 716e5c9ea222..a72bc35d0870 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -355,6 +355,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
port_set(&port[n], port_pack(rq, count));
desc = execlists_update_context(rq);
+ pr_err("%s: in (rq=%x) ctx=%d\n", engine->name, rq->global_seqno, upper_32_bits(desc));
GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
} else {
GEM_BUG_ON(!n);
@@ -594,9 +595,23 @@ static void intel_lrc_irq_handler(unsigned long data)
if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
continue;
+ pr_err("%s: out CSB (%x head=%d, tail=%d), ctx=%d, rq=%d\n",
+ engine->name,
+ readl(csb_mmio),
+ head, tail,
+ readl(buf+2*head+1),
+ port->context_id);
+
/* Check the context/desc id for this event matches */
- GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
- port->context_id);
+ if (readl(buf + 2 * head + 1) != port->context_id) {
+ pr_err("%s: BUG CSB (%x head=%d, tail=%d), ctx=%d, rq=%d\n",
+ engine->name,
+ readl(csb_mmio),
+ head, tail,
+ readl(buf+2*head+1),
+ port->context_id);
+ BUG();
+ }
rq = port_unpack(port, &count);
GEM_BUG_ON(count == 0);
Results in:
[ 6423.006602] Resetting rcs0
[ 6423.009080] rcs0: in (rq=fffffe70) ctx=1
[ 6423.009216] rcs0: in (rq=fffffe6f) ctx=3
[ 6423.009542] rcs0: out CSB (2 head=1, tail=2), ctx=3, rq=3
[ 6423.009619] Resetting bcs0
[ 6423.009980] rcs0: BUG CSB (0 head=1, tail=2), ctx=0, rq=3
Note that this bug may be affect all machines and not just Broxton,
Broxton is just the first machine on which I have confirmed this bug.
Fixes:
|
||
---|---|---|
arch | ||
block | ||
certs | ||
crypto | ||
Documentation | ||
drivers | ||
firmware | ||
fs | ||
include | ||
init | ||
ipc | ||
kernel | ||
lib | ||
mm | ||
net | ||
samples | ||
scripts | ||
security | ||
sound | ||
tools | ||
usr | ||
virt | ||
.cocciconfig | ||
.get_maintainer.ignore | ||
.gitattributes | ||
.gitignore | ||
.mailmap | ||
COPYING | ||
CREDITS | ||
Kbuild | ||
Kconfig | ||
MAINTAINERS | ||
Makefile | ||
README |
Linux kernel ============ This file was moved to Documentation/admin-guide/README.rst Please notice that there are several guides for kernel developers and users. These guides can be rendered in a number of formats, like HTML and PDF. In order to build the documentation, use ``make htmldocs`` or ``make pdfdocs``. There are various text files in the Documentation/ subdirectory, several of them using the Restructured Text markup notation. See Documentation/00-INDEX for a list of what is contained in each file. Please read the Documentation/process/changes.rst file, as it contains the requirements for building and running the kernel, and information about the problems which may result by upgrading your kernel.