2019-08-12 09:29:35 +00:00
|
|
|
// SPDX-License-Identifier: MIT
|
2015-08-12 14:43:39 +00:00
|
|
|
/*
|
|
|
|
* Copyright © 2014 Intel Corporation
|
|
|
|
*/
|
|
|
|
|
2017-10-04 18:13:40 +00:00
|
|
|
#include <linux/circ_buf.h>
|
2017-03-16 12:56:18 +00:00
|
|
|
|
2019-05-28 09:29:49 +00:00
|
|
|
#include "gem/i915_gem_context.h"
|
2021-01-13 02:12:35 +00:00
|
|
|
#include "gt/gen8_engine_cs.h"
|
|
|
|
#include "gt/intel_breadcrumbs.h"
|
2019-07-13 10:00:11 +00:00
|
|
|
#include "gt/intel_context.h"
|
|
|
|
#include "gt/intel_engine_pm.h"
|
2019-07-13 10:00:14 +00:00
|
|
|
#include "gt/intel_gt.h"
|
2019-08-08 20:27:58 +00:00
|
|
|
#include "gt/intel_gt_pm.h"
|
2020-12-19 02:03:42 +00:00
|
|
|
#include "gt/intel_lrc.h"
|
2021-01-13 02:12:35 +00:00
|
|
|
#include "gt/intel_mocs.h"
|
2019-10-24 10:03:44 +00:00
|
|
|
#include "gt/intel_ring.h"
|
|
|
|
|
2017-11-16 13:32:41 +00:00
|
|
|
#include "intel_guc_submission.h"
|
2019-07-13 10:00:11 +00:00
|
|
|
|
2017-10-04 18:13:40 +00:00
|
|
|
#include "i915_drv.h"
|
2019-08-06 10:07:28 +00:00
|
|
|
#include "i915_trace.h"
|
2017-10-04 18:13:40 +00:00
|
|
|
|
2015-08-12 14:43:41 +00:00
|
|
|
/**
|
2015-10-19 23:10:54 +00:00
|
|
|
* DOC: GuC-based command submission
|
2015-08-12 14:43:41 +00:00
|
|
|
*
|
2019-10-14 18:36:01 +00:00
|
|
|
* IMPORTANT NOTE: GuC submission is currently not supported in i915. The GuC
|
|
|
|
* firmware is moving to an updated submission interface and we plan to
|
|
|
|
* turn submission back on when that lands. The below documentation (and related
|
|
|
|
* code) matches the old submission model and will be updated as part of the
|
|
|
|
* upgrade to the new flow.
|
|
|
|
*
|
2017-03-22 17:39:53 +00:00
|
|
|
* GuC stage descriptor:
|
2017-03-22 17:39:50 +00:00
|
|
|
* During initialization, the driver allocates a static pool of 1024 such
|
2019-12-05 22:02:42 +00:00
|
|
|
* descriptors, and shares them with the GuC. Currently, we only use one
|
|
|
|
* descriptor. This stage descriptor lets the GuC know about the workqueue and
|
drm/i915/guc: kill doorbell code and selftests
Instead of relying on the workqueue, the upcoming reworked GuC
submission flow will offer the host driver indipendent control over
the execution status of each context submitted to GuC. As part of this,
the doorbell usage model has been reworked, with each doorbell being
paired to a single lrc and a doorbell ring representing new work
available for that specific context. This mechanism, however, limits
the number of contexts that can be registered with GuC to the number of
doorbells, which is an undesired limitation. To avoid this limitation,
we requested the GuC team to also provide a H2G that will allow the host
to notify the GuC of work available for a specified lrc, so we can use
that mechanism instead of relying on the doorbells. We can therefore drop
the doorbell code we currently have, also given the fact that in the
unlikely case we'd want to switch back to using doorbells we'd have to
heavily rework it.
The workqueue will still have a use in the new interface to pass special
commands, so that code has been retained for now.
With the doorbells gone and the GuC client becoming even simpler, the
existing GuC selftests don't give us any meaningful coverage so we can
remove them as well. Some selftests might come with the new code, but
they will look different from what we have now so if doesn't seem worth
it to keep the file around in the meantime.
v2: fix comments and commit message (John)
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191205220243.27403-3-daniele.ceraolospurio@intel.com
2019-12-05 22:02:41 +00:00
|
|
|
* process descriptor. Theoretically, it also lets the GuC know about our HW
|
|
|
|
* contexts (context ID, etc...), but we actually employ a kind of submission
|
2019-12-05 22:02:42 +00:00
|
|
|
* where the GuC uses the LRCA sent via the work item instead. This is called
|
drm/i915/guc: kill doorbell code and selftests
Instead of relying on the workqueue, the upcoming reworked GuC
submission flow will offer the host driver indipendent control over
the execution status of each context submitted to GuC. As part of this,
the doorbell usage model has been reworked, with each doorbell being
paired to a single lrc and a doorbell ring representing new work
available for that specific context. This mechanism, however, limits
the number of contexts that can be registered with GuC to the number of
doorbells, which is an undesired limitation. To avoid this limitation,
we requested the GuC team to also provide a H2G that will allow the host
to notify the GuC of work available for a specified lrc, so we can use
that mechanism instead of relying on the doorbells. We can therefore drop
the doorbell code we currently have, also given the fact that in the
unlikely case we'd want to switch back to using doorbells we'd have to
heavily rework it.
The workqueue will still have a use in the new interface to pass special
commands, so that code has been retained for now.
With the doorbells gone and the GuC client becoming even simpler, the
existing GuC selftests don't give us any meaningful coverage so we can
remove them as well. Some selftests might come with the new code, but
they will look different from what we have now so if doesn't seem worth
it to keep the file around in the meantime.
v2: fix comments and commit message (John)
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191205220243.27403-3-daniele.ceraolospurio@intel.com
2019-12-05 22:02:41 +00:00
|
|
|
* a "proxy" submission.
|
2015-08-12 14:43:41 +00:00
|
|
|
*
|
|
|
|
* The Scratch registers:
|
|
|
|
* There are 16 MMIO-based registers start from 0xC180. The kernel driver writes
|
|
|
|
* a value to the action register (SOFT_SCRATCH_0) along with any data. It then
|
|
|
|
* triggers an interrupt on the GuC via another register write (0xC4C8).
|
|
|
|
* Firmware writes a success/fail code back to the action register after
|
|
|
|
* processes the request. The kernel driver polls waiting for this update and
|
|
|
|
* then proceeds.
|
|
|
|
*
|
|
|
|
* Work Items:
|
|
|
|
* There are several types of work items that the host may place into a
|
|
|
|
* workqueue, each with its own requirements and limitations. Currently only
|
|
|
|
* WQ_TYPE_INORDER is needed to support legacy submission via GuC, which
|
|
|
|
* represents in-order queue. The kernel driver packs ring tail pointer and an
|
|
|
|
* ELSP context descriptor dword into Work Item.
|
2017-10-25 20:00:14 +00:00
|
|
|
* See guc_add_request()
|
2015-08-12 14:43:41 +00:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
2021-01-13 02:12:35 +00:00
|
|
|
#define GUC_REQUEST_SIZE 64 /* bytes */
|
|
|
|
|
2018-02-22 14:22:29 +00:00
|
|
|
static inline struct i915_priolist *to_priolist(struct rb_node *rb)
|
|
|
|
{
|
|
|
|
return rb_entry(rb, struct i915_priolist, node);
|
|
|
|
}
|
|
|
|
|
2019-12-05 22:02:42 +00:00
|
|
|
static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
|
2017-03-22 17:39:44 +00:00
|
|
|
{
|
2019-12-05 22:02:42 +00:00
|
|
|
struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
|
|
|
|
|
|
|
|
return &base[id];
|
2017-03-22 17:39:44 +00:00
|
|
|
}
|
|
|
|
|
2017-10-25 20:00:10 +00:00
|
|
|
static int guc_stage_desc_pool_create(struct intel_guc *guc)
|
|
|
|
{
|
2019-12-05 22:02:40 +00:00
|
|
|
u32 size = PAGE_ALIGN(sizeof(struct guc_stage_desc) *
|
|
|
|
GUC_MAX_STAGE_DESCRIPTORS);
|
2017-10-25 20:00:10 +00:00
|
|
|
|
2019-12-05 22:02:42 +00:00
|
|
|
return intel_guc_allocate_and_map_vma(guc, size, &guc->stage_desc_pool,
|
|
|
|
&guc->stage_desc_pool_vaddr);
|
2017-10-25 20:00:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void guc_stage_desc_pool_destroy(struct intel_guc *guc)
|
|
|
|
{
|
2018-07-21 12:50:37 +00:00
|
|
|
i915_vma_unpin_and_release(&guc->stage_desc_pool, I915_VMA_RELEASE_MAP);
|
2017-10-25 20:00:10 +00:00
|
|
|
}
|
|
|
|
|
2015-08-12 14:43:41 +00:00
|
|
|
/*
|
2017-03-22 17:39:53 +00:00
|
|
|
* Initialise/clear the stage descriptor shared with the GuC firmware.
|
2015-08-12 14:43:41 +00:00
|
|
|
*
|
|
|
|
* This descriptor tells the GuC where (in GGTT space) to find the important
|
2019-12-05 22:02:42 +00:00
|
|
|
* data structures related to work submission (process descriptor, write queue,
|
drm/i915/guc: kill doorbell code and selftests
Instead of relying on the workqueue, the upcoming reworked GuC
submission flow will offer the host driver indipendent control over
the execution status of each context submitted to GuC. As part of this,
the doorbell usage model has been reworked, with each doorbell being
paired to a single lrc and a doorbell ring representing new work
available for that specific context. This mechanism, however, limits
the number of contexts that can be registered with GuC to the number of
doorbells, which is an undesired limitation. To avoid this limitation,
we requested the GuC team to also provide a H2G that will allow the host
to notify the GuC of work available for a specified lrc, so we can use
that mechanism instead of relying on the doorbells. We can therefore drop
the doorbell code we currently have, also given the fact that in the
unlikely case we'd want to switch back to using doorbells we'd have to
heavily rework it.
The workqueue will still have a use in the new interface to pass special
commands, so that code has been retained for now.
With the doorbells gone and the GuC client becoming even simpler, the
existing GuC selftests don't give us any meaningful coverage so we can
remove them as well. Some selftests might come with the new code, but
they will look different from what we have now so if doesn't seem worth
it to keep the file around in the meantime.
v2: fix comments and commit message (John)
Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191205220243.27403-3-daniele.ceraolospurio@intel.com
2019-12-05 22:02:41 +00:00
|
|
|
* etc).
|
2015-08-12 14:43:41 +00:00
|
|
|
*/
|
2019-12-05 22:02:42 +00:00
|
|
|
static void guc_stage_desc_init(struct intel_guc *guc)
|
2015-08-12 14:43:41 +00:00
|
|
|
{
|
2017-03-22 17:39:53 +00:00
|
|
|
struct guc_stage_desc *desc;
|
2015-08-12 14:43:41 +00:00
|
|
|
|
2019-12-05 22:02:42 +00:00
|
|
|
/* we only use 1 stage desc, so hardcode it to 0 */
|
|
|
|
desc = __get_stage_desc(guc, 0);
|
2017-03-22 17:39:45 +00:00
|
|
|
memset(desc, 0, sizeof(*desc));
|
2015-08-12 14:43:41 +00:00
|
|
|
|
2017-11-16 13:32:41 +00:00
|
|
|
desc->attribute = GUC_STAGE_DESC_ATTR_ACTIVE |
|
|
|
|
GUC_STAGE_DESC_ATTR_KERNEL;
|
2015-08-12 14:43:41 +00:00
|
|
|
|
2019-12-05 22:02:42 +00:00
|
|
|
desc->stage_id = 0;
|
|
|
|
desc->priority = GUC_CLIENT_PRIORITY_KMD_NORMAL;
|
2015-08-12 14:43:41 +00:00
|
|
|
|
2019-12-05 22:02:42 +00:00
|
|
|
desc->wq_size = GUC_WQ_SIZE;
|
2015-08-12 14:43:41 +00:00
|
|
|
}
|
|
|
|
|
2019-12-05 22:02:42 +00:00
|
|
|
static void guc_stage_desc_fini(struct intel_guc *guc)
|
2015-08-12 14:43:41 +00:00
|
|
|
{
|
2017-03-22 17:39:53 +00:00
|
|
|
struct guc_stage_desc *desc;
|
2015-08-12 14:43:41 +00:00
|
|
|
|
2019-12-05 22:02:42 +00:00
|
|
|
desc = __get_stage_desc(guc, 0);
|
2017-03-22 17:39:45 +00:00
|
|
|
memset(desc, 0, sizeof(*desc));
|
2015-08-12 14:43:41 +00:00
|
|
|
}
|
|
|
|
|
2018-02-21 09:56:36 +00:00
|
|
|
static void guc_add_request(struct intel_guc *guc, struct i915_request *rq)
|
2017-10-25 20:00:14 +00:00
|
|
|
{
|
2021-01-13 02:12:33 +00:00
|
|
|
/* Leaving stub as this function will be used in future patches */
|
2017-10-25 20:00:14 +00:00
|
|
|
}
|
|
|
|
|
drm/i915/guc: Preemption! With GuC
Pretty similar to what we have on execlists.
We're reusing most of the GEM code, however, due to GuC quirks we need a
couple of extra bits.
Preemption is implemented as GuC action, and actions can be pretty slow.
Because of that, we're using a mutex to serialize them. Since we're
requesting preemption from the tasklet, the task of creating a workitem
and wrapping it in GuC action is delegated to a worker.
To distinguish that preemption has finished, we're using additional
piece of HWSP, and since we're not getting context switch interrupts,
we're also adding a user interrupt.
The fact that our special preempt context has completed unfortunately
doesn't mean that we're ready to submit new work. We also need to wait
for GuC to finish its own processing.
v2: Don't compile out the wait for GuC, handle workqueue flush on reset,
no need for ordered workqueue, put on a reviewer hat when looking at my own
patches (Chris)
Move struct work around in intel_guc, move user interruput outside of
conditional (Michał)
Keep ring around rather than chase though intel_context
v3: Extract WA for flushing ggtt writes to a helper (Chris)
Keep work_struct in intel_guc rather than engine (Michał)
Use ordered workqueue for inject_preempt worker to avoid GuC quirks.
v4: Drop now unused INTEL_GUC_PREEMPT_OPTION_IMMEDIATE (Daniele)
Drop stray newlines, use container_of for intel_guc in worker,
check for presence of workqueue when flushing it, rather than
enable_guc_submission modparam, reorder preempt postprocessing (Chris)
v5: Make wq NULL after destroying it
v6: Swap struct guc_preempt_work members (Michał)
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jeff McGee <jeff.mcgee@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Oscar Mateo <oscar.mateo@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20171026133558.19580-1-michal.winiarski@intel.com
2017-10-26 13:35:58 +00:00
|
|
|
/*
|
|
|
|
* When we're doing submissions using regular execlists backend, writing to
|
|
|
|
* ELSP from CPU side is enough to make sure that writes to ringbuffer pages
|
|
|
|
* pinned in mappable aperture portion of GGTT are visible to command streamer.
|
|
|
|
* Writes done by GuC on our behalf are not guaranteeing such ordering,
|
|
|
|
* therefore, to ensure the flush, we're issuing a POSTING READ.
|
|
|
|
*/
|
|
|
|
static void flush_ggtt_writes(struct i915_vma *vma)
|
|
|
|
{
|
|
|
|
if (i915_vma_is_map_and_fenceable(vma))
|
2019-12-07 01:00:33 +00:00
|
|
|
intel_uncore_posting_read_fw(vma->vm->gt->uncore,
|
|
|
|
GUC_STATUS);
|
drm/i915/guc: Preemption! With GuC
Pretty similar to what we have on execlists.
We're reusing most of the GEM code, however, due to GuC quirks we need a
couple of extra bits.
Preemption is implemented as GuC action, and actions can be pretty slow.
Because of that, we're using a mutex to serialize them. Since we're
requesting preemption from the tasklet, the task of creating a workitem
and wrapping it in GuC action is delegated to a worker.
To distinguish that preemption has finished, we're using additional
piece of HWSP, and since we're not getting context switch interrupts,
we're also adding a user interrupt.
The fact that our special preempt context has completed unfortunately
doesn't mean that we're ready to submit new work. We also need to wait
for GuC to finish its own processing.
v2: Don't compile out the wait for GuC, handle workqueue flush on reset,
no need for ordered workqueue, put on a reviewer hat when looking at my own
patches (Chris)
Move struct work around in intel_guc, move user interruput outside of
conditional (Michał)
Keep ring around rather than chase though intel_context
v3: Extract WA for flushing ggtt writes to a helper (Chris)
Keep work_struct in intel_guc rather than engine (Michał)
Use ordered workqueue for inject_preempt worker to avoid GuC quirks.
v4: Drop now unused INTEL_GUC_PREEMPT_OPTION_IMMEDIATE (Daniele)
Drop stray newlines, use container_of for intel_guc in worker,
check for presence of workqueue when flushing it, rather than
enable_guc_submission modparam, reorder preempt postprocessing (Chris)
v5: Make wq NULL after destroying it
v6: Swap struct guc_preempt_work members (Michał)
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jeff McGee <jeff.mcgee@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Oscar Mateo <oscar.mateo@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20171026133558.19580-1-michal.winiarski@intel.com
2017-10-26 13:35:58 +00:00
|
|
|
}
|
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
static void guc_submit(struct intel_engine_cs *engine,
|
|
|
|
struct i915_request **out,
|
|
|
|
struct i915_request **end)
|
2015-08-12 14:43:41 +00:00
|
|
|
{
|
2019-07-13 10:00:12 +00:00
|
|
|
struct intel_guc *guc = &engine->gt->uc.guc;
|
2015-08-12 14:43:41 +00:00
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
do {
|
|
|
|
struct i915_request *rq = *out++;
|
2017-02-28 11:28:03 +00:00
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
flush_ggtt_writes(rq->ring->vma);
|
|
|
|
guc_add_request(guc, rq);
|
|
|
|
} while (out != end);
|
drm/i915/guc: Split hw submission for replay after GPU reset
Something I missed before sending off the partial series was that the
non-scheduler guc reset path was broken (in the full series, this is
pushed to the execlists reset handler). The issue is that after a reset,
we have to refill the GuC workqueues, which we do by resubmitting the
requests. However, if we already have submitted them, the fences within
them have already been used and triggering them again is an error.
Instead, just repopulate the guc workqueue.
[ 115.858560] [IGT] gem_busy: starting subtest hang-render
[ 135.839867] [drm] GPU HANG: ecode 9:0:0xe757fefe, in gem_busy [1716], reason: Hang on render ring, action: reset
[ 135.839902] drm/i915: Resetting chip after gpu hang
[ 135.839957] [drm] RC6 on
[ 135.858351] ------------[ cut here ]------------
[ 135.858357] WARNING: CPU: 2 PID: 45 at drivers/gpu/drm/i915/i915_sw_fence.c:108 i915_sw_fence_complete+0x25/0x30
[ 135.858357] Modules linked in: rfcomm bnep binfmt_misc nls_iso8859_1 input_leds snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core btusb btrtl snd_hwdep snd_pcm 8250_dw snd_seq_midi hid_lenovo snd_seq_midi_event snd_rawmidi iwlwifi x86_pkg_temp_thermal coretemp snd_seq crct10dif_pclmul snd_seq_device hci_uart snd_timer crc32_pclmul ghash_clmulni_intel idma64 aesni_intel virt_dma btbcm snd btqca aes_x86_64 btintel lrw cfg80211 bluetooth gf128mul glue_helper ablk_helper cryptd soundcore intel_lpss_pci intel_pch_thermal intel_lpss_acpi intel_lpss acpi_als mfd_core kfifo_buf acpi_pad industrialio autofs4 hid_plantronics usbhid dm_mirror dm_region_hash dm_log sdhci_pci ahci sdhci libahci i2c_hid hid
[ 135.858389] CPU: 2 PID: 45 Comm: kworker/2:1 Tainted: G W 4.9.0-rc4+ #238
[ 135.858389] Hardware name: /NUC6i3SYB, BIOS SYSKLi35.86A.0024.2015.1027.2142 10/27/2015
[ 135.858392] Workqueue: events_long i915_hangcheck_elapsed
[ 135.858394] ffffc900001bf9b8 ffffffff812bb238 0000000000000000 0000000000000000
[ 135.858396] ffffc900001bf9f8 ffffffff8104f621 0000006c00000000 ffff8808296137f8
[ 135.858398] 0000000000000a00 ffff8808457a0000 ffff880845764e60 ffff880845760000
[ 135.858399] Call Trace:
[ 135.858403] [<ffffffff812bb238>] dump_stack+0x4d/0x65
[ 135.858405] [<ffffffff8104f621>] __warn+0xc1/0xe0
[ 135.858406] [<ffffffff8104f748>] warn_slowpath_null+0x18/0x20
[ 135.858408] [<ffffffff813f8c15>] i915_sw_fence_complete+0x25/0x30
[ 135.858410] [<ffffffff813f8fad>] i915_sw_fence_commit+0xd/0x30
[ 135.858412] [<ffffffff8142e591>] __i915_gem_request_submit+0xe1/0xf0
[ 135.858413] [<ffffffff8142e5c8>] i915_gem_request_submit+0x28/0x40
[ 135.858415] [<ffffffff814433e7>] i915_guc_submit+0x47/0x210
[ 135.858417] [<ffffffff81443e98>] i915_guc_submission_enable+0x468/0x540
[ 135.858419] [<ffffffff81442495>] intel_guc_setup+0x715/0x810
[ 135.858421] [<ffffffff8142b6b4>] i915_gem_init_hw+0x114/0x2a0
[ 135.858423] [<ffffffff813eeaa8>] i915_reset+0xe8/0x120
[ 135.858424] [<ffffffff813f3937>] i915_reset_and_wakeup+0x157/0x180
[ 135.858426] [<ffffffff813f79db>] i915_handle_error+0x1ab/0x230
[ 135.858428] [<ffffffff812c760d>] ? scnprintf+0x4d/0x90
[ 135.858430] [<ffffffff81435985>] i915_hangcheck_elapsed+0x275/0x3d0
[ 135.858432] [<ffffffff810668cf>] process_one_work+0x12f/0x410
[ 135.858433] [<ffffffff81066bf3>] worker_thread+0x43/0x4d0
[ 135.858435] [<ffffffff81066bb0>] ? process_one_work+0x410/0x410
[ 135.858436] [<ffffffff81066bb0>] ? process_one_work+0x410/0x410
[ 135.858438] [<ffffffff8106bbb4>] kthread+0xd4/0xf0
[ 135.858440] [<ffffffff8106bae0>] ? kthread_park+0x60/0x60
v2: Only resubmit submitted requests
v3: Don't forget the pending requests have reserved space.
Fixes: d55ac5bf97c6 ("drm/i915: Defer transfer onto execution timeline to actual hw submission")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161129121024.22650-6-chris@chris-wilson.co.uk
2016-11-29 12:10:24 +00:00
|
|
|
}
|
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
static inline int rq_prio(const struct i915_request *rq)
|
2017-05-17 12:10:00 +00:00
|
|
|
{
|
2020-05-07 15:23:38 +00:00
|
|
|
return rq->sched.attr.priority;
|
2017-05-17 12:10:00 +00:00
|
|
|
}
|
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
static struct i915_request *schedule_in(struct i915_request *rq, int idx)
|
2018-04-03 18:35:37 +00:00
|
|
|
{
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
trace_i915_request_in(rq, idx);
|
|
|
|
|
2019-08-12 20:36:26 +00:00
|
|
|
/*
|
|
|
|
* Currently we are not tracking the rq->context being inflight
|
|
|
|
* (ce->inflight = rq->engine). It is only used by the execlists
|
|
|
|
* backend at the moment, a similar counting strategy would be
|
|
|
|
* required if we generalise the inflight tracking.
|
|
|
|
*/
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
|
2019-11-21 13:05:28 +00:00
|
|
|
__intel_gt_pm_get(rq->engine->gt);
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
return i915_request_get(rq);
|
2018-04-03 18:35:37 +00:00
|
|
|
}
|
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
static void schedule_out(struct i915_request *rq)
|
2018-04-03 18:35:37 +00:00
|
|
|
{
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
trace_i915_request_out(rq);
|
|
|
|
|
2019-11-20 21:13:21 +00:00
|
|
|
intel_gt_pm_put_async(rq->engine->gt);
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
i915_request_put(rq);
|
2018-04-03 18:35:37 +00:00
|
|
|
}
|
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
static void __guc_dequeue(struct intel_engine_cs *engine)
|
2017-03-16 12:56:18 +00:00
|
|
|
{
|
2017-09-22 12:43:03 +00:00
|
|
|
struct intel_engine_execlists * const execlists = &engine->execlists;
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
struct i915_request **first = execlists->inflight;
|
|
|
|
struct i915_request ** const last_port = first + execlists->port_mask;
|
|
|
|
struct i915_request *last = first[0];
|
|
|
|
struct i915_request **port;
|
2017-03-16 12:56:18 +00:00
|
|
|
bool submit = false;
|
2017-09-14 08:32:13 +00:00
|
|
|
struct rb_node *rb;
|
|
|
|
|
2019-06-14 16:46:06 +00:00
|
|
|
lockdep_assert_held(&engine->active.lock);
|
2018-05-08 21:03:18 +00:00
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
if (last) {
|
|
|
|
if (*++first)
|
|
|
|
return;
|
|
|
|
|
|
|
|
last = NULL;
|
drm/i915/guc: Preemption! With GuC
Pretty similar to what we have on execlists.
We're reusing most of the GEM code, however, due to GuC quirks we need a
couple of extra bits.
Preemption is implemented as GuC action, and actions can be pretty slow.
Because of that, we're using a mutex to serialize them. Since we're
requesting preemption from the tasklet, the task of creating a workitem
and wrapping it in GuC action is delegated to a worker.
To distinguish that preemption has finished, we're using additional
piece of HWSP, and since we're not getting context switch interrupts,
we're also adding a user interrupt.
The fact that our special preempt context has completed unfortunately
doesn't mean that we're ready to submit new work. We also need to wait
for GuC to finish its own processing.
v2: Don't compile out the wait for GuC, handle workqueue flush on reset,
no need for ordered workqueue, put on a reviewer hat when looking at my own
patches (Chris)
Move struct work around in intel_guc, move user interruput outside of
conditional (Michał)
Keep ring around rather than chase though intel_context
v3: Extract WA for flushing ggtt writes to a helper (Chris)
Keep work_struct in intel_guc rather than engine (Michał)
Use ordered workqueue for inject_preempt worker to avoid GuC quirks.
v4: Drop now unused INTEL_GUC_PREEMPT_OPTION_IMMEDIATE (Daniele)
Drop stray newlines, use container_of for intel_guc in worker,
check for presence of workqueue when flushing it, rather than
enable_guc_submission modparam, reorder preempt postprocessing (Chris)
v5: Make wq NULL after destroying it
v6: Swap struct guc_preempt_work members (Michał)
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jeff McGee <jeff.mcgee@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Oscar Mateo <oscar.mateo@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20171026133558.19580-1-michal.winiarski@intel.com
2017-10-26 13:35:58 +00:00
|
|
|
}
|
|
|
|
|
2019-08-12 20:36:26 +00:00
|
|
|
/*
|
|
|
|
* We write directly into the execlists->inflight queue and don't use
|
|
|
|
* the execlists->pending queue, as we don't have a distinct switch
|
|
|
|
* event.
|
|
|
|
*/
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
port = first;
|
2018-06-29 07:53:20 +00:00
|
|
|
while ((rb = rb_first_cached(&execlists->queue))) {
|
2018-02-22 14:22:29 +00:00
|
|
|
struct i915_priolist *p = to_priolist(rb);
|
2018-02-21 09:56:36 +00:00
|
|
|
struct i915_request *rq, *rn;
|
2018-10-01 12:32:04 +00:00
|
|
|
int i;
|
drm/i915: Split execlist priority queue into rbtree + linked list
All the requests at the same priority are executed in FIFO order. They
do not need to be stored in the rbtree themselves, as they are a simple
list within a level. If we move the requests at one priority into a list,
we can then reduce the rbtree to the set of priorities. This should keep
the height of the rbtree small, as the number of active priorities can not
exceed the number of active requests and should be typically only a few.
Currently, we have ~2k possible different priority levels, that may
increase to allow even more fine grained selection. Allocating those in
advance seems a waste (and may be impossible), so we opt for allocating
upon first use, and freeing after its requests are depleted. To avoid
the possibility of an allocation failure causing us to lose a request,
we preallocate the default priority (0) and bump any request to that
priority if we fail to allocate it the appropriate plist. Having a
request (that is ready to run, so not leading to corruption) execute
out-of-order is better than leaking the request (and its dependency
tree) entirely.
There should be a benefit to reducing execlists_dequeue() to principally
using a simple list (and reducing the frequency of both rbtree iteration
and balancing on erase) but for typical workloads, request coalescing
should be small enough that we don't notice any change. The main gain is
from improving PI calls to schedule, and the explicit list within a
level should make request unwinding simpler (we just need to insert at
the head of the list rather than the tail and not have to make the
rbtree search more complicated).
v2: Avoid use-after-free when deleting a depleted priolist
v3: Michał found the solution to handling the allocation failure
gracefully. If we disable all priority scheduling following the
allocation failure, those requests will be executed in fifo and we will
ensure that this request and its dependencies are in strict fifo (even
when it doesn't realise it is only a single list). Normal scheduling is
restored once we know the device is idle, until the next failure!
Suggested-by: Michał Wajdeczko <michal.wajdeczko@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170517121007.27224-8-chris@chris-wilson.co.uk
2017-05-17 12:10:03 +00:00
|
|
|
|
2018-10-01 12:32:04 +00:00
|
|
|
priolist_for_each_request_consume(rq, rn, p, i) {
|
2019-12-20 10:12:29 +00:00
|
|
|
if (last && rq->context != last->context) {
|
2018-10-01 12:32:04 +00:00
|
|
|
if (port == last_port)
|
drm/i915: Split execlist priority queue into rbtree + linked list
All the requests at the same priority are executed in FIFO order. They
do not need to be stored in the rbtree themselves, as they are a simple
list within a level. If we move the requests at one priority into a list,
we can then reduce the rbtree to the set of priorities. This should keep
the height of the rbtree small, as the number of active priorities can not
exceed the number of active requests and should be typically only a few.
Currently, we have ~2k possible different priority levels, that may
increase to allow even more fine grained selection. Allocating those in
advance seems a waste (and may be impossible), so we opt for allocating
upon first use, and freeing after its requests are depleted. To avoid
the possibility of an allocation failure causing us to lose a request,
we preallocate the default priority (0) and bump any request to that
priority if we fail to allocate it the appropriate plist. Having a
request (that is ready to run, so not leading to corruption) execute
out-of-order is better than leaking the request (and its dependency
tree) entirely.
There should be a benefit to reducing execlists_dequeue() to principally
using a simple list (and reducing the frequency of both rbtree iteration
and balancing on erase) but for typical workloads, request coalescing
should be small enough that we don't notice any change. The main gain is
from improving PI calls to schedule, and the explicit list within a
level should make request unwinding simpler (we just need to insert at
the head of the list rather than the tail and not have to make the
rbtree search more complicated).
v2: Avoid use-after-free when deleting a depleted priolist
v3: Michał found the solution to handling the allocation failure
gracefully. If we disable all priority scheduling following the
allocation failure, those requests will be executed in fifo and we will
ensure that this request and its dependencies are in strict fifo (even
when it doesn't realise it is only a single list). Normal scheduling is
restored once we know the device is idle, until the next failure!
Suggested-by: Michał Wajdeczko <michal.wajdeczko@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170517121007.27224-8-chris@chris-wilson.co.uk
2017-05-17 12:10:03 +00:00
|
|
|
goto done;
|
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
*port = schedule_in(last,
|
|
|
|
port - execlists->inflight);
|
drm/i915: Split execlist priority queue into rbtree + linked list
All the requests at the same priority are executed in FIFO order. They
do not need to be stored in the rbtree themselves, as they are a simple
list within a level. If we move the requests at one priority into a list,
we can then reduce the rbtree to the set of priorities. This should keep
the height of the rbtree small, as the number of active priorities can not
exceed the number of active requests and should be typically only a few.
Currently, we have ~2k possible different priority levels, that may
increase to allow even more fine grained selection. Allocating those in
advance seems a waste (and may be impossible), so we opt for allocating
upon first use, and freeing after its requests are depleted. To avoid
the possibility of an allocation failure causing us to lose a request,
we preallocate the default priority (0) and bump any request to that
priority if we fail to allocate it the appropriate plist. Having a
request (that is ready to run, so not leading to corruption) execute
out-of-order is better than leaking the request (and its dependency
tree) entirely.
There should be a benefit to reducing execlists_dequeue() to principally
using a simple list (and reducing the frequency of both rbtree iteration
and balancing on erase) but for typical workloads, request coalescing
should be small enough that we don't notice any change. The main gain is
from improving PI calls to schedule, and the explicit list within a
level should make request unwinding simpler (we just need to insert at
the head of the list rather than the tail and not have to make the
rbtree search more complicated).
v2: Avoid use-after-free when deleting a depleted priolist
v3: Michał found the solution to handling the allocation failure
gracefully. If we disable all priority scheduling following the
allocation failure, those requests will be executed in fifo and we will
ensure that this request and its dependencies are in strict fifo (even
when it doesn't realise it is only a single list). Normal scheduling is
restored once we know the device is idle, until the next failure!
Suggested-by: Michał Wajdeczko <michal.wajdeczko@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170517121007.27224-8-chris@chris-wilson.co.uk
2017-05-17 12:10:03 +00:00
|
|
|
port++;
|
|
|
|
}
|
|
|
|
|
2018-10-01 12:32:04 +00:00
|
|
|
list_del_init(&rq->sched.link);
|
2018-02-21 09:56:36 +00:00
|
|
|
__i915_request_submit(rq);
|
drm/i915: Split execlist priority queue into rbtree + linked list
All the requests at the same priority are executed in FIFO order. They
do not need to be stored in the rbtree themselves, as they are a simple
list within a level. If we move the requests at one priority into a list,
we can then reduce the rbtree to the set of priorities. This should keep
the height of the rbtree small, as the number of active priorities can not
exceed the number of active requests and should be typically only a few.
Currently, we have ~2k possible different priority levels, that may
increase to allow even more fine grained selection. Allocating those in
advance seems a waste (and may be impossible), so we opt for allocating
upon first use, and freeing after its requests are depleted. To avoid
the possibility of an allocation failure causing us to lose a request,
we preallocate the default priority (0) and bump any request to that
priority if we fail to allocate it the appropriate plist. Having a
request (that is ready to run, so not leading to corruption) execute
out-of-order is better than leaking the request (and its dependency
tree) entirely.
There should be a benefit to reducing execlists_dequeue() to principally
using a simple list (and reducing the frequency of both rbtree iteration
and balancing on erase) but for typical workloads, request coalescing
should be small enough that we don't notice any change. The main gain is
from improving PI calls to schedule, and the explicit list within a
level should make request unwinding simpler (we just need to insert at
the head of the list rather than the tail and not have to make the
rbtree search more complicated).
v2: Avoid use-after-free when deleting a depleted priolist
v3: Michał found the solution to handling the allocation failure
gracefully. If we disable all priority scheduling following the
allocation failure, those requests will be executed in fifo and we will
ensure that this request and its dependencies are in strict fifo (even
when it doesn't realise it is only a single list). Normal scheduling is
restored once we know the device is idle, until the next failure!
Suggested-by: Michał Wajdeczko <michal.wajdeczko@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170517121007.27224-8-chris@chris-wilson.co.uk
2017-05-17 12:10:03 +00:00
|
|
|
submit = true;
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
last = rq;
|
2017-03-16 12:56:18 +00:00
|
|
|
}
|
|
|
|
|
2018-06-29 07:53:20 +00:00
|
|
|
rb_erase_cached(&p->node, &execlists->queue);
|
2019-02-28 10:20:33 +00:00
|
|
|
i915_priolist_free(p);
|
2018-02-22 14:22:29 +00:00
|
|
|
}
|
drm/i915: Split execlist priority queue into rbtree + linked list
All the requests at the same priority are executed in FIFO order. They
do not need to be stored in the rbtree themselves, as they are a simple
list within a level. If we move the requests at one priority into a list,
we can then reduce the rbtree to the set of priorities. This should keep
the height of the rbtree small, as the number of active priorities can not
exceed the number of active requests and should be typically only a few.
Currently, we have ~2k possible different priority levels, that may
increase to allow even more fine grained selection. Allocating those in
advance seems a waste (and may be impossible), so we opt for allocating
upon first use, and freeing after its requests are depleted. To avoid
the possibility of an allocation failure causing us to lose a request,
we preallocate the default priority (0) and bump any request to that
priority if we fail to allocate it the appropriate plist. Having a
request (that is ready to run, so not leading to corruption) execute
out-of-order is better than leaking the request (and its dependency
tree) entirely.
There should be a benefit to reducing execlists_dequeue() to principally
using a simple list (and reducing the frequency of both rbtree iteration
and balancing on erase) but for typical workloads, request coalescing
should be small enough that we don't notice any change. The main gain is
from improving PI calls to schedule, and the explicit list within a
level should make request unwinding simpler (we just need to insert at
the head of the list rather than the tail and not have to make the
rbtree search more complicated).
v2: Avoid use-after-free when deleting a depleted priolist
v3: Michał found the solution to handling the allocation failure
gracefully. If we disable all priority scheduling following the
allocation failure, those requests will be executed in fifo and we will
ensure that this request and its dependencies are in strict fifo (even
when it doesn't realise it is only a single list). Normal scheduling is
restored once we know the device is idle, until the next failure!
Suggested-by: Michał Wajdeczko <michal.wajdeczko@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Reviewed-by: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170517121007.27224-8-chris@chris-wilson.co.uk
2017-05-17 12:10:03 +00:00
|
|
|
done:
|
2019-01-29 18:54:51 +00:00
|
|
|
execlists->queue_priority_hint =
|
|
|
|
rb ? to_priolist(rb)->priority : INT_MIN;
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
if (submit) {
|
|
|
|
*port = schedule_in(last, port - execlists->inflight);
|
|
|
|
*++port = NULL;
|
|
|
|
guc_submit(engine, first, port);
|
|
|
|
}
|
|
|
|
execlists->active = execlists->inflight;
|
2017-03-16 12:56:18 +00:00
|
|
|
}
|
|
|
|
|
2017-11-16 13:32:37 +00:00
|
|
|
static void guc_submission_tasklet(unsigned long data)
|
2017-03-16 12:56:18 +00:00
|
|
|
{
|
2017-09-22 12:43:03 +00:00
|
|
|
struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
|
2017-09-22 12:43:06 +00:00
|
|
|
struct intel_engine_execlists * const execlists = &engine->execlists;
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
struct i915_request **port, *rq;
|
2018-09-25 08:31:59 +00:00
|
|
|
unsigned long flags;
|
|
|
|
|
2019-06-14 16:46:06 +00:00
|
|
|
spin_lock_irqsave(&engine->active.lock, flags);
|
2017-03-16 12:56:18 +00:00
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
for (port = execlists->inflight; (rq = *port); port++) {
|
|
|
|
if (!i915_request_completed(rq))
|
|
|
|
break;
|
2017-05-17 12:10:00 +00:00
|
|
|
|
drm/i915/execlists: Preempt-to-busy
When using a global seqno, we required a precise stop-the-workd event to
handle preemption and unwind the global seqno counter. To accomplish
this, we would preempt to a special out-of-band context and wait for the
machine to report that it was idle. Given an idle machine, we could very
precisely see which requests had completed and which we needed to feed
back into the run queue.
However, now that we have scrapped the global seqno, we no longer need
to precisely unwind the global counter and only track requests by their
per-context seqno. This allows us to loosely unwind inflight requests
while scheduling a preemption, with the enormous caveat that the
requests we put back on the run queue are still _inflight_ (until the
preemption request is complete). This makes request tracking much more
messy, as at any point then we can see a completed request that we
believe is not currently scheduled for execution. We also have to be
careful not to rewind RING_TAIL past RING_HEAD on preempting to the
running context, and for this we use a semaphore to prevent completion
of the request before continuing.
To accomplish this feat, we change how we track requests scheduled to
the HW. Instead of appending our requests onto a single list as we
submit, we track each submission to ELSP as its own block. Then upon
receiving the CS preemption event, we promote the pending block to the
inflight block (discarding what was previously being tracked). As normal
CS completion events arrive, we then remove stale entries from the
inflight tracker.
v2: Be a tinge paranoid and ensure we flush the write into the HWS page
for the GPU semaphore to pick in a timely fashion.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190620142052.19311-1-chris@chris-wilson.co.uk
2019-06-20 14:20:51 +00:00
|
|
|
schedule_out(rq);
|
|
|
|
}
|
|
|
|
if (port != execlists->inflight) {
|
|
|
|
int idx = port - execlists->inflight;
|
|
|
|
int rem = ARRAY_SIZE(execlists->inflight) - idx;
|
|
|
|
memmove(execlists->inflight, port, rem * sizeof(*port));
|
2017-09-14 08:32:13 +00:00
|
|
|
}
|
2017-03-16 12:56:18 +00:00
|
|
|
|
2019-07-10 00:54:26 +00:00
|
|
|
__guc_dequeue(engine);
|
2018-09-25 08:31:59 +00:00
|
|
|
|
2019-06-14 16:46:06 +00:00
|
|
|
spin_unlock_irqrestore(&engine->active.lock, flags);
|
2017-03-16 12:56:18 +00:00
|
|
|
}
|
|
|
|
|
2019-01-25 13:22:28 +00:00
|
|
|
static void guc_reset_prepare(struct intel_engine_cs *engine)
|
2018-05-16 18:33:52 +00:00
|
|
|
{
|
|
|
|
struct intel_engine_execlists * const execlists = &engine->execlists;
|
|
|
|
|
2019-12-13 15:51:52 +00:00
|
|
|
ENGINE_TRACE(engine, "\n");
|
2018-05-16 18:33:52 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Prevent request submission to the hardware until we have
|
|
|
|
* completed the reset in i915_gem_reset_finish(). If a request
|
|
|
|
* is completed by one engine, it may then queue a request
|
|
|
|
* to a second via its execlists->tasklet *just* as we are
|
|
|
|
* calling engine->init_hw() and also writing the ELSP.
|
|
|
|
* Turning off the execlists->tasklet until the reset is over
|
|
|
|
* prevents the race.
|
|
|
|
*/
|
|
|
|
__tasklet_disable_sync_once(&execlists->tasklet);
|
|
|
|
}
|
|
|
|
|
2020-12-19 02:03:42 +00:00
|
|
|
static void guc_reset_state(struct intel_context *ce,
|
|
|
|
struct intel_engine_cs *engine,
|
|
|
|
u32 head,
|
|
|
|
bool scrub)
|
|
|
|
{
|
|
|
|
GEM_BUG_ON(!intel_context_is_pinned(ce));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We want a simple context + ring to execute the breadcrumb update.
|
|
|
|
* We cannot rely on the context being intact across the GPU hang,
|
|
|
|
* so clear it and rebuild just what we need for the breadcrumb.
|
|
|
|
* All pending requests for this context will be zapped, and any
|
|
|
|
* future request will be after userspace has had the opportunity
|
|
|
|
* to recreate its own state.
|
|
|
|
*/
|
|
|
|
if (scrub)
|
|
|
|
lrc_init_regs(ce, engine, true);
|
|
|
|
|
|
|
|
/* Rerun the request; its payload has been neutered (if guilty). */
|
|
|
|
lrc_update_regs(ce, engine, head);
|
|
|
|
}
|
|
|
|
|
2019-12-22 12:07:52 +00:00
|
|
|
static void guc_reset_rewind(struct intel_engine_cs *engine, bool stalled)
|
2019-04-11 13:05:14 +00:00
|
|
|
{
|
|
|
|
struct intel_engine_execlists * const execlists = &engine->execlists;
|
|
|
|
struct i915_request *rq;
|
|
|
|
unsigned long flags;
|
|
|
|
|
2019-06-14 16:46:06 +00:00
|
|
|
spin_lock_irqsave(&engine->active.lock, flags);
|
2019-04-11 13:05:14 +00:00
|
|
|
|
|
|
|
/* Push back any incomplete requests for replay after the reset. */
|
|
|
|
rq = execlists_unwind_incomplete_requests(execlists);
|
|
|
|
if (!rq)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
if (!i915_request_started(rq))
|
|
|
|
stalled = false;
|
|
|
|
|
2019-07-12 19:29:53 +00:00
|
|
|
__i915_request_reset(rq, stalled);
|
2020-12-19 02:03:42 +00:00
|
|
|
guc_reset_state(rq->context, engine, rq->head, stalled);
|
2019-04-11 13:05:14 +00:00
|
|
|
|
|
|
|
out_unlock:
|
2019-06-14 16:46:06 +00:00
|
|
|
spin_unlock_irqrestore(&engine->active.lock, flags);
|
2019-04-11 13:05:14 +00:00
|
|
|
}
|
|
|
|
|
2019-12-22 12:07:52 +00:00
|
|
|
static void guc_reset_cancel(struct intel_engine_cs *engine)
|
2019-04-11 13:05:14 +00:00
|
|
|
{
|
|
|
|
struct intel_engine_execlists * const execlists = &engine->execlists;
|
|
|
|
struct i915_request *rq, *rn;
|
|
|
|
struct rb_node *rb;
|
|
|
|
unsigned long flags;
|
|
|
|
|
2019-12-13 15:51:52 +00:00
|
|
|
ENGINE_TRACE(engine, "\n");
|
2019-04-11 13:05:14 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Before we call engine->cancel_requests(), we should have exclusive
|
|
|
|
* access to the submission state. This is arranged for us by the
|
|
|
|
* caller disabling the interrupt generation, the tasklet and other
|
|
|
|
* threads that may then access the same state, giving us a free hand
|
|
|
|
* to reset state. However, we still need to let lockdep be aware that
|
|
|
|
* we know this state may be accessed in hardirq context, so we
|
|
|
|
* disable the irq around this manipulation and we want to keep
|
|
|
|
* the spinlock focused on its duties and not accidentally conflate
|
|
|
|
* coverage to the submission's irq state. (Similarly, although we
|
|
|
|
* shouldn't need to disable irq around the manipulation of the
|
|
|
|
* submission's irq state, we also wish to remind ourselves that
|
|
|
|
* it is irq state.)
|
|
|
|
*/
|
2019-06-14 16:46:06 +00:00
|
|
|
spin_lock_irqsave(&engine->active.lock, flags);
|
2019-04-11 13:05:14 +00:00
|
|
|
|
|
|
|
/* Mark all executing requests as skipped. */
|
2019-06-14 16:46:06 +00:00
|
|
|
list_for_each_entry(rq, &engine->active.requests, sched.link) {
|
2020-03-04 12:18:48 +00:00
|
|
|
i915_request_set_error_once(rq, -EIO);
|
2019-04-11 13:05:14 +00:00
|
|
|
i915_request_mark_complete(rq);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Flush the queued requests to the timeline list (for retiring). */
|
|
|
|
while ((rb = rb_first_cached(&execlists->queue))) {
|
|
|
|
struct i915_priolist *p = to_priolist(rb);
|
|
|
|
int i;
|
|
|
|
|
|
|
|
priolist_for_each_request_consume(rq, rn, p, i) {
|
|
|
|
list_del_init(&rq->sched.link);
|
|
|
|
__i915_request_submit(rq);
|
|
|
|
dma_fence_set_error(&rq->fence, -EIO);
|
|
|
|
i915_request_mark_complete(rq);
|
|
|
|
}
|
|
|
|
|
|
|
|
rb_erase_cached(&p->node, &execlists->queue);
|
|
|
|
i915_priolist_free(p);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Remaining _unready_ requests will be nop'ed when submitted */
|
|
|
|
|
|
|
|
execlists->queue_priority_hint = INT_MIN;
|
|
|
|
execlists->queue = RB_ROOT_CACHED;
|
|
|
|
|
2019-06-14 16:46:06 +00:00
|
|
|
spin_unlock_irqrestore(&engine->active.lock, flags);
|
2019-04-11 13:05:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void guc_reset_finish(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
struct intel_engine_execlists * const execlists = &engine->execlists;
|
|
|
|
|
|
|
|
if (__tasklet_enable(&execlists->tasklet))
|
|
|
|
/* And kick in case we missed a new request submission. */
|
|
|
|
tasklet_hi_schedule(&execlists->tasklet);
|
|
|
|
|
2019-12-13 15:51:52 +00:00
|
|
|
ENGINE_TRACE(engine, "depth->%d\n",
|
|
|
|
atomic_read(&execlists->tasklet.count));
|
2019-04-11 13:05:14 +00:00
|
|
|
}
|
|
|
|
|
2015-08-12 14:43:39 +00:00
|
|
|
/*
|
2017-03-22 17:39:52 +00:00
|
|
|
* Set up the memory resources to be shared with the GuC (via the GGTT)
|
|
|
|
* at firmware loading time.
|
2015-08-12 14:43:39 +00:00
|
|
|
*/
|
2017-11-16 13:32:39 +00:00
|
|
|
int intel_guc_submission_init(struct intel_guc *guc)
|
2015-08-12 14:43:39 +00:00
|
|
|
{
|
2017-03-22 17:39:46 +00:00
|
|
|
int ret;
|
2015-08-12 14:43:39 +00:00
|
|
|
|
2017-03-22 17:39:53 +00:00
|
|
|
if (guc->stage_desc_pool)
|
2017-03-22 17:39:46 +00:00
|
|
|
return 0;
|
2015-08-12 14:43:39 +00:00
|
|
|
|
2017-10-25 20:00:10 +00:00
|
|
|
ret = guc_stage_desc_pool_create(guc);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2017-11-06 11:48:33 +00:00
|
|
|
/*
|
|
|
|
* Keep static analysers happy, let them know that we allocated the
|
|
|
|
* vma after testing that it didn't exist earlier.
|
|
|
|
*/
|
|
|
|
GEM_BUG_ON(!guc->stage_desc_pool);
|
2017-03-22 17:39:45 +00:00
|
|
|
|
2015-08-12 14:43:39 +00:00
|
|
|
return 0;
|
2017-03-22 17:39:46 +00:00
|
|
|
}
|
|
|
|
|
2017-11-16 13:32:39 +00:00
|
|
|
void intel_guc_submission_fini(struct intel_guc *guc)
|
2017-03-22 17:39:46 +00:00
|
|
|
{
|
2019-12-05 22:02:42 +00:00
|
|
|
if (guc->stage_desc_pool) {
|
2018-07-13 17:26:58 +00:00
|
|
|
guc_stage_desc_pool_destroy(guc);
|
2019-12-05 22:02:42 +00:00
|
|
|
}
|
2016-11-29 12:10:23 +00:00
|
|
|
}
|
|
|
|
|
2019-07-13 10:00:14 +00:00
|
|
|
static void guc_interrupts_capture(struct intel_gt *gt)
|
2017-03-09 13:20:04 +00:00
|
|
|
{
|
2019-07-13 10:00:14 +00:00
|
|
|
struct intel_uncore *uncore = gt->uncore;
|
2019-11-05 22:53:21 +00:00
|
|
|
u32 irqs = GT_CONTEXT_SWITCH_INTERRUPT;
|
|
|
|
u32 dmask = irqs << 16 | irqs;
|
2017-03-09 13:20:04 +00:00
|
|
|
|
2019-11-05 22:53:21 +00:00
|
|
|
GEM_BUG_ON(INTEL_GEN(gt->i915) < 11);
|
2017-03-11 02:37:01 +00:00
|
|
|
|
2019-11-05 22:53:21 +00:00
|
|
|
/* Don't handle the ctx switch interrupt in GuC submission mode */
|
|
|
|
intel_uncore_rmw(uncore, GEN11_RENDER_COPY_INTR_ENABLE, dmask, 0);
|
|
|
|
intel_uncore_rmw(uncore, GEN11_VCS_VECS_INTR_ENABLE, dmask, 0);
|
2017-03-09 13:20:04 +00:00
|
|
|
}
|
|
|
|
|
2019-07-13 10:00:14 +00:00
|
|
|
static void guc_interrupts_release(struct intel_gt *gt)
|
2017-03-22 17:39:55 +00:00
|
|
|
{
|
2019-07-13 10:00:14 +00:00
|
|
|
struct intel_uncore *uncore = gt->uncore;
|
2019-11-05 22:53:21 +00:00
|
|
|
u32 irqs = GT_CONTEXT_SWITCH_INTERRUPT;
|
|
|
|
u32 dmask = irqs << 16 | irqs;
|
2017-03-22 17:39:55 +00:00
|
|
|
|
2019-11-05 22:53:21 +00:00
|
|
|
GEM_BUG_ON(INTEL_GEN(gt->i915) < 11);
|
|
|
|
|
|
|
|
/* Handle ctx switch interrupts again */
|
|
|
|
intel_uncore_rmw(uncore, GEN11_RENDER_COPY_INTR_ENABLE, 0, dmask);
|
|
|
|
intel_uncore_rmw(uncore, GEN11_VCS_VECS_INTR_ENABLE, 0, dmask);
|
2017-03-22 17:39:55 +00:00
|
|
|
}
|
|
|
|
|
2021-01-13 02:12:35 +00:00
|
|
|
static int guc_context_alloc(struct intel_context *ce)
|
|
|
|
{
|
|
|
|
return lrc_alloc(ce, ce->engine);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int guc_context_pre_pin(struct intel_context *ce,
|
|
|
|
struct i915_gem_ww_ctx *ww,
|
|
|
|
void **vaddr)
|
|
|
|
{
|
|
|
|
return lrc_pre_pin(ce, ce->engine, ww, vaddr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int guc_context_pin(struct intel_context *ce, void *vaddr)
|
|
|
|
{
|
|
|
|
return lrc_pin(ce, ce->engine, vaddr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct intel_context_ops guc_context_ops = {
|
|
|
|
.alloc = guc_context_alloc,
|
|
|
|
|
|
|
|
.pre_pin = guc_context_pre_pin,
|
|
|
|
.pin = guc_context_pin,
|
|
|
|
.unpin = lrc_unpin,
|
|
|
|
.post_unpin = lrc_post_unpin,
|
|
|
|
|
|
|
|
.enter = intel_context_enter_engine,
|
|
|
|
.exit = intel_context_exit_engine,
|
|
|
|
|
|
|
|
.reset = lrc_reset,
|
|
|
|
.destroy = lrc_destroy,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int guc_request_alloc(struct i915_request *request)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
GEM_BUG_ON(!intel_context_is_pinned(request->context));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush enough space to reduce the likelihood of waiting after
|
|
|
|
* we start building the request - in which case we will just
|
|
|
|
* have to repeat work.
|
|
|
|
*/
|
|
|
|
request->reserved_space += GUC_REQUEST_SIZE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note that after this point, we have committed to using
|
|
|
|
* this request as it is being used to both track the
|
|
|
|
* state of engine initialisation and liveness of the
|
|
|
|
* golden renderstate above. Think twice before you try
|
|
|
|
* to cancel/unwind this request now.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Unconditionally invalidate GPU caches and TLBs. */
|
|
|
|
ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
request->reserved_space -= GUC_REQUEST_SIZE;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-01-13 02:12:36 +00:00
|
|
|
static inline void queue_request(struct intel_engine_cs *engine,
|
|
|
|
struct i915_request *rq,
|
|
|
|
int prio)
|
|
|
|
{
|
|
|
|
GEM_BUG_ON(!list_empty(&rq->sched.link));
|
|
|
|
list_add_tail(&rq->sched.link,
|
|
|
|
i915_sched_lookup_priolist(engine, prio));
|
|
|
|
set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void guc_submit_request(struct i915_request *rq)
|
|
|
|
{
|
|
|
|
struct intel_engine_cs *engine = rq->engine;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
/* Will be called from irq-context when using foreign fences. */
|
|
|
|
spin_lock_irqsave(&engine->active.lock, flags);
|
|
|
|
|
|
|
|
queue_request(engine, rq, rq_prio(rq));
|
|
|
|
|
|
|
|
GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
|
|
|
|
GEM_BUG_ON(list_empty(&rq->sched.link));
|
|
|
|
|
|
|
|
tasklet_hi_schedule(&engine->execlists.tasklet);
|
|
|
|
|
|
|
|
spin_unlock_irqrestore(&engine->active.lock, flags);
|
|
|
|
}
|
|
|
|
|
2021-01-13 02:12:35 +00:00
|
|
|
static void sanitize_hwsp(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
struct intel_timeline *tl;
|
|
|
|
|
|
|
|
list_for_each_entry(tl, &engine->status_page.timelines, engine_link)
|
|
|
|
intel_timeline_reset_seqno(tl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void guc_sanitize(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Poison residual state on resume, in case the suspend didn't!
|
|
|
|
*
|
|
|
|
* We have to assume that across suspend/resume (or other loss
|
|
|
|
* of control) that the contents of our pinned buffers has been
|
|
|
|
* lost, replaced by garbage. Since this doesn't always happen,
|
|
|
|
* let's poison such state so that we more quickly spot when
|
|
|
|
* we falsely assume it has been preserved.
|
|
|
|
*/
|
|
|
|
if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
|
|
|
|
memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The kernel_context HWSP is stored in the status_page. As above,
|
|
|
|
* that may be lost on resume/initialisation, and so we need to
|
|
|
|
* reset the value in the HWSP.
|
|
|
|
*/
|
|
|
|
sanitize_hwsp(engine);
|
|
|
|
|
|
|
|
/* And scrub the dirty cachelines for the HWSP */
|
|
|
|
clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void setup_hwsp(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
|
|
|
|
|
|
|
|
ENGINE_WRITE_FW(engine,
|
|
|
|
RING_HWS_PGA,
|
|
|
|
i915_ggtt_offset(engine->status_page.vma));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void start_engine(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
ENGINE_WRITE_FW(engine,
|
|
|
|
RING_MODE_GEN7,
|
|
|
|
_MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE));
|
|
|
|
|
|
|
|
ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
|
|
|
|
ENGINE_POSTING_READ(engine, RING_MI_MODE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int guc_resume(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
|
|
|
|
|
|
|
|
intel_mocs_init_engine(engine);
|
|
|
|
|
|
|
|
intel_breadcrumbs_reset(engine->breadcrumbs);
|
|
|
|
|
|
|
|
setup_hwsp(engine);
|
|
|
|
start_engine(engine);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-07-17 20:29:32 +00:00
|
|
|
static void guc_set_default_submission(struct intel_engine_cs *engine)
|
|
|
|
{
|
2021-01-13 02:12:36 +00:00
|
|
|
engine->submit_request = guc_submit_request;
|
|
|
|
engine->schedule = i915_schedule;
|
2018-07-17 20:29:32 +00:00
|
|
|
engine->execlists.tasklet.func = guc_submission_tasklet;
|
|
|
|
|
|
|
|
engine->reset.prepare = guc_reset_prepare;
|
2019-12-22 12:07:52 +00:00
|
|
|
engine->reset.rewind = guc_reset_rewind;
|
|
|
|
engine->reset.cancel = guc_reset_cancel;
|
2019-04-11 13:05:14 +00:00
|
|
|
engine->reset.finish = guc_reset_finish;
|
|
|
|
|
2019-08-12 23:31:50 +00:00
|
|
|
engine->flags |= I915_ENGINE_NEEDS_BREADCRUMB_TASKLET;
|
2021-01-13 02:12:36 +00:00
|
|
|
engine->flags |= I915_ENGINE_HAS_PREEMPTION;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TODO: GuC supports timeslicing and semaphores as well, but they're
|
|
|
|
* handled by the firmware so some minor tweaks are required before
|
|
|
|
* enabling.
|
|
|
|
*
|
|
|
|
* engine->flags |= I915_ENGINE_HAS_TIMESLICES;
|
|
|
|
* engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
|
|
|
|
*/
|
|
|
|
|
|
|
|
engine->emit_bb_start = gen8_emit_bb_start;
|
2019-08-12 23:31:50 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For the breadcrumb irq to work we need the interrupts to stay
|
|
|
|
* enabled. However, on all platforms on which we'll have support for
|
|
|
|
* GuC submission we don't allow disabling the interrupts at runtime, so
|
|
|
|
* we're always safe with the current flow.
|
|
|
|
*/
|
|
|
|
GEM_BUG_ON(engine->irq_enable || engine->irq_disable);
|
2018-07-17 20:29:32 +00:00
|
|
|
}
|
|
|
|
|
2021-01-13 02:12:35 +00:00
|
|
|
static void guc_release(struct intel_engine_cs *engine)
|
2015-08-12 14:43:41 +00:00
|
|
|
{
|
2021-01-13 02:12:35 +00:00
|
|
|
engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
|
2019-08-02 18:40:54 +00:00
|
|
|
|
2021-01-13 02:12:35 +00:00
|
|
|
tasklet_kill(&engine->execlists.tasklet);
|
2015-08-18 21:34:47 +00:00
|
|
|
|
2021-01-13 02:12:35 +00:00
|
|
|
intel_engine_cleanup_common(engine);
|
|
|
|
lrc_fini_wa_ctx(engine);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void guc_default_vfuncs(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
/* Default vfuncs which can be overridden by each engine. */
|
|
|
|
|
|
|
|
engine->resume = guc_resume;
|
|
|
|
|
|
|
|
engine->cops = &guc_context_ops;
|
|
|
|
engine->request_alloc = guc_request_alloc;
|
|
|
|
|
|
|
|
engine->emit_flush = gen8_emit_flush_xcs;
|
|
|
|
engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
|
|
|
|
engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_xcs;
|
|
|
|
if (INTEL_GEN(engine->i915) >= 12) {
|
|
|
|
engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_xcs;
|
|
|
|
engine->emit_flush = gen12_emit_flush_xcs;
|
|
|
|
}
|
|
|
|
engine->set_default_submission = guc_set_default_submission;
|
|
|
|
}
|
2017-03-09 13:20:04 +00:00
|
|
|
|
2021-01-13 02:12:35 +00:00
|
|
|
static void rcs_submission_override(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
switch (INTEL_GEN(engine->i915)) {
|
|
|
|
case 12:
|
|
|
|
engine->emit_flush = gen12_emit_flush_rcs;
|
|
|
|
engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
|
|
|
|
break;
|
|
|
|
case 11:
|
|
|
|
engine->emit_flush = gen11_emit_flush_rcs;
|
|
|
|
engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
engine->emit_flush = gen8_emit_flush_rcs;
|
|
|
|
engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
|
|
|
|
break;
|
2016-09-09 13:11:53 +00:00
|
|
|
}
|
2015-08-12 14:43:41 +00:00
|
|
|
}
|
|
|
|
|
2021-01-13 02:12:35 +00:00
|
|
|
static inline void guc_default_irqs(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
engine->irq_keep_mask = GT_RENDER_USER_INTERRUPT;
|
|
|
|
}
|
|
|
|
|
|
|
|
int intel_guc_submission_setup(struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
struct drm_i915_private *i915 = engine->i915;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The setup relies on several assumptions (e.g. irqs always enabled)
|
|
|
|
* that are only valid on gen11+
|
|
|
|
*/
|
|
|
|
GEM_BUG_ON(INTEL_GEN(i915) < 11);
|
|
|
|
|
|
|
|
tasklet_init(&engine->execlists.tasklet,
|
|
|
|
guc_submission_tasklet, (unsigned long)engine);
|
|
|
|
|
|
|
|
guc_default_vfuncs(engine);
|
|
|
|
guc_default_irqs(engine);
|
|
|
|
|
|
|
|
if (engine->class == RENDER_CLASS)
|
|
|
|
rcs_submission_override(engine);
|
|
|
|
|
|
|
|
lrc_init_wa_ctx(engine);
|
|
|
|
|
|
|
|
/* Finally, take ownership and responsibility for cleanup! */
|
|
|
|
engine->sanitize = guc_sanitize;
|
|
|
|
engine->release = guc_release;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void intel_guc_submission_enable(struct intel_guc *guc)
|
|
|
|
{
|
|
|
|
guc_stage_desc_init(guc);
|
|
|
|
|
|
|
|
/* Take over from manual control of ELSP (execlists) */
|
|
|
|
guc_interrupts_capture(guc_to_gt(guc));
|
|
|
|
}
|
|
|
|
|
2017-11-16 13:32:39 +00:00
|
|
|
void intel_guc_submission_disable(struct intel_guc *guc)
|
2015-08-12 14:43:41 +00:00
|
|
|
{
|
2019-07-13 10:00:14 +00:00
|
|
|
struct intel_gt *gt = guc_to_gt(guc);
|
2015-08-12 14:43:41 +00:00
|
|
|
|
2019-07-13 10:00:14 +00:00
|
|
|
GEM_BUG_ON(gt->awake); /* GT should be parked first */
|
2017-10-25 14:39:42 +00:00
|
|
|
|
2019-12-05 22:02:42 +00:00
|
|
|
/* Note: By the time we're here, GuC may have already been reset */
|
|
|
|
|
2019-07-13 10:00:14 +00:00
|
|
|
guc_interrupts_release(gt);
|
2019-12-05 22:02:42 +00:00
|
|
|
|
|
|
|
guc_stage_desc_fini(guc);
|
2015-08-12 14:43:41 +00:00
|
|
|
}
|
2017-11-16 22:06:31 +00:00
|
|
|
|
2020-02-18 22:33:24 +00:00
|
|
|
static bool __guc_submission_selected(struct intel_guc *guc)
|
2019-07-31 22:33:20 +00:00
|
|
|
{
|
2020-06-18 15:04:02 +00:00
|
|
|
struct drm_i915_private *i915 = guc_to_gt(guc)->i915;
|
|
|
|
|
2020-02-18 22:33:24 +00:00
|
|
|
if (!intel_guc_submission_is_supported(guc))
|
2019-07-31 22:33:20 +00:00
|
|
|
return false;
|
|
|
|
|
2020-06-18 15:04:02 +00:00
|
|
|
return i915->params.enable_guc & ENABLE_GUC_SUBMISSION;
|
2019-07-31 22:33:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void intel_guc_submission_init_early(struct intel_guc *guc)
|
|
|
|
{
|
2020-02-18 22:33:24 +00:00
|
|
|
guc->submission_selected = __guc_submission_selected(guc);
|
2019-07-31 22:33:20 +00:00
|
|
|
}
|
2019-12-05 22:02:42 +00:00
|
|
|
|
|
|
|
bool intel_engine_in_guc_submission_mode(const struct intel_engine_cs *engine)
|
|
|
|
{
|
|
|
|
return engine->set_default_submission == guc_set_default_submission;
|
|
|
|
}
|