linux/drivers/gpu/drm/i915/gt/intel_gt_pm.c
Chris Wilson 4a0ca47a8e drm/i915/gt: Suspend tasklets before resume sanitization
It is possible for a residual tasklet to be pending execution as we
resume (whether that's some prior test kicking off the tasklet, or if we
are in a suspend/resume stress test). As such, we do not want that
tasklet to execute in the middle of our sanitization, such that it sees
the poisoned state. For example,

<4>[  449.386553] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
<4>[  449.386555] CPU: 1 PID: 5115 Comm: i915_selftest Tainted: G     U  W         5.7.0-rc4-CI-CI_DRM_8472+ #1
<4>[  449.386556] Hardware name: Intel Corporation Ice Lake Client Platform/IceLake U DDR4 SODIMM PD RVP TLC, BIOS ICLSFWR1.R00.3183.A00.1905020411 05/02/2019
<4>[  449.386585] RIP: 0010:process_csb+0x6bf/0x830 [i915]
<4>[  449.386588] Code: 00 48 c7 c2 10 bc 4c a0 48 c7 c7 d4 75 34 a0 e8 87 0e e6 e0 bf 01 00 00 00 e8 9d e0 e5 e0 31 f6 bf 09 00 00 00 e8 e1 ba d6 e0 <0f> 0b 8b 87 10 05 00 00 85 c0 0f 85 5f f9 ff ff 48 c7 c1 70 a5 4f
<4>[  449.386591] RSP: 0018:ffffc90000170ea0 EFLAGS: 00010297
<4>[  449.386594] RAX: 0000000080000101 RBX: 0000000000000000 RCX: 0000000000000000
<4>[  449.386596] RDX: ffff88849d5bc040 RSI: 0000000000000000 RDI: 0000000000000009
<4>[  449.386598] RBP: ffffc90000170f00 R08: 0000000000000000 R09: 0000000000000000
<4>[  449.386600] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88843ccea018
<4>[  449.386602] R13: ffff88843ccea658 R14: ffff88843ccea640 R15: ffff88843ccea000
<4>[  449.386605] FS:  00007f826a813300(0000) GS:ffff88849fe80000(0000) knlGS:0000000000000000
<4>[  449.386607] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
<4>[  449.386609] CR2: 0000560366b94280 CR3: 000000048ba02002 CR4: 0000000000760ee0
<4>[  449.386611] PKRU: 55555554
<4>[  449.386613] Call Trace:
<4>[  449.386616]  <IRQ>
<4>[  449.386646]  ? execlists_submission_tasklet+0xcf/0x140 [i915]
<4>[  449.386674]  execlists_submission_tasklet+0x2f/0x140 [i915]
<4>[  449.386679]  tasklet_action_common.isra.16+0x6c/0x1c0
<4>[  449.386684]  __do_softirq+0xdf/0x49e
<4>[  449.386687]  irq_exit+0xba/0xc0
<4>[  449.386690]  smp_apic_timer_interrupt+0xb7/0x280
<4>[  449.386693]  apic_timer_interrupt+0xf/0x20
<4>[  449.386695]  </IRQ>
<4>[  449.386698] RIP: 0010:_raw_spin_unlock_irqrestore+0x49/0x60
<4>[  449.386701] Code: c7 02 75 1f 53 9d e8 26 ab 75 ff bf 01 00 00 00 e8 7c a3 69 ff 65 8b 05 7d 9b 5c 7e 85 c0 74 0c 5b 5d c3 e8 09 aa 75 ff 53 9d <eb> df e8 ca 39 5b ff 5b 5d c3 0f 1f 00 66 2e 0f 1f 84 00 00 00 00
<4>[  449.386703] RSP: 0018:ffffc90000a6b950 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13
<4>[  449.386706] RAX: 0000000080000001 RBX: 0000000000000202 RCX: 0000000000000000
<4>[  449.386708] RDX: ffff88849d5bc040 RSI: ffff88849d5bc900 RDI: ffffffff82386f12
<4>[  449.386710] RBP: ffff88847d400f00 R08: ffff88849d5bc900 R09: 0000000000000000
<4>[  449.386712] R10: 0000000000000000 R11: 0000000000000000 R12: 00000000ffff0b0b
<4>[  449.386714] R13: 000000000000000c R14: ffff88847d40bf70 R15: ffff88847d40cef8
<4>[  449.386742]  reset_csb_pointers+0x59/0x140 [i915]
<4>[  449.386769]  execlists_sanitize+0x3e/0x60 [i915]
<4>[  449.386797]  gt_sanitize+0xd6/0x260 [i915]

As part of the reset preparation, engine->reset.prepare() prevents the
tasklet from running, so pull the sanitization inside the critical
section for reset.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1812
Fixes: 23122a4d99 ("drm/i915/gt: Scrub execlists state on resume")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200513122826.27484-1-chris@chris-wilson.co.uk
2020-05-13 17:13:16 +01:00

345 lines
7.8 KiB
C

/*
* SPDX-License-Identifier: MIT
*
* Copyright © 2019 Intel Corporation
*/
#include <linux/suspend.h>
#include "i915_drv.h"
#include "i915_globals.h"
#include "i915_params.h"
#include "intel_context.h"
#include "intel_engine_pm.h"
#include "intel_gt.h"
#include "intel_gt_clock_utils.h"
#include "intel_gt_pm.h"
#include "intel_gt_requests.h"
#include "intel_llc.h"
#include "intel_pm.h"
#include "intel_rc6.h"
#include "intel_rps.h"
#include "intel_wakeref.h"
static void user_forcewake(struct intel_gt *gt, bool suspend)
{
int count = atomic_read(&gt->user_wakeref);
/* Inside suspend/resume so single threaded, no races to worry about. */
if (likely(!count))
return;
intel_gt_pm_get(gt);
if (suspend) {
GEM_BUG_ON(count > atomic_read(&gt->wakeref.count));
atomic_sub(count, &gt->wakeref.count);
} else {
atomic_add(count, &gt->wakeref.count);
}
intel_gt_pm_put(gt);
}
static int __gt_unpark(struct intel_wakeref *wf)
{
struct intel_gt *gt = container_of(wf, typeof(*gt), wakeref);
struct drm_i915_private *i915 = gt->i915;
GT_TRACE(gt, "\n");
i915_globals_unpark();
/*
* It seems that the DMC likes to transition between the DC states a lot
* when there are no connected displays (no active power domains) during
* command submission.
*
* This activity has negative impact on the performance of the chip with
* huge latencies observed in the interrupt handler and elsewhere.
*
* Work around it by grabbing a GT IRQ power domain whilst there is any
* GT activity, preventing any DC state transitions.
*/
gt->awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ);
GEM_BUG_ON(!gt->awake);
intel_rc6_unpark(&gt->rc6);
intel_rps_unpark(&gt->rps);
i915_pmu_gt_unparked(i915);
intel_gt_unpark_requests(gt);
return 0;
}
static int __gt_park(struct intel_wakeref *wf)
{
struct intel_gt *gt = container_of(wf, typeof(*gt), wakeref);
intel_wakeref_t wakeref = fetch_and_zero(&gt->awake);
struct drm_i915_private *i915 = gt->i915;
GT_TRACE(gt, "\n");
intel_gt_park_requests(gt);
i915_vma_parked(gt);
i915_pmu_gt_parked(i915);
intel_rps_park(&gt->rps);
intel_rc6_park(&gt->rc6);
/* Everything switched off, flush any residual interrupt just in case */
intel_synchronize_irq(i915);
/* Defer dropping the display power well for 100ms, it's slow! */
GEM_BUG_ON(!wakeref);
intel_display_power_put_async(i915, POWER_DOMAIN_GT_IRQ, wakeref);
i915_globals_park();
return 0;
}
static const struct intel_wakeref_ops wf_ops = {
.get = __gt_unpark,
.put = __gt_park,
};
void intel_gt_pm_init_early(struct intel_gt *gt)
{
intel_wakeref_init(&gt->wakeref, gt->uncore->rpm, &wf_ops);
}
void intel_gt_pm_init(struct intel_gt *gt)
{
/*
* Enabling power-management should be "self-healing". If we cannot
* enable a feature, simply leave it disabled with a notice to the
* user.
*/
intel_rc6_init(&gt->rc6);
intel_rps_init(&gt->rps);
}
static bool reset_engines(struct intel_gt *gt)
{
if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
return false;
return __intel_gt_reset(gt, ALL_ENGINES) == 0;
}
static void gt_sanitize(struct intel_gt *gt, bool force)
{
struct intel_engine_cs *engine;
enum intel_engine_id id;
intel_wakeref_t wakeref;
GT_TRACE(gt, "force:%s", yesno(force));
/* Use a raw wakeref to avoid calling intel_display_power_get early */
wakeref = intel_runtime_pm_get(gt->uncore->rpm);
intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
intel_gt_check_clock_frequency(gt);
/*
* As we have just resumed the machine and woken the device up from
* deep PCI sleep (presumably D3_cold), assume the HW has been reset
* back to defaults, recovering from whatever wedged state we left it
* in and so worth trying to use the device once more.
*/
if (intel_gt_is_wedged(gt))
intel_gt_unset_wedged(gt);
intel_uc_sanitize(&gt->uc);
for_each_engine(engine, gt, id)
if (engine->reset.prepare)
engine->reset.prepare(engine);
intel_uc_reset_prepare(&gt->uc);
for_each_engine(engine, gt, id)
if (engine->sanitize)
engine->sanitize(engine);
if (reset_engines(gt) || force) {
for_each_engine(engine, gt, id)
__intel_engine_reset(engine, false);
}
for_each_engine(engine, gt, id)
if (engine->reset.finish)
engine->reset.finish(engine);
intel_rps_sanitize(&gt->rps);
intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
intel_runtime_pm_put(gt->uncore->rpm, wakeref);
}
void intel_gt_pm_fini(struct intel_gt *gt)
{
intel_rc6_fini(&gt->rc6);
}
int intel_gt_resume(struct intel_gt *gt)
{
struct intel_engine_cs *engine;
enum intel_engine_id id;
int err;
err = intel_gt_has_init_error(gt);
if (err)
return err;
GT_TRACE(gt, "\n");
/*
* After resume, we may need to poke into the pinned kernel
* contexts to paper over any damage caused by the sudden suspend.
* Only the kernel contexts should remain pinned over suspend,
* allowing us to fixup the user contexts on their first pin.
*/
gt_sanitize(gt, true);
intel_gt_pm_get(gt);
intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
intel_rc6_sanitize(&gt->rc6);
if (intel_gt_is_wedged(gt)) {
err = -EIO;
goto out_fw;
}
/* Only when the HW is re-initialised, can we replay the requests */
err = intel_gt_init_hw(gt);
if (err) {
drm_err(&gt->i915->drm,
"Failed to initialize GPU, declaring it wedged!\n");
goto err_wedged;
}
intel_rps_enable(&gt->rps);
intel_llc_enable(&gt->llc);
for_each_engine(engine, gt, id) {
intel_engine_pm_get(engine);
engine->serial++; /* kernel context lost */
err = intel_engine_resume(engine);
intel_engine_pm_put(engine);
if (err) {
drm_err(&gt->i915->drm,
"Failed to restart %s (%d)\n",
engine->name, err);
goto err_wedged;
}
}
intel_rc6_enable(&gt->rc6);
intel_uc_resume(&gt->uc);
user_forcewake(gt, false);
out_fw:
intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
intel_gt_pm_put(gt);
return err;
err_wedged:
intel_gt_set_wedged(gt);
goto out_fw;
}
static void wait_for_suspend(struct intel_gt *gt)
{
if (!intel_gt_pm_is_awake(gt))
return;
if (intel_gt_wait_for_idle(gt, I915_GEM_IDLE_TIMEOUT) == -ETIME) {
/*
* Forcibly cancel outstanding work and leave
* the gpu quiet.
*/
intel_gt_set_wedged(gt);
intel_gt_retire_requests(gt);
}
intel_gt_pm_wait_for_idle(gt);
}
void intel_gt_suspend_prepare(struct intel_gt *gt)
{
user_forcewake(gt, true);
wait_for_suspend(gt);
intel_uc_suspend(&gt->uc);
}
static suspend_state_t pm_suspend_target(void)
{
#if IS_ENABLED(CONFIG_SUSPEND) && IS_ENABLED(CONFIG_PM_SLEEP)
return pm_suspend_target_state;
#else
return PM_SUSPEND_TO_IDLE;
#endif
}
void intel_gt_suspend_late(struct intel_gt *gt)
{
intel_wakeref_t wakeref;
/* We expect to be idle already; but also want to be independent */
wait_for_suspend(gt);
if (is_mock_gt(gt))
return;
GEM_BUG_ON(gt->awake);
/*
* On disabling the device, we want to turn off HW access to memory
* that we no longer own.
*
* However, not all suspend-states disable the device. S0 (s2idle)
* is effectively runtime-suspend, the device is left powered on
* but needs to be put into a low power state. We need to keep
* powermanagement enabled, but we also retain system state and so
* it remains safe to keep on using our allocated memory.
*/
if (pm_suspend_target() == PM_SUSPEND_TO_IDLE)
return;
with_intel_runtime_pm(gt->uncore->rpm, wakeref) {
intel_rps_disable(&gt->rps);
intel_rc6_disable(&gt->rc6);
intel_llc_disable(&gt->llc);
}
gt_sanitize(gt, false);
GT_TRACE(gt, "\n");
}
void intel_gt_runtime_suspend(struct intel_gt *gt)
{
intel_uc_runtime_suspend(&gt->uc);
GT_TRACE(gt, "\n");
}
int intel_gt_runtime_resume(struct intel_gt *gt)
{
GT_TRACE(gt, "\n");
intel_gt_init_swizzling(gt);
intel_ggtt_restore_fences(gt->ggtt);
return intel_uc_runtime_resume(&gt->uc);
}
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftest_gt_pm.c"
#endif