Files
linux/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h

96 lines
2.8 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: MIT */
/*
* Copyright © 2014-2019 Intel Corporation
*/
#ifndef _INTEL_GUC_LOG_H_
#define _INTEL_GUC_LOG_H_
#include <linux/mutex.h>
#include <linux/relay.h>
#include <linux/workqueue.h>
#include "intel_guc_fwif.h"
#include "i915_gem.h"
struct intel_guc;
#if defined(CONFIG_DRM_I915_DEBUG_GUC)
#define CRASH_BUFFER_SIZE SZ_2M
#define DEBUG_BUFFER_SIZE SZ_16M
drm/i915/guc: Update to GuC version 69.0.3 Update to the latest GuC release. The latest GuC firmware introduces a number of interface changes: GuC may return NO_RESPONSE_RETRY message for requests sent over CTB. Add support for this reply and try resending the request again as a new CTB message. A KLV (key-length-value) mechanism is now used for passing configuration data such as CTB management. With the new KLV scheme, the old CTB management actions are no longer used and are removed. Register capture on hang is now supported by GuC. Full i915 support for this will be added by a later patch. A minimum support of providing capture memory and register lists is required though, so add that in. The device id of the current platform needs to be provided at init time. The 'poll CS' w/a (Wa_22012773006) was blanket enabled by previous versions of GuC. It must now be explicitly requested by the KMD. So, add in the code to turn it on when relevant. The GuC log entry format has changed. This requires adding a new field to the log header structure to mark the wrap point at the end of the buffer (as the buffer size is no longer a multiple of the log entry size). New CTB notification messages are now sent for some things that were previously only sent via MMIO notifications. Of these, the crash dump notification was not really being handled by i915. It called the log flush code but that only flushed the regular debug log and then only if relay logging was enabled. So just report an error message instead. The 'exception' notification was just being ignored completely. So add an error message for that as well. Note that in either the crash dump or the exception case, the GuC is basically dead. The KMD will detect this via the heartbeat and trigger both an error log (which will include the crash dump as part of the GuC log) and a GT reset. So no other processing is really required. Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com> Reviewed-by: Matthew Brost <matthew.brost@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20220107000622.292081-3-John.C.Harrison@Intel.com
2022-01-06 16:06:21 -08:00
#define CAPTURE_BUFFER_SIZE SZ_4M
#elif defined(CONFIG_DRM_I915_DEBUG_GEM)
#define CRASH_BUFFER_SIZE SZ_1M
#define DEBUG_BUFFER_SIZE SZ_2M
drm/i915/guc: Update to GuC version 69.0.3 Update to the latest GuC release. The latest GuC firmware introduces a number of interface changes: GuC may return NO_RESPONSE_RETRY message for requests sent over CTB. Add support for this reply and try resending the request again as a new CTB message. A KLV (key-length-value) mechanism is now used for passing configuration data such as CTB management. With the new KLV scheme, the old CTB management actions are no longer used and are removed. Register capture on hang is now supported by GuC. Full i915 support for this will be added by a later patch. A minimum support of providing capture memory and register lists is required though, so add that in. The device id of the current platform needs to be provided at init time. The 'poll CS' w/a (Wa_22012773006) was blanket enabled by previous versions of GuC. It must now be explicitly requested by the KMD. So, add in the code to turn it on when relevant. The GuC log entry format has changed. This requires adding a new field to the log header structure to mark the wrap point at the end of the buffer (as the buffer size is no longer a multiple of the log entry size). New CTB notification messages are now sent for some things that were previously only sent via MMIO notifications. Of these, the crash dump notification was not really being handled by i915. It called the log flush code but that only flushed the regular debug log and then only if relay logging was enabled. So just report an error message instead. The 'exception' notification was just being ignored completely. So add an error message for that as well. Note that in either the crash dump or the exception case, the GuC is basically dead. The KMD will detect this via the heartbeat and trigger both an error log (which will include the crash dump as part of the GuC log) and a GT reset. So no other processing is really required. Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com> Reviewed-by: Matthew Brost <matthew.brost@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20220107000622.292081-3-John.C.Harrison@Intel.com
2022-01-06 16:06:21 -08:00
#define CAPTURE_BUFFER_SIZE SZ_1M
#else
#define CRASH_BUFFER_SIZE SZ_8K
#define DEBUG_BUFFER_SIZE SZ_64K
drm/i915/guc: Update to GuC version 69.0.3 Update to the latest GuC release. The latest GuC firmware introduces a number of interface changes: GuC may return NO_RESPONSE_RETRY message for requests sent over CTB. Add support for this reply and try resending the request again as a new CTB message. A KLV (key-length-value) mechanism is now used for passing configuration data such as CTB management. With the new KLV scheme, the old CTB management actions are no longer used and are removed. Register capture on hang is now supported by GuC. Full i915 support for this will be added by a later patch. A minimum support of providing capture memory and register lists is required though, so add that in. The device id of the current platform needs to be provided at init time. The 'poll CS' w/a (Wa_22012773006) was blanket enabled by previous versions of GuC. It must now be explicitly requested by the KMD. So, add in the code to turn it on when relevant. The GuC log entry format has changed. This requires adding a new field to the log header structure to mark the wrap point at the end of the buffer (as the buffer size is no longer a multiple of the log entry size). New CTB notification messages are now sent for some things that were previously only sent via MMIO notifications. Of these, the crash dump notification was not really being handled by i915. It called the log flush code but that only flushed the regular debug log and then only if relay logging was enabled. So just report an error message instead. The 'exception' notification was just being ignored completely. So add an error message for that as well. Note that in either the crash dump or the exception case, the GuC is basically dead. The KMD will detect this via the heartbeat and trigger both an error log (which will include the crash dump as part of the GuC log) and a GT reset. So no other processing is really required. Signed-off-by: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com> Reviewed-by: Matthew Brost <matthew.brost@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20220107000622.292081-3-John.C.Harrison@Intel.com
2022-01-06 16:06:21 -08:00
#define CAPTURE_BUFFER_SIZE SZ_16K
#endif
drm/i915/guc: Fix lockdep due to log relay channel handling under struct_mutex This patch fixes lockdep issue due to circular locking dependency of struct_mutex, i_mutex_key, mmap_sem, relay_channels_mutex. For GuC log relay channel we create debugfs file that requires i_mutex_key lock and we are doing that under struct_mutex. So we introduced newer dependency as: &dev->struct_mutex --> &sb->s_type->i_mutex_key#3 --> &mm->mmap_sem However, there is dependency from mmap_sem to struct_mutex. Hence we separate the relay create/destroy operation from under struct_mutex. Also added runtime check of relay buffer status. Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> ====================================================== WARNING: possible circular locking dependency detected 4.15.0-rc6-CI-Patchwork_7614+ #1 Not tainted ------------------------------------------------------ debugfs_test/1388 is trying to acquire lock: (&dev->struct_mutex){+.+.}, at: [<00000000d5e1d915>] i915_mutex_lock_interruptible+0x47/0x130 [i915] but task is already holding lock: (&mm->mmap_sem){++++}, at: [<0000000029a9c131>] __do_page_fault+0x106/0x560 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_sem){++++}: _copy_to_user+0x1e/0x70 filldir+0x8c/0xf0 dcache_readdir+0xeb/0x160 iterate_dir+0xdc/0x140 SyS_getdents+0xa0/0x130 entry_SYSCALL_64_fastpath+0x1c/0x89 -> #2 (&sb->s_type->i_mutex_key#3){++++}: start_creating+0x59/0x110 __debugfs_create_file+0x2e/0xe0 relay_create_buf_file+0x62/0x80 relay_late_setup_files+0x84/0x250 guc_log_late_setup+0x4f/0x110 [i915] i915_guc_log_register+0x32/0x40 [i915] i915_driver_load+0x7b6/0x1720 [i915] i915_pci_probe+0x2e/0x90 [i915] pci_device_probe+0x9c/0x120 driver_probe_device+0x2a3/0x480 __driver_attach+0xd9/0xe0 bus_for_each_dev+0x57/0x90 bus_add_driver+0x168/0x260 driver_register+0x52/0xc0 do_one_initcall+0x39/0x150 do_init_module+0x56/0x1ef load_module+0x231c/0x2d70 SyS_finit_module+0xa5/0xe0 entry_SYSCALL_64_fastpath+0x1c/0x89 -> #1 (relay_channels_mutex){+.+.}: relay_open+0x12c/0x2b0 intel_guc_log_runtime_create+0xab/0x230 [i915] intel_guc_init+0x81/0x120 [i915] intel_uc_init+0x29/0xa0 [i915] i915_gem_init+0x182/0x530 [i915] i915_driver_load+0xaa9/0x1720 [i915] i915_pci_probe+0x2e/0x90 [i915] pci_device_probe+0x9c/0x120 driver_probe_device+0x2a3/0x480 __driver_attach+0xd9/0xe0 bus_for_each_dev+0x57/0x90 bus_add_driver+0x168/0x260 driver_register+0x52/0xc0 do_one_initcall+0x39/0x150 do_init_module+0x56/0x1ef load_module+0x231c/0x2d70 SyS_finit_module+0xa5/0xe0 entry_SYSCALL_64_fastpath+0x1c/0x89 -> #0 (&dev->struct_mutex){+.+.}: __mutex_lock+0x81/0x9b0 i915_mutex_lock_interruptible+0x47/0x130 [i915] i915_gem_fault+0x201/0x790 [i915] __do_fault+0x15/0x70 __handle_mm_fault+0x677/0xdc0 handle_mm_fault+0x14f/0x2f0 __do_page_fault+0x2d1/0x560 page_fault+0x4c/0x60 other info that might help us debug this: Chain exists of: &dev->struct_mutex --> &sb->s_type->i_mutex_key#3 --> &mm->mmap_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(&sb->s_type->i_mutex_key#3); lock(&mm->mmap_sem); lock(&dev->struct_mutex); *** DEADLOCK *** 1 lock held by debugfs_test/1388: #0: (&mm->mmap_sem){++++}, at: [<0000000029a9c131>] __do_page_fault+0x106/0x560 stack backtrace: CPU: 2 PID: 1388 Comm: debugfs_test Not tainted 4.15.0-rc6-CI-Patchwork_7614+ #1 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./J4205-ITX, BIOS P1.10 09/29/2016 Call Trace: dump_stack+0x5f/0x86 print_circular_bug.isra.18+0x1d0/0x2c0 __lock_acquire+0x14ae/0x1b60 ? lock_acquire+0xaf/0x200 lock_acquire+0xaf/0x200 ? i915_mutex_lock_interruptible+0x47/0x130 [i915] __mutex_lock+0x81/0x9b0 ? i915_mutex_lock_interruptible+0x47/0x130 [i915] ? i915_mutex_lock_interruptible+0x47/0x130 [i915] ? i915_mutex_lock_interruptible+0x47/0x130 [i915] i915_mutex_lock_interruptible+0x47/0x130 [i915] ? __pm_runtime_resume+0x4f/0x80 i915_gem_fault+0x201/0x790 [i915] __do_fault+0x15/0x70 ? _raw_spin_unlock+0x29/0x40 __handle_mm_fault+0x677/0xdc0 handle_mm_fault+0x14f/0x2f0 __do_page_fault+0x2d1/0x560 ? page_fault+0x36/0x60 page_fault+0x4c/0x60 v2: Added lock protection to guc->log.runtime.relay_chan (Chris) Fixed locking inside guc_flush_logs uncovered by new lockdep. v3: Locking guc_read_update_log_buffer entirely with relay_lock. (Chris) Prepared intel_guc_init_early. Moved relay_lock inside relay_create relay_destroy, relay_file_create, guc_read_update_log_buffer. (Michal) Removed struct_mutex lock around guc_log_flush and removed usage of guc_log_has_relay() from runtime_create path as it needs struct_mutex lock. v4: Handle NULL relay sub buffer pointer earlier in read_update_log_buffer (Chris). Fixed comment suffix **/. (Michal) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104693 Testcase: igt/debugfs_test/read_all_entries # with enable_guc=1 and guc_log_level=1 Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Marta Lofstedt <marta.lofstedt@intel.com> Cc: Michal Winiarski <michal.winiarski@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Link: https://patchwork.freedesktop.org/patch/msgid/1516808821-3638-3-git-send-email-sagar.a.kamble@intel.com
2018-01-24 21:16:58 +05:30
/*
* While we're using plain log level in i915, GuC controls are much more...
* "elaborate"? We have a couple of bits for verbosity, separate bit for actual
* log enabling, and separate bit for default logging - which "conveniently"
* ignores the enable bit.
*/
#define GUC_LOG_LEVEL_DISABLED 0
#define GUC_LOG_LEVEL_NON_VERBOSE 1
#define GUC_LOG_LEVEL_IS_ENABLED(x) ((x) > GUC_LOG_LEVEL_DISABLED)
#define GUC_LOG_LEVEL_IS_VERBOSE(x) ((x) > GUC_LOG_LEVEL_NON_VERBOSE)
#define GUC_LOG_LEVEL_TO_VERBOSITY(x) ({ \
typeof(x) _x = (x); \
GUC_LOG_LEVEL_IS_VERBOSE(_x) ? _x - 2 : 0; \
})
#define GUC_VERBOSITY_TO_LOG_LEVEL(x) ((x) + 2)
#define GUC_LOG_LEVEL_MAX GUC_VERBOSITY_TO_LOG_LEVEL(GUC_LOG_VERBOSITY_MAX)
struct intel_guc_log {
u32 level;
struct i915_vma *vma;
void *buf_addr;
struct {
bool buf_in_use;
bool started;
struct work_struct flush_work;
struct rchan *channel;
struct mutex lock;
u32 full_count;
} relay;
/* logging related stats */
struct {
u32 sampled_overflow;
u32 overflow;
u32 flush;
} stats[GUC_MAX_LOG_BUFFER];
};
void intel_guc_log_init_early(struct intel_guc_log *log);
drm/i915/guc: Extract GuC error capture lists on G2H notification. - Upon the G2H Notify-Err-Capture event, parse through the GuC Log Buffer (error-capture-subregion) and generate one or more capture-nodes. A single node represents a single "engine- instance-capture-dump" and contains at least 3 register lists: global, engine-class and engine-instance. An internal link list is maintained to store one or more nodes. - Because the link-list node generation happen before the call to i915_gpu_codedump, duplicate global and engine-class register lists for each engine-instance register dump if we find dependent-engine resets in a engine-capture-group. - When i915_gpu_coredump calls into capture_engine, (in a subsequent patch) we detach the matching node (guc-id, LRCA, etc) from the link list above and attach it to i915_gpu_coredump's intel_engine_coredump structure when have matching LRCA/guc-id/engine-instance. Additional notes to be aware of: - GuC generates the error capture dump into the GuC log buffer but this buffer is one big log buffer with 3 independent subregions within it. Each subregion is populated with different content and used in different ways and timings but all regions operate behave as independent ring buffers. Each guc-log subregion (general-logs, crash-dump and error- capture) has it's own guc_log_buffer_state that contain independent read and write pointers. Signed-off-by: Alan Previn <alan.previn.teres.alexis@intel.com> Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com> Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20220321164527.2500062-11-alan.previn.teres.alexis@intel.com
2022-03-21 09:45:24 -07:00
bool intel_guc_check_log_buf_overflow(struct intel_guc_log *log, enum guc_log_buffer_type type,
unsigned int full_cnt);
unsigned int intel_guc_get_log_buffer_size(enum guc_log_buffer_type type);
size_t intel_guc_get_log_buffer_offset(enum guc_log_buffer_type type);
int intel_guc_log_create(struct intel_guc_log *log);
void intel_guc_log_destroy(struct intel_guc_log *log);
int intel_guc_log_set_level(struct intel_guc_log *log, u32 level);
bool intel_guc_log_relay_created(const struct intel_guc_log *log);
int intel_guc_log_relay_open(struct intel_guc_log *log);
int intel_guc_log_relay_start(struct intel_guc_log *log);
void intel_guc_log_relay_flush(struct intel_guc_log *log);
void intel_guc_log_relay_close(struct intel_guc_log *log);
void intel_guc_log_handle_flush_event(struct intel_guc_log *log);
static inline u32 intel_guc_log_get_level(struct intel_guc_log *log)
{
return log->level;
}
void intel_guc_log_info(struct intel_guc_log *log, struct drm_printer *p);
int intel_guc_log_dump(struct intel_guc_log *log, struct drm_printer *p,
bool dump_load_err);
#endif