mirror of
https://github.com/torvalds/linux.git
synced 2024-11-10 14:11:52 +00:00
6fe60465e1
If stack_depot_save_flags() allocates memory it always drops
__GFP_NOLOCKDEP flag. So when KASAN tries to track __GFP_NOLOCKDEP
allocation we may end up with lockdep splat like bellow:
======================================================
WARNING: possible circular locking dependency detected
6.9.0-rc3+ #49 Not tainted
------------------------------------------------------
kswapd0/149 is trying to acquire lock:
ffff88811346a920
(&xfs_nondir_ilock_class){++++}-{4:4}, at: xfs_reclaim_inode+0x3ac/0x590
[xfs]
but task is already holding lock:
ffffffff8bb33100 (fs_reclaim){+.+.}-{0:0}, at:
balance_pgdat+0x5d9/0xad0
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (fs_reclaim){+.+.}-{0:0}:
__lock_acquire+0x7da/0x1030
lock_acquire+0x15d/0x400
fs_reclaim_acquire+0xb5/0x100
prepare_alloc_pages.constprop.0+0xc5/0x230
__alloc_pages+0x12a/0x3f0
alloc_pages_mpol+0x175/0x340
stack_depot_save_flags+0x4c5/0x510
kasan_save_stack+0x30/0x40
kasan_save_track+0x10/0x30
__kasan_slab_alloc+0x83/0x90
kmem_cache_alloc+0x15e/0x4a0
__alloc_object+0x35/0x370
__create_object+0x22/0x90
__kmalloc_node_track_caller+0x477/0x5b0
krealloc+0x5f/0x110
xfs_iext_insert_raw+0x4b2/0x6e0 [xfs]
xfs_iext_insert+0x2e/0x130 [xfs]
xfs_iread_bmbt_block+0x1a9/0x4d0 [xfs]
xfs_btree_visit_block+0xfb/0x290 [xfs]
xfs_btree_visit_blocks+0x215/0x2c0 [xfs]
xfs_iread_extents+0x1a2/0x2e0 [xfs]
xfs_buffered_write_iomap_begin+0x376/0x10a0 [xfs]
iomap_iter+0x1d1/0x2d0
iomap_file_buffered_write+0x120/0x1a0
xfs_file_buffered_write+0x128/0x4b0 [xfs]
vfs_write+0x675/0x890
ksys_write+0xc3/0x160
do_syscall_64+0x94/0x170
entry_SYSCALL_64_after_hwframe+0x71/0x79
Always preserve __GFP_NOLOCKDEP to fix this.
Link: https://lkml.kernel.org/r/20240418141133.22950-1-ryabinin.a.a@gmail.com
Fixes: cd11016e5f
("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB")
Signed-off-by: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Reported-by: Xiubo Li <xiubli@redhat.com>
Closes: https://lore.kernel.org/all/a0caa289-ca02-48eb-9bf2-d86fd47b71f4@redhat.com/
Reported-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Closes: https://lore.kernel.org/all/f9ff999a-e170-b66b-7caf-293f2b147ac2@opensource.wdc.com/
Suggested-by: Dave Chinner <david@fromorbit.com>
Tested-by: Xiubo Li <xiubli@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
817 lines
22 KiB
C
817 lines
22 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Stack depot - a stack trace storage that avoids duplication.
|
|
*
|
|
* Internally, stack depot maintains a hash table of unique stacktraces. The
|
|
* stack traces themselves are stored contiguously one after another in a set
|
|
* of separate page allocations.
|
|
*
|
|
* Author: Alexander Potapenko <glider@google.com>
|
|
* Copyright (C) 2016 Google, Inc.
|
|
*
|
|
* Based on the code by Dmitry Chernenkov.
|
|
*/
|
|
|
|
#define pr_fmt(fmt) "stackdepot: " fmt
|
|
|
|
#include <linux/debugfs.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/jhash.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/kmsan.h>
|
|
#include <linux/list.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/poison.h>
|
|
#include <linux/printk.h>
|
|
#include <linux/rculist.h>
|
|
#include <linux/rcupdate.h>
|
|
#include <linux/refcount.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/stacktrace.h>
|
|
#include <linux/stackdepot.h>
|
|
#include <linux/string.h>
|
|
#include <linux/types.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/kasan-enabled.h>
|
|
|
|
#define DEPOT_POOLS_CAP 8192
|
|
/* The pool_index is offset by 1 so the first record does not have a 0 handle. */
|
|
#define DEPOT_MAX_POOLS \
|
|
(((1LL << (DEPOT_POOL_INDEX_BITS)) - 1 < DEPOT_POOLS_CAP) ? \
|
|
(1LL << (DEPOT_POOL_INDEX_BITS)) - 1 : DEPOT_POOLS_CAP)
|
|
|
|
static bool stack_depot_disabled;
|
|
static bool __stack_depot_early_init_requested __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT);
|
|
static bool __stack_depot_early_init_passed __initdata;
|
|
|
|
/* Use one hash table bucket per 16 KB of memory. */
|
|
#define STACK_HASH_TABLE_SCALE 14
|
|
/* Limit the number of buckets between 4K and 1M. */
|
|
#define STACK_BUCKET_NUMBER_ORDER_MIN 12
|
|
#define STACK_BUCKET_NUMBER_ORDER_MAX 20
|
|
/* Initial seed for jhash2. */
|
|
#define STACK_HASH_SEED 0x9747b28c
|
|
|
|
/* Hash table of stored stack records. */
|
|
static struct list_head *stack_table;
|
|
/* Fixed order of the number of table buckets. Used when KASAN is enabled. */
|
|
static unsigned int stack_bucket_number_order;
|
|
/* Hash mask for indexing the table. */
|
|
static unsigned int stack_hash_mask;
|
|
|
|
/* Array of memory regions that store stack records. */
|
|
static void *stack_pools[DEPOT_MAX_POOLS];
|
|
/* Newly allocated pool that is not yet added to stack_pools. */
|
|
static void *new_pool;
|
|
/* Number of pools in stack_pools. */
|
|
static int pools_num;
|
|
/* Offset to the unused space in the currently used pool. */
|
|
static size_t pool_offset = DEPOT_POOL_SIZE;
|
|
/* Freelist of stack records within stack_pools. */
|
|
static LIST_HEAD(free_stacks);
|
|
/* The lock must be held when performing pool or freelist modifications. */
|
|
static DEFINE_RAW_SPINLOCK(pool_lock);
|
|
|
|
/* Statistics counters for debugfs. */
|
|
enum depot_counter_id {
|
|
DEPOT_COUNTER_REFD_ALLOCS,
|
|
DEPOT_COUNTER_REFD_FREES,
|
|
DEPOT_COUNTER_REFD_INUSE,
|
|
DEPOT_COUNTER_FREELIST_SIZE,
|
|
DEPOT_COUNTER_PERSIST_COUNT,
|
|
DEPOT_COUNTER_PERSIST_BYTES,
|
|
DEPOT_COUNTER_COUNT,
|
|
};
|
|
static long counters[DEPOT_COUNTER_COUNT];
|
|
static const char *const counter_names[] = {
|
|
[DEPOT_COUNTER_REFD_ALLOCS] = "refcounted_allocations",
|
|
[DEPOT_COUNTER_REFD_FREES] = "refcounted_frees",
|
|
[DEPOT_COUNTER_REFD_INUSE] = "refcounted_in_use",
|
|
[DEPOT_COUNTER_FREELIST_SIZE] = "freelist_size",
|
|
[DEPOT_COUNTER_PERSIST_COUNT] = "persistent_count",
|
|
[DEPOT_COUNTER_PERSIST_BYTES] = "persistent_bytes",
|
|
};
|
|
static_assert(ARRAY_SIZE(counter_names) == DEPOT_COUNTER_COUNT);
|
|
|
|
static int __init disable_stack_depot(char *str)
|
|
{
|
|
return kstrtobool(str, &stack_depot_disabled);
|
|
}
|
|
early_param("stack_depot_disable", disable_stack_depot);
|
|
|
|
void __init stack_depot_request_early_init(void)
|
|
{
|
|
/* Too late to request early init now. */
|
|
WARN_ON(__stack_depot_early_init_passed);
|
|
|
|
__stack_depot_early_init_requested = true;
|
|
}
|
|
|
|
/* Initialize list_head's within the hash table. */
|
|
static void init_stack_table(unsigned long entries)
|
|
{
|
|
unsigned long i;
|
|
|
|
for (i = 0; i < entries; i++)
|
|
INIT_LIST_HEAD(&stack_table[i]);
|
|
}
|
|
|
|
/* Allocates a hash table via memblock. Can only be used during early boot. */
|
|
int __init stack_depot_early_init(void)
|
|
{
|
|
unsigned long entries = 0;
|
|
|
|
/* This function must be called only once, from mm_init(). */
|
|
if (WARN_ON(__stack_depot_early_init_passed))
|
|
return 0;
|
|
__stack_depot_early_init_passed = true;
|
|
|
|
/*
|
|
* Print disabled message even if early init has not been requested:
|
|
* stack_depot_init() will not print one.
|
|
*/
|
|
if (stack_depot_disabled) {
|
|
pr_info("disabled\n");
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* If KASAN is enabled, use the maximum order: KASAN is frequently used
|
|
* in fuzzing scenarios, which leads to a large number of different
|
|
* stack traces being stored in stack depot.
|
|
*/
|
|
if (kasan_enabled() && !stack_bucket_number_order)
|
|
stack_bucket_number_order = STACK_BUCKET_NUMBER_ORDER_MAX;
|
|
|
|
/*
|
|
* Check if early init has been requested after setting
|
|
* stack_bucket_number_order: stack_depot_init() uses its value.
|
|
*/
|
|
if (!__stack_depot_early_init_requested)
|
|
return 0;
|
|
|
|
/*
|
|
* If stack_bucket_number_order is not set, leave entries as 0 to rely
|
|
* on the automatic calculations performed by alloc_large_system_hash().
|
|
*/
|
|
if (stack_bucket_number_order)
|
|
entries = 1UL << stack_bucket_number_order;
|
|
pr_info("allocating hash table via alloc_large_system_hash\n");
|
|
stack_table = alloc_large_system_hash("stackdepot",
|
|
sizeof(struct list_head),
|
|
entries,
|
|
STACK_HASH_TABLE_SCALE,
|
|
HASH_EARLY,
|
|
NULL,
|
|
&stack_hash_mask,
|
|
1UL << STACK_BUCKET_NUMBER_ORDER_MIN,
|
|
1UL << STACK_BUCKET_NUMBER_ORDER_MAX);
|
|
if (!stack_table) {
|
|
pr_err("hash table allocation failed, disabling\n");
|
|
stack_depot_disabled = true;
|
|
return -ENOMEM;
|
|
}
|
|
if (!entries) {
|
|
/*
|
|
* Obtain the number of entries that was calculated by
|
|
* alloc_large_system_hash().
|
|
*/
|
|
entries = stack_hash_mask + 1;
|
|
}
|
|
init_stack_table(entries);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Allocates a hash table via kvcalloc. Can be used after boot. */
|
|
int stack_depot_init(void)
|
|
{
|
|
static DEFINE_MUTEX(stack_depot_init_mutex);
|
|
unsigned long entries;
|
|
int ret = 0;
|
|
|
|
mutex_lock(&stack_depot_init_mutex);
|
|
|
|
if (stack_depot_disabled || stack_table)
|
|
goto out_unlock;
|
|
|
|
/*
|
|
* Similarly to stack_depot_early_init, use stack_bucket_number_order
|
|
* if assigned, and rely on automatic scaling otherwise.
|
|
*/
|
|
if (stack_bucket_number_order) {
|
|
entries = 1UL << stack_bucket_number_order;
|
|
} else {
|
|
int scale = STACK_HASH_TABLE_SCALE;
|
|
|
|
entries = nr_free_buffer_pages();
|
|
entries = roundup_pow_of_two(entries);
|
|
|
|
if (scale > PAGE_SHIFT)
|
|
entries >>= (scale - PAGE_SHIFT);
|
|
else
|
|
entries <<= (PAGE_SHIFT - scale);
|
|
}
|
|
|
|
if (entries < 1UL << STACK_BUCKET_NUMBER_ORDER_MIN)
|
|
entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MIN;
|
|
if (entries > 1UL << STACK_BUCKET_NUMBER_ORDER_MAX)
|
|
entries = 1UL << STACK_BUCKET_NUMBER_ORDER_MAX;
|
|
|
|
pr_info("allocating hash table of %lu entries via kvcalloc\n", entries);
|
|
stack_table = kvcalloc(entries, sizeof(struct list_head), GFP_KERNEL);
|
|
if (!stack_table) {
|
|
pr_err("hash table allocation failed, disabling\n");
|
|
stack_depot_disabled = true;
|
|
ret = -ENOMEM;
|
|
goto out_unlock;
|
|
}
|
|
stack_hash_mask = entries - 1;
|
|
init_stack_table(entries);
|
|
|
|
out_unlock:
|
|
mutex_unlock(&stack_depot_init_mutex);
|
|
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(stack_depot_init);
|
|
|
|
/*
|
|
* Initializes new stack pool, and updates the list of pools.
|
|
*/
|
|
static bool depot_init_pool(void **prealloc)
|
|
{
|
|
lockdep_assert_held(&pool_lock);
|
|
|
|
if (unlikely(pools_num >= DEPOT_MAX_POOLS)) {
|
|
/* Bail out if we reached the pool limit. */
|
|
WARN_ON_ONCE(pools_num > DEPOT_MAX_POOLS); /* should never happen */
|
|
WARN_ON_ONCE(!new_pool); /* to avoid unnecessary pre-allocation */
|
|
WARN_ONCE(1, "Stack depot reached limit capacity");
|
|
return false;
|
|
}
|
|
|
|
if (!new_pool && *prealloc) {
|
|
/* We have preallocated memory, use it. */
|
|
WRITE_ONCE(new_pool, *prealloc);
|
|
*prealloc = NULL;
|
|
}
|
|
|
|
if (!new_pool)
|
|
return false; /* new_pool and *prealloc are NULL */
|
|
|
|
/* Save reference to the pool to be used by depot_fetch_stack(). */
|
|
stack_pools[pools_num] = new_pool;
|
|
|
|
/*
|
|
* Stack depot tries to keep an extra pool allocated even before it runs
|
|
* out of space in the currently used pool.
|
|
*
|
|
* To indicate that a new preallocation is needed new_pool is reset to
|
|
* NULL; do not reset to NULL if we have reached the maximum number of
|
|
* pools.
|
|
*/
|
|
if (pools_num < DEPOT_MAX_POOLS)
|
|
WRITE_ONCE(new_pool, NULL);
|
|
else
|
|
WRITE_ONCE(new_pool, STACK_DEPOT_POISON);
|
|
|
|
/* Pairs with concurrent READ_ONCE() in depot_fetch_stack(). */
|
|
WRITE_ONCE(pools_num, pools_num + 1);
|
|
ASSERT_EXCLUSIVE_WRITER(pools_num);
|
|
|
|
pool_offset = 0;
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Keeps the preallocated memory to be used for a new stack depot pool. */
|
|
static void depot_keep_new_pool(void **prealloc)
|
|
{
|
|
lockdep_assert_held(&pool_lock);
|
|
|
|
/*
|
|
* If a new pool is already saved or the maximum number of
|
|
* pools is reached, do not use the preallocated memory.
|
|
*/
|
|
if (new_pool)
|
|
return;
|
|
|
|
WRITE_ONCE(new_pool, *prealloc);
|
|
*prealloc = NULL;
|
|
}
|
|
|
|
/*
|
|
* Try to initialize a new stack record from the current pool, a cached pool, or
|
|
* the current pre-allocation.
|
|
*/
|
|
static struct stack_record *depot_pop_free_pool(void **prealloc, size_t size)
|
|
{
|
|
struct stack_record *stack;
|
|
void *current_pool;
|
|
u32 pool_index;
|
|
|
|
lockdep_assert_held(&pool_lock);
|
|
|
|
if (pool_offset + size > DEPOT_POOL_SIZE) {
|
|
if (!depot_init_pool(prealloc))
|
|
return NULL;
|
|
}
|
|
|
|
if (WARN_ON_ONCE(pools_num < 1))
|
|
return NULL;
|
|
pool_index = pools_num - 1;
|
|
current_pool = stack_pools[pool_index];
|
|
if (WARN_ON_ONCE(!current_pool))
|
|
return NULL;
|
|
|
|
stack = current_pool + pool_offset;
|
|
|
|
/* Pre-initialize handle once. */
|
|
stack->handle.pool_index_plus_1 = pool_index + 1;
|
|
stack->handle.offset = pool_offset >> DEPOT_STACK_ALIGN;
|
|
stack->handle.extra = 0;
|
|
INIT_LIST_HEAD(&stack->hash_list);
|
|
|
|
pool_offset += size;
|
|
|
|
return stack;
|
|
}
|
|
|
|
/* Try to find next free usable entry from the freelist. */
|
|
static struct stack_record *depot_pop_free(void)
|
|
{
|
|
struct stack_record *stack;
|
|
|
|
lockdep_assert_held(&pool_lock);
|
|
|
|
if (list_empty(&free_stacks))
|
|
return NULL;
|
|
|
|
/*
|
|
* We maintain the invariant that the elements in front are least
|
|
* recently used, and are therefore more likely to be associated with an
|
|
* RCU grace period in the past. Consequently it is sufficient to only
|
|
* check the first entry.
|
|
*/
|
|
stack = list_first_entry(&free_stacks, struct stack_record, free_list);
|
|
if (!poll_state_synchronize_rcu(stack->rcu_state))
|
|
return NULL;
|
|
|
|
list_del(&stack->free_list);
|
|
counters[DEPOT_COUNTER_FREELIST_SIZE]--;
|
|
|
|
return stack;
|
|
}
|
|
|
|
static inline size_t depot_stack_record_size(struct stack_record *s, unsigned int nr_entries)
|
|
{
|
|
const size_t used = flex_array_size(s, entries, nr_entries);
|
|
const size_t unused = sizeof(s->entries) - used;
|
|
|
|
WARN_ON_ONCE(sizeof(s->entries) < used);
|
|
|
|
return ALIGN(sizeof(struct stack_record) - unused, 1 << DEPOT_STACK_ALIGN);
|
|
}
|
|
|
|
/* Allocates a new stack in a stack depot pool. */
|
|
static struct stack_record *
|
|
depot_alloc_stack(unsigned long *entries, unsigned int nr_entries, u32 hash, depot_flags_t flags, void **prealloc)
|
|
{
|
|
struct stack_record *stack = NULL;
|
|
size_t record_size;
|
|
|
|
lockdep_assert_held(&pool_lock);
|
|
|
|
/* This should already be checked by public API entry points. */
|
|
if (WARN_ON_ONCE(!nr_entries))
|
|
return NULL;
|
|
|
|
/* Limit number of saved frames to CONFIG_STACKDEPOT_MAX_FRAMES. */
|
|
if (nr_entries > CONFIG_STACKDEPOT_MAX_FRAMES)
|
|
nr_entries = CONFIG_STACKDEPOT_MAX_FRAMES;
|
|
|
|
if (flags & STACK_DEPOT_FLAG_GET) {
|
|
/*
|
|
* Evictable entries have to allocate the max. size so they may
|
|
* safely be re-used by differently sized allocations.
|
|
*/
|
|
record_size = depot_stack_record_size(stack, CONFIG_STACKDEPOT_MAX_FRAMES);
|
|
stack = depot_pop_free();
|
|
} else {
|
|
record_size = depot_stack_record_size(stack, nr_entries);
|
|
}
|
|
|
|
if (!stack) {
|
|
stack = depot_pop_free_pool(prealloc, record_size);
|
|
if (!stack)
|
|
return NULL;
|
|
}
|
|
|
|
/* Save the stack trace. */
|
|
stack->hash = hash;
|
|
stack->size = nr_entries;
|
|
/* stack->handle is already filled in by depot_pop_free_pool(). */
|
|
memcpy(stack->entries, entries, flex_array_size(stack, entries, nr_entries));
|
|
|
|
if (flags & STACK_DEPOT_FLAG_GET) {
|
|
refcount_set(&stack->count, 1);
|
|
counters[DEPOT_COUNTER_REFD_ALLOCS]++;
|
|
counters[DEPOT_COUNTER_REFD_INUSE]++;
|
|
} else {
|
|
/* Warn on attempts to switch to refcounting this entry. */
|
|
refcount_set(&stack->count, REFCOUNT_SATURATED);
|
|
counters[DEPOT_COUNTER_PERSIST_COUNT]++;
|
|
counters[DEPOT_COUNTER_PERSIST_BYTES] += record_size;
|
|
}
|
|
|
|
/*
|
|
* Let KMSAN know the stored stack record is initialized. This shall
|
|
* prevent false positive reports if instrumented code accesses it.
|
|
*/
|
|
kmsan_unpoison_memory(stack, record_size);
|
|
|
|
return stack;
|
|
}
|
|
|
|
static struct stack_record *depot_fetch_stack(depot_stack_handle_t handle)
|
|
{
|
|
const int pools_num_cached = READ_ONCE(pools_num);
|
|
union handle_parts parts = { .handle = handle };
|
|
void *pool;
|
|
u32 pool_index = parts.pool_index_plus_1 - 1;
|
|
size_t offset = parts.offset << DEPOT_STACK_ALIGN;
|
|
struct stack_record *stack;
|
|
|
|
lockdep_assert_not_held(&pool_lock);
|
|
|
|
if (pool_index >= pools_num_cached) {
|
|
WARN(1, "pool index %d out of bounds (%d) for stack id %08x\n",
|
|
pool_index, pools_num_cached, handle);
|
|
return NULL;
|
|
}
|
|
|
|
pool = stack_pools[pool_index];
|
|
if (WARN_ON(!pool))
|
|
return NULL;
|
|
|
|
stack = pool + offset;
|
|
if (WARN_ON(!refcount_read(&stack->count)))
|
|
return NULL;
|
|
|
|
return stack;
|
|
}
|
|
|
|
/* Links stack into the freelist. */
|
|
static void depot_free_stack(struct stack_record *stack)
|
|
{
|
|
unsigned long flags;
|
|
|
|
lockdep_assert_not_held(&pool_lock);
|
|
|
|
raw_spin_lock_irqsave(&pool_lock, flags);
|
|
printk_deferred_enter();
|
|
|
|
/*
|
|
* Remove the entry from the hash list. Concurrent list traversal may
|
|
* still observe the entry, but since the refcount is zero, this entry
|
|
* will no longer be considered as valid.
|
|
*/
|
|
list_del_rcu(&stack->hash_list);
|
|
|
|
/*
|
|
* Due to being used from constrained contexts such as the allocators,
|
|
* NMI, or even RCU itself, stack depot cannot rely on primitives that
|
|
* would sleep (such as synchronize_rcu()) or recursively call into
|
|
* stack depot again (such as call_rcu()).
|
|
*
|
|
* Instead, get an RCU cookie, so that we can ensure this entry isn't
|
|
* moved onto another list until the next grace period, and concurrent
|
|
* RCU list traversal remains safe.
|
|
*/
|
|
stack->rcu_state = get_state_synchronize_rcu();
|
|
|
|
/*
|
|
* Add the entry to the freelist tail, so that older entries are
|
|
* considered first - their RCU cookie is more likely to no longer be
|
|
* associated with the current grace period.
|
|
*/
|
|
list_add_tail(&stack->free_list, &free_stacks);
|
|
|
|
counters[DEPOT_COUNTER_FREELIST_SIZE]++;
|
|
counters[DEPOT_COUNTER_REFD_FREES]++;
|
|
counters[DEPOT_COUNTER_REFD_INUSE]--;
|
|
|
|
printk_deferred_exit();
|
|
raw_spin_unlock_irqrestore(&pool_lock, flags);
|
|
}
|
|
|
|
/* Calculates the hash for a stack. */
|
|
static inline u32 hash_stack(unsigned long *entries, unsigned int size)
|
|
{
|
|
return jhash2((u32 *)entries,
|
|
array_size(size, sizeof(*entries)) / sizeof(u32),
|
|
STACK_HASH_SEED);
|
|
}
|
|
|
|
/*
|
|
* Non-instrumented version of memcmp().
|
|
* Does not check the lexicographical order, only the equality.
|
|
*/
|
|
static inline
|
|
int stackdepot_memcmp(const unsigned long *u1, const unsigned long *u2,
|
|
unsigned int n)
|
|
{
|
|
for ( ; n-- ; u1++, u2++) {
|
|
if (*u1 != *u2)
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Finds a stack in a bucket of the hash table. */
|
|
static inline struct stack_record *find_stack(struct list_head *bucket,
|
|
unsigned long *entries, int size,
|
|
u32 hash, depot_flags_t flags)
|
|
{
|
|
struct stack_record *stack, *ret = NULL;
|
|
|
|
/*
|
|
* Stack depot may be used from instrumentation that instruments RCU or
|
|
* tracing itself; use variant that does not call into RCU and cannot be
|
|
* traced.
|
|
*
|
|
* Note: Such use cases must take care when using refcounting to evict
|
|
* unused entries, because the stack record free-then-reuse code paths
|
|
* do call into RCU.
|
|
*/
|
|
rcu_read_lock_sched_notrace();
|
|
|
|
list_for_each_entry_rcu(stack, bucket, hash_list) {
|
|
if (stack->hash != hash || stack->size != size)
|
|
continue;
|
|
|
|
/*
|
|
* This may race with depot_free_stack() accessing the freelist
|
|
* management state unioned with @entries. The refcount is zero
|
|
* in that case and the below refcount_inc_not_zero() will fail.
|
|
*/
|
|
if (data_race(stackdepot_memcmp(entries, stack->entries, size)))
|
|
continue;
|
|
|
|
/*
|
|
* Try to increment refcount. If this succeeds, the stack record
|
|
* is valid and has not yet been freed.
|
|
*
|
|
* If STACK_DEPOT_FLAG_GET is not used, it is undefined behavior
|
|
* to then call stack_depot_put() later, and we can assume that
|
|
* a stack record is never placed back on the freelist.
|
|
*/
|
|
if ((flags & STACK_DEPOT_FLAG_GET) && !refcount_inc_not_zero(&stack->count))
|
|
continue;
|
|
|
|
ret = stack;
|
|
break;
|
|
}
|
|
|
|
rcu_read_unlock_sched_notrace();
|
|
|
|
return ret;
|
|
}
|
|
|
|
depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
|
|
unsigned int nr_entries,
|
|
gfp_t alloc_flags,
|
|
depot_flags_t depot_flags)
|
|
{
|
|
struct list_head *bucket;
|
|
struct stack_record *found = NULL;
|
|
depot_stack_handle_t handle = 0;
|
|
struct page *page = NULL;
|
|
void *prealloc = NULL;
|
|
bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
|
|
unsigned long flags;
|
|
u32 hash;
|
|
|
|
if (WARN_ON(depot_flags & ~STACK_DEPOT_FLAGS_MASK))
|
|
return 0;
|
|
|
|
/*
|
|
* If this stack trace is from an interrupt, including anything before
|
|
* interrupt entry usually leads to unbounded stack depot growth.
|
|
*
|
|
* Since use of filter_irq_stacks() is a requirement to ensure stack
|
|
* depot can efficiently deduplicate interrupt stacks, always
|
|
* filter_irq_stacks() to simplify all callers' use of stack depot.
|
|
*/
|
|
nr_entries = filter_irq_stacks(entries, nr_entries);
|
|
|
|
if (unlikely(nr_entries == 0) || stack_depot_disabled)
|
|
return 0;
|
|
|
|
hash = hash_stack(entries, nr_entries);
|
|
bucket = &stack_table[hash & stack_hash_mask];
|
|
|
|
/* Fast path: look the stack trace up without locking. */
|
|
found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
|
|
if (found)
|
|
goto exit;
|
|
|
|
/*
|
|
* Allocate memory for a new pool if required now:
|
|
* we won't be able to do that under the lock.
|
|
*/
|
|
if (unlikely(can_alloc && !READ_ONCE(new_pool))) {
|
|
/*
|
|
* Zero out zone modifiers, as we don't have specific zone
|
|
* requirements. Keep the flags related to allocation in atomic
|
|
* contexts, I/O, nolockdep.
|
|
*/
|
|
alloc_flags &= ~GFP_ZONEMASK;
|
|
alloc_flags &= (GFP_ATOMIC | GFP_KERNEL | __GFP_NOLOCKDEP);
|
|
alloc_flags |= __GFP_NOWARN;
|
|
page = alloc_pages(alloc_flags, DEPOT_POOL_ORDER);
|
|
if (page)
|
|
prealloc = page_address(page);
|
|
}
|
|
|
|
raw_spin_lock_irqsave(&pool_lock, flags);
|
|
printk_deferred_enter();
|
|
|
|
/* Try to find again, to avoid concurrently inserting duplicates. */
|
|
found = find_stack(bucket, entries, nr_entries, hash, depot_flags);
|
|
if (!found) {
|
|
struct stack_record *new =
|
|
depot_alloc_stack(entries, nr_entries, hash, depot_flags, &prealloc);
|
|
|
|
if (new) {
|
|
/*
|
|
* This releases the stack record into the bucket and
|
|
* makes it visible to readers in find_stack().
|
|
*/
|
|
list_add_rcu(&new->hash_list, bucket);
|
|
found = new;
|
|
}
|
|
}
|
|
|
|
if (prealloc) {
|
|
/*
|
|
* Either stack depot already contains this stack trace, or
|
|
* depot_alloc_stack() did not consume the preallocated memory.
|
|
* Try to keep the preallocated memory for future.
|
|
*/
|
|
depot_keep_new_pool(&prealloc);
|
|
}
|
|
|
|
printk_deferred_exit();
|
|
raw_spin_unlock_irqrestore(&pool_lock, flags);
|
|
exit:
|
|
if (prealloc) {
|
|
/* Stack depot didn't use this memory, free it. */
|
|
free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
|
|
}
|
|
if (found)
|
|
handle = found->handle.handle;
|
|
return handle;
|
|
}
|
|
EXPORT_SYMBOL_GPL(stack_depot_save_flags);
|
|
|
|
depot_stack_handle_t stack_depot_save(unsigned long *entries,
|
|
unsigned int nr_entries,
|
|
gfp_t alloc_flags)
|
|
{
|
|
return stack_depot_save_flags(entries, nr_entries, alloc_flags,
|
|
STACK_DEPOT_FLAG_CAN_ALLOC);
|
|
}
|
|
EXPORT_SYMBOL_GPL(stack_depot_save);
|
|
|
|
struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle)
|
|
{
|
|
if (!handle)
|
|
return NULL;
|
|
|
|
return depot_fetch_stack(handle);
|
|
}
|
|
|
|
unsigned int stack_depot_fetch(depot_stack_handle_t handle,
|
|
unsigned long **entries)
|
|
{
|
|
struct stack_record *stack;
|
|
|
|
*entries = NULL;
|
|
/*
|
|
* Let KMSAN know *entries is initialized. This shall prevent false
|
|
* positive reports if instrumented code accesses it.
|
|
*/
|
|
kmsan_unpoison_memory(entries, sizeof(*entries));
|
|
|
|
if (!handle || stack_depot_disabled)
|
|
return 0;
|
|
|
|
stack = depot_fetch_stack(handle);
|
|
/*
|
|
* Should never be NULL, otherwise this is a use-after-put (or just a
|
|
* corrupt handle).
|
|
*/
|
|
if (WARN(!stack, "corrupt handle or use after stack_depot_put()"))
|
|
return 0;
|
|
|
|
*entries = stack->entries;
|
|
return stack->size;
|
|
}
|
|
EXPORT_SYMBOL_GPL(stack_depot_fetch);
|
|
|
|
void stack_depot_put(depot_stack_handle_t handle)
|
|
{
|
|
struct stack_record *stack;
|
|
|
|
if (!handle || stack_depot_disabled)
|
|
return;
|
|
|
|
stack = depot_fetch_stack(handle);
|
|
/*
|
|
* Should always be able to find the stack record, otherwise this is an
|
|
* unbalanced put attempt (or corrupt handle).
|
|
*/
|
|
if (WARN(!stack, "corrupt handle or unbalanced stack_depot_put()"))
|
|
return;
|
|
|
|
if (refcount_dec_and_test(&stack->count))
|
|
depot_free_stack(stack);
|
|
}
|
|
EXPORT_SYMBOL_GPL(stack_depot_put);
|
|
|
|
void stack_depot_print(depot_stack_handle_t stack)
|
|
{
|
|
unsigned long *entries;
|
|
unsigned int nr_entries;
|
|
|
|
nr_entries = stack_depot_fetch(stack, &entries);
|
|
if (nr_entries > 0)
|
|
stack_trace_print(entries, nr_entries, 0);
|
|
}
|
|
EXPORT_SYMBOL_GPL(stack_depot_print);
|
|
|
|
int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
|
|
int spaces)
|
|
{
|
|
unsigned long *entries;
|
|
unsigned int nr_entries;
|
|
|
|
nr_entries = stack_depot_fetch(handle, &entries);
|
|
return nr_entries ? stack_trace_snprint(buf, size, entries, nr_entries,
|
|
spaces) : 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(stack_depot_snprint);
|
|
|
|
depot_stack_handle_t __must_check stack_depot_set_extra_bits(
|
|
depot_stack_handle_t handle, unsigned int extra_bits)
|
|
{
|
|
union handle_parts parts = { .handle = handle };
|
|
|
|
/* Don't set extra bits on empty handles. */
|
|
if (!handle)
|
|
return 0;
|
|
|
|
parts.extra = extra_bits;
|
|
return parts.handle;
|
|
}
|
|
EXPORT_SYMBOL(stack_depot_set_extra_bits);
|
|
|
|
unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle)
|
|
{
|
|
union handle_parts parts = { .handle = handle };
|
|
|
|
return parts.extra;
|
|
}
|
|
EXPORT_SYMBOL(stack_depot_get_extra_bits);
|
|
|
|
static int stats_show(struct seq_file *seq, void *v)
|
|
{
|
|
/*
|
|
* data race ok: These are just statistics counters, and approximate
|
|
* statistics are ok for debugging.
|
|
*/
|
|
seq_printf(seq, "pools: %d\n", data_race(pools_num));
|
|
for (int i = 0; i < DEPOT_COUNTER_COUNT; i++)
|
|
seq_printf(seq, "%s: %ld\n", counter_names[i], data_race(counters[i]));
|
|
|
|
return 0;
|
|
}
|
|
DEFINE_SHOW_ATTRIBUTE(stats);
|
|
|
|
static int depot_debugfs_init(void)
|
|
{
|
|
struct dentry *dir;
|
|
|
|
if (stack_depot_disabled)
|
|
return 0;
|
|
|
|
dir = debugfs_create_dir("stackdepot", NULL);
|
|
debugfs_create_file("stats", 0444, dir, NULL, &stats_fops);
|
|
return 0;
|
|
}
|
|
late_initcall(depot_debugfs_init);
|