From 4c39529663b93165953ecf9b1a9ea817358dcd06 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 7 Aug 2024 10:07:46 +0100 Subject: [PATCH 01/34] slab: Warn on duplicate cache names when DEBUG_VM=y Duplicate slab cache names can create havoc for userspace tooling that expects slab cache names to be unique [1]. This is a reasonable expectation. Sadly, too many duplicate name problems are out there in the wild, so simply warn instead of pr_err() + failing the sanity check. [ vbabka@suse.cz: change to WARN_ON(), see the discussion at [2] ] Link: https://lore.kernel.org/linux-fsdevel/2d1d053da1cafb3e7940c4f25952da4f0af34e38.1722293276.git.osandov@fb.com/ [1] Link: https://lore.kernel.org/all/20240807090746.2146479-1-pedro.falcato@gmail.com/ [2] Signed-off-by: Pedro Falcato Acked-by: Christoph Lameter Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/slab_common.c b/mm/slab_common.c index 40b582a014b8..dea2b05c0e3f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -88,6 +88,19 @@ unsigned int kmem_cache_size(struct kmem_cache *s) EXPORT_SYMBOL(kmem_cache_size); #ifdef CONFIG_DEBUG_VM + +static bool kmem_cache_is_duplicate_name(const char *name) +{ + struct kmem_cache *s; + + list_for_each_entry(s, &slab_caches, list) { + if (!strcmp(s->name, name)) + return true; + } + + return false; +} + static int kmem_cache_sanity_check(const char *name, unsigned int size) { if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) { @@ -95,6 +108,10 @@ static int kmem_cache_sanity_check(const char *name, unsigned int size) return -EINVAL; } + /* Duplicate names will confuse slabtop, et al */ + WARN(kmem_cache_is_duplicate_name(name), + "kmem_cache of name '%s' already exists\n", name); + WARN_ON(strchr(name, ' ')); /* It confuses parsers */ return 0; } From bf6b9e9ba0861c92b08c77edbd5d602063443c5f Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Wed, 14 Aug 2024 14:50:37 -0700 Subject: [PATCH 02/34] mm, slub: print CPU id (and its node) on slab OOM Depending on how remote_node_defrag_ratio is configured, allocations can end up in this path as a result of the local node being OOM, despite the allocation overall being unconstrained (node == -1). When we print a warning, printing the current CPU makes that situation more clear (i.e., you can immediately see which node's OOM status matters for the allocation at hand). Acked-by: David Rientjes Signed-off-by: Axel Rasmussen Signed-off-by: Vlastimil Babka --- mm/slub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index c9d8a2497fd6..3088260bf75d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3416,14 +3416,15 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + int cpu = raw_smp_processor_id(); int node; struct kmem_cache_node *n; if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; - pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", - nid, gfpflags, &gfpflags); + pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n", + cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags); pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); From 1941b31482a61a7bd75300d1905938e7e48c3d25 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 19 Aug 2024 11:54:05 -0700 Subject: [PATCH 03/34] Reenable NUMA policy support in the slab allocator Revert commit 8014c46ad991 ("slub: use alloc_pages_node() in alloc_slab_page()"). The patch disabled the numa policy support in the slab allocator. It did not consider that alloc_pages() uses memory policies but alloc_pages_node() does not. As a result of this patch slab memory allocations are no longer spread via interleave policy across all available NUMA nodes on bootup. Instead all slab memory is allocated close to the boot processor. This leads to an imbalance of memory accesses on NUMA systems. Also applications using MPOL_INTERLEAVE as a memory policy will no longer spread slab allocations over all nodes in the interleave set but allocate memory locally. This may also result in unbalanced allocations on a single numa node. SLUB does not apply memory policies to individual object allocations. However, it relies on the page allocators support of memory policies through alloc_pages() to do the NUMA memory allocations on a per folio or page level. SLUB also applies memory policies when retrieving partial allocated slab pages from the partial list. Fixes: 8014c46ad991 ("slub: use alloc_pages_node() in alloc_slab_page()") Signed-off-by: Christoph Lameter Reviewed-by: Yang Shi Signed-off-by: Vlastimil Babka --- mm/slub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 3088260bf75d..60004bfc2dc2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2318,7 +2318,11 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, struct slab *slab; unsigned int order = oo_order(oo); - folio = (struct folio *)alloc_pages_node(node, flags, order); + if (node == NUMA_NO_NODE) + folio = (struct folio *)alloc_pages(flags, order); + else + folio = (struct folio *)__alloc_pages_node(node, flags, order); + if (!folio) return NULL; From b5959789e96e01b4b27a6d0354b475398d67aa6f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 7 Aug 2024 12:31:14 +0200 Subject: [PATCH 04/34] mm, slab: dissolve shutdown_cache() into its caller There's only one caller of shutdown_cache() so move the code into it. Preparatory patch for further changes, no functional change. Reviewed-by: Jann Horn Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 40b582a014b8..b76d65d7fe33 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -540,27 +540,6 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) } } -static int shutdown_cache(struct kmem_cache *s) -{ - /* free asan quarantined objects */ - kasan_cache_shutdown(s); - - if (__kmem_cache_shutdown(s) != 0) - return -EBUSY; - - list_del(&s->list); - - if (s->flags & SLAB_TYPESAFE_BY_RCU) { - list_add_tail(&s->list, &slab_caches_to_rcu_destroy); - schedule_work(&slab_caches_to_rcu_destroy_work); - } else { - kfence_shutdown_cache(s); - debugfs_slab_release(s); - } - - return 0; -} - void slab_kmem_cache_release(struct kmem_cache *s) { __kmem_cache_release(s); @@ -585,9 +564,26 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - err = shutdown_cache(s); + /* free asan quarantined objects */ + kasan_cache_shutdown(s); + + err = __kmem_cache_shutdown(s); WARN(err, "%s %s: Slab cache still has objects when called from %pS", __func__, s->name, (void *)_RET_IP_); + + if (err) + goto out_unlock; + + list_del(&s->list); + + if (rcu_set) { + list_add_tail(&s->list, &slab_caches_to_rcu_destroy); + schedule_work(&slab_caches_to_rcu_destroy_work); + } else { + kfence_shutdown_cache(s); + debugfs_slab_release(s); + } + out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); From 4ec10268ed98a3d568a39861e7b7d0a0fa7cbe60 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 7 Aug 2024 12:31:15 +0200 Subject: [PATCH 05/34] mm, slab: unlink slabinfo, sysfs and debugfs immediately kmem_cache_destroy() includes removing the associated sysfs and debugfs directories, and the cache from the list of caches that appears in /proc/slabinfo. Currently this might not happen immediately when: - the cache is SLAB_TYPESAFE_BY_RCU and the cleanup is delayed, including the directores removal - __kmem_cache_shutdown() fails due to outstanding objects - the directories remain indefinitely When a cache is recreated with the same name, such as due to module unload followed by a load, the directories will fail to be recreated for the new instance of the cache due to the old directories being present. The cache will also appear twice in /proc/slabinfo. While we want to convert the SLAB_TYPESAFE_BY_RCU cleanup to be synchronous again, the second point remains. So let's fix this first and have the directories and slabinfo removed immediately in kmem_cache_destroy() and regardless of __kmem_cache_shutdown() success. This should not make debugging harder if __kmem_cache_shutdown() fails, because a detailed report of outstanding objects is printed into dmesg already due to the failure. Also simplify kmem_cache_release() sysfs handling by using __is_defined(SLAB_SUPPORTS_SYSFS). Note the resulting code in kmem_cache_destroy() is a bit ugly but will be further simplified - this is in order to make small bisectable steps. Reviewed-by: Jann Horn Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 57 ++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index b76d65d7fe33..db61df3b4282 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -484,31 +484,19 @@ fail: } EXPORT_SYMBOL(kmem_buckets_create); -#ifdef SLAB_SUPPORTS_SYSFS /* * For a given kmem_cache, kmem_cache_destroy() should only be called * once or there will be a use-after-free problem. The actual deletion * and release of the kobject does not need slab_mutex or cpu_hotplug_lock * protection. So they are now done without holding those locks. - * - * Note that there will be a slight delay in the deletion of sysfs files - * if kmem_cache_release() is called indrectly from a work function. */ static void kmem_cache_release(struct kmem_cache *s) { - if (slab_state >= FULL) { - sysfs_slab_unlink(s); + if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL) sysfs_slab_release(s); - } else { + else slab_kmem_cache_release(s); - } } -#else -static void kmem_cache_release(struct kmem_cache *s) -{ - slab_kmem_cache_release(s); -} -#endif static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) { @@ -534,7 +522,6 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) rcu_barrier(); list_for_each_entry_safe(s, s2, &to_destroy, list) { - debugfs_slab_release(s); kfence_shutdown_cache(s); kmem_cache_release(s); } @@ -549,8 +536,8 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - int err = -EBUSY; bool rcu_set; + int err; if (unlikely(!s) || !kasan_check_byte(s)) return; @@ -558,11 +545,14 @@ void kmem_cache_destroy(struct kmem_cache *s) cpus_read_lock(); mutex_lock(&slab_mutex); - rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; - s->refcount--; - if (s->refcount) - goto out_unlock; + if (s->refcount) { + mutex_unlock(&slab_mutex); + cpus_read_unlock(); + return; + } + + rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; /* free asan quarantined objects */ kasan_cache_shutdown(s); @@ -571,24 +561,29 @@ void kmem_cache_destroy(struct kmem_cache *s) WARN(err, "%s %s: Slab cache still has objects when called from %pS", __func__, s->name, (void *)_RET_IP_); - if (err) - goto out_unlock; - list_del(&s->list); - if (rcu_set) { - list_add_tail(&s->list, &slab_caches_to_rcu_destroy); - schedule_work(&slab_caches_to_rcu_destroy_work); - } else { + if (!err && !rcu_set) kfence_shutdown_cache(s); - debugfs_slab_release(s); - } -out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); - if (!err && !rcu_set) + + if (slab_state >= FULL) + sysfs_slab_unlink(s); + debugfs_slab_release(s); + + if (err) + return; + + if (rcu_set) { + mutex_lock(&slab_mutex); + list_add_tail(&s->list, &slab_caches_to_rcu_destroy); + schedule_work(&slab_caches_to_rcu_destroy_work); + mutex_unlock(&slab_mutex); + } else { kmem_cache_release(s); + } } EXPORT_SYMBOL(kmem_cache_destroy); From f77d0cda4a8ebd070bfa1ef9a153c470ea3601ce Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 7 Aug 2024 12:31:16 +0200 Subject: [PATCH 06/34] mm, slab: move kfence_shutdown_cache() outside slab_mutex kfence_shutdown_cache() is called under slab_mutex when the cache is destroyed synchronously, and outside slab_mutex during the delayed destruction of SLAB_TYPESAFE_BY_RCU caches. It seems it should always be safe to call it outside of slab_mutex so we can just move the call to kmem_cache_release(), which is called outside. Reviewed-by: Jann Horn Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index db61df3b4282..a079b8540334 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -492,6 +492,7 @@ EXPORT_SYMBOL(kmem_buckets_create); */ static void kmem_cache_release(struct kmem_cache *s) { + kfence_shutdown_cache(s); if (__is_defined(SLAB_SUPPORTS_SYSFS) && slab_state >= FULL) sysfs_slab_release(s); else @@ -521,10 +522,8 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) rcu_barrier(); - list_for_each_entry_safe(s, s2, &to_destroy, list) { - kfence_shutdown_cache(s); + list_for_each_entry_safe(s, s2, &to_destroy, list) kmem_cache_release(s); - } } void slab_kmem_cache_release(struct kmem_cache *s) @@ -563,9 +562,6 @@ void kmem_cache_destroy(struct kmem_cache *s) list_del(&s->list); - if (!err && !rcu_set) - kfence_shutdown_cache(s); - mutex_unlock(&slab_mutex); cpus_read_unlock(); From 2eb14c1c2717396f2fb1e4a4c5a1ec87cdd174f6 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 7 Aug 2024 12:31:17 +0200 Subject: [PATCH 07/34] mm, slab: reintroduce rcu_barrier() into kmem_cache_destroy() There used to be a rcu_barrier() for SLAB_TYPESAFE_BY_RCU caches in kmem_cache_destroy() until commit 657dc2f97220 ("slab: remove synchronous rcu_barrier() call in memcg cache release path") moved it to an asynchronous work that finishes the destroying of such caches. The motivation for that commit was the MEMCG_KMEM integration that at the time created and removed clones of the global slab caches together with their cgroups, and blocking cgroups removal was unwelcome. The implementation later changed to per-object memcg tracking using a single cache, so there should be no more need for a fast non-blocking kmem_cache_destroy(), which is typically only done when a module is unloaded etc. Going back to synchronous barrier has the following advantages: - simpler implementation - it's easier to test the result of kmem_cache_destroy() in a kunit test Thus effectively revert commit 657dc2f97220. It is not a 1:1 revert as the code has changed since. The main part is that kmem_cache_release(s) is always called from kmem_cache_destroy(), but for SLAB_TYPESAFE_BY_RCU caches there's a rcu_barrier() first. Suggested-by: Mateusz Guzik Reviewed-by: Jann Horn Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 47 ++++------------------------------------------- 1 file changed, 4 insertions(+), 43 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index a079b8540334..c40227d5fa07 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -40,11 +40,6 @@ LIST_HEAD(slab_caches); DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; -static LIST_HEAD(slab_caches_to_rcu_destroy); -static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); -static DECLARE_WORK(slab_caches_to_rcu_destroy_work, - slab_caches_to_rcu_destroy_workfn); - /* * Set of flags that will prevent slab merging */ @@ -499,33 +494,6 @@ static void kmem_cache_release(struct kmem_cache *s) slab_kmem_cache_release(s); } -static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) -{ - LIST_HEAD(to_destroy); - struct kmem_cache *s, *s2; - - /* - * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the - * @slab_caches_to_rcu_destroy list. The slab pages are freed - * through RCU and the associated kmem_cache are dereferenced - * while freeing the pages, so the kmem_caches should be freed only - * after the pending RCU operations are finished. As rcu_barrier() - * is a pretty slow operation, we batch all pending destructions - * asynchronously. - */ - mutex_lock(&slab_mutex); - list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy); - mutex_unlock(&slab_mutex); - - if (list_empty(&to_destroy)) - return; - - rcu_barrier(); - - list_for_each_entry_safe(s, s2, &to_destroy, list) - kmem_cache_release(s); -} - void slab_kmem_cache_release(struct kmem_cache *s) { __kmem_cache_release(s); @@ -535,7 +503,6 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - bool rcu_set; int err; if (unlikely(!s) || !kasan_check_byte(s)) @@ -551,8 +518,6 @@ void kmem_cache_destroy(struct kmem_cache *s) return; } - rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; - /* free asan quarantined objects */ kasan_cache_shutdown(s); @@ -572,14 +537,10 @@ void kmem_cache_destroy(struct kmem_cache *s) if (err) return; - if (rcu_set) { - mutex_lock(&slab_mutex); - list_add_tail(&s->list, &slab_caches_to_rcu_destroy); - schedule_work(&slab_caches_to_rcu_destroy_work); - mutex_unlock(&slab_mutex); - } else { - kmem_cache_release(s); - } + if (s->flags & SLAB_TYPESAFE_BY_RCU) + rcu_barrier(); + + kmem_cache_release(s); } EXPORT_SYMBOL(kmem_cache_destroy); From 2b55d6a42d14c8675e38d6d9adca3014fdf01951 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 20 Aug 2024 17:59:35 +0200 Subject: [PATCH 08/34] rcu/kvfree: Add kvfree_rcu_barrier() API Add a kvfree_rcu_barrier() function. It waits until all in-flight pointers are freed over RCU machinery. It does not wait any GP completion and it is within its right to return immediately if there are no outstanding pointers. This function is useful when there is a need to guarantee that a memory is fully freed before destroying memory caches. For example, during unloading a kernel module. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Vlastimil Babka --- include/linux/rcutiny.h | 5 ++ include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 109 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 107 insertions(+), 8 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d9ac7b136aea..522123050ff8 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -111,6 +111,11 @@ static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) kvfree(ptr); } +static inline void kvfree_rcu_barrier(void) +{ + rcu_barrier(); +} + #ifdef CONFIG_KASAN_GENERIC void kvfree_call_rcu(struct rcu_head *head, void *ptr); #else diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 254244202ea9..58e7db80f3a8 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -35,6 +35,7 @@ static inline void rcu_virt_note_context_switch(void) void synchronize_rcu_expedited(void); void kvfree_call_rcu(struct rcu_head *head, void *ptr); +void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_dyntick_idle(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..be00aac5f4e7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3584,18 +3584,15 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) } /* - * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * Return: %true if a work is queued, %false otherwise. */ -static void kfree_rcu_monitor(struct work_struct *work) +static bool +kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) { - struct kfree_rcu_cpu *krcp = container_of(work, - struct kfree_rcu_cpu, monitor_work.work); unsigned long flags; + bool queued = false; int i, j; - // Drain ready for reclaim. - kvfree_rcu_drain_ready(krcp); - raw_spin_lock_irqsave(&krcp->lock, flags); // Attempt to start a new batch. @@ -3634,11 +3631,27 @@ static void kfree_rcu_monitor(struct work_struct *work) // be that the work is in the pending state when // channels have been detached following by each // other. - queue_rcu_work(system_wq, &krwp->rcu_work); + queued = queue_rcu_work(system_wq, &krwp->rcu_work); } } raw_spin_unlock_irqrestore(&krcp->lock, flags); + return queued; +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + struct kfree_rcu_cpu *krcp = container_of(work, + struct kfree_rcu_cpu, monitor_work.work); + + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + + // Queue a batch for a rest. + kvfree_rcu_queue_batch(krcp); // If there is nothing to detach, it means that our job is // successfully done here. In case of having at least one @@ -3859,6 +3872,86 @@ unlock_return: } EXPORT_SYMBOL_GPL(kvfree_call_rcu); +/** + * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. + * + * Note that a single argument of kvfree_rcu() call has a slow path that + * triggers synchronize_rcu() following by freeing a pointer. It is done + * before the return from the function. Therefore for any single-argument + * call that will result in a kfree() to a cache that is to be destroyed + * during module exit, it is developer's responsibility to ensure that all + * such calls have returned before the call to kmem_cache_destroy(). + */ +void kvfree_rcu_barrier(void) +{ + struct kfree_rcu_cpu_work *krwp; + struct kfree_rcu_cpu *krcp; + bool queued; + int i, cpu; + + /* + * Firstly we detach objects and queue them over an RCU-batch + * for all CPUs. Finally queued works are flushed for each CPU. + * + * Please note. If there are outstanding batches for a particular + * CPU, those have to be finished first following by queuing a new. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * Check if this CPU has any objects which have been queued for a + * new GP completion. If not(means nothing to detach), we are done + * with it. If any batch is pending/running for this "krcp", below + * per-cpu flush_rcu_work() waits its completion(see last step). + */ + if (!need_offload_krc(krcp)) + continue; + + while (1) { + /* + * If we are not able to queue a new RCU work it means: + * - batches for this CPU are still in flight which should + * be flushed first and then repeat; + * - no objects to detach, because of concurrency. + */ + queued = kvfree_rcu_queue_batch(krcp); + + /* + * Bail out, if there is no need to offload this "krcp" + * anymore. As noted earlier it can run concurrently. + */ + if (queued || !need_offload_krc(krcp)) + break; + + /* There are ongoing batches. */ + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } + } + + /* + * Now we guarantee that all objects are flushed. + */ + for_each_possible_cpu(cpu) { + krcp = per_cpu_ptr(&krc, cpu); + + /* + * A monitor work can drain ready to reclaim objects + * directly. Wait its completion if running or pending. + */ + cancel_delayed_work_sync(&krcp->monitor_work); + + for (i = 0; i < KFREE_N_BATCHES; i++) { + krwp = &(krcp->krw_arr[i]); + flush_rcu_work(&krwp->rcu_work); + } + } +} +EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); + static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { From 6c6c47b063b593785202be158e61fe5c827d6677 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 7 Aug 2024 12:31:19 +0200 Subject: [PATCH 09/34] mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy() We would like to replace call_rcu() users with kfree_rcu() where the existing callback is just a kmem_cache_free(). However this causes issues when the cache can be destroyed (such as due to module unload). Currently such modules should be issuing rcu_barrier() before kmem_cache_destroy() to have their call_rcu() callbacks processed first. This barrier is however not sufficient for kfree_rcu() in flight due to the batching introduced by a35d16905efc ("rcu: Add basic support for kfree_rcu() batching"). This is not a problem for kmalloc caches which are never destroyed, but since removing SLOB, kfree_rcu() is allowed also for any other cache, that might be destroyed. In order not to complicate the API, put the responsibility for handling outstanding kfree_rcu() in kmem_cache_destroy() itself. Use the newly introduced kvfree_rcu_barrier() to wait before destroying the cache. This is similar to how we issue rcu_barrier() for SLAB_TYPESAFE_BY_RCU caches, but has to be done earlier, as the latter only needs to wait for the empty slab pages to finish freeing, and not objects from the slab. Users of call_rcu() with arbitrary callbacks should still issue rcu_barrier() before destroying the cache and unloading the module, as kvfree_rcu_barrier() is not a superset of rcu_barrier() and the callbacks may be invoking module code or performing other actions that are necessary for a successful unload. Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/slab_common.c b/mm/slab_common.c index c40227d5fa07..1a2873293f5d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -508,6 +508,9 @@ void kmem_cache_destroy(struct kmem_cache *s) if (unlikely(!s) || !kasan_check_byte(s)) return; + /* in-flight kfree_rcu()'s may include objects from our cache */ + kvfree_rcu_barrier(); + cpus_read_lock(); mutex_lock(&slab_mutex); From 4e1c44b3db79ba910adec32e2e1b920a0e34890a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 7 Aug 2024 12:31:20 +0200 Subject: [PATCH 10/34] kunit, slub: add test_kfree_rcu() and test_leak_destroy() Add a test that will create cache, allocate one object, kfree_rcu() it and attempt to destroy it. As long as the usage of kvfree_rcu_barrier() in kmem_cache_destroy() works correctly, there should be no warnings in dmesg and the test should pass. Additionally add a test_leak_destroy() test that leaks an object on purpose and verifies that kmem_cache_destroy() catches it. Signed-off-by: Vlastimil Babka --- lib/slub_kunit.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c index e6667a28c014..6e3a1e5a7142 100644 --- a/lib/slub_kunit.c +++ b/lib/slub_kunit.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "../mm/slab.h" static struct kunit_resource resource; @@ -157,6 +158,34 @@ static void test_kmalloc_redzone_access(struct kunit *test) kmem_cache_destroy(s); } +struct test_kfree_rcu_struct { + struct rcu_head rcu; +}; + +static void test_kfree_rcu(struct kunit *test) +{ + struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu", + sizeof(struct test_kfree_rcu_struct), + SLAB_NO_MERGE); + struct test_kfree_rcu_struct *p = kmem_cache_alloc(s, GFP_KERNEL); + + kfree_rcu(p, rcu); + kmem_cache_destroy(s); + + KUNIT_EXPECT_EQ(test, 0, slab_errors); +} + +static void test_leak_destroy(struct kunit *test) +{ + struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu", + 64, SLAB_NO_MERGE); + kmem_cache_alloc(s, GFP_KERNEL); + + kmem_cache_destroy(s); + + KUNIT_EXPECT_EQ(test, 1, slab_errors); +} + static int test_init(struct kunit *test) { slab_errors = 0; @@ -177,6 +206,8 @@ static struct kunit_case test_cases[] = { KUNIT_CASE(test_clobber_redzone_free), KUNIT_CASE(test_kmalloc_redzone_access), + KUNIT_CASE(test_kfree_rcu), + KUNIT_CASE(test_leak_destroy), {} }; From b3c34245756adada8a50bdaedbb3965b071c7b0a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 9 Aug 2024 17:36:55 +0200 Subject: [PATCH 11/34] kasan: catch invalid free before SLUB reinitializes the object Currently, when KASAN is combined with init-on-free behavior, the initialization happens before KASAN's "invalid free" checks. More importantly, a subsequent commit will want to RCU-delay the actual SLUB freeing of an object, and we'd like KASAN to still validate synchronously that freeing the object is permitted. (Otherwise this change will make the existing testcase kmem_cache_invalid_free fail.) So add a new KASAN hook that allows KASAN to pre-validate a kmem_cache_free() operation before SLUB actually starts modifying the object or its metadata. Inside KASAN, this: - moves checks from poison_slab_object() into check_slab_allocation() - moves kasan_arch_is_ready() up into callers of poison_slab_object() - removes "ip" argument of poison_slab_object() and __kasan_slab_free() (since those functions no longer do any reporting) Acked-by: Vlastimil Babka #slub Reviewed-by: Andrey Konovalov Signed-off-by: Jann Horn Signed-off-by: Vlastimil Babka --- include/linux/kasan.h | 54 ++++++++++++++++++++++++++++++++++--- mm/kasan/common.c | 63 +++++++++++++++++++++++++------------------ mm/slub.c | 7 +++++ 3 files changed, 95 insertions(+), 29 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 70d6a8f6e25d..1570c7191176 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -175,13 +175,55 @@ static __always_inline void * __must_check kasan_init_slab_obj( return (void *)object; } -bool __kasan_slab_free(struct kmem_cache *s, void *object, - unsigned long ip, bool init); +bool __kasan_slab_pre_free(struct kmem_cache *s, void *object, + unsigned long ip); +/** + * kasan_slab_pre_free - Check whether freeing a slab object is safe. + * @object: Object to be freed. + * + * This function checks whether freeing the given object is safe. It may + * check for double-free and invalid-free bugs and report them. + * + * This function is intended only for use by the slab allocator. + * + * @Return true if freeing the object is unsafe; false otherwise. + */ +static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, + void *object) +{ + if (kasan_enabled()) + return __kasan_slab_pre_free(s, object, _RET_IP_); + return false; +} + +bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init); +/** + * kasan_slab_free - Poison, initialize, and quarantine a slab object. + * @object: Object to be freed. + * @init: Whether to initialize the object. + * + * This function informs that a slab object has been freed and is not + * supposed to be accessed anymore, except for objects in + * SLAB_TYPESAFE_BY_RCU caches. + * + * For KASAN modes that have integrated memory initialization + * (kasan_has_integrated_init() == true), this function also initializes + * the object's memory. For other modes, the @init argument is ignored. + * + * This function might also take ownership of the object to quarantine it. + * When this happens, KASAN will defer freeing the object to a later + * stage and handle it internally until then. The return value indicates + * whether KASAN took ownership of the object. + * + * This function is intended only for use by the slab allocator. + * + * @Return true if KASAN took ownership of the object; false otherwise. + */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { if (kasan_enabled()) - return __kasan_slab_free(s, object, _RET_IP_, init); + return __kasan_slab_free(s, object, init); return false; } @@ -371,6 +413,12 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache, { return (void *)object; } + +static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) +{ + return false; +} + static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { return false; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 85e7c6b4575c..f26bbc087b3b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -208,15 +208,12 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, return (void *)object; } -static inline bool poison_slab_object(struct kmem_cache *cache, void *object, - unsigned long ip, bool init) +/* Returns true when freeing the object is not safe. */ +static bool check_slab_allocation(struct kmem_cache *cache, void *object, + unsigned long ip) { - void *tagged_object; + void *tagged_object = object; - if (!kasan_arch_is_ready()) - return false; - - tagged_object = object; object = kasan_reset_tag(object); if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != object)) { @@ -224,37 +221,46 @@ static inline bool poison_slab_object(struct kmem_cache *cache, void *object, return true; } - /* RCU slabs could be legally used after free within the RCU period. */ - if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) - return false; - if (!kasan_byte_accessible(tagged_object)) { kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE); return true; } + return false; +} + +static inline void poison_slab_object(struct kmem_cache *cache, void *object, + bool init) +{ + void *tagged_object = object; + + object = kasan_reset_tag(object); + + /* RCU slabs could be legally used after free within the RCU period. */ + if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) + return; + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), KASAN_SLAB_FREE, init); if (kasan_stack_collection_enabled()) kasan_save_free_info(cache, tagged_object); - - return false; } -bool __kasan_slab_free(struct kmem_cache *cache, void *object, - unsigned long ip, bool init) +bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, + unsigned long ip) { - if (is_kfence_address(object)) + if (!kasan_arch_is_ready() || is_kfence_address(object)) + return false; + return check_slab_allocation(cache, object, ip); +} + +bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init) +{ + if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; - /* - * If the object is buggy, do not let slab put the object onto the - * freelist. The object will thus never be allocated again and its - * metadata will never get released. - */ - if (poison_slab_object(cache, object, ip, init)) - return true; + poison_slab_object(cache, object, init); /* * If the object is put into quarantine, do not let slab put the object @@ -504,11 +510,16 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) return true; } - if (is_kfence_address(ptr)) - return false; + if (is_kfence_address(ptr) || !kasan_arch_is_ready()) + return true; slab = folio_slab(folio); - return !poison_slab_object(slab->slab_cache, ptr, ip, false); + + if (check_slab_allocation(slab->slab_cache, ptr, ip)) + return false; + + poison_slab_object(slab->slab_cache, ptr, false); + return true; } void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip) diff --git a/mm/slub.c b/mm/slub.c index c9d8a2497fd6..4946488cb5a7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2226,6 +2226,13 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) if (kfence_free(x)) return false; + /* + * Give KASAN a chance to notice an invalid free operation before we + * modify the object. + */ + if (kasan_slab_pre_free(s, x)) + return false; + /* * As memory initialization might be integrated into KASAN, * kasan_slab_free and initialization memset's must be From b8c8ba73c68bb3c3e9dad22f488b86c540c839f9 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 9 Aug 2024 17:36:56 +0200 Subject: [PATCH 12/34] slub: Introduce CONFIG_SLUB_RCU_DEBUG Currently, KASAN is unable to catch use-after-free in SLAB_TYPESAFE_BY_RCU slabs because use-after-free is allowed within the RCU grace period by design. Add a SLUB debugging feature which RCU-delays every individual kmem_cache_free() before either actually freeing the object or handing it off to KASAN, and change KASAN to poison freed objects as normal when this option is enabled. For now I've configured Kconfig.debug to default-enable this feature in the KASAN GENERIC and SW_TAGS modes; I'm not enabling it by default in HW_TAGS mode because I'm not sure if it might have unwanted performance degradation effects there. Note that this is mostly useful with KASAN in the quarantine-based GENERIC mode; SLAB_TYPESAFE_BY_RCU slabs are basically always also slabs with a ->ctor, and KASAN's assign_tag() currently has to assign fixed tags for those, reducing the effectiveness of SW_TAGS/HW_TAGS mode. (A possible future extension of this work would be to also let SLUB call the ->ctor() on every allocation instead of only when the slab page is allocated; then tag-based modes would be able to assign new tags on every reallocation.) Tested-by: syzbot+263726e59eab6b442723@syzkaller.appspotmail.com Reviewed-by: Andrey Konovalov Acked-by: Marco Elver Acked-by: Vlastimil Babka #slab Signed-off-by: Jann Horn Signed-off-by: Vlastimil Babka --- include/linux/kasan.h | 17 ++++++---- mm/Kconfig.debug | 32 ++++++++++++++++++ mm/kasan/common.c | 11 +++--- mm/kasan/kasan_test.c | 46 +++++++++++++++++++++++++ mm/slab_common.c | 16 +++++++++ mm/slub.c | 79 ++++++++++++++++++++++++++++++++++++++----- 6 files changed, 182 insertions(+), 19 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 1570c7191176..00a3bf7c0d8f 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -196,15 +196,18 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, return false; } -bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init); +bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, + bool still_accessible); /** * kasan_slab_free - Poison, initialize, and quarantine a slab object. * @object: Object to be freed. * @init: Whether to initialize the object. + * @still_accessible: Whether the object contents are still accessible. * * This function informs that a slab object has been freed and is not - * supposed to be accessed anymore, except for objects in - * SLAB_TYPESAFE_BY_RCU caches. + * supposed to be accessed anymore, except when @still_accessible is set + * (indicating that the object is in a SLAB_TYPESAFE_BY_RCU cache and an RCU + * grace period might not have passed yet). * * For KASAN modes that have integrated memory initialization * (kasan_has_integrated_init() == true), this function also initializes @@ -220,10 +223,11 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init); * @Return true if KASAN took ownership of the object; false otherwise. */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, - void *object, bool init) + void *object, bool init, + bool still_accessible) { if (kasan_enabled()) - return __kasan_slab_free(s, object, init); + return __kasan_slab_free(s, object, init, still_accessible); return false; } @@ -419,7 +423,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) return false; } -static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) +static inline bool kasan_slab_free(struct kmem_cache *s, void *object, + bool init, bool still_accessible) { return false; } diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index afc72fde0f03..41a58536531d 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -70,6 +70,38 @@ config SLUB_DEBUG_ON off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying "slab_debug=-". +config SLUB_RCU_DEBUG + bool "Enable UAF detection in TYPESAFE_BY_RCU caches (for KASAN)" + depends on SLUB_DEBUG + # SLUB_RCU_DEBUG should build fine without KASAN, but is currently useless + # without KASAN, so mark it as a dependency of KASAN for now. + depends on KASAN + default KASAN_GENERIC || KASAN_SW_TAGS + help + Make SLAB_TYPESAFE_BY_RCU caches behave approximately as if the cache + was not marked as SLAB_TYPESAFE_BY_RCU and every caller used + kfree_rcu() instead. + + This is intended for use in combination with KASAN, to enable KASAN to + detect use-after-free accesses in such caches. + (KFENCE is able to do that independent of this flag.) + + This might degrade performance. + Unfortunately this also prevents a very specific bug pattern from + triggering (insufficient checks against an object being recycled + within the RCU grace period); so this option can be turned off even on + KASAN builds, in case you want to test for such a bug. + + If you're using this for testing bugs / fuzzing and care about + catching all the bugs WAY more than performance, you might want to + also turn on CONFIG_RCU_STRICT_GRACE_PERIOD. + + WARNING: + This is designed as a debugging feature, not a security feature. + Objects are sometimes recycled without RCU delay under memory pressure. + + If unsure, say N. + config PAGE_OWNER bool "Track page owner" depends on DEBUG_KERNEL && STACKTRACE_SUPPORT diff --git a/mm/kasan/common.c b/mm/kasan/common.c index f26bbc087b3b..ed4873e18c75 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -230,14 +230,14 @@ static bool check_slab_allocation(struct kmem_cache *cache, void *object, } static inline void poison_slab_object(struct kmem_cache *cache, void *object, - bool init) + bool init, bool still_accessible) { void *tagged_object = object; object = kasan_reset_tag(object); /* RCU slabs could be legally used after free within the RCU period. */ - if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) + if (unlikely(still_accessible)) return; kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), @@ -255,12 +255,13 @@ bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, return check_slab_allocation(cache, object, ip); } -bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init) +bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, + bool still_accessible) { if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; - poison_slab_object(cache, object, init); + poison_slab_object(cache, object, init, still_accessible); /* * If the object is put into quarantine, do not let slab put the object @@ -518,7 +519,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) if (check_slab_allocation(slab->slab_cache, ptr, ip)) return false; - poison_slab_object(slab->slab_cache, ptr, false); + poison_slab_object(slab->slab_cache, ptr, false, false); return true; } diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 7b32be2a3cf0..567d33b493e2 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -996,6 +996,51 @@ static void kmem_cache_invalid_free(struct kunit *test) kmem_cache_destroy(cache); } +static void kmem_cache_rcu_uaf(struct kunit *test) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB_RCU_DEBUG); + + cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU, + NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + *p = 1; + + rcu_read_lock(); + + /* Free the object - this will internally schedule an RCU callback. */ + kmem_cache_free(cache, p); + + /* + * We should still be allowed to access the object at this point because + * the cache is SLAB_TYPESAFE_BY_RCU and we've been in an RCU read-side + * critical section since before the kmem_cache_free(). + */ + READ_ONCE(*p); + + rcu_read_unlock(); + + /* + * Wait for the RCU callback to execute; after this, the object should + * have actually been freed from KASAN's perspective. + */ + rcu_barrier(); + + KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*p)); + + kmem_cache_destroy(cache); +} + static void empty_cache_ctor(void *object) { } static void kmem_cache_double_destroy(struct kunit *test) @@ -1937,6 +1982,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmem_cache_oob), KUNIT_CASE(kmem_cache_double_free), KUNIT_CASE(kmem_cache_invalid_free), + KUNIT_CASE(kmem_cache_rcu_uaf), KUNIT_CASE(kmem_cache_double_destroy), KUNIT_CASE(kmem_cache_accounted), KUNIT_CASE(kmem_cache_bulk), diff --git a/mm/slab_common.c b/mm/slab_common.c index 1a2873293f5d..884e8e70a56d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -511,6 +511,22 @@ void kmem_cache_destroy(struct kmem_cache *s) /* in-flight kfree_rcu()'s may include objects from our cache */ kvfree_rcu_barrier(); + if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) && + (s->flags & SLAB_TYPESAFE_BY_RCU)) { + /* + * Under CONFIG_SLUB_RCU_DEBUG, when objects in a + * SLAB_TYPESAFE_BY_RCU slab are freed, SLUB will internally + * defer their freeing with call_rcu(). + * Wait for such call_rcu() invocations here before actually + * destroying the cache. + * + * It doesn't matter that we haven't looked at the slab refcount + * yet - slabs with SLAB_TYPESAFE_BY_RCU can't be merged, so + * the refcount should be 1 here. + */ + rcu_barrier(); + } + cpus_read_lock(); mutex_lock(&slab_mutex); diff --git a/mm/slub.c b/mm/slub.c index 4946488cb5a7..95977f25a760 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2200,16 +2200,30 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, } #endif /* CONFIG_MEMCG */ +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head); + +struct rcu_delayed_free { + struct rcu_head head; + void *object; +}; +#endif + /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. * * Returns true if freeing of the object can proceed, false if its reuse - * was delayed by KASAN quarantine, or it was returned to KFENCE. + * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned + * to KFENCE. */ static __always_inline -bool slab_free_hook(struct kmem_cache *s, void *x, bool init) +bool slab_free_hook(struct kmem_cache *s, void *x, bool init, + bool after_rcu_delay) { + /* Are the object contents still accessible? */ + bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay; + kmemleak_free_recursive(x, s->flags); kmsan_slab_free(s, x); @@ -2219,7 +2233,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) debug_check_no_obj_freed(x, s->object_size); /* Use KCSAN to help debug racy use-after-free. */ - if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) + if (!still_accessible) __kcsan_check_access(x, s->object_size, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); @@ -2233,6 +2247,28 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) if (kasan_slab_pre_free(s, x)) return false; +#ifdef CONFIG_SLUB_RCU_DEBUG + if (still_accessible) { + struct rcu_delayed_free *delayed_free; + + delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT); + if (delayed_free) { + /* + * Let KASAN track our call stack as a "related work + * creation", just like if the object had been freed + * normally via kfree_rcu(). + * We have to do this manually because the rcu_head is + * not located inside the object. + */ + kasan_record_aux_stack_noalloc(x); + + delayed_free->object = x; + call_rcu(&delayed_free->head, slab_free_after_rcu_debug); + return false; + } + } +#endif /* CONFIG_SLUB_RCU_DEBUG */ + /* * As memory initialization might be integrated into KASAN, * kasan_slab_free and initialization memset's must be @@ -2256,7 +2292,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) s->size - inuse - rsize); } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init); + return !kasan_slab_free(s, x, init, still_accessible); } static __fastpath_inline @@ -2270,7 +2306,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, bool init; if (is_kfence_address(next)) { - slab_free_hook(s, next, false); + slab_free_hook(s, next, false, false); return false; } @@ -2285,7 +2321,7 @@ bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (likely(slab_free_hook(s, object, init))) { + if (likely(slab_free_hook(s, object, init, false))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -4477,7 +4513,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, memcg_slab_free_hook(s, slab, &object, 1); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) do_slab_free(s, slab, object, object, 1, addr); } @@ -4486,7 +4522,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) { - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); } #endif @@ -4505,6 +4541,33 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, do_slab_free(s, slab, head, tail, cnt, addr); } +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) +{ + struct rcu_delayed_free *delayed_free = + container_of(rcu_head, struct rcu_delayed_free, head); + void *object = delayed_free->object; + struct slab *slab = virt_to_slab(object); + struct kmem_cache *s; + + kfree(delayed_free); + + if (WARN_ON(is_kfence_address(object))) + return; + + /* find the object and the cache again */ + if (WARN_ON(!slab)) + return; + s = slab->slab_cache; + if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU))) + return; + + /* resume freeing */ + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) + do_slab_free(s, slab, object, object, 1, _THIS_IP_); +} +#endif /* CONFIG_SLUB_RCU_DEBUG */ + #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { From 3beb2fb68184fda063cdd3cdd3bbe52c5cae56bb Mon Sep 17 00:00:00 2001 From: Yan Zhen Date: Thu, 22 Aug 2024 10:27:04 +0800 Subject: [PATCH 13/34] mm, slab: use kmem_cache_free() to free from kmem_buckets_cache In kmem_buckets_create(), the kmem_buckets object is allocated by kmem_cache_alloc() from kmem_buckets_cache, but in the failure case, it's freed by kfree(). This is not wrong, but using kmem_cache_free() is the more common pattern, so use it. Signed-off-by: Yan Zhen Reviewed-by: Christoph Lameter Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index dea2b05c0e3f..ca694f5553b4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -495,7 +495,7 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, fail: for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) kmem_cache_destroy((*b)[idx]); - kfree(b); + kmem_cache_free(kmem_buckets_cache, b); return NULL; } From 59090e479ac78ae18facd4c58eb332562a23020e Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Thu, 29 Aug 2024 11:29:11 +0800 Subject: [PATCH 14/34] mm, slub: avoid zeroing kmalloc redzone Since commit 946fa0dbf2d8 ("mm/slub: extend redzone check to extra allocated kmalloc space than requested"), setting orig_size treats the wasted space (object_size - orig_size) as a redzone. However with init_on_free=1 we clear the full object->size, including the redzone. Additionally we clear the object metadata, including the stored orig_size, making it zero, which makes check_object() treat the whole object as a redzone. These issues lead to the following BUG report with "slub_debug=FUZ init_on_free=1": [ 0.000000] ============================================================================= [ 0.000000] BUG kmalloc-8 (Not tainted): kmalloc Redzone overwritten [ 0.000000] ----------------------------------------------------------------------------- [ 0.000000] [ 0.000000] 0xffff000010032858-0xffff00001003285f @offset=2136. First byte 0x0 instead of 0xcc [ 0.000000] FIX kmalloc-8: Restoring kmalloc Redzone 0xffff000010032858-0xffff00001003285f=0xcc [ 0.000000] Slab 0xfffffdffc0400c80 objects=36 used=23 fp=0xffff000010032a18 flags=0x3fffe0000000200(workingset|node=0|zone=0|lastcpupid=0x1ffff) [ 0.000000] Object 0xffff000010032858 @offset=2136 fp=0xffff0000100328c8 [ 0.000000] [ 0.000000] Redzone ffff000010032850: cc cc cc cc cc cc cc cc ........ [ 0.000000] Object ffff000010032858: cc cc cc cc cc cc cc cc ........ [ 0.000000] Redzone ffff000010032860: cc cc cc cc cc cc cc cc ........ [ 0.000000] Padding ffff0000100328b4: 00 00 00 00 00 00 00 00 00 00 00 00 ............ [ 0.000000] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.11.0-rc3-next-20240814-00004-g61844c55c3f4 #144 [ 0.000000] Hardware name: NXP i.MX95 19X19 board (DT) [ 0.000000] Call trace: [ 0.000000] dump_backtrace+0x90/0xe8 [ 0.000000] show_stack+0x18/0x24 [ 0.000000] dump_stack_lvl+0x74/0x8c [ 0.000000] dump_stack+0x18/0x24 [ 0.000000] print_trailer+0x150/0x218 [ 0.000000] check_object+0xe4/0x454 [ 0.000000] free_to_partial_list+0x2f8/0x5ec To address the issue, use orig_size to clear the used area. And restore the value of orig_size after clear the remaining area. When CONFIG_SLUB_DEBUG not defined, (get_orig_size()' directly returns s->object_size. So when using memset to init the area, the size can simply be orig_size, as orig_size returns object_size when CONFIG_SLUB_DEBUG not enabled. And orig_size can never be bigger than object_size. Fixes: 946fa0dbf2d8 ("mm/slub: extend redzone check to extra allocated kmalloc space than requested") Cc: Reviewed-by: Feng Tang Acked-by: David Rientjes Signed-off-by: Peng Fan Signed-off-by: Vlastimil Babka --- mm/slub.c | 100 +++++++++++++++++++++++++++++------------------------- 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 60004bfc2dc2..d52c88f29f69 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -756,6 +756,50 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, return false; } +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + unsigned int kasan_meta_size; + + if (!slub_debug_orig_size(s)) + return; + + /* + * KASAN can save its free meta data inside of the object at offset 0. + * If this meta data size is larger than 'orig_size', it will overlap + * the data redzone in [orig_size+1, object_size]. Thus, we adjust + * 'orig_size' to be as at least as big as KASAN's meta data. + */ + kasan_meta_size = kasan_metadata_size(s, true); + if (kasan_meta_size > orig_size) + orig_size = kasan_meta_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; +} + #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); @@ -985,50 +1029,6 @@ static void print_slab_info(const struct slab *slab) &slab->__page_flags); } -/* - * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API - * family will round up the real request size to these fixed ones, so - * there could be an extra area than what is requested. Save the original - * request size in the meta data area, for better debug and sanity check. - */ -static inline void set_orig_size(struct kmem_cache *s, - void *object, unsigned int orig_size) -{ - void *p = kasan_reset_tag(object); - unsigned int kasan_meta_size; - - if (!slub_debug_orig_size(s)) - return; - - /* - * KASAN can save its free meta data inside of the object at offset 0. - * If this meta data size is larger than 'orig_size', it will overlap - * the data redzone in [orig_size+1, object_size]. Thus, we adjust - * 'orig_size' to be as at least as big as KASAN's meta data. - */ - kasan_meta_size = kasan_metadata_size(s, true); - if (kasan_meta_size > orig_size) - orig_size = kasan_meta_size; - - p += get_info_end(s); - p += sizeof(struct track) * 2; - - *(unsigned int *)p = orig_size; -} - -static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) -{ - void *p = kasan_reset_tag(object); - - if (!slub_debug_orig_size(s)) - return s->object_size; - - p += get_info_end(s); - p += sizeof(struct track) * 2; - - return *(unsigned int *)p; -} - void skip_orig_size_check(struct kmem_cache *s, const void *object) { set_orig_size(s, (void *)object, s->object_size); @@ -1894,7 +1894,6 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} - #ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) @@ -2239,14 +2238,21 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) */ if (unlikely(init)) { int rsize; - unsigned int inuse; + unsigned int inuse, orig_size; inuse = get_info_end(s); + orig_size = get_orig_size(s, x); if (!kasan_has_integrated_init()) - memset(kasan_reset_tag(x), 0, s->object_size); + memset(kasan_reset_tag(x), 0, orig_size); rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; memset((char *)kasan_reset_tag(x) + inuse, 0, s->size - inuse - rsize); + /* + * Restore orig_size, otherwize kmalloc redzone overwritten + * would be reported + */ + set_orig_size(s, x, orig_size); + } /* KASAN might put x into memory quarantine, delaying its reuse. */ return !kasan_slab_free(s, x, init); From e02147cb703412fa13dd31908c734d7fb2314f55 Mon Sep 17 00:00:00 2001 From: Xavier Date: Wed, 4 Sep 2024 15:40:37 +0800 Subject: [PATCH 15/34] mm/slab: Optimize the code logic in find_mergeable() We can first assess the flags, if it's unmergeable, there's no need to calculate the size and align. Signed-off-by: Xavier Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index ca694f5553b4..85afeb69b3c0 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -186,14 +186,15 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, if (ctor) return NULL; - size = ALIGN(size, sizeof(void *)); - align = calculate_alignment(flags, align, size); - size = ALIGN(size, align); flags = kmem_cache_flags(flags, name); if (flags & SLAB_NEVER_MERGE) return NULL; + size = ALIGN(size, sizeof(void *)); + align = calculate_alignment(flags, align, size); + size = ALIGN(size, align); + list_for_each_entry_reverse(s, &slab_caches, list) { if (slab_unmergeable(s)) continue; From 9028cdeb38e1f37d63cb3154799dd259b67e879e Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 5 Sep 2024 10:34:22 -0700 Subject: [PATCH 16/34] memcg: add charging of already allocated slab objects At the moment, the slab objects are charged to the memcg at the allocation time. However there are cases where slab objects are allocated at the time where the right target memcg to charge it to is not known. One such case is the network sockets for the incoming connection which are allocated in the softirq context. Couple hundred thousand connections are very normal on large loaded server and almost all of those sockets underlying those connections get allocated in the softirq context and thus not charged to any memcg. However later at the accept() time we know the right target memcg to charge. Let's add new API to charge already allocated objects, so we can have better accounting of the memory usage. To measure the performance impact of this change, tcp_crr is used from the neper [1] performance suite. Basically it is a network ping pong test with new connection for each ping pong. The server and the client are run inside 3 level of cgroup hierarchy using the following commands: Server: $ tcp_crr -6 Client: $ tcp_crr -6 -c -H ${server_ip} If the client and server run on different machines with 50 GBPS NIC, there is no visible impact of the change. For the same machine experiment with v6.11-rc5 as base. base (throughput) with-patch tcp_crr 14545 (+- 80) 14463 (+- 56) It seems like the performance impact is within the noise. Link: https://github.com/google/neper [1] Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Reviewed-by: Yosry Ahmed Acked-by: Paolo Abeni # net Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 29 ++++++++++++++++++ mm/slab.h | 7 +++++ mm/slub.c | 53 +++++++++++++++++++++++++++++++++ net/ipv4/inet_connection_sock.c | 5 ++-- 4 files changed, 92 insertions(+), 2 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index eb2bf4629157..3be2a5ed4936 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -547,6 +547,35 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) __assume_slab_alignment __malloc; #define kmem_cache_alloc_lru(...) alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__)) +/** + * kmem_cache_charge - memcg charge an already allocated slab memory + * @objp: address of the slab object to memcg charge + * @gfpflags: describe the allocation context + * + * kmem_cache_charge allows charging a slab object to the current memcg, + * primarily in cases where charging at allocation time might not be possible + * because the target memcg is not known (i.e. softirq context) + * + * The objp should be pointer returned by the slab allocator functions like + * kmalloc (with __GFP_ACCOUNT in flags) or kmem_cache_alloc. The memcg charge + * behavior can be controlled through gfpflags parameter, which affects how the + * necessary internal metadata can be allocated. Including __GFP_NOFAIL denotes + * that overcharging is requested instead of failure, but is not applied for the + * internal metadata allocation. + * + * There are several cases where it will return true even if the charging was + * not done: + * More specifically: + * + * 1. For !CONFIG_MEMCG or cgroup_disable=memory systems. + * 2. Already charged slab objects. + * 3. For slab objects from KMALLOC_NORMAL caches - allocated by kmalloc() + * without __GFP_ACCOUNT + * 4. Allocating internal metadata has failed + * + * Return: true if charge was successful otherwise false. + */ +bool kmem_cache_charge(void *objp, gfp_t gfpflags); void kmem_cache_free(struct kmem_cache *s, void *objp); kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, diff --git a/mm/slab.h b/mm/slab.h index dcdb56b8e7f5..9f907e930609 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -443,6 +443,13 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s) return (s->flags & SLAB_KMALLOC); } +static inline bool is_kmalloc_normal(struct kmem_cache *s) +{ + if (!is_kmalloc_cache(s)) + return false; + return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); +} + /* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ diff --git a/mm/slub.c b/mm/slub.c index 95977f25a760..aa512de974e7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2185,6 +2185,45 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, __memcg_slab_free_hook(s, slab, p, objects, obj_exts); } + +static __fastpath_inline +bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + struct slabobj_ext *slab_exts; + struct kmem_cache *s; + struct folio *folio; + struct slab *slab; + unsigned long off; + + folio = virt_to_folio(p); + if (!folio_test_slab(folio)) { + return folio_memcg_kmem(folio) || + (__memcg_kmem_charge_page(folio_page(folio, 0), flags, + folio_order(folio)) == 0); + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + /* + * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency + * of slab_obj_exts being allocated from the same slab and thus the slab + * becoming effectively unfreeable. + */ + if (is_kmalloc_normal(s)) + return true; + + /* Ignore already charged objects. */ + slab_exts = slab_obj_exts(slab); + if (slab_exts) { + off = obj_to_index(s, slab, p); + if (unlikely(slab_exts[off].objcg)) + return true; + } + + return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); +} + #else /* CONFIG_MEMCG */ static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, @@ -2198,6 +2237,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } + +static inline bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + return true; +} #endif /* CONFIG_MEMCG */ #ifdef CONFIG_SLUB_RCU_DEBUG @@ -4105,6 +4149,15 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); +bool kmem_cache_charge(void *objp, gfp_t gfpflags) +{ + if (!memcg_kmem_online()) + return true; + + return memcg_slab_post_charge(objp, gfpflags); +} +EXPORT_SYMBOL(kmem_cache_charge); + /** * kmem_cache_alloc_node - Allocate an object on the specified node * @s: The cache to allocate from. diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 64d07b842e73..e25381bf32d0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -714,6 +714,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) out: release_sock(sk); if (newsk && mem_cgroup_sockets_enabled) { + gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; int amt = 0; /* atomically get the memory usage, set and charge the @@ -731,8 +732,8 @@ out: } if (amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt, - GFP_KERNEL | __GFP_NOFAIL); + mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp); + kmem_cache_charge(newsk, gfp); release_sock(newsk); } From 53d3d210864e532ee5d8ae48f66168c12dd8f530 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:44 +0200 Subject: [PATCH 17/34] slab: s/__kmem_cache_create/do_kmem_cache_create/g Free up reusing the double-underscore variant for follow-up patches. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab.h | 2 +- mm/slab_common.c | 4 ++-- mm/slub.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index a6051385186e..684bb48c4f39 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -424,7 +424,7 @@ kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller) gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ -int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); +int do_kmem_cache_create(struct kmem_cache *, slab_flags_t flags); void __init kmem_cache_init(void); extern void create_boot_cache(struct kmem_cache *, const char *name, diff --git a/mm/slab_common.c b/mm/slab_common.c index 95db3702f8d6..91e0e36e4379 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -234,7 +234,7 @@ static struct kmem_cache *create_cache(const char *name, s->useroffset = useroffset; s->usersize = usersize; #endif - err = __kmem_cache_create(s, flags); + err = do_kmem_cache_create(s, flags); if (err) goto out_free_cache; @@ -778,7 +778,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, s->usersize = usersize; #endif - err = __kmem_cache_create(s, flags); + err = do_kmem_cache_create(s, flags); if (err) panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", diff --git a/mm/slub.c b/mm/slub.c index 9aa5da1e8e27..23d9d783ff26 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5902,7 +5902,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, return s; } -int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) +int do_kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) { int err; From 879fb3c274c12cb0892718e1e54f85fc406e3c7b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:45 +0200 Subject: [PATCH 18/34] slab: add struct kmem_cache_args Currently we have multiple kmem_cache_create*() variants that take up to seven separate parameters with one of the functions having to grow an eigth parameter in the future to handle both usercopy and a custom freelist pointer. Add a struct kmem_cache_args structure and move less common parameters into it. Core parameters such as name, object size, and flags continue to be passed separately. Add a new function __kmem_cache_create_args() that takes a struct kmem_cache_args pointer and port do_kmem_cache_create_usercopy() over to it. In follow-up patches we will port the other kmem_cache_create*() variants over to it as well. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 22 +++++++++++++++ mm/slab_common.c | 65 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 15 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 5b2da2cf31a8..2b8eeca7fd2c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -240,6 +240,28 @@ struct mem_cgroup; */ bool slab_is_available(void); +/** + * struct kmem_cache_args - Less common arguments for kmem_cache_create() + * @align: The required alignment for the objects. + * @useroffset: Usercopy region offset + * @usersize: Usercopy region size + * @freeptr_offset: Custom offset for the free pointer in RCU caches + * @use_freeptr_offset: Whether a @freeptr_offset is used + * @ctor: A constructor for the objects. + */ +struct kmem_cache_args { + unsigned int align; + unsigned int useroffset; + unsigned int usersize; + unsigned int freeptr_offset; + bool use_freeptr_offset; + void (*ctor)(void *); +}; + +struct kmem_cache *__kmem_cache_create_args(const char *name, + unsigned int object_size, + struct kmem_cache_args *args, + slab_flags_t flags); struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); diff --git a/mm/slab_common.c b/mm/slab_common.c index 91e0e36e4379..0f13c045b8d1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -248,14 +248,24 @@ out: return ERR_PTR(err); } -static struct kmem_cache * -do_kmem_cache_create_usercopy(const char *name, - unsigned int size, unsigned int freeptr_offset, - unsigned int align, slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)) +/** + * __kmem_cache_create_args - Create a kmem cache + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @object_size: The size of objects to be created in this cache. + * @args: Arguments for the cache creation (see struct kmem_cache_args). + * @flags: See %SLAB_* flags for an explanation of individual @flags. + * + * Cannot be called within a interrupt, but can be interrupted. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +struct kmem_cache *__kmem_cache_create_args(const char *name, + unsigned int object_size, + struct kmem_cache_args *args, + slab_flags_t flags) { struct kmem_cache *s = NULL; + unsigned int freeptr_offset = UINT_MAX; const char *cache_name; int err; @@ -275,7 +285,7 @@ do_kmem_cache_create_usercopy(const char *name, mutex_lock(&slab_mutex); - err = kmem_cache_sanity_check(name, size); + err = kmem_cache_sanity_check(name, object_size); if (err) { goto out_unlock; } @@ -296,12 +306,14 @@ do_kmem_cache_create_usercopy(const char *name, /* Fail closed on bad usersize of useroffset values. */ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || - WARN_ON(!usersize && useroffset) || - WARN_ON(size < usersize || size - usersize < useroffset)) - usersize = useroffset = 0; + WARN_ON(!args->usersize && args->useroffset) || + WARN_ON(object_size < args->usersize || + object_size - args->usersize < args->useroffset)) + args->usersize = args->useroffset = 0; - if (!usersize) - s = __kmem_cache_alias(name, size, align, flags, ctor); + if (!args->usersize) + s = __kmem_cache_alias(name, object_size, args->align, flags, + args->ctor); if (s) goto out_unlock; @@ -311,9 +323,11 @@ do_kmem_cache_create_usercopy(const char *name, goto out_unlock; } - s = create_cache(cache_name, size, freeptr_offset, - calculate_alignment(flags, align, size), - flags, useroffset, usersize, ctor); + if (args->use_freeptr_offset) + freeptr_offset = args->freeptr_offset; + s = create_cache(cache_name, object_size, freeptr_offset, + calculate_alignment(flags, args->align, object_size), + flags, args->useroffset, args->usersize, args->ctor); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); @@ -335,6 +349,27 @@ out_unlock: } return s; } +EXPORT_SYMBOL(__kmem_cache_create_args); + +static struct kmem_cache * +do_kmem_cache_create_usercopy(const char *name, + unsigned int size, unsigned int freeptr_offset, + unsigned int align, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)) +{ + struct kmem_cache_args kmem_args = { + .align = align, + .use_freeptr_offset = freeptr_offset != UINT_MAX, + .freeptr_offset = freeptr_offset, + .useroffset = useroffset, + .usersize = usersize, + .ctor = ctor, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, flags); +} + /** * kmem_cache_create_usercopy - Create a cache with a region suitable From f6cd98c9407f8b8118464c633b27765e751e2750 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:46 +0200 Subject: [PATCH 19/34] slab: port kmem_cache_create() to struct kmem_cache_args Port kmem_cache_create() to struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 0f13c045b8d1..ac0832dac01e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -439,8 +439,12 @@ struct kmem_cache * kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { - return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, - 0, 0, ctor); + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, flags); } EXPORT_SYMBOL(kmem_cache_create); From 9816c3c4e778b2477fda1966bf7047a2d9772d14 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:47 +0200 Subject: [PATCH 20/34] slab: port kmem_cache_create_rcu() to struct kmem_cache_args Port kmem_cache_create_rcu() to struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index ac0832dac01e..da62ed30f95d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -481,9 +481,13 @@ struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, unsigned int freeptr_offset, slab_flags_t flags) { - return do_kmem_cache_create_usercopy(name, size, freeptr_offset, 0, - flags | SLAB_TYPESAFE_BY_RCU, 0, 0, - NULL); + struct kmem_cache_args kmem_args = { + .freeptr_offset = freeptr_offset, + .use_freeptr_offset = true, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, + flags | SLAB_TYPESAFE_BY_RCU); } EXPORT_SYMBOL(kmem_cache_create_rcu); From 1d3d7645d789c9ce8fec48b16c0040b4a3dc6604 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:48 +0200 Subject: [PATCH 21/34] slab: port kmem_cache_create_usercopy() to struct kmem_cache_args Port kmem_cache_create_usercopy() to struct kmem_cache_args and remove the now unused do_kmem_cache_create_usercopy() helper. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index da62ed30f95d..16c36a946135 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -351,26 +351,6 @@ out_unlock: } EXPORT_SYMBOL(__kmem_cache_create_args); -static struct kmem_cache * -do_kmem_cache_create_usercopy(const char *name, - unsigned int size, unsigned int freeptr_offset, - unsigned int align, slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)) -{ - struct kmem_cache_args kmem_args = { - .align = align, - .use_freeptr_offset = freeptr_offset != UINT_MAX, - .freeptr_offset = freeptr_offset, - .useroffset = useroffset, - .usersize = usersize, - .ctor = ctor, - }; - - return __kmem_cache_create_args(name, size, &kmem_args, flags); -} - - /** * kmem_cache_create_usercopy - Create a cache with a region suitable * for copying to userspace @@ -405,8 +385,14 @@ kmem_cache_create_usercopy(const char *name, unsigned int size, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)) { - return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, - useroffset, usersize, ctor); + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + .useroffset = useroffset, + .usersize = usersize, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, flags); } EXPORT_SYMBOL(kmem_cache_create_usercopy); From 34410a906080e7f669330c45c8e38215fe09f481 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:49 +0200 Subject: [PATCH 22/34] slab: pass struct kmem_cache_args to create_cache() Pass struct kmem_cache_args to create_cache() so that we can later simplify further helpers. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 16c36a946135..9baa61c9c670 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -202,22 +202,22 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, } static struct kmem_cache *create_cache(const char *name, - unsigned int object_size, unsigned int freeptr_offset, - unsigned int align, slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)) + unsigned int object_size, + struct kmem_cache_args *args, + slab_flags_t flags) { struct kmem_cache *s; int err; - if (WARN_ON(useroffset + usersize > object_size)) - useroffset = usersize = 0; + if (WARN_ON(args->useroffset + args->usersize > object_size)) + args->useroffset = args->usersize = 0; /* If a custom freelist pointer is requested make sure it's sane. */ err = -EINVAL; - if (freeptr_offset != UINT_MAX && - (freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) || - !IS_ALIGNED(freeptr_offset, sizeof(freeptr_t)))) + if (args->use_freeptr_offset && + (args->freeptr_offset >= object_size || + !(flags & SLAB_TYPESAFE_BY_RCU) || + !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)))) goto out; err = -ENOMEM; @@ -227,12 +227,15 @@ static struct kmem_cache *create_cache(const char *name, s->name = name; s->size = s->object_size = object_size; - s->rcu_freeptr_offset = freeptr_offset; - s->align = align; - s->ctor = ctor; + if (args->use_freeptr_offset) + s->rcu_freeptr_offset = args->freeptr_offset; + else + s->rcu_freeptr_offset = UINT_MAX; + s->align = args->align; + s->ctor = args->ctor; #ifdef CONFIG_HARDENED_USERCOPY - s->useroffset = useroffset; - s->usersize = usersize; + s->useroffset = args->useroffset; + s->usersize = args->usersize; #endif err = do_kmem_cache_create(s, flags); if (err) @@ -265,7 +268,6 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, slab_flags_t flags) { struct kmem_cache *s = NULL; - unsigned int freeptr_offset = UINT_MAX; const char *cache_name; int err; @@ -323,11 +325,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, goto out_unlock; } - if (args->use_freeptr_offset) - freeptr_offset = args->freeptr_offset; - s = create_cache(cache_name, object_size, freeptr_offset, - calculate_alignment(flags, args->align, object_size), - flags, args->useroffset, args->usersize, args->ctor); + args->align = calculate_alignment(flags, args->align, object_size); + s = create_cache(cache_name, object_size, args, flags); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); From fc0eac57d08cad87b478a3be944899c81c950d43 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:50 +0200 Subject: [PATCH 23/34] slab: pull kmem_cache_open() into do_kmem_cache_create() do_kmem_cache_create() is the only caller and we're going to pass down struct kmem_cache_args in a follow-up patch. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slub.c | 132 +++++++++++++++++++++++++----------------------------- 1 file changed, 62 insertions(+), 70 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 23d9d783ff26..30f4ca6335c7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5290,65 +5290,6 @@ static int calculate_sizes(struct kmem_cache *s) return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) -{ - s->flags = kmem_cache_flags(flags, s->name); -#ifdef CONFIG_SLAB_FREELIST_HARDENED - s->random = get_random_long(); -#endif - - if (!calculate_sizes(s)) - goto error; - if (disable_higher_order_debug) { - /* - * Disable debugging flags that store metadata if the min slab - * order increased. - */ - if (get_order(s->size) > get_order(s->object_size)) { - s->flags &= ~DEBUG_METADATA_FLAGS; - s->offset = 0; - if (!calculate_sizes(s)) - goto error; - } - } - -#ifdef system_has_freelist_aba - if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { - /* Enable fast mode */ - s->flags |= __CMPXCHG_DOUBLE; - } -#endif - - /* - * The larger the object size is, the more slabs we want on the partial - * list to avoid pounding the page allocator excessively. - */ - s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); - s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - - set_cpu_partial(s); - -#ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 1000; -#endif - - /* Initialize the pre-computed randomized freelist if slab is up */ - if (slab_state >= UP) { - if (init_cache_random_seq(s)) - goto error; - } - - if (!init_kmem_cache_nodes(s)) - goto error; - - if (alloc_kmem_cache_cpus(s)) - return 0; - -error: - __kmem_cache_release(s); - return -EINVAL; -} - static void list_slab_objects(struct kmem_cache *s, struct slab *slab, const char *text) { @@ -5904,26 +5845,77 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, int do_kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) { - int err; + int err = -EINVAL; - err = kmem_cache_open(s, flags); - if (err) - return err; + s->flags = kmem_cache_flags(flags, s->name); +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif + + if (!calculate_sizes(s)) + goto out; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->object_size)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(s)) + goto out; + } + } + +#ifdef system_has_freelist_aba + if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; + } +#endif + + /* + * The larger the object size is, the more slabs we want on the partial + * list to avoid pounding the page allocator excessively. + */ + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); + + set_cpu_partial(s); + +#ifdef CONFIG_NUMA + s->remote_node_defrag_ratio = 1000; +#endif + + /* Initialize the pre-computed randomized freelist if slab is up */ + if (slab_state >= UP) { + if (init_cache_random_seq(s)) + goto out; + } + + if (!init_kmem_cache_nodes(s)) + goto out; + + if (!alloc_kmem_cache_cpus(s)) + goto out; /* Mutex is not taken during early boot */ - if (slab_state <= UP) - return 0; + if (slab_state <= UP) { + err = 0; + goto out; + } err = sysfs_slab_add(s); - if (err) { - __kmem_cache_release(s); - return err; - } + if (err) + goto out; if (s->flags & SLAB_STORE_USER) debugfs_slab_add(s); - return 0; +out: + if (err) + __kmem_cache_release(s); + return err; } #ifdef SLAB_SUPPORTS_SYSFS From 3dbe2bad5785e4a9f62d99186e7d31410a8f1e2d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:51 +0200 Subject: [PATCH 24/34] slab: pass struct kmem_cache_args to do_kmem_cache_create() and initialize most things in do_kmem_cache_create(). In a follow-up patch we'll remove rcu_freeptr_offset from struct kmem_cache. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab.h | 4 +++- mm/slab_common.c | 27 ++++++--------------------- mm/slub.c | 17 ++++++++++++++++- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 684bb48c4f39..c7a4e0fc3cf1 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -424,7 +424,9 @@ kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller) gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ -int do_kmem_cache_create(struct kmem_cache *, slab_flags_t flags); +int do_kmem_cache_create(struct kmem_cache *s, const char *name, + unsigned int size, struct kmem_cache_args *args, + slab_flags_t flags); void __init kmem_cache_init(void); extern void create_boot_cache(struct kmem_cache *, const char *name, diff --git a/mm/slab_common.c b/mm/slab_common.c index 9baa61c9c670..19ae3dd6e36f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -224,20 +224,7 @@ static struct kmem_cache *create_cache(const char *name, s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); if (!s) goto out; - - s->name = name; - s->size = s->object_size = object_size; - if (args->use_freeptr_offset) - s->rcu_freeptr_offset = args->freeptr_offset; - else - s->rcu_freeptr_offset = UINT_MAX; - s->align = args->align; - s->ctor = args->ctor; -#ifdef CONFIG_HARDENED_USERCOPY - s->useroffset = args->useroffset; - s->usersize = args->usersize; -#endif - err = do_kmem_cache_create(s, flags); + err = do_kmem_cache_create(s, name, object_size, args, flags); if (err) goto out_free_cache; @@ -788,9 +775,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, { int err; unsigned int align = ARCH_KMALLOC_MINALIGN; - - s->name = name; - s->size = s->object_size = size; + struct kmem_cache_args kmem_args = {}; /* * kmalloc caches guarantee alignment of at least the largest @@ -799,14 +784,14 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, */ if (flags & SLAB_KMALLOC) align = max(align, 1U << (ffs(size) - 1)); - s->align = calculate_alignment(flags, align, size); + kmem_args.align = calculate_alignment(flags, align, size); #ifdef CONFIG_HARDENED_USERCOPY - s->useroffset = useroffset; - s->usersize = usersize; + kmem_args.useroffset = useroffset; + kmem_args.usersize = usersize; #endif - err = do_kmem_cache_create(s, flags); + err = do_kmem_cache_create(s, name, size, &kmem_args, flags); if (err) panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", diff --git a/mm/slub.c b/mm/slub.c index 30f4ca6335c7..4719b60215b8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5843,14 +5843,29 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, return s; } -int do_kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) +int do_kmem_cache_create(struct kmem_cache *s, const char *name, + unsigned int size, struct kmem_cache_args *args, + slab_flags_t flags) { int err = -EINVAL; + s->name = name; + s->size = s->object_size = size; + s->flags = kmem_cache_flags(flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif + if (args->use_freeptr_offset) + s->rcu_freeptr_offset = args->freeptr_offset; + else + s->rcu_freeptr_offset = UINT_MAX; + s->align = args->align; + s->ctor = args->ctor; +#ifdef CONFIG_HARDENED_USERCOPY + s->useroffset = args->useroffset; + s->usersize = args->usersize; +#endif if (!calculate_sizes(s)) goto out; From dacf472bcdfa4f3b08cd2c1beef034e3e5ab4bd0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:52 +0200 Subject: [PATCH 25/34] slab: remove rcu_freeptr_offset from struct kmem_cache Pass down struct kmem_cache_args to calculate_sizes() so we can use args->{use}_freeptr_offset directly. This allows us to remove ->rcu_freeptr_offset from struct kmem_cache. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab.h | 2 -- mm/slub.c | 25 +++++++------------------ 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index c7a4e0fc3cf1..36ac38e21fcb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -261,8 +261,6 @@ struct kmem_cache { unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ - /* Specific free pointer requested (if not UINT_MAX) */ - unsigned int rcu_freeptr_offset; #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; diff --git a/mm/slub.c b/mm/slub.c index 4719b60215b8..a23c7036cd61 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3916,8 +3916,7 @@ static void *__slab_alloc_node(struct kmem_cache *s, * If the object has been wiped upon free, make sure it's fully initialized by * zeroing out freelist pointer. * - * Note that we also wipe custom freelist pointers specified via - * s->rcu_freeptr_offset. + * Note that we also wipe custom freelist pointers. */ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, void *obj) @@ -5141,17 +5140,11 @@ static void set_cpu_partial(struct kmem_cache *s) #endif } -/* Was a valid freeptr offset requested? */ -static inline bool has_freeptr_offset(const struct kmem_cache *s) -{ - return s->rcu_freeptr_offset != UINT_MAX; -} - /* * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; @@ -5192,7 +5185,7 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) || + if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || (flags & SLAB_POISON) || s->ctor || ((flags & SLAB_RED_ZONE) && (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { @@ -5214,8 +5207,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->offset = size; size += sizeof(void *); - } else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) { - s->offset = s->rcu_freeptr_offset; + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { + s->offset = args->freeptr_offset; } else { /* * Store freelist pointer near middle of object to keep @@ -5856,10 +5849,6 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif - if (args->use_freeptr_offset) - s->rcu_freeptr_offset = args->freeptr_offset; - else - s->rcu_freeptr_offset = UINT_MAX; s->align = args->align; s->ctor = args->ctor; #ifdef CONFIG_HARDENED_USERCOPY @@ -5867,7 +5856,7 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, s->usersize = args->usersize; #endif - if (!calculate_sizes(s)) + if (!calculate_sizes(args, s)) goto out; if (disable_higher_order_debug) { /* @@ -5877,7 +5866,7 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (get_order(s->size) > get_order(s->object_size)) { s->flags &= ~DEBUG_METADATA_FLAGS; s->offset = 0; - if (!calculate_sizes(s)) + if (!calculate_sizes(args, s)) goto out; } } From 052d67b46bcd91b6785e8e6e047241814f6142a3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:53 +0200 Subject: [PATCH 26/34] slab: port KMEM_CACHE() to struct kmem_cache_args Make KMEM_CACHE() use struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 2b8eeca7fd2c..1f38d94387cc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -284,9 +284,11 @@ int kmem_cache_shrink(struct kmem_cache *s); * f.e. add ____cacheline_aligned_in_smp to the struct declaration * then the objects will be properly aligned in SMP configurations. */ -#define KMEM_CACHE(__struct, __flags) \ - kmem_cache_create(#__struct, sizeof(struct __struct), \ - __alignof__(struct __struct), (__flags), NULL) +#define KMEM_CACHE(__struct, __flags) \ + __kmem_cache_create_args(#__struct, sizeof(struct __struct), \ + &(struct kmem_cache_args) { \ + .align = __alignof__(struct __struct), \ + }, (__flags)) /* * To whitelist a single field for copying to/from usercopy, use this From 199cd13a745eb44fb4828bca373155293cdcfa5c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:54 +0200 Subject: [PATCH 27/34] slab: port KMEM_CACHE_USERCOPY() to struct kmem_cache_args Make KMEM_CACHE_USERCOPY() use struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 1f38d94387cc..7e15a3a3edb1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -294,12 +294,13 @@ int kmem_cache_shrink(struct kmem_cache *s); * To whitelist a single field for copying to/from usercopy, use this * macro instead for KMEM_CACHE() above. */ -#define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \ - kmem_cache_create_usercopy(#__struct, \ - sizeof(struct __struct), \ - __alignof__(struct __struct), (__flags), \ - offsetof(struct __struct, __field), \ - sizeof_field(struct __struct, __field), NULL) +#define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \ + __kmem_cache_create_args(#__struct, sizeof(struct __struct), \ + &(struct kmem_cache_args) { \ + .align = __alignof__(struct __struct), \ + .useroffset = offsetof(struct __struct, __field), \ + .usersize = sizeof_field(struct __struct, __field), \ + }, (__flags)) /* * Common kmalloc functions provided by all allocators From b2e7456b5c25c41eda7a8a15f7ccaa4e7579949f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:55 +0200 Subject: [PATCH 28/34] slab: create kmem_cache_create() compatibility layer Use _Generic() to create a compatibility layer that type switches on the third argument to either call __kmem_cache_create() or __kmem_cache_create_args(). If NULL is passed for the struct kmem_cache_args argument use default args making porting for callers that don't care about additional arguments easy. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 29 ++++++++++++++++++++++++++--- mm/slab_common.c | 10 +++++----- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 7e15a3a3edb1..15eb0a0defd6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -262,9 +262,10 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, unsigned int object_size, struct kmem_cache_args *args, slab_flags_t flags); -struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, - unsigned int align, slab_flags_t flags, - void (*ctor)(void *)); + +struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, + void (*ctor)(void *)); struct kmem_cache *kmem_cache_create_usercopy(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, @@ -273,6 +274,28 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, unsigned int freeptr_offset, slab_flags_t flags); + +/* If NULL is passed for @args, use this variant with default arguments. */ +static inline struct kmem_cache * +__kmem_cache_default_args(const char *name, unsigned int size, + struct kmem_cache_args *args, + slab_flags_t flags) +{ + struct kmem_cache_args kmem_default_args = {}; + + /* Make sure we don't get passed garbage. */ + if (WARN_ON_ONCE(args)) + return ERR_PTR(-EINVAL); + + return __kmem_cache_create_args(name, size, &kmem_default_args, flags); +} + +#define kmem_cache_create(__name, __object_size, __args, ...) \ + _Generic((__args), \ + struct kmem_cache_args *: __kmem_cache_create_args, \ + void *: __kmem_cache_default_args, \ + default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__) + void kmem_cache_destroy(struct kmem_cache *s); int kmem_cache_shrink(struct kmem_cache *s); diff --git a/mm/slab_common.c b/mm/slab_common.c index 19ae3dd6e36f..418459927670 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -383,7 +383,7 @@ kmem_cache_create_usercopy(const char *name, unsigned int size, EXPORT_SYMBOL(kmem_cache_create_usercopy); /** - * kmem_cache_create - Create a cache. + * __kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. * @align: The required alignment for the objects. @@ -407,9 +407,9 @@ EXPORT_SYMBOL(kmem_cache_create_usercopy); * * Return: a pointer to the cache on success, NULL on failure. */ -struct kmem_cache * -kmem_cache_create(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, void (*ctor)(void *)) +struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, + void (*ctor)(void *)) { struct kmem_cache_args kmem_args = { .align = align, @@ -418,7 +418,7 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, return __kmem_cache_create_args(name, size, &kmem_args, flags); } -EXPORT_SYMBOL(kmem_cache_create); +EXPORT_SYMBOL(__kmem_cache_create); /** * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. From 5f7d25668217bb7841bc5784d2185618a01e336b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:56 +0200 Subject: [PATCH 29/34] file: port to struct kmem_cache_args Port filp_cache to struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- fs/file_table.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 3ef558f27a1c..861c03608e83 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -511,9 +511,14 @@ EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { - filp_cachep = kmem_cache_create_rcu("filp", sizeof(struct file), - offsetof(struct file, f_freeptr), - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct file, f_freeptr), + }; + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } From 3d453e60f1a9c377d670d5fe306d3b9eed477bc0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:57 +0200 Subject: [PATCH 30/34] slab: remove kmem_cache_create_rcu() Now that we have ported all users of kmem_cache_create_rcu() to struct kmem_cache_args the function is unused and can be removed. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 3 --- mm/slab_common.c | 43 ------------------------------------------- 2 files changed, 46 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 15eb0a0defd6..cb82a6414a28 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -271,9 +271,6 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); -struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, - unsigned int freeptr_offset, - slab_flags_t flags); /* If NULL is passed for @args, use this variant with default arguments. */ static inline struct kmem_cache * diff --git a/mm/slab_common.c b/mm/slab_common.c index 418459927670..9133b9fafcb1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -420,49 +420,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, } EXPORT_SYMBOL(__kmem_cache_create); -/** - * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @freeptr_offset: The offset into the memory to the free pointer - * @flags: SLAB flags - * - * Cannot be called within an interrupt, but can be interrupted. - * - * See kmem_cache_create() for an explanation of possible @flags. - * - * By default SLAB_TYPESAFE_BY_RCU caches place the free pointer outside - * of the object. This might cause the object to grow in size. Callers - * that have a reason to avoid this can specify a custom free pointer - * offset in their struct where the free pointer will be placed. - * - * Note that placing the free pointer inside the object requires the - * caller to ensure that no fields are invalidated that are required to - * guard against object recycling (See SLAB_TYPESAFE_BY_RCU for - * details.). - * - * Using zero as a value for @freeptr_offset is valid. To request no - * offset UINT_MAX must be specified. - * - * Note that @ctor isn't supported with custom free pointers as a @ctor - * requires an external free pointer. - * - * Return: a pointer to the cache on success, NULL on failure. - */ -struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, - unsigned int freeptr_offset, - slab_flags_t flags) -{ - struct kmem_cache_args kmem_args = { - .freeptr_offset = freeptr_offset, - .use_freeptr_offset = true, - }; - - return __kmem_cache_create_args(name, size, &kmem_args, - flags | SLAB_TYPESAFE_BY_RCU); -} -EXPORT_SYMBOL(kmem_cache_create_rcu); - static struct kmem_cache *kmem_buckets_cache __ro_after_init; /** From 0c9050b09cfb1ddb3fe4b2d401dca1fb674c825a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:58 +0200 Subject: [PATCH 31/34] slab: make kmem_cache_create_usercopy() static inline Make kmem_cache_create_usercopy() a static inline function. Signed-off-by: Christian Brauner Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 49 +++++++++++++++++++++++++++++++++++++++----- mm/slab_common.c | 45 ---------------------------------------- 2 files changed, 44 insertions(+), 50 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index cb82a6414a28..634717c9f73b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -266,11 +266,50 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); -struct kmem_cache *kmem_cache_create_usercopy(const char *name, - unsigned int size, unsigned int align, - slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)); + +/** + * kmem_cache_create_usercopy - Create a cache with a region suitable + * for copying to userspace + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @useroffset: Usercopy region offset + * @usersize: Usercopy region size + * @ctor: A constructor for the objects. + * + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +static inline struct kmem_cache * +kmem_cache_create_usercopy(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)) +{ + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + .useroffset = useroffset, + .usersize = usersize, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, flags); +} /* If NULL is passed for @args, use this variant with default arguments. */ static inline struct kmem_cache * diff --git a/mm/slab_common.c b/mm/slab_common.c index 9133b9fafcb1..3477a3918afd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -337,51 +337,6 @@ out_unlock: } EXPORT_SYMBOL(__kmem_cache_create_args); -/** - * kmem_cache_create_usercopy - Create a cache with a region suitable - * for copying to userspace - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. - * @flags: SLAB flags - * @useroffset: Usercopy region offset - * @usersize: Usercopy region size - * @ctor: A constructor for the objects. - * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - * - * Return: a pointer to the cache on success, NULL on failure. - */ -struct kmem_cache * -kmem_cache_create_usercopy(const char *name, unsigned int size, - unsigned int align, slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)) -{ - struct kmem_cache_args kmem_args = { - .align = align, - .ctor = ctor, - .useroffset = useroffset, - .usersize = usersize, - }; - - return __kmem_cache_create_args(name, size, &kmem_args, flags); -} -EXPORT_SYMBOL(kmem_cache_create_usercopy); - /** * __kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. From 781aee755638dbb6dbfe022bdd2f732621ec9534 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:59 +0200 Subject: [PATCH 32/34] slab: make __kmem_cache_create() static inline Make __kmem_cache_create() a static inline function. Signed-off-by: Christian Brauner Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 13 ++++++++++--- mm/slab_common.c | 38 -------------------------------------- 2 files changed, 10 insertions(+), 41 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 634717c9f73b..331412a9f4f2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -262,10 +262,17 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, unsigned int object_size, struct kmem_cache_args *args, slab_flags_t flags); +static inline struct kmem_cache * +__kmem_cache_create(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) +{ + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + }; -struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, - unsigned int align, slab_flags_t flags, - void (*ctor)(void *)); + return __kmem_cache_create_args(name, size, &kmem_args, flags); +} /** * kmem_cache_create_usercopy - Create a cache with a region suitable diff --git a/mm/slab_common.c b/mm/slab_common.c index 3477a3918afd..30000dcf0736 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -337,44 +337,6 @@ out_unlock: } EXPORT_SYMBOL(__kmem_cache_create_args); -/** - * __kmem_cache_create - Create a cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. - * @flags: SLAB flags - * @ctor: A constructor for the objects. - * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - * - * Return: a pointer to the cache on success, NULL on failure. - */ -struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, - unsigned int align, slab_flags_t flags, - void (*ctor)(void *)) -{ - struct kmem_cache_args kmem_args = { - .align = align, - .ctor = ctor, - }; - - return __kmem_cache_create_args(name, size, &kmem_args, flags); -} -EXPORT_SYMBOL(__kmem_cache_create); - static struct kmem_cache *kmem_buckets_cache __ro_after_init; /** From a6711d1cd4e291f334ae900065e18f585732acfa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:57:00 +0200 Subject: [PATCH 33/34] io_uring: port to struct kmem_cache_args Port req_cachep to struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Signed-off-by: Vlastimil Babka --- io_uring/io_uring.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3942db160f18..d9d721d1424e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3638,6 +3638,11 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, static int __init io_uring_init(void) { + struct kmem_cache_args kmem_args = { + .useroffset = offsetof(struct io_kiocb, cmd.data), + .usersize = sizeof_field(struct io_kiocb, cmd.data), + }; + #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \ @@ -3722,12 +3727,9 @@ static int __init io_uring_init(void) * range, and HARDENED_USERCOPY will complain if we haven't * correctly annotated this range. */ - req_cachep = kmem_cache_create_usercopy("io_kiocb", - sizeof(struct io_kiocb), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, - offsetof(struct io_kiocb, cmd.data), - sizeof_field(struct io_kiocb, cmd.data), NULL); + req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | + SLAB_TYPESAFE_BY_RCU); io_buf_cachep = KMEM_CACHE(io_buffer, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); From 4b7ff9ab98af11a477d50f08382bcc4c2f899926 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 13 Sep 2024 10:15:56 +0200 Subject: [PATCH 34/34] mm, slab: restore kerneldoc for kmem_cache_create() As kmem_cache_create() became a _Generic() wrapper macro, it currently has no kerneldoc despite being the main API to use. Add it. Also adjust kmem_cache_create_usercopy() kerneldoc to indicate it is now a legacy wrapper. Also expand the kerneldoc for struct kmem_cache_args, especially for the freeptr_offset field, where important details were removed with the removal of kmem_cache_create_rcu(). Signed-off-by: Vlastimil Babka Reviewed-by: Christian Brauner --- include/linux/slab.h | 114 ++++++++++++++++++++++++++++++++++--------- mm/slab_common.c | 10 ++-- 2 files changed, 98 insertions(+), 26 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 331412a9f4f2..6a8ab7ef3af7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -242,19 +242,72 @@ bool slab_is_available(void); /** * struct kmem_cache_args - Less common arguments for kmem_cache_create() - * @align: The required alignment for the objects. - * @useroffset: Usercopy region offset - * @usersize: Usercopy region size - * @freeptr_offset: Custom offset for the free pointer in RCU caches - * @use_freeptr_offset: Whether a @freeptr_offset is used - * @ctor: A constructor for the objects. + * + * Any uninitialized fields of the structure are interpreted as unused. The + * exception is @freeptr_offset where %0 is a valid value, so + * @use_freeptr_offset must be also set to %true in order to interpret the field + * as used. For @useroffset %0 is also valid, but only with non-%0 + * @usersize. + * + * When %NULL args is passed to kmem_cache_create(), it is equivalent to all + * fields unused. */ struct kmem_cache_args { + /** + * @align: The required alignment for the objects. + * + * %0 means no specific alignment is requested. + */ unsigned int align; + /** + * @useroffset: Usercopy region offset. + * + * %0 is a valid offset, when @usersize is non-%0 + */ unsigned int useroffset; + /** + * @usersize: Usercopy region size. + * + * %0 means no usercopy region is specified. + */ unsigned int usersize; + /** + * @freeptr_offset: Custom offset for the free pointer + * in &SLAB_TYPESAFE_BY_RCU caches + * + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer + * outside of the object. This might cause the object to grow in size. + * Cache creators that have a reason to avoid this can specify a custom + * free pointer offset in their struct where the free pointer will be + * placed. + * + * Note that placing the free pointer inside the object requires the + * caller to ensure that no fields are invalidated that are required to + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for + * details). + * + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset + * is specified, %use_freeptr_offset must be set %true. + * + * Note that @ctor currently isn't supported with custom free pointers + * as a @ctor requires an external free pointer. + */ unsigned int freeptr_offset; + /** + * @use_freeptr_offset: Whether a @freeptr_offset is used. + */ bool use_freeptr_offset; + /** + * @ctor: A constructor for the objects. + * + * The constructor is invoked for each object in a newly allocated slab + * page. It is the cache user's responsibility to free object in the + * same state as after calling the constructor, or deal appropriately + * with any differences between a freshly constructed and a reallocated + * object. + * + * %NULL means no constructor. + */ void (*ctor)(void *); }; @@ -275,30 +328,20 @@ __kmem_cache_create(const char *name, unsigned int size, unsigned int align, } /** - * kmem_cache_create_usercopy - Create a cache with a region suitable - * for copying to userspace + * kmem_cache_create_usercopy - Create a kmem cache with a region suitable + * for copying to userspace. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. * @align: The required alignment for the objects. * @flags: SLAB flags * @useroffset: Usercopy region offset * @usersize: Usercopy region size - * @ctor: A constructor for the objects. + * @ctor: A constructor for the objects, or %NULL. * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. + * This is a legacy wrapper, new code should use either KMEM_CACHE_USERCOPY() + * if whitelisting a single field is sufficient, or kmem_cache_create() with + * the necessary parameters passed via the args parameter (see + * &struct kmem_cache_args) * * Return: a pointer to the cache on success, NULL on failure. */ @@ -333,6 +376,31 @@ __kmem_cache_default_args(const char *name, unsigned int size, return __kmem_cache_create_args(name, size, &kmem_default_args, flags); } +/** + * kmem_cache_create - Create a kmem cache. + * @__name: A string which is used in /proc/slabinfo to identify this cache. + * @__object_size: The size of objects to be created in this cache. + * @__args: Optional arguments, see &struct kmem_cache_args. Passing %NULL + * means defaults will be used for all the arguments. + * + * This is currently implemented as a macro using ``_Generic()`` to call + * either the new variant of the function, or a legacy one. + * + * The new variant has 4 parameters: + * ``kmem_cache_create(name, object_size, args, flags)`` + * + * See __kmem_cache_create_args() which implements this. + * + * The legacy variant has 5 parameters: + * ``kmem_cache_create(name, object_size, align, flags, ctor)`` + * + * The align and ctor parameters map to the respective fields of + * &struct kmem_cache_args + * + * Context: Cannot be called within a interrupt, but can be interrupted. + * + * Return: a pointer to the cache on success, NULL on failure. + */ #define kmem_cache_create(__name, __object_size, __args, ...) \ _Generic((__args), \ struct kmem_cache_args *: __kmem_cache_create_args, \ diff --git a/mm/slab_common.c b/mm/slab_common.c index 30000dcf0736..86c2e6f4a1ce 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -239,13 +239,17 @@ out: } /** - * __kmem_cache_create_args - Create a kmem cache + * __kmem_cache_create_args - Create a kmem cache. * @name: A string which is used in /proc/slabinfo to identify this cache. * @object_size: The size of objects to be created in this cache. - * @args: Arguments for the cache creation (see struct kmem_cache_args). + * @args: Additional arguments for the cache creation (see + * &struct kmem_cache_args). * @flags: See %SLAB_* flags for an explanation of individual @flags. * - * Cannot be called within a interrupt, but can be interrupted. + * Not to be called directly, use the kmem_cache_create() wrapper with the same + * parameters. + * + * Context: Cannot be called within a interrupt, but can be interrupted. * * Return: a pointer to the cache on success, NULL on failure. */