mm/vmalloc: hugepage vmalloc mappings

Support huge page vmalloc mappings.  Config option HAVE_ARCH_HUGE_VMALLOC
enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
supports PMD sized vmap mappings.

vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or
larger, and fall back to small pages if that was unsuccessful.

Architectures must ensure that any arch specific vmalloc allocations that
require PAGE_SIZE mappings (e.g., module allocations vs strict module rwx)
use the VM_NOHUGE flag to inhibit larger mappings.

This can result in more internal fragmentation and memory overhead for a
given allocation, an option nohugevmalloc is added to disable at boot.

[colin.king@canonical.com: fix read of uninitialized pointer area]
  Link: https://lkml.kernel.org/r/20210318155955.18220-1-colin.king@canonical.com

Link: https://lkml.kernel.org/r/20210317062402.533919-14-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Ding Tianhong <dingtianhong@huawei.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Nicholas Piggin 2021-04-29 22:58:49 -07:00 committed by Linus Torvalds
parent 5d87510de1
commit 121e6f3258
4 changed files with 210 additions and 49 deletions

View File

@ -829,6 +829,17 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
config HAVE_ARCH_HUGE_VMAP
bool
#
# Archs that select this would be capable of PMD-sized vmaps (i.e.,
# arch_vmap_pmd_supported() returns true), and they must make no assumptions
# that vmalloc memory is mapped with PAGE_SIZE ptes. The VM_NO_HUGE_VMAP flag
# can be used to prohibit arch-specific allocations from using hugepages to
# help with this (e.g., modules may require it).
#
config HAVE_ARCH_HUGE_VMALLOC
depends on HAVE_ARCH_HUGE_VMAP
bool
config ARCH_WANT_HUGE_PMD_SHARE
bool

View File

@ -26,6 +26,7 @@ struct notifier_block; /* in notifier.h */
#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
#define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */
#define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */
#define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */
/*
* VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
@ -54,6 +55,9 @@ struct vm_struct {
unsigned long size;
unsigned long flags;
struct page **pages;
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
unsigned int page_order;
#endif
unsigned int nr_pages;
phys_addr_t phys_addr;
const void *caller;
@ -188,6 +192,22 @@ void free_vm_area(struct vm_struct *area);
extern struct vm_struct *remove_vm_area(const void *addr);
extern struct vm_struct *find_vm_area(const void *addr);
static inline bool is_vm_area_hugepages(const void *addr)
{
/*
* This may not 100% tell if the area is mapped with > PAGE_SIZE
* page table entries, if for some reason the architecture indicates
* larger sizes are available but decides not to use them, nothing
* prevents that. This only indicates the size of the physical page
* allocated in the vmalloc layer.
*/
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
return find_vm_area(addr)->page_order > 0;
#else
return false;
#endif
}
#ifdef CONFIG_MMU
int vmap_range(unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
@ -205,6 +225,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
if (vm)
vm->flags |= VM_FLUSH_RESET_PERMS;
}
#else
static inline int
map_kernel_range_noflush(unsigned long start, unsigned long size,

View File

@ -72,6 +72,7 @@
#include <linux/padata.h>
#include <linux/khugepaged.h>
#include <linux/buffer_head.h>
#include <linux/vmalloc.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@ -8222,6 +8223,7 @@ void *__init alloc_large_system_hash(const char *tablename,
void *table = NULL;
gfp_t gfp_flags;
bool virt;
bool huge;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@ -8289,6 +8291,7 @@ void *__init alloc_large_system_hash(const char *tablename,
} else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags);
virt = true;
huge = is_vm_area_hugepages(table);
} else {
/*
* If bucketsize is not a power-of-two, we may free
@ -8305,7 +8308,7 @@ void *__init alloc_large_system_hash(const char *tablename,
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
virt ? "vmalloc" : "linear");
virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
if (_hash_shift)
*_hash_shift = log2qty;

View File

@ -42,6 +42,19 @@
#include "internal.h"
#include "pgalloc-track.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
static bool __ro_after_init vmap_allow_huge = true;
static int __init set_nohugevmalloc(char *str)
{
vmap_allow_huge = false;
return 0;
}
early_param("nohugevmalloc", set_nohugevmalloc);
#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
static const bool vmap_allow_huge = false;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
bool is_vmalloc_addr(const void *x)
{
unsigned long addr = (unsigned long)x;
@ -483,6 +496,69 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
return 0;
}
static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages)
{
unsigned long start = addr;
pgd_t *pgd;
unsigned long next;
int err = 0;
int nr = 0;
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
return 0;
}
static int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
WARN_ON(page_shift < PAGE_SHIFT);
if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
page_shift == PAGE_SHIFT)
return vmap_small_pages_range_noflush(addr, end, prot, pages);
for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
int err;
err = vmap_range_noflush(addr, addr + (1UL << page_shift),
__pa(page_address(pages[i])), prot,
page_shift);
if (err)
return err;
addr += 1UL << page_shift;
}
return 0;
}
static int vmap_pages_range(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
int err;
err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
flush_cache_vmap(addr, end);
return err;
}
/**
* map_kernel_range_noflush - map kernel VM area with the specified pages
* @addr: start of the VM area to map
@ -504,29 +580,7 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
int map_kernel_range_noflush(unsigned long addr, unsigned long size,
pgprot_t prot, struct page **pages)
{
unsigned long start = addr;
unsigned long end = addr + size;
unsigned long next;
pgd_t *pgd;
int err = 0;
int nr = 0;
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_bad(*pgd))
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
return 0;
return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT);
}
int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
@ -2112,6 +2166,24 @@ EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
static inline unsigned int vm_area_page_order(struct vm_struct *vm)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
return vm->page_order;
#else
return 0;
#endif
}
static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
vm->page_order = order;
#else
BUG_ON(order != 0);
#endif
}
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@ -2422,6 +2494,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
{
int i;
/* HUGE_VMALLOC passes small pages to set_direct_map */
for (i = 0; i < area->nr_pages; i++)
if (page_address(area->pages[i]))
set_direct_map(area->pages[i]);
@ -2431,6 +2504,7 @@ static inline void set_area_direct_map(const struct vm_struct *area,
static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
{
unsigned long start = ULONG_MAX, end = 0;
unsigned int page_order = vm_area_page_order(area);
int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
int flush_dmap = 0;
int i;
@ -2455,11 +2529,14 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
* map. Find the start and end range of the direct mappings to make sure
* the vm_unmap_aliases() flush includes the direct map.
*/
for (i = 0; i < area->nr_pages; i++) {
for (i = 0; i < area->nr_pages; i += 1U << page_order) {
unsigned long addr = (unsigned long)page_address(area->pages[i]);
if (addr) {
unsigned long page_size;
page_size = PAGE_SIZE << page_order;
start = min(addr, start);
end = max(addr + PAGE_SIZE, end);
end = max(addr + page_size, end);
flush_dmap = 1;
}
}
@ -2500,13 +2577,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
unsigned int page_order = vm_area_page_order(area);
int i;
for (i = 0; i < area->nr_pages; i++) {
for (i = 0; i < area->nr_pages; i += 1U << page_order) {
struct page *page = area->pages[i];
BUG_ON(!page);
__free_pages(page, 0);
__free_pages(page, page_order);
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@ -2697,15 +2775,19 @@ EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
pgprot_t prot, unsigned int page_shift,
int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area);
unsigned long array_size;
unsigned int i;
unsigned int nr_small_pages = size >> PAGE_SHIFT;
unsigned int page_order;
struct page **pages;
unsigned int i;
array_size = (unsigned long)nr_pages * sizeof(struct page *);
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
@ -2724,30 +2806,38 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
area->pages = pages;
area->nr_pages = nr_pages;
area->nr_pages = nr_small_pages;
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
for (i = 0; i < area->nr_pages; i++) {
page_order = vm_area_page_order(area);
/*
* Careful, we allocate and map page_order pages, but tracking is done
* per PAGE_SIZE page so as to keep the vm_struct APIs independent of
* the physical/mapped size.
*/
for (i = 0; i < area->nr_pages; i += 1U << page_order) {
struct page *page;
int p;
if (node == NUMA_NO_NODE)
page = alloc_page(gfp_mask);
else
page = alloc_pages_node(node, gfp_mask, 0);
/* Compound pages required for remap_vmalloc_page */
page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
goto fail;
}
area->pages[i] = page;
for (p = 0; p < (1U << page_order); p++)
area->pages[i + p] = page + p;
if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
prot, pages) < 0)
if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0)
goto fail;
return area->addr;
@ -2755,7 +2845,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
fail:
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
(area->nr_pages*PAGE_SIZE), size);
__vfree(area->addr);
return NULL;
}
@ -2786,19 +2876,45 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
struct vm_struct *area;
void *addr;
unsigned long real_size = size;
unsigned long real_align = align;
unsigned int shift = PAGE_SHIFT;
size = PAGE_ALIGN(size);
if (!size || (size >> PAGE_SHIFT) > totalram_pages())
if (!size || (size >> PAGE_SHIFT) > totalram_pages()) {
area = NULL;
goto fail;
}
area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED |
if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
arch_vmap_pmd_supported(prot)) {
unsigned long size_per_node;
/*
* Try huge pages. Only try for PAGE_KERNEL allocations,
* others like modules don't yet expect huge pages in
* their allocations due to apply_to_page_range not
* supporting them.
*/
size_per_node = size;
if (node == NUMA_NO_NODE)
size_per_node /= num_online_nodes();
if (size_per_node >= PMD_SIZE) {
shift = PMD_SHIFT;
align = max(real_align, 1UL << shift);
size = ALIGN(real_size, 1UL << shift);
}
}
again:
size = PAGE_ALIGN(size);
area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
vm_flags, start, end, node, gfp_mask, caller);
if (!area)
goto fail;
addr = __vmalloc_area_node(area, gfp_mask, prot, node);
addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
if (!addr)
return NULL;
goto fail;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@ -2812,8 +2928,18 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return addr;
fail:
warn_alloc(gfp_mask, NULL,
if (shift > PAGE_SHIFT) {
shift = PAGE_SHIFT;
align = real_align;
size = real_size;
goto again;
}
if (!area) {
/* Warn for area allocation, page allocations already warn */
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure: %lu bytes", real_size);
}
return NULL;
}