diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 85598f7da407..1602373e539c 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -64,6 +64,7 @@ SECTIONS __initramfs_end = .; #endif . = ALIGN(4096); + __per_cpu_load = .; __per_cpu_start = .; *(.data.percpu.page_aligned) *(.data.percpu) diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index f45e4e508eca..3765efc5f963 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -213,17 +213,9 @@ SECTIONS { *(.data.cacheline_aligned) } /* Per-cpu data: */ - percpu : { } :percpu . = ALIGN(PERCPU_PAGE_SIZE); - __phys_per_cpu_start = .; - .data.percpu PERCPU_ADDR : AT(__phys_per_cpu_start - LOAD_OFFSET) - { - __per_cpu_start = .; - *(.data.percpu.page_aligned) - *(.data.percpu) - *(.data.percpu.shared_aligned) - __per_cpu_end = .; - } + PERCPU_VADDR(PERCPU_ADDR, :percpu) + __phys_per_cpu_start = __per_cpu_load; . = __phys_per_cpu_start + PERCPU_PAGE_SIZE; /* ensure percpu data fits * into percpu page size */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 295ccc5e86b1..67f07f453385 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -181,14 +181,7 @@ SECTIONS __initramfs_end = .; } #endif - . = ALIGN(PAGE_SIZE); - .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { - __per_cpu_start = .; - *(.data.percpu.page_aligned) - *(.data.percpu) - *(.data.percpu.shared_aligned) - __per_cpu_end = .; - } + PERCPU(PAGE_SIZE) . = ALIGN(8); .machine.desc : AT(ADDR(.machine.desc) - LOAD_OFFSET) { diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 8f1d2fbec1d4..aee103b26d01 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -43,14 +43,6 @@ #else /* ...!ASSEMBLY */ #include -#include - -#define __addr_to_pcpu_ptr(addr) \ - (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ - + (unsigned long)__per_cpu_start) -#define __pcpu_ptr_to_addr(ptr) \ - (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ - - (unsigned long)__per_cpu_start) #ifdef CONFIG_SMP #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index efa615f2bf43..400331b50a53 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -233,8 +233,8 @@ proceed: "%zu bytes\n", vm.addr, static_size); ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, - PMD_SIZE, dyn_size, vm.addr, NULL); + PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + PMD_SIZE, vm.addr, NULL); goto out_free_ar; enomem: @@ -257,31 +257,13 @@ static ssize_t __init setup_pcpu_remap(size_t static_size) * Embedding allocator * * The first chunk is sized to just contain the static area plus - * module and dynamic reserves, and allocated as a contiguous area - * using bootmem allocator and used as-is without being mapped into - * vmalloc area. This enables the first chunk to piggy back on the - * linear physical PMD mapping and doesn't add any additional pressure - * to TLB. Note that if the needed size is smaller than the minimum - * unit size, the leftover is returned to the bootmem allocator. + * module and dynamic reserves and embedded into linear physical + * mapping so that it can use PMD mapping without additional TLB + * pressure. */ -static void *pcpue_ptr __initdata; -static size_t pcpue_size __initdata; -static size_t pcpue_unit_size __initdata; - -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpue_size) - return NULL; - - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); -} - static ssize_t __init setup_pcpu_embed(size_t static_size) { - unsigned int cpu; - size_t dyn_size; + size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; /* * If large page isn't supported, there's no benefit in doing @@ -291,33 +273,8 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) if (!cpu_has_pse || pcpu_need_numa()) return -EINVAL; - /* allocate and copy */ - pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size, - PAGE_SIZE); - if (!pcpue_ptr) - return -ENOMEM; - - for_each_possible_cpu(cpu) { - void *ptr = pcpue_ptr + cpu * pcpue_unit_size; - - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); - } - - /* we're ready, commit */ - pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); - - return pcpu_setup_first_chunk(pcpue_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, - pcpue_unit_size, dyn_size, - pcpue_ptr, NULL); + return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); } /* @@ -375,8 +332,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pcpu4k_nr_static_pages, static_size); ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL, - pcpu4k_populate_pte); + PERCPU_FIRST_CHUNK_RESERVE, -1, + -1, NULL, pcpu4k_populate_pte); goto out_free_ar; enomem: diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 54a968b4b924..ee5615d65211 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -107,10 +107,14 @@ typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, - ssize_t unit_size, ssize_t dyn_size, + ssize_t dyn_size, ssize_t unit_size, void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn); +extern ssize_t __init pcpu_embed_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, ssize_t unit_size); + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/kernel/sched.c b/kernel/sched.c index 0a76d0b6f215..61e63562f273 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9599,10 +9599,11 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) cpu = task_cpu(tsk); ca = task_ca(tsk); - for (; ca; ca = ca->parent) { + do { u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; - } + ca = ca->parent; + } while (ca); } struct cgroup_subsys cpuacct_subsys = { diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 3653c570232b..1882923bc706 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -120,7 +120,7 @@ void *__alloc_percpu(size_t size, size_t align) * on it. Larger alignment should only be used for module * percpu sections on SMP for which this path isn't used. */ - WARN_ON_ONCE(align > __alignof__(unsigned long long)); + WARN_ON_ONCE(align > SMP_CACHE_BYTES); if (unlikely(!pdata)) return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index bfe6a3afaf45..1aa5d8fbca12 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -46,7 +46,8 @@ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate - * regular address to percpu pointer and back + * regular address to percpu pointer and back if they need to be + * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area @@ -67,11 +68,24 @@ #include #include +#include #include #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ +#ifndef __addr_to_pcpu_ptr +#define __addr_to_pcpu_ptr(addr) \ + (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ + + (unsigned long)__per_cpu_start) +#endif +#ifndef __pcpu_ptr_to_addr +#define __pcpu_ptr_to_addr(ptr) \ + (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ + - (unsigned long)__per_cpu_start) +#endif + struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ struct rb_node rb_node; /* key is chunk->vm->addr */ @@ -1013,8 +1027,8 @@ EXPORT_SYMBOL_GPL(free_percpu); * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * @base_addr: mapped address, NULL for auto * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary * @@ -1039,14 +1053,14 @@ EXPORT_SYMBOL_GPL(free_percpu); * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * + * @dyn_size, if non-negative, determines the number of bytes + * available for dynamic allocation in the first chunk. Specifying + * non-negative value makes percpu leave alone the area beyond + * @static_size + @reserved_size + @dyn_size. + * * @unit_size, if non-negative, specifies unit size and must be * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @reserved_size + @dyn_size. - * - * @dyn_size, if non-negative, limits the number of bytes available - * for dynamic allocation in the first chunk. Specifying non-negative - * value make percpu leave alone the area beyond @static_size + - * @reserved_size + @dyn_size. + * @reserved_size + if non-negative, @dyn_size. * * Non-null @base_addr means that the caller already allocated virtual * region for the first chunk and mapped it. percpu must not mess @@ -1069,12 +1083,14 @@ EXPORT_SYMBOL_GPL(free_percpu); */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, - ssize_t unit_size, ssize_t dyn_size, + ssize_t dyn_size, ssize_t unit_size, void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; static int smap[2], dmap[2]; + size_t size_sum = static_size + reserved_size + + (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu; int nr_pages; @@ -1085,20 +1101,18 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); if (unit_size >= 0) { - BUG_ON(unit_size < static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); + BUG_ON(unit_size < size_sum); BUG_ON(unit_size & ~PAGE_MASK); - } else { - BUG_ON(dyn_size >= 0); + BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + } else BUG_ON(base_addr); - } BUG_ON(base_addr && populate_pte_fn); if (unit_size >= 0) pcpu_unit_pages = unit_size >> PAGE_SHIFT; else pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(static_size + reserved_size)); + PFN_UP(size_sum)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; @@ -1224,3 +1238,89 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); return pcpu_unit_size; } + +/* + * Embedding first chunk setup helper. + */ +static void *pcpue_ptr __initdata; +static size_t pcpue_size __initdata; +static size_t pcpue_unit_size __initdata; + +static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpue_size) + return NULL; + + return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); +} + +/** + * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto + * + * This is a helper to ease setting up embedded first percpu chunk and + * can be called where pcpu_setup_first_chunk() is expected. + * + * If this function is used to setup the first chunk, it is allocated + * as a contiguous area using bootmem allocator and used as-is without + * being mapped into vmalloc area. This enables the first chunk to + * piggy back on the linear physical mapping which often uses larger + * page size. + * + * When @dyn_size is positive, dynamic area might be larger than + * specified to fill page alignment. Also, when @dyn_size is auto, + * @dyn_size does not fill the whole first chunk but only what's + * necessary for page alignment after static and reserved areas. + * + * If the needed size is smaller than the minimum or specified unit + * size, the leftover is returned to the bootmem allocator. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, + ssize_t dyn_size, ssize_t unit_size) +{ + unsigned int cpu; + + /* determine parameters and allocate */ + pcpue_size = PFN_ALIGN(static_size + reserved_size + + (dyn_size >= 0 ? dyn_size : 0)); + if (dyn_size != 0) + dyn_size = pcpue_size - static_size - reserved_size; + + if (unit_size >= 0) { + BUG_ON(unit_size < pcpue_size); + pcpue_unit_size = unit_size; + } else + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); + + pcpue_ptr = __alloc_bootmem_nopanic( + num_possible_cpus() * pcpue_unit_size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!pcpue_ptr) + return -ENOMEM; + + /* return the leftover and copy */ + for_each_possible_cpu(cpu) { + void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } + + /* we're ready, commit */ + pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", + pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + + return pcpu_setup_first_chunk(pcpue_get_page, static_size, + reserved_size, dyn_size, + pcpue_unit_size, pcpue_ptr, NULL); +}