diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index ce3e05e41724..d09471aa7443 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1356,8 +1356,8 @@ PAGE_SIZE multiple when read back. thp_fault_alloc Number of transparent hugepages which were allocated to satisfy - a page fault, including COW faults. This counter is not present - when CONFIG_TRANSPARENT_HUGEPAGE is not set. + a page fault. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE + is not set. thp_collapse_alloc Number of transparent hugepages which were allocated to allow diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 6a233e42be08..b2acd0d395ca 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -305,8 +305,7 @@ monitor how successfully the system is providing huge pages for use. thp_fault_alloc is incremented every time a huge page is successfully - allocated to handle a page fault. This applies to both the - first time a page is faulted and for COW faults. + allocated to handle a page fault. thp_collapse_alloc is incremented by khugepaged when it has found diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 6068266dd303..7ca8c7bac650 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -33,7 +33,7 @@ all combinations of get*(), pin*(), FOLL_LONGTERM, and more. Also, the pin_user_pages*() APIs are clearly distinct from the get_user_pages*() APIs, so that's a natural dividing line, and a good point to make separate wrapper calls. In other words, use pin_user_pages*() for DMA-pinned pages, and -get_user_pages*() for other cases. There are four cases described later on in +get_user_pages*() for other cases. There are five cases described later on in this document, to further clarify that concept. FOLL_PIN and FOLL_GET are mutually exclusive for a given gup call. However, diff --git a/MAINTAINERS b/MAINTAINERS index 63c4cc4a04d6..28bd4715ebd5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16058,8 +16058,10 @@ SPARSE CHECKER M: "Luc Van Oostenryck" L: linux-sparse@vger.kernel.org S: Maintained -W: https://sparse.wiki.kernel.org/ +W: https://sparse.docs.kernel.org/ T: git git://git.kernel.org/pub/scm/devel/sparse/sparse.git +Q: https://patchwork.kernel.org/project/linux-sparse/list/ +B: https://bugzilla.kernel.org/enter_bug.cgi?component=Sparse&product=Tools F: include/linux/compiler.h SPEAR CLOCK FRAMEWORK SUPPORT diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index d1c95dcf1d78..cbe49cd117cf 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -120,15 +120,9 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) void *alloc_insn_page(void) { - void *page; - - page = vmalloc_exec(PAGE_SIZE); - if (page) { - set_memory_ro((unsigned long)page, 1); - set_vm_flush_reset_perms(page); - } - - return page; + return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __func__); } /* arm kprobe: install breakpoint in text */ diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index c152a68811dd..345727638d52 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -74,8 +74,11 @@ void *arch_dma_set_uncached(void *cpu_addr, size_t size) * We need to iterate through the pages, clearing the dcache for * them and setting the cache-inhibit bit. */ + mmap_read_lock(&init_mm); error = walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops, NULL); + mmap_read_unlock(&init_mm); + if (error) return ERR_PTR(error); return cpu_addr; @@ -85,9 +88,11 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size) { unsigned long va = (unsigned long)cpu_addr; + mmap_read_lock(&init_mm); /* walk_page_range shouldn't be able to fail here */ WARN_ON(walk_page_range(&init_mm, va, va + size, &clear_nocache_walk_ops, NULL)); + mmap_read_unlock(&init_mm); } void arch_sync_dma_for_device(phys_addr_t addr, size_t size, diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index a54c6a401581..2bdc72e6890e 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -375,7 +375,9 @@ void __init hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hv_hypercall_pg = vmalloc_exec(PAGE_SIZE); + hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, + VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, + VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, __func__); if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); goto remove_cpuhp_state; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 2da1f95b88d7..816b31c68550 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -194,6 +194,7 @@ enum page_cache_mode { #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) +#define __PAGE_KERNEL_ROX (__PP| 0| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) @@ -219,6 +220,7 @@ enum page_cache_mode { #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) +#define PAGE_KERNEL_ROX __pgprot_mask(__PAGE_KERNEL_ROX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) diff --git a/drivers/media/platform/omap3isp/isp.c b/drivers/media/platform/omap3isp/isp.c index a4ee6b86663e..b91e472ee764 100644 --- a/drivers/media/platform/omap3isp/isp.c +++ b/drivers/media/platform/omap3isp/isp.c @@ -39,8 +39,6 @@ * Troy Laramy */ -#include - #include #include #include diff --git a/drivers/media/platform/omap3isp/ispvideo.c b/drivers/media/platform/omap3isp/ispvideo.c index 10c214bd0903..1ac9aef70dff 100644 --- a/drivers/media/platform/omap3isp/ispvideo.c +++ b/drivers/media/platform/omap3isp/ispvideo.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 152a0fc4e905..751bc4dc7466 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -689,6 +689,12 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_nfs_sync_lops, osb); } +static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb) +{ + ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + init_rwsem(&osb->nfs_sync_rwlock); +} + void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) { struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; @@ -2855,6 +2861,11 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex) if (ocfs2_is_hard_readonly(osb)) return -EROFS; + if (ex) + down_write(&osb->nfs_sync_rwlock); + else + down_read(&osb->nfs_sync_rwlock); + if (ocfs2_mount_local(osb)) return 0; @@ -2873,6 +2884,10 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) if (!ocfs2_mount_local(osb)) ocfs2_cluster_unlock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE); + if (ex) + up_write(&osb->nfs_sync_rwlock); + else + up_read(&osb->nfs_sync_rwlock); } int ocfs2_trim_fs_lock(struct ocfs2_super *osb, @@ -3340,7 +3355,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) local: ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); - ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb); + ocfs2_nfs_sync_lock_init(osb); ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb); osb->cconn = conn; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index ee5d98516212..2dd71d626196 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -395,6 +395,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; + struct rw_semaphore nfs_sync_rwlock; struct ocfs2_lock_res osb_trim_fs_lockres; struct mutex obs_trim_fs_mutex; struct ocfs2_dlm_debug *osb_dlm_debug; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 0dd8c41bafd4..19137c6d087b 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -290,7 +290,7 @@ #define OCFS2_MAX_SLOTS 255 /* Slot map indicator for an empty slot */ -#define OCFS2_INVALID_SLOT -1 +#define OCFS2_INVALID_SLOT ((u16)-1) #define OCFS2_VOL_UUID_LEN 16 #define OCFS2_MAX_VOL_LABEL_LEN 64 @@ -326,8 +326,8 @@ struct ocfs2_system_inode_info { enum { BAD_BLOCK_SYSTEM_INODE = 0, GLOBAL_INODE_ALLOC_SYSTEM_INODE, +#define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE, -#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE HEARTBEAT_SYSTEM_INODE, GLOBAL_BITMAP_SYSTEM_INODE, USER_QUOTA_SYSTEM_INODE, diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 4836becb7578..45745cc3408a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2825,9 +2825,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - inode_alloc_inode = - ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, - suballoc_slot); + if (suballoc_slot == (u16)OCFS2_INVALID_SLOT) + inode_alloc_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot); + else + inode_alloc_inode = ocfs2_get_system_file_inode(osb, + INODE_ALLOC_SYSTEM_INODE, suballoc_slot); if (!inode_alloc_inode) { /* the error code could be inaccurate, but we are not able to * get the correct one. */ diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 907fa5d16494..4a674db4e1fa 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,6 +2,11 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H +struct mm_struct; +struct vm_area_struct; +struct page; +struct address_space; + /* * The cache doesn't need to be flushed when TLB entries change when * the cache is mapped to physical memory, not virtual memory diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c4c37fd12104..f6f884970511 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -257,8 +257,8 @@ struct lruvec { */ unsigned long anon_cost; unsigned long file_cost; - /* Evictions & activations on the inactive file list */ - atomic_long_t inactive_age; + /* Non-resident age, driven by LRU movement */ + atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults; /* Various lruvec state flags (enum lruvec_flags) */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 4c5974bb9ba9..5b3216ba39a9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -313,6 +313,7 @@ struct vma_swap_readahead { }; /* linux/mm/workingset.c */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 48bb681e6c2a..0221f852a7e1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -106,7 +106,6 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index bb05fd52de85..09cc78df53c6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -181,34 +181,19 @@ void kimage_file_post_load_cleanup(struct kimage *image) static int kimage_validate_signature(struct kimage *image) { - const char *reason; int ret; ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, image->kernel_buf_len); - switch (ret) { - case 0: - break; + if (ret) { - /* Certain verification errors are non-fatal if we're not - * checking errors, provided we aren't mandating that there - * must be a valid signature. - */ - case -ENODATA: - reason = "kexec of unsigned image"; - goto decide; - case -ENOPKG: - reason = "kexec of image with unsupported crypto"; - goto decide; - case -ENOKEY: - reason = "kexec of image with unavailable key"; - decide: if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { - pr_notice("%s rejected\n", reason); + pr_notice("Enforced kernel signature verification failed (%d).\n", ret); return ret; } - /* If IMA is guaranteed to appraise a signature on the kexec + /* + * If IMA is guaranteed to appraise a signature on the kexec * image, permit it even if the kernel is otherwise locked * down. */ @@ -216,17 +201,10 @@ kimage_validate_signature(struct kimage *image) security_locked_down(LOCKDOWN_KEXEC)) return -EPERM; - return 0; - - /* All other errors are fatal, including nomem, unparseable - * signatures and signature check failures - even if signatures - * aren't required. - */ - default: - pr_notice("kernel signature verification failed (%d).\n", ret); + pr_debug("kernel signature verification failed (%d).\n", ret); } - return ret; + return 0; } #endif diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..0c6573b98c36 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2783,7 +2783,9 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return vmalloc_exec(size); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __func__); } bool __weak module_init_section(const char *name) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 28528285942c..a2a82262b97b 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -520,8 +520,7 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, err_free: kfree(devmem); err_release: - release_mem_region(devmem->pagemap.res.start, - resource_size(&devmem->pagemap.res)); + release_mem_region(res->start, resource_size(res)); err: mutex_unlock(&mdevice->devmem_lock); return false; diff --git a/mm/compaction.c b/mm/compaction.c index fd988b7e5f2b..86375605faa9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2316,15 +2316,26 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .page = NULL, }; - current->capture_control = &capc; + /* + * Make sure the structs are really initialized before we expose the + * capture control, in case we are interrupted and the interrupt handler + * frees a page. + */ + barrier(); + WRITE_ONCE(current->capture_control, &capc); ret = compact_zone(&cc, &capc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); - *capture = capc.page; - current->capture_control = NULL; + /* + * Make sure we hide capture control first before we read the captured + * page pointer, otherwise an interrupt could free and capture a page + * and we would leak it. + */ + WRITE_ONCE(current->capture_control, NULL); + *capture = READ_ONCE(capc.page); return ret; } diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index e45623016aea..61ab16fb2e36 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -246,13 +246,13 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, unsigned long vaddr) { - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); pte = __pte(pte_val(pte) | RANDOM_ORVALUE); set_pte_at(mm, vaddr, ptep, pte); barrier(); pte_clear(mm, vaddr, ptep); - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b38b6ad547d..19622328e4b5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2772,8 +2772,10 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, return; cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); - if (!cw) + if (!cw) { + css_put(&memcg->css); return; + } cw->memcg = memcg; cw->cachep = cachep; @@ -6360,11 +6362,16 @@ static unsigned long effective_protection(unsigned long usage, * We're using unprotected memory for the weight so that if * some cgroups DO claim explicit protection, we don't protect * the same bytes twice. + * + * Check both usage and parent_usage against the respective + * protected values. One should imply the other, but they + * aren't read atomically - make sure the division is sane. */ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) return ep; - - if (parent_effective > siblings_protected && usage > protected) { + if (parent_effective > siblings_protected && + parent_usage > siblings_protected && + usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; @@ -6416,7 +6423,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, if (parent == root) { memcg->memory.emin = READ_ONCE(memcg->memory.min); - memcg->memory.elow = memcg->memory.low; + memcg->memory.elow = READ_ONCE(memcg->memory.low); goto out; } @@ -6428,7 +6435,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_min_usage))); WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, - memcg->memory.low, READ_ONCE(parent->memory.elow), + READ_ONCE(memcg->memory.low), + READ_ONCE(parent->memory.elow), atomic_long_read(&parent->memory.children_low_usage))); out: diff --git a/mm/memory.c b/mm/memory.c index dc7f3543b1fd..87ec87cdc1ff 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1498,7 +1498,7 @@ out: } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, +static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1506,8 +1506,9 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, if (!page_count(page)) return -EINVAL; err = validate_page_before_insert(page); - return err ? err : insert_page_into_pte_locked( - mm, pte_offset_map(pmd, addr), addr, page, prot); + if (err) + return err; + return insert_page_into_pte_locked(mm, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1517,7 +1518,8 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) { pmd_t *pmd = NULL; - spinlock_t *pte_lock = NULL; + pte_t *start_pte, *pte; + spinlock_t *pte_lock; struct mm_struct *const mm = vma->vm_mm; unsigned long curr_page_idx = 0; unsigned long remaining_pages_total = *num; @@ -1536,18 +1538,17 @@ more: ret = -ENOMEM; if (pte_alloc(mm, pmd)) goto out; - pte_lock = pte_lockptr(mm, pmd); while (pages_to_write_in_pmd) { int pte_idx = 0; const int batch_size = min_t(int, pages_to_write_in_pmd, 8); - spin_lock(pte_lock); - for (; pte_idx < batch_size; ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pmd, + start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { + int err = insert_page_in_batch_locked(mm, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { - spin_unlock(pte_lock); + pte_unmap_unlock(start_pte, pte_lock); ret = err; remaining_pages_total -= pte_idx; goto out; @@ -1555,7 +1556,7 @@ more: addr += PAGE_SIZE; ++curr_page_idx; } - spin_unlock(pte_lock); + pte_unmap_unlock(start_pte, pte_lock); pages_to_write_in_pmd -= batch_size; remaining_pages_total -= batch_size; } @@ -3140,8 +3141,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) err = mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL); ClearPageSwapCache(page); - if (err) + if (err) { + ret = VM_FAULT_OOM; goto out_page; + } + + /* + * XXX: Move to lru_cache_add() when it + * supports new vs putback + */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); lru_cache_add(page); swap_readpage(page, true); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9b34e03e730a..da374cd3d45b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -471,11 +471,20 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { + const unsigned long end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long flags; + unsigned long pfn, cur_nr_pages, flags; /* Poison struct pages because they are now uninitialized again. */ - page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { + cond_resched(); + + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = + min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); + page_init_poison(pfn_to_page(pfn), + sizeof(struct page) * cur_nr_pages); + } #ifdef CONFIG_ZONE_DEVICE /* diff --git a/mm/nommu.c b/mm/nommu.c index cdcad5d61dd1..f32a69095d50 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -290,23 +290,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - */ - -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); -} - /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size diff --git a/mm/slab.h b/mm/slab.h index 207c83ef6e06..74f7e09a7cfd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -348,7 +348,7 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - unsigned int nr_pages = 1 << order; + int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; int ret; @@ -388,7 +388,7 @@ out: static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - unsigned int nr_pages = 1 << order; + int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; diff --git a/mm/slab_common.c b/mm/slab_common.c index 9e72ba224175..37d48a56431d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1726,7 +1726,7 @@ void kzfree(const void *p) if (unlikely(ZERO_OR_NULL_PTR(mem))) return; ks = ksize(mem); - memset(mem, 0, ks); + memzero_explicit(mem, ks); kfree(mem); } EXPORT_SYMBOL(kzfree); diff --git a/mm/slub.c b/mm/slub.c index fe81773fd97e..ef303070d175 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3766,15 +3766,13 @@ error: } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text, unsigned long *map) + const char *text) { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); + unsigned long *map; void *p; - if (!map) - return; - slab_err(s, page, text, s->name); slab_lock(page); @@ -3786,6 +3784,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } + put_map(map); slab_unlock(page); #endif } @@ -3799,11 +3798,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); struct page *page, *h; - unsigned long *map = NULL; - -#ifdef CONFIG_SLUB_DEBUG - map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); -#endif BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); @@ -3813,16 +3807,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()", - map); + "Objects remaining in %s on __kmem_cache_shutdown()"); } } spin_unlock_irq(&n->list_lock); -#ifdef CONFIG_SLUB_DEBUG - bitmap_free(map); -#endif - list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } diff --git a/mm/swap.c b/mm/swap.c index dbcab84c6fce..a82efc33411f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -443,8 +443,7 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); - if (page_is_file_lru(page)) - workingset_activation(page); + workingset_activation(page); } if (page_is_idle(page)) clear_page_idle(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index e98ff460e9e9..05889e8e3c97 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,7 +21,7 @@ #include #include #include - +#include "internal.h" /* * swapper_space is a fiction, retained to simplify the path through @@ -429,7 +429,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __SetPageSwapBacked(page); /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) { + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK)) { put_swap_page(page, entry); goto fail_unlock; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3091c2ca60df..5a2b55c8dd9a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1862,7 +1862,6 @@ EXPORT_SYMBOL(vm_unmap_ram); * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node - * @prot: memory protection to use. PAGE_KERNEL for regular RAM * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life @@ -2696,26 +2695,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) diff --git a/mm/vmscan.c b/mm/vmscan.c index b6d84326bdf2..749d239c62b2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -904,6 +904,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); + workingset_eviction(page, target_memcg); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -1884,6 +1885,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, &pages_to_free); } else { nr_moved += nr_pages; + if (PageActive(page)) + workingset_age_nonresident(lruvec, nr_pages); } } diff --git a/mm/workingset.c b/mm/workingset.c index d481ea452eeb..50b7937bab32 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -156,8 +156,8 @@ * * Implementation * - * For each node's file LRU lists, a counter for inactive evictions - * and activations is maintained (node->inactive_age). + * For each node's LRU lists, a counter for inactive evictions and + * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache @@ -213,7 +213,17 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *workingsetp = workingset; } -static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) +/** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @memcg: the lruvec that was aged + * @nr_pages: the number of pages to count + * + * As in-memory pages are aged, non-resident pages need to be aged as + * well, in order for the refault distances later on to be comparable + * to the in-memory dimensions. This function allows reclaim and LRU + * operations to drive the non-resident aging along in parallel. + */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a @@ -227,11 +237,8 @@ static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) * the root cgroup's, age as well. */ do { - struct lruvec *lruvec; - - lruvec = mem_cgroup_lruvec(memcg, pgdat); - atomic_long_inc(&lruvec->inactive_age); - } while (memcg && (memcg = parent_mem_cgroup(memcg))); + atomic_long_add(nr_pages, &lruvec->nonresident_age); + } while ((lruvec = parent_lruvec(lruvec))); } /** @@ -254,12 +261,11 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - advance_inactive_age(page_memcg(page), pgdat); - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->inactive_age); + eviction = atomic_long_read(&lruvec->nonresident_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -309,20 +315,20 @@ void workingset_refault(struct page *page, void *shadow) if (!mem_cgroup_disabled() && !eviction_memcg) goto out; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); - refault = atomic_long_read(&eviction_lruvec->inactive_age); + refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. There is a + * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the - * inactive_age to lap a shadow entry in the field, which can - * then result in a false small refault distance, leading to a - * false activation should this old entry actually refault - * again. However, earlier kernels used to deactivate + * nonresident_age to lap a shadow entry in the field, which + * can then result in a false small refault distance, leading + * to a false activation should this old entry actually + * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. @@ -359,7 +365,7 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); - advance_inactive_age(memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); /* Page was active prior to eviction */ @@ -382,6 +388,7 @@ out: void workingset_activation(struct page *page) { struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); /* @@ -394,7 +401,8 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - advance_inactive_age(memcg, page_pgdat(page)); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); out: rcu_read_unlock(); }