Merge branch 'akpm'

* akpm:
  mm: madvise(MADV_DODUMP): allow hugetlbfs pages
  ocfs2: fix locking for res->tracking and dlm->tracking_list
  mm/vmscan.c: fix int overflow in callers of do_shrink_slab()
  mm/vmstat.c: skip NR_TLB_REMOTE_FLUSH* properly
  mm/vmstat.c: fix outdated vmstat_text
  proc: restrict kernel stack dumps to root
  mm/hugetlb: add mmap() encodings for 32MB and 512MB page sizes
  mm/migrate.c: split only transparent huge pages when allocation fails
  ipc/shm.c: use ERR_CAST() for shm_lock() error return
  mm/gup_benchmark: fix unsigned comparison to zero in __gup_benchmark_ioctl
  mm, thp: fix mlocking THP page with migration enabled
  ocfs2: fix crash in ocfs2_duplicate_clusters_by_page()
  hugetlb: take PMD sharing into account when flushing tlb/caches
  mm: migration: fix migration of huge PMD shared pages
This commit is contained in:
Greg Kroah-Hartman 2018-10-05 16:33:03 -07:00
commit 091a1eaa0e
18 changed files with 189 additions and 30 deletions

View File

@ -584,9 +584,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->last_used = 0;
spin_lock(&dlm->spinlock);
spin_lock(&dlm->track_lock);
list_add_tail(&res->tracking, &dlm->tracking_list);
spin_unlock(&dlm->spinlock);
spin_unlock(&dlm->track_lock);
memset(res->lvb, 0, DLM_LVB_LEN);
memset(res->refmap, 0, sizeof(res->refmap));

View File

@ -2946,6 +2946,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (map_end & (PAGE_SIZE - 1))
to = map_end & (PAGE_SIZE - 1);
retry:
page = find_or_create_page(mapping, page_index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
@ -2954,11 +2955,18 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
/*
* In case PAGE_SIZE <= CLUSTER_SIZE, This page
* can't be dirtied before we CoW it out.
* In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty
* page, so write it back.
*/
if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize)
BUG_ON(PageDirty(page));
if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) {
if (PageDirty(page)) {
/*
* write_on_page will unlock the page on return
*/
ret = write_one_page(page);
goto retry;
}
}
if (!PageUptodate(page)) {
ret = block_read_full_page(page, ocfs2_get_block);

View File

@ -407,6 +407,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
unsigned long *entries;
int err;
/*
* The ability to racily run the kernel stack unwinder on a running task
* and then observe the unwinder output is scary; while it is useful for
* debugging kernel issues, it can also allow an attacker to leak kernel
* stack contents.
* Doing this in a manner that is at least safe from races would require
* some work to ensure that the remote task can not be scheduled; and
* even then, this would still expose the unwinder as local attack
* surface.
* Therefore, this interface is restricted to root.
*/
if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
return -EACCES;
entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
GFP_KERNEL);
if (!entries)

View File

@ -140,6 +140,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte_t *huge_pte_offset(struct mm_struct *mm,
unsigned long addr, unsigned long sz);
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end);
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
int write);
struct page *follow_huge_pd(struct vm_area_struct *vma,
@ -170,6 +172,18 @@ static inline unsigned long hugetlb_total_pages(void)
return 0;
}
static inline int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr,
pte_t *ptep)
{
return 0;
}
static inline void adjust_range_if_pmd_sharing_possible(
struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
}
#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; })
#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL)
#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })

View File

@ -2455,6 +2455,12 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
return vma;
}
static inline bool range_in_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
return (vma && vma->vm_start <= start && end <= vma->vm_end);
}
#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);

View File

@ -26,7 +26,9 @@
#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT)
#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT)

View File

@ -25,7 +25,9 @@
#define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
#define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
#define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
#define MFD_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB
#define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
#define MFD_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB
#define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
#define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
#define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB

View File

@ -28,7 +28,9 @@
#define MAP_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
#define MAP_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
#define MAP_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
#define MAP_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB
#define MAP_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
#define MAP_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB
#define MAP_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
#define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
#define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB

View File

@ -65,7 +65,9 @@ struct shmid_ds {
#define SHM_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB
#define SHM_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB
#define SHM_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB
#define SHM_HUGE_32MB HUGETLB_FLAG_ENCODE_32MB
#define SHM_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB
#define SHM_HUGE_512MB HUGETLB_FLAG_ENCODE_512MB
#define SHM_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB
#define SHM_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB
#define SHM_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB

View File

@ -206,7 +206,7 @@ err:
* Callers of shm_lock() must validate the status of the returned ipc
* object pointer and error out as appropriate.
*/
return (void *)ipcp;
return ERR_CAST(ipcp);
}
static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)

View File

@ -19,7 +19,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
struct gup_benchmark *gup)
{
ktime_t start_time, end_time;
unsigned long i, nr, nr_pages, addr, next;
unsigned long i, nr_pages, addr, next;
int nr;
struct page **pages;
nr_pages = gup->size / PAGE_SIZE;

View File

@ -2931,7 +2931,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
else
page_add_file_rmap(new, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
if (vma->vm_flags & VM_LOCKED)
if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
mlock_vma_page(new);
update_mmu_cache_pmd(vma, address, pvmw->pmd);
}

View File

@ -3326,8 +3326,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
const unsigned long mmun_start = start; /* For mmu_notifiers */
const unsigned long mmun_end = end; /* For mmu_notifiers */
unsigned long mmun_start = start; /* For mmu_notifiers */
unsigned long mmun_end = end; /* For mmu_notifiers */
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@ -3339,6 +3339,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
*/
tlb_remove_check_page_size_change(tlb, sz);
tlb_start_vma(tlb, vma);
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
address = start;
for (; address < end; address += sz) {
@ -3349,6 +3354,10 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, &address, ptep)) {
spin_unlock(ptl);
/*
* We just unmapped a page of PMDs by clearing a PUD.
* The caller's TLB flush range should cover this area.
*/
continue;
}
@ -3431,12 +3440,23 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
{
struct mm_struct *mm;
struct mmu_gather tlb;
unsigned long tlb_start = start;
unsigned long tlb_end = end;
/*
* If shared PMDs were possibly used within this vma range, adjust
* start/end for worst case tlb flushing.
* Note that we can not be sure if PMDs are shared until we try to
* unmap pages. However, we want to make sure TLB flushing covers
* the largest possible range.
*/
adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end);
mm = vma->vm_mm;
tlb_gather_mmu(&tlb, mm, start, end);
tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end);
__unmap_hugepage_range(&tlb, vma, start, end, ref_page);
tlb_finish_mmu(&tlb, start, end);
tlb_finish_mmu(&tlb, tlb_start, tlb_end);
}
/*
@ -4298,11 +4318,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
unsigned long pages = 0;
unsigned long f_start = start;
unsigned long f_end = end;
bool shared_pmd = false;
/*
* In the case of shared PMDs, the area to flush could be beyond
* start/end. Set f_start/f_end to cover the maximum possible
* range if PMD sharing is possible.
*/
adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
flush_cache_range(vma, f_start, f_end);
mmu_notifier_invalidate_range_start(mm, start, end);
mmu_notifier_invalidate_range_start(mm, f_start, f_end);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
@ -4313,6 +4343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
if (huge_pmd_unshare(mm, &address, ptep)) {
pages++;
spin_unlock(ptl);
shared_pmd = true;
continue;
}
pte = huge_ptep_get(ptep);
@ -4348,9 +4379,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
* once we release i_mmap_rwsem, another task can do the final put_page
* and that page table be reused and filled with junk.
* and that page table be reused and filled with junk. If we actually
* did unshare a page of pmds, flush the range corresponding to the pud.
*/
flush_hugetlb_tlb_range(vma, start, end);
if (shared_pmd)
flush_hugetlb_tlb_range(vma, f_start, f_end);
else
flush_hugetlb_tlb_range(vma, start, end);
/*
* No need to call mmu_notifier_invalidate_range() we are downgrading
* page table protection not changing it to point to a new page.
@ -4358,7 +4393,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(mm, start, end);
mmu_notifier_invalidate_range_end(mm, f_start, f_end);
return pages << h->order;
}
@ -4545,12 +4580,40 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
/*
* check on proper vm_flags and page table alignment
*/
if (vma->vm_flags & VM_MAYSHARE &&
vma->vm_start <= base && end <= vma->vm_end)
if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
return true;
return false;
}
/*
* Determine if start,end range within vma could be mapped by shared pmd.
* If yes, adjust start and end to cover range associated with possible
* shared pmd mappings.
*/
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
unsigned long check_addr = *start;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
unsigned long a_start = check_addr & PUD_MASK;
unsigned long a_end = a_start + PUD_SIZE;
/*
* If sharing is possible, adjust start/end if necessary.
*/
if (range_in_vma(vma, a_start, a_end)) {
if (a_start < *start)
*start = a_start;
if (a_end > *end)
*end = a_end;
}
}
}
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
@ -4648,6 +4711,11 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
return 0;
}
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
}
#define want_pmd_share() (0)
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */

View File

@ -96,7 +96,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
if (new_flags & VM_SPECIAL) {
if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
error = -EINVAL;
goto out;
}

View File

@ -275,6 +275,9 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
if (PageTransHuge(page) && PageMlocked(page))
clear_page_mlock(page);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, pvmw.address, pvmw.pte);
}
@ -1411,7 +1414,7 @@ retry:
* we encounter them after the rest of the list
* is processed.
*/
if (PageTransHuge(page)) {
if (PageTransHuge(page) && !PageHuge(page)) {
lock_page(page);
rc = split_huge_page_to_list(page, from);
unlock_page(page);

View File

@ -1362,11 +1362,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/*
* We have to assume the worse case ie pmd for invalidation. Note that
* the page can not be free in this function as call of try_to_unmap()
* must hold a reference on the page.
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
*
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
*/
adjust_range_if_pmd_sharing_possible(vma, &start, &end);
}
mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
while (page_vma_mapped_walk(&pvmw)) {
@ -1409,6 +1419,32 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
if (PageHuge(page)) {
if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
/*
* huge_pmd_unshare unmapped an entire PMD
* page. There is no way of knowing exactly
* which PMDs may be cached for this mm, so
* we must flush them all. start/end were
* already adjusted above to cover this range.
*/
flush_cache_range(vma, start, end);
flush_tlb_range(vma, start, end);
mmu_notifier_invalidate_range(mm, start, end);
/*
* The ref count of the PMD page was dropped
* which is part of the way map counting
* is done for shared PMDs. Return 'true'
* here. When there is no other sharing,
* huge_pmd_unshare returns false and we will
* unmap the actual page and drop map count
* to zero.
*/
page_vma_mapped_walk_done(&pvmw);
break;
}
}
if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION) &&

View File

@ -580,8 +580,8 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
struct memcg_shrinker_map *map;
unsigned long freed = 0;
int ret, i;
unsigned long ret, freed = 0;
int i;
if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
return 0;
@ -677,9 +677,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg,
int priority)
{
unsigned long ret, freed = 0;
struct shrinker *shrinker;
unsigned long freed = 0;
int ret;
if (!mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

View File

@ -1275,6 +1275,9 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_SMP
"nr_tlb_remote_flush",
"nr_tlb_remote_flush_received",
#else
"", /* nr_tlb_remote_flush */
"", /* nr_tlb_remote_flush_received */
#endif /* CONFIG_SMP */
"nr_tlb_local_flush_all",
"nr_tlb_local_flush_one",
@ -1283,7 +1286,6 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_DEBUG_VM_VMACACHE
"vmacache_find_calls",
"vmacache_find_hits",
"vmacache_full_flushes",
#endif
#ifdef CONFIG_SWAP
"swap_ra",