diff --git a/mm/gup.c b/mm/gup.c index 70d65e4015a4..e95b0cb6ed81 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -143,6 +143,10 @@ retry: mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* Do not mlock pte-mapped THP */ + if (PageTransCompound(page)) + goto out; + /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -920,8 +924,6 @@ long populate_vma_page_range(struct vm_area_struct *vma, gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; if (vma->vm_flags & VM_LOCKONFAULT) gup_flags &= ~FOLL_POPULATE; - if (vma->vm_flags & VM_LOCKED) - gup_flags |= FOLL_SPLIT; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4acf55b31f7c..f283cb7c480e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -874,8 +874,6 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; - if (vma->vm_flags & VM_LOCKED) - return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) @@ -1344,7 +1342,20 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, update_mmu_cache_pmd(vma, addr, pmd); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - if (page->mapping && trylock_page(page)) { + /* + * We don't mlock() pte-mapped THPs. This way we can avoid + * leaking mlocked pages into non-VM_LOCKED VMAs. + * + * In most cases the pmd is the only mapping of the page as we + * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for + * writable private mappings in populate_vma_page_range(). + * + * The only scenario when we have the page shared here is if we + * mlocking read-only mapping shared over fork(). We skip + * mlocking such pages. + */ + if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && + page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) mlock_vma_page(page); @@ -2209,8 +2220,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || (vma->vm_flags & VM_NOHUGEPAGE)) return false; - if (vma->vm_flags & VM_LOCKED) - return false; if (!vma->anon_vma || vma->vm_ops) return false; if (is_vma_temporary_stack(vma)) @@ -2851,14 +2860,28 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mm_struct *mm = vma->vm_mm; + struct page *page = NULL; unsigned long haddr = address & HPAGE_PMD_MASK; mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) - __split_huge_pmd_locked(vma, pmd, haddr, false); + if (unlikely(!pmd_trans_huge(*pmd))) + goto out; + page = pmd_page(*pmd); + __split_huge_pmd_locked(vma, pmd, haddr, false); + if (PageMlocked(page)) + get_page(page); + else + page = NULL; +out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); + if (page) { + lock_page(page); + munlock_vma_page(page); + unlock_page(page); + put_page(page); + } } static void split_huge_pmd_address(struct vm_area_struct *vma, diff --git a/mm/memory.c b/mm/memory.c index 9d5b40892d4d..5a73c6ed8e5c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2160,15 +2160,15 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - /* THP pages are never mlocked */ - if (old_page && !PageTransCompound(old_page)) { + if (old_page) { /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ if (page_copied && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); /* LRU manipulation */ - munlock_vma_page(old_page); + if (PageMlocked(old_page)) + munlock_vma_page(old_page); unlock_page(old_page); } page_cache_release(old_page); diff --git a/mm/mlock.c b/mm/mlock.c index c6b139ad356a..9197b6721a1e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page) /* Serialize with page migration */ BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); + if (!TestSetPageMlocked(page)) { mod_zone_page_state(page_zone(page), NR_MLOCK, hpage_nr_pages(page)); @@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page) /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageTail(page), page); + /* * Serialize with any parallel __split_huge_page_refcount() which * might otherwise copy PageMlocked to part of the tail pages before @@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, if (!page || page_zone_id(page) != zoneid) break; + /* + * Do not use pagevec for PTE-mapped THP, + * munlock_vma_pages_range() will handle them. + */ + if (PageTransCompound(page)) + break; + get_page(page); /* * Increase the address that will be returned *before* the @@ -443,29 +455,43 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, &page_mask); - if (page && !IS_ERR(page) && !PageTransCompound(page)) { - /* - * Non-huge pages are handled in batches via - * pagevec. The pin from follow_page_mask() - * prevents them from collapsing by THP. - */ - pagevec_add(&pvec, page); - zone = page_zone(page); - zoneid = page_zone_id(page); + if (page && !IS_ERR(page)) { + if (PageTransTail(page)) { + VM_BUG_ON_PAGE(PageMlocked(page), page); + put_page(page); /* follow_page_mask() */ + } else if (PageTransHuge(page)) { + lock_page(page); + /* + * Any THP page found by follow_page_mask() may + * have gotten split before reaching + * munlock_vma_page(), so we need to recompute + * the page_mask here. + */ + page_mask = munlock_vma_page(page); + unlock_page(page); + put_page(page); /* follow_page_mask() */ + } else { + /* + * Non-huge pages are handled in batches via + * pagevec. The pin from follow_page_mask() + * prevents them from collapsing by THP. + */ + pagevec_add(&pvec, page); + zone = page_zone(page); + zoneid = page_zone_id(page); - /* - * Try to fill the rest of pagevec using fast - * pte walk. This will also update start to - * the next page to process. Then munlock the - * pagevec. - */ - start = __munlock_pagevec_fill(&pvec, vma, - zoneid, start, end); - __munlock_pagevec(&pvec, zone); - goto next; + /* + * Try to fill the rest of pagevec using fast + * pte walk. This will also update start to + * the next page to process. Then munlock the + * pagevec. + */ + start = __munlock_pagevec_fill(&pvec, vma, + zoneid, start, end); + __munlock_pagevec(&pvec, zone); + goto next; + } } - /* It's a bug to munlock in the middle of a THP page */ - VM_BUG_ON((start >> PAGE_SHIFT) & page_mask); page_increm = 1 + page_mask; start += page_increm * PAGE_SIZE; next: diff --git a/mm/rmap.c b/mm/rmap.c index 84271cc39d1e..31d8866fb562 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1282,6 +1282,9 @@ static void page_remove_anon_compound_rmap(struct page *page) nr = HPAGE_PMD_NR; } + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); + if (nr) { __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); deferred_split_huge_page(page); diff --git a/mm/swap.c b/mm/swap.c index 3d65480422e8..abffc33bb975 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -358,6 +358,7 @@ static void __lru_cache_activate_page(struct page *page) */ void mark_page_accessed(struct page *page) { + page = compound_head(page); if (!PageActive(page) && !PageUnevictable(page) && PageReferenced(page)) {