mm: Do early cow for pinned pages during fork() for ptes
This allows copy_pte_range() to do early cow if the pages were pinned on the source mm. Currently we don't have an accurate way to know whether a page is pinned or not. The only thing we have is page_maybe_dma_pinned(). However that's good enough for now. Especially, with the newly added mm->has_pinned flag to make sure we won't affect processes that never pinned any pages. It would be easier if we can do GFP_KERNEL allocation within copy_one_pte(). Unluckily, we can't because we're with the page table locks held for both the parent and child processes. So the page allocation needs to be done outside copy_one_pte(). Some trick is there in copy_present_pte(), majorly the wrprotect trick to block concurrent fast-gup. Comments in the function should explain better in place. Oleg Nesterov reported a (probably harmless) bug during review that we didn't reset entry.val properly in copy_pte_range() so that potentially there's chance to call add_swap_count_continuation() multiple times on the same swp entry. However that should be harmless since even if it happens, the same function (add_swap_count_continuation()) will return directly noticing that there're enough space for the swp counter. So instead of a standalone stable patch, it is touched up in this patch directly. Link: https://lore.kernel.org/lkml/20200914143829.GA1424636@nvidia.com/ Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Peter Xu <peterx@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
7a4830c380
commit
70e806e4e6
207
mm/memory.c
207
mm/memory.c
@ -773,15 +773,142 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void
|
||||
/*
|
||||
* Copy a present and normal page if necessary.
|
||||
*
|
||||
* NOTE! The usual case is that this doesn't need to do
|
||||
* anything, and can just return a positive value. That
|
||||
* will let the caller know that it can just increase
|
||||
* the page refcount and re-use the pte the traditional
|
||||
* way.
|
||||
*
|
||||
* But _if_ we need to copy it because it needs to be
|
||||
* pinned in the parent (and the child should get its own
|
||||
* copy rather than just a reference to the same page),
|
||||
* we'll do that here and return zero to let the caller
|
||||
* know we're done.
|
||||
*
|
||||
* And if we need a pre-allocated page but don't yet have
|
||||
* one, return a negative error to let the preallocation
|
||||
* code know so that it can do so outside the page table
|
||||
* lock.
|
||||
*/
|
||||
static inline int
|
||||
copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
pte_t *dst_pte, pte_t *src_pte,
|
||||
struct vm_area_struct *vma, struct vm_area_struct *new,
|
||||
unsigned long addr, int *rss, struct page **prealloc,
|
||||
pte_t pte, struct page *page)
|
||||
{
|
||||
struct page *new_page;
|
||||
|
||||
if (!is_cow_mapping(vma->vm_flags))
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* The trick starts.
|
||||
*
|
||||
* What we want to do is to check whether this page may
|
||||
* have been pinned by the parent process. If so,
|
||||
* instead of wrprotect the pte on both sides, we copy
|
||||
* the page immediately so that we'll always guarantee
|
||||
* the pinned page won't be randomly replaced in the
|
||||
* future.
|
||||
*
|
||||
* To achieve this, we do the following:
|
||||
*
|
||||
* 1. Write-protect the pte if it's writable. This is
|
||||
* to protect concurrent write fast-gup with
|
||||
* FOLL_PIN, so that we'll fail the fast-gup with
|
||||
* the write bit removed.
|
||||
*
|
||||
* 2. Check page_maybe_dma_pinned() to see whether this
|
||||
* page may have been pinned.
|
||||
*
|
||||
* The order of these steps is important to serialize
|
||||
* against the fast-gup code (gup_pte_range()) on the
|
||||
* pte check and try_grab_compound_head(), so that
|
||||
* we'll make sure either we'll capture that fast-gup
|
||||
* so we'll copy the pinned page here, or we'll fail
|
||||
* that fast-gup.
|
||||
*
|
||||
* NOTE! Even if we don't end up copying the page,
|
||||
* we won't undo this wrprotect(), because the normal
|
||||
* reference copy will need it anyway.
|
||||
*/
|
||||
if (pte_write(pte))
|
||||
ptep_set_wrprotect(src_mm, addr, src_pte);
|
||||
|
||||
/*
|
||||
* These are the "normally we can just copy by reference"
|
||||
* checks.
|
||||
*/
|
||||
if (likely(!atomic_read(&src_mm->has_pinned)))
|
||||
return 1;
|
||||
if (likely(!page_maybe_dma_pinned(page)))
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* Uhhuh. It looks like the page might be a pinned page,
|
||||
* and we actually need to copy it. Now we can set the
|
||||
* source pte back to being writable.
|
||||
*/
|
||||
if (pte_write(pte))
|
||||
set_pte_at(src_mm, addr, src_pte, pte);
|
||||
|
||||
new_page = *prealloc;
|
||||
if (!new_page)
|
||||
return -EAGAIN;
|
||||
|
||||
/*
|
||||
* We have a prealloc page, all good! Take it
|
||||
* over and copy the page & arm it.
|
||||
*/
|
||||
*prealloc = NULL;
|
||||
copy_user_highpage(new_page, page, addr, vma);
|
||||
__SetPageUptodate(new_page);
|
||||
page_add_new_anon_rmap(new_page, new, addr, false);
|
||||
lru_cache_add_inactive_or_unevictable(new_page, new);
|
||||
rss[mm_counter(new_page)]++;
|
||||
|
||||
/* All done, just insert the new page copy in the child */
|
||||
pte = mk_pte(new_page, new->vm_page_prot);
|
||||
pte = maybe_mkwrite(pte_mkdirty(pte), new);
|
||||
set_pte_at(dst_mm, addr, dst_pte, pte);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
|
||||
* is required to copy this pte.
|
||||
*/
|
||||
static inline int
|
||||
copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
|
||||
unsigned long addr, int *rss)
|
||||
struct vm_area_struct *new,
|
||||
unsigned long addr, int *rss, struct page **prealloc)
|
||||
{
|
||||
unsigned long vm_flags = vma->vm_flags;
|
||||
pte_t pte = *src_pte;
|
||||
struct page *page;
|
||||
|
||||
page = vm_normal_page(vma, addr, pte);
|
||||
if (page) {
|
||||
int retval;
|
||||
|
||||
retval = copy_present_page(dst_mm, src_mm,
|
||||
dst_pte, src_pte,
|
||||
vma, new,
|
||||
addr, rss, prealloc,
|
||||
pte, page);
|
||||
if (retval <= 0)
|
||||
return retval;
|
||||
|
||||
get_page(page);
|
||||
page_dup_rmap(page, false);
|
||||
rss[mm_counter(page)]++;
|
||||
}
|
||||
|
||||
/*
|
||||
* If it's a COW mapping, write protect it both
|
||||
* in the parent and the child
|
||||
@ -807,14 +934,27 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
if (!(vm_flags & VM_UFFD_WP))
|
||||
pte = pte_clear_uffd_wp(pte);
|
||||
|
||||
page = vm_normal_page(vma, addr, pte);
|
||||
if (page) {
|
||||
get_page(page);
|
||||
page_dup_rmap(page, false);
|
||||
rss[mm_counter(page)]++;
|
||||
}
|
||||
|
||||
set_pte_at(dst_mm, addr, dst_pte, pte);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline struct page *
|
||||
page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
|
||||
unsigned long addr)
|
||||
{
|
||||
struct page *new_page;
|
||||
|
||||
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
|
||||
if (!new_page)
|
||||
return NULL;
|
||||
|
||||
if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
|
||||
put_page(new_page);
|
||||
return NULL;
|
||||
}
|
||||
cgroup_throttle_swaprate(new_page, GFP_KERNEL);
|
||||
|
||||
return new_page;
|
||||
}
|
||||
|
||||
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
@ -825,16 +965,20 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
pte_t *orig_src_pte, *orig_dst_pte;
|
||||
pte_t *src_pte, *dst_pte;
|
||||
spinlock_t *src_ptl, *dst_ptl;
|
||||
int progress = 0;
|
||||
int progress, ret = 0;
|
||||
int rss[NR_MM_COUNTERS];
|
||||
swp_entry_t entry = (swp_entry_t){0};
|
||||
struct page *prealloc = NULL;
|
||||
|
||||
again:
|
||||
progress = 0;
|
||||
init_rss_vec(rss);
|
||||
|
||||
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
|
||||
if (!dst_pte)
|
||||
return -ENOMEM;
|
||||
if (!dst_pte) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
src_pte = pte_offset_map(src_pmd, addr);
|
||||
src_ptl = pte_lockptr(src_mm, src_pmd);
|
||||
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
|
||||
@ -866,8 +1010,25 @@ again:
|
||||
progress += 8;
|
||||
continue;
|
||||
}
|
||||
copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
|
||||
vma, addr, rss);
|
||||
/* copy_present_pte() will clear `*prealloc' if consumed */
|
||||
ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
|
||||
vma, new, addr, rss, &prealloc);
|
||||
/*
|
||||
* If we need a pre-allocated page for this pte, drop the
|
||||
* locks, allocate, and try again.
|
||||
*/
|
||||
if (unlikely(ret == -EAGAIN))
|
||||
break;
|
||||
if (unlikely(prealloc)) {
|
||||
/*
|
||||
* pre-alloc page cannot be reused by next time so as
|
||||
* to strictly follow mempolicy (e.g., alloc_page_vma()
|
||||
* will allocate page according to address). This
|
||||
* could only happen if one pinned pte changed.
|
||||
*/
|
||||
put_page(prealloc);
|
||||
prealloc = NULL;
|
||||
}
|
||||
progress += 8;
|
||||
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
|
||||
|
||||
@ -879,13 +1040,25 @@ again:
|
||||
cond_resched();
|
||||
|
||||
if (entry.val) {
|
||||
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
|
||||
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
entry.val = 0;
|
||||
} else if (ret) {
|
||||
WARN_ON_ONCE(ret != -EAGAIN);
|
||||
prealloc = page_copy_prealloc(src_mm, vma, addr);
|
||||
if (!prealloc)
|
||||
return -ENOMEM;
|
||||
progress = 0;
|
||||
/* We've captured and resolved the error. Reset, try again. */
|
||||
ret = 0;
|
||||
}
|
||||
if (addr != end)
|
||||
goto again;
|
||||
return 0;
|
||||
out:
|
||||
if (unlikely(prealloc))
|
||||
put_page(prealloc);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
|
Loading…
Reference in New Issue
Block a user