Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: "More MM work: a memcg scalability improvememt" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm/lru: revise the comments of lru_lock mm/lru: introduce relock_page_lruvec() mm/lru: replace pgdat lru_lock with lruvec lock mm/swap.c: serialize memcg changes in pagevec_lru_move_fn mm/compaction: do page isolation first in compaction mm/lru: introduce TestClearPageLRU() mm/mlock: remove __munlock_isolate_lru_page() mm/mlock: remove lru_lock on TestClearPageMlocked mm/vmscan: remove lruvec reget in move_pages_to_lru mm/lru: move lock into lru_note_cost mm/swap.c: fold vm event PGROTATED into pagevec_move_tail_fn mm/memcg: add debug checking in lock_page_memcg mm: page_idle_get_page() does not need lru_lock mm/rmap: stop store reordering issue on page->mapping mm/vmscan: remove unnecessary lruvec adding mm/thp: narrow lru locking mm/thp: simplify lru_add_page_tail() mm/thp: use head for head page in lru_add_page_tail() mm/thp: move lru_add_page_tail() to huge_memory.c
This commit is contained in:
commit
5b200f5789
@ -133,18 +133,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
|
||||
|
||||
8. LRU
|
||||
======
|
||||
Each memcg has its own private LRU. Now, its handling is under global
|
||||
VM's control (means that it's handled under global pgdat->lru_lock).
|
||||
Almost all routines around memcg's LRU is called by global LRU's
|
||||
list management functions under pgdat->lru_lock.
|
||||
|
||||
A special function is mem_cgroup_isolate_pages(). This scans
|
||||
memcg's private LRU and call __isolate_lru_page() to extract a page
|
||||
from LRU.
|
||||
|
||||
(By __isolate_lru_page(), the page is removed from both of global and
|
||||
private LRU.)
|
||||
|
||||
Each memcg has its own vector of LRUs (inactive anon, active anon,
|
||||
inactive file, active file, unevictable) of pages from each node,
|
||||
each LRU handled under a single lru_lock for that memcg and node.
|
||||
|
||||
9. Typical Tests.
|
||||
=================
|
||||
|
@ -287,20 +287,17 @@ When oom event notifier is registered, event will be delivered.
|
||||
2.6 Locking
|
||||
-----------
|
||||
|
||||
lock_page_cgroup()/unlock_page_cgroup() should not be called under
|
||||
the i_pages lock.
|
||||
Lock order is as follows:
|
||||
|
||||
Other lock order is following:
|
||||
Page lock (PG_locked bit of page->flags)
|
||||
mm->page_table_lock or split pte_lock
|
||||
lock_page_memcg (memcg->move_lock)
|
||||
mapping->i_pages lock
|
||||
lruvec->lru_lock.
|
||||
|
||||
PG_locked.
|
||||
mm->page_table_lock
|
||||
pgdat->lru_lock
|
||||
lock_page_cgroup.
|
||||
|
||||
In many cases, just lock_page_cgroup() is called.
|
||||
|
||||
per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
|
||||
pgdat->lru_lock, it has no lock of its own.
|
||||
Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by
|
||||
lruvec->lru_lock; PG_lru bit of page->flags is cleared before
|
||||
isolating a page from its LRU under lruvec->lru_lock.
|
||||
|
||||
2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
|
||||
-----------------------------------------------
|
||||
|
@ -69,7 +69,7 @@ When pages are freed in batch, the also mm_page_free_batched is triggered.
|
||||
Broadly speaking, pages are taken off the LRU lock in bulk and
|
||||
freed in batch with a page list. Significant amounts of activity here could
|
||||
indicate that the system is under memory pressure and can also indicate
|
||||
contention on the zone->lru_lock.
|
||||
contention on the lruvec->lru_lock.
|
||||
|
||||
4. Per-CPU Allocator Activity
|
||||
=============================
|
||||
|
@ -33,7 +33,7 @@ reclaim in Linux. The problems have been observed at customer sites on large
|
||||
memory x86_64 systems.
|
||||
|
||||
To illustrate this with an example, a non-NUMA x86_64 platform with 128GB of
|
||||
main memory will have over 32 million 4k pages in a single zone. When a large
|
||||
main memory will have over 32 million 4k pages in a single node. When a large
|
||||
fraction of these pages are not evictable for any reason [see below], vmscan
|
||||
will spend a lot of time scanning the LRU lists looking for the small fraction
|
||||
of pages that are evictable. This can result in a situation where all CPUs are
|
||||
@ -55,7 +55,7 @@ unevictable, either by definition or by circumstance, in the future.
|
||||
The Unevictable Page List
|
||||
-------------------------
|
||||
|
||||
The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
|
||||
The Unevictable LRU infrastructure consists of an additional, per-node, LRU list
|
||||
called the "unevictable" list and an associated page flag, PG_unevictable, to
|
||||
indicate that the page is being managed on the unevictable list.
|
||||
|
||||
@ -84,15 +84,9 @@ The unevictable list does not differentiate between file-backed and anonymous,
|
||||
swap-backed pages. This differentiation is only important while the pages are,
|
||||
in fact, evictable.
|
||||
|
||||
The unevictable list benefits from the "arrayification" of the per-zone LRU
|
||||
The unevictable list benefits from the "arrayification" of the per-node LRU
|
||||
lists and statistics originally proposed and posted by Christoph Lameter.
|
||||
|
||||
The unevictable list does not use the LRU pagevec mechanism. Rather,
|
||||
unevictable pages are placed directly on the page's zone's unevictable list
|
||||
under the zone lru_lock. This allows us to prevent the stranding of pages on
|
||||
the unevictable list when one task has the page isolated from the LRU and other
|
||||
tasks are changing the "evictability" state of the page.
|
||||
|
||||
|
||||
Memory Control Group Interaction
|
||||
--------------------------------
|
||||
@ -101,8 +95,8 @@ The unevictable LRU facility interacts with the memory control group [aka
|
||||
memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by extending the
|
||||
lru_list enum.
|
||||
|
||||
The memory controller data structure automatically gets a per-zone unevictable
|
||||
list as a result of the "arrayification" of the per-zone LRU lists (one per
|
||||
The memory controller data structure automatically gets a per-node unevictable
|
||||
list as a result of the "arrayification" of the per-node LRU lists (one per
|
||||
lru_list enum element). The memory controller tracks the movement of pages to
|
||||
and from the unevictable list.
|
||||
|
||||
@ -196,7 +190,7 @@ for the sake of expediency, to leave a unevictable page on one of the regular
|
||||
active/inactive LRU lists for vmscan to deal with. vmscan checks for such
|
||||
pages in all of the shrink_{active|inactive|page}_list() functions and will
|
||||
"cull" such pages that it encounters: that is, it diverts those pages to the
|
||||
unevictable list for the zone being scanned.
|
||||
unevictable list for the node being scanned.
|
||||
|
||||
There may be situations where a page is mapped into a VM_LOCKED VMA, but the
|
||||
page is not marked as PG_mlocked. Such pages will make it all the way to
|
||||
@ -328,7 +322,7 @@ If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
|
||||
page from the LRU, as it is likely on the appropriate active or inactive list
|
||||
at that time. If the isolate_lru_page() succeeds, mlock_vma_page() will put
|
||||
back the page - by calling putback_lru_page() - which will notice that the page
|
||||
is now mlocked and divert the page to the zone's unevictable list. If
|
||||
is now mlocked and divert the page to the node's unevictable list. If
|
||||
mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
|
||||
it later if and when it attempts to reclaim the page.
|
||||
|
||||
@ -603,7 +597,7 @@ Some examples of these unevictable pages on the LRU lists are:
|
||||
unevictable list in mlock_vma_page().
|
||||
|
||||
shrink_inactive_list() also diverts any unevictable pages that it finds on the
|
||||
inactive lists to the appropriate zone's unevictable list.
|
||||
inactive lists to the appropriate node's unevictable list.
|
||||
|
||||
shrink_inactive_list() should only see SHM_LOCK'd pages that became SHM_LOCK'd
|
||||
after shrink_active_list() had moved them to the inactive list, or pages mapped
|
||||
|
@ -654,12 +654,41 @@ out:
|
||||
|
||||
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
|
||||
|
||||
static inline bool lruvec_holds_page_lru_lock(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
pg_data_t *pgdat = page_pgdat(page);
|
||||
const struct mem_cgroup *memcg;
|
||||
struct mem_cgroup_per_node *mz;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return lruvec == &pgdat->__lruvec;
|
||||
|
||||
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
|
||||
memcg = page_memcg(page) ? : root_mem_cgroup;
|
||||
|
||||
return lruvec->pgdat == pgdat && mz->memcg == memcg;
|
||||
}
|
||||
|
||||
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
|
||||
|
||||
struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
|
||||
|
||||
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page);
|
||||
|
||||
struct lruvec *lock_page_lruvec(struct page *page);
|
||||
struct lruvec *lock_page_lruvec_irq(struct page *page);
|
||||
struct lruvec *lock_page_lruvec_irqsave(struct page *page,
|
||||
unsigned long *flags);
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page);
|
||||
#else
|
||||
static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline
|
||||
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
|
||||
return css ? container_of(css, struct mem_cgroup, css) : NULL;
|
||||
@ -1167,6 +1196,14 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
|
||||
return &pgdat->__lruvec;
|
||||
}
|
||||
|
||||
static inline bool lruvec_holds_page_lru_lock(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
pg_data_t *pgdat = page_pgdat(page);
|
||||
|
||||
return lruvec == &pgdat->__lruvec;
|
||||
}
|
||||
|
||||
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
|
||||
{
|
||||
return NULL;
|
||||
@ -1192,6 +1229,31 @@ static inline void mem_cgroup_put(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
|
||||
static inline struct lruvec *lock_page_lruvec(struct page *page)
|
||||
{
|
||||
struct pglist_data *pgdat = page_pgdat(page);
|
||||
|
||||
spin_lock(&pgdat->__lruvec.lru_lock);
|
||||
return &pgdat->__lruvec;
|
||||
}
|
||||
|
||||
static inline struct lruvec *lock_page_lruvec_irq(struct page *page)
|
||||
{
|
||||
struct pglist_data *pgdat = page_pgdat(page);
|
||||
|
||||
spin_lock_irq(&pgdat->__lruvec.lru_lock);
|
||||
return &pgdat->__lruvec;
|
||||
}
|
||||
|
||||
static inline struct lruvec *lock_page_lruvec_irqsave(struct page *page,
|
||||
unsigned long *flagsp)
|
||||
{
|
||||
struct pglist_data *pgdat = page_pgdat(page);
|
||||
|
||||
spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
|
||||
return &pgdat->__lruvec;
|
||||
}
|
||||
|
||||
static inline struct mem_cgroup *
|
||||
mem_cgroup_iter(struct mem_cgroup *root,
|
||||
struct mem_cgroup *prev,
|
||||
@ -1411,6 +1473,10 @@ static inline
|
||||
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_MEMCG */
|
||||
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
@ -1492,6 +1558,50 @@ static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
|
||||
return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
|
||||
}
|
||||
|
||||
static inline void unlock_page_lruvec(struct lruvec *lruvec)
|
||||
{
|
||||
spin_unlock(&lruvec->lru_lock);
|
||||
}
|
||||
|
||||
static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
|
||||
{
|
||||
spin_unlock_irq(&lruvec->lru_lock);
|
||||
}
|
||||
|
||||
static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
|
||||
unsigned long flags)
|
||||
{
|
||||
spin_unlock_irqrestore(&lruvec->lru_lock, flags);
|
||||
}
|
||||
|
||||
/* Don't lock again iff page's lruvec locked */
|
||||
static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
|
||||
struct lruvec *locked_lruvec)
|
||||
{
|
||||
if (locked_lruvec) {
|
||||
if (lruvec_holds_page_lru_lock(page, locked_lruvec))
|
||||
return locked_lruvec;
|
||||
|
||||
unlock_page_lruvec_irq(locked_lruvec);
|
||||
}
|
||||
|
||||
return lock_page_lruvec_irq(page);
|
||||
}
|
||||
|
||||
/* Don't lock again iff page's lruvec locked */
|
||||
static inline struct lruvec *relock_page_lruvec_irqsave(struct page *page,
|
||||
struct lruvec *locked_lruvec, unsigned long *flags)
|
||||
{
|
||||
if (locked_lruvec) {
|
||||
if (lruvec_holds_page_lru_lock(page, locked_lruvec))
|
||||
return locked_lruvec;
|
||||
|
||||
unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
|
||||
}
|
||||
|
||||
return lock_page_lruvec_irqsave(page, flags);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
|
||||
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
|
||||
|
@ -79,7 +79,7 @@ struct page {
|
||||
struct { /* Page cache and anonymous pages */
|
||||
/**
|
||||
* @lru: Pageout list, eg. active_list protected by
|
||||
* pgdat->lru_lock. Sometimes used as a generic list
|
||||
* lruvec->lru_lock. Sometimes used as a generic list
|
||||
* by the page owner.
|
||||
*/
|
||||
struct list_head lru;
|
||||
|
@ -113,8 +113,7 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
|
||||
struct pglist_data;
|
||||
|
||||
/*
|
||||
* zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
|
||||
* So add a wild amount of padding here to ensure that they fall into separate
|
||||
* Add a wild amount of padding here to ensure datas fall into separate
|
||||
* cachelines. There are very few zone structures in the machine, so space
|
||||
* consumption is not a concern here.
|
||||
*/
|
||||
@ -276,6 +275,8 @@ enum lruvec_flags {
|
||||
|
||||
struct lruvec {
|
||||
struct list_head lists[NR_LRU_LISTS];
|
||||
/* per lruvec lru_lock for memcg */
|
||||
spinlock_t lru_lock;
|
||||
/*
|
||||
* These track the cost of reclaiming one LRU - file or anon -
|
||||
* over the other. As the observed cost of reclaiming one LRU
|
||||
@ -782,7 +783,6 @@ typedef struct pglist_data {
|
||||
|
||||
/* Write-intensive fields used by page reclaim */
|
||||
ZONE_PADDING(_pad1_)
|
||||
spinlock_t lru_lock;
|
||||
|
||||
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
|
||||
/*
|
||||
|
@ -334,6 +334,7 @@ PAGEFLAG(Referenced, referenced, PF_HEAD)
|
||||
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
|
||||
__CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
|
||||
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
|
||||
TESTCLEARFLAG(LRU, lru, PF_HEAD)
|
||||
PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
|
||||
TESTCLEARFLAG(Active, active, PF_HEAD)
|
||||
PAGEFLAG(Workingset, workingset, PF_HEAD)
|
||||
|
@ -338,8 +338,6 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file,
|
||||
unsigned int nr_pages);
|
||||
extern void lru_note_cost_page(struct page *);
|
||||
extern void lru_cache_add(struct page *);
|
||||
extern void lru_add_page_tail(struct page *page, struct page *page_tail,
|
||||
struct lruvec *lruvec, struct list_head *head);
|
||||
extern void mark_page_accessed(struct page *);
|
||||
extern void lru_add_drain(void);
|
||||
extern void lru_add_drain_cpu(int cpu);
|
||||
@ -358,7 +356,7 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page,
|
||||
extern unsigned long zone_reclaimable_pages(struct zone *zone);
|
||||
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
|
||||
gfp_t gfp_mask, nodemask_t *mask);
|
||||
extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
|
||||
extern int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode);
|
||||
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
|
||||
unsigned long nr_pages,
|
||||
gfp_t gfp_mask,
|
||||
|
@ -804,7 +804,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
unsigned long nr_scanned = 0, nr_isolated = 0;
|
||||
struct lruvec *lruvec;
|
||||
unsigned long flags = 0;
|
||||
bool locked = false;
|
||||
struct lruvec *locked = NULL;
|
||||
struct page *page = NULL, *valid_page = NULL;
|
||||
unsigned long start_pfn = low_pfn;
|
||||
bool skip_on_failure = false;
|
||||
@ -868,13 +868,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
* contention, to give chance to IRQs. Abort completely if
|
||||
* a fatal signal is pending.
|
||||
*/
|
||||
if (!(low_pfn % SWAP_CLUSTER_MAX)
|
||||
&& compact_unlock_should_abort(&pgdat->lru_lock,
|
||||
flags, &locked, cc)) {
|
||||
if (!(low_pfn % SWAP_CLUSTER_MAX)) {
|
||||
if (locked) {
|
||||
unlock_page_lruvec_irqrestore(locked, flags);
|
||||
locked = NULL;
|
||||
}
|
||||
|
||||
if (fatal_signal_pending(current)) {
|
||||
cc->contended = true;
|
||||
|
||||
low_pfn = 0;
|
||||
goto fatal_pending;
|
||||
}
|
||||
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
if (!pfn_valid_within(low_pfn))
|
||||
goto isolate_fail;
|
||||
nr_scanned++;
|
||||
@ -890,6 +899,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
|
||||
if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
|
||||
low_pfn = end_pfn;
|
||||
page = NULL;
|
||||
goto isolate_abort;
|
||||
}
|
||||
valid_page = page;
|
||||
@ -943,9 +953,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
if (unlikely(__PageMovable(page)) &&
|
||||
!PageIsolated(page)) {
|
||||
if (locked) {
|
||||
spin_unlock_irqrestore(&pgdat->lru_lock,
|
||||
flags);
|
||||
locked = false;
|
||||
unlock_page_lruvec_irqrestore(locked, flags);
|
||||
locked = NULL;
|
||||
}
|
||||
|
||||
if (!isolate_movable_page(page, isolate_mode))
|
||||
@ -971,10 +980,34 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
|
||||
goto isolate_fail;
|
||||
|
||||
/*
|
||||
* Be careful not to clear PageLRU until after we're
|
||||
* sure the page is not being freed elsewhere -- the
|
||||
* page release code relies on it.
|
||||
*/
|
||||
if (unlikely(!get_page_unless_zero(page)))
|
||||
goto isolate_fail;
|
||||
|
||||
if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
|
||||
goto isolate_fail_put;
|
||||
|
||||
/* Try isolate the page */
|
||||
if (!TestClearPageLRU(page))
|
||||
goto isolate_fail_put;
|
||||
|
||||
rcu_read_lock();
|
||||
lruvec = mem_cgroup_page_lruvec(page, pgdat);
|
||||
|
||||
/* If we already hold the lock, we can skip some rechecking */
|
||||
if (!locked) {
|
||||
locked = compact_lock_irqsave(&pgdat->lru_lock,
|
||||
&flags, cc);
|
||||
if (lruvec != locked) {
|
||||
if (locked)
|
||||
unlock_page_lruvec_irqrestore(locked, flags);
|
||||
|
||||
compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
|
||||
locked = lruvec;
|
||||
rcu_read_unlock();
|
||||
|
||||
lruvec_memcg_debug(lruvec, page);
|
||||
|
||||
/* Try get exclusive access under lock */
|
||||
if (!skip_updated) {
|
||||
@ -983,10 +1016,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
goto isolate_abort;
|
||||
}
|
||||
|
||||
/* Recheck PageLRU and PageCompound under lock */
|
||||
if (!PageLRU(page))
|
||||
goto isolate_fail;
|
||||
|
||||
/*
|
||||
* Page become compound since the non-locked check,
|
||||
* and it's on LRU. It can only be a THP so the order
|
||||
@ -994,15 +1023,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
*/
|
||||
if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
|
||||
low_pfn += compound_nr(page) - 1;
|
||||
goto isolate_fail;
|
||||
SetPageLRU(page);
|
||||
goto isolate_fail_put;
|
||||
}
|
||||
}
|
||||
|
||||
lruvec = mem_cgroup_page_lruvec(page, pgdat);
|
||||
|
||||
/* Try isolate the page */
|
||||
if (__isolate_lru_page(page, isolate_mode) != 0)
|
||||
goto isolate_fail;
|
||||
} else
|
||||
rcu_read_unlock();
|
||||
|
||||
/* The whole page is taken off the LRU; skip the tail pages. */
|
||||
if (PageCompound(page))
|
||||
@ -1032,6 +1057,15 @@ isolate_success:
|
||||
}
|
||||
|
||||
continue;
|
||||
|
||||
isolate_fail_put:
|
||||
/* Avoid potential deadlock in freeing page under lru_lock */
|
||||
if (locked) {
|
||||
unlock_page_lruvec_irqrestore(locked, flags);
|
||||
locked = NULL;
|
||||
}
|
||||
put_page(page);
|
||||
|
||||
isolate_fail:
|
||||
if (!skip_on_failure)
|
||||
continue;
|
||||
@ -1043,8 +1077,8 @@ isolate_fail:
|
||||
*/
|
||||
if (nr_isolated) {
|
||||
if (locked) {
|
||||
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
|
||||
locked = false;
|
||||
unlock_page_lruvec_irqrestore(locked, flags);
|
||||
locked = NULL;
|
||||
}
|
||||
putback_movable_pages(&cc->migratepages);
|
||||
cc->nr_migratepages = 0;
|
||||
@ -1068,9 +1102,15 @@ isolate_fail:
|
||||
if (unlikely(low_pfn > end_pfn))
|
||||
low_pfn = end_pfn;
|
||||
|
||||
page = NULL;
|
||||
|
||||
isolate_abort:
|
||||
if (locked)
|
||||
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
|
||||
unlock_page_lruvec_irqrestore(locked, flags);
|
||||
if (page) {
|
||||
SetPageLRU(page);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Updated the cached scanner pfn once the pageblock has been scanned
|
||||
|
@ -102,8 +102,8 @@
|
||||
* ->swap_lock (try_to_unmap_one)
|
||||
* ->private_lock (try_to_unmap_one)
|
||||
* ->i_pages lock (try_to_unmap_one)
|
||||
* ->pgdat->lru_lock (follow_page->mark_page_accessed)
|
||||
* ->pgdat->lru_lock (check_pte_range->isolate_lru_page)
|
||||
* ->lruvec->lru_lock (follow_page->mark_page_accessed)
|
||||
* ->lruvec->lru_lock (check_pte_range->isolate_lru_page)
|
||||
* ->private_lock (page_remove_rmap->set_page_dirty)
|
||||
* ->i_pages lock (page_remove_rmap->set_page_dirty)
|
||||
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
|
||||
|
@ -2359,6 +2359,27 @@ static void remap_page(struct page *page, unsigned int nr)
|
||||
}
|
||||
}
|
||||
|
||||
static void lru_add_page_tail(struct page *head, struct page *tail,
|
||||
struct lruvec *lruvec, struct list_head *list)
|
||||
{
|
||||
VM_BUG_ON_PAGE(!PageHead(head), head);
|
||||
VM_BUG_ON_PAGE(PageCompound(tail), head);
|
||||
VM_BUG_ON_PAGE(PageLRU(tail), head);
|
||||
lockdep_assert_held(&lruvec->lru_lock);
|
||||
|
||||
if (list) {
|
||||
/* page reclaim is reclaiming a huge page */
|
||||
VM_WARN_ON(PageLRU(head));
|
||||
get_page(tail);
|
||||
list_add_tail(&tail->lru, list);
|
||||
} else {
|
||||
/* head is still on lru (and we have it frozen) */
|
||||
VM_WARN_ON(!PageLRU(head));
|
||||
SetPageLRU(tail);
|
||||
list_add_tail(&tail->lru, &head->lru);
|
||||
}
|
||||
}
|
||||
|
||||
static void __split_huge_page_tail(struct page *head, int tail,
|
||||
struct lruvec *lruvec, struct list_head *list)
|
||||
{
|
||||
@ -2425,18 +2446,15 @@ static void __split_huge_page_tail(struct page *head, int tail,
|
||||
}
|
||||
|
||||
static void __split_huge_page(struct page *page, struct list_head *list,
|
||||
pgoff_t end, unsigned long flags)
|
||||
pgoff_t end)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
pg_data_t *pgdat = page_pgdat(head);
|
||||
struct lruvec *lruvec;
|
||||
struct address_space *swap_cache = NULL;
|
||||
unsigned long offset = 0;
|
||||
unsigned int nr = thp_nr_pages(head);
|
||||
int i;
|
||||
|
||||
lruvec = mem_cgroup_page_lruvec(head, pgdat);
|
||||
|
||||
/* complete memcg works before add pages to LRU */
|
||||
mem_cgroup_split_huge_fixup(head);
|
||||
|
||||
@ -2448,6 +2466,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
|
||||
xa_lock(&swap_cache->i_pages);
|
||||
}
|
||||
|
||||
/* lock lru list/PageCompound, ref freezed by page_ref_freeze */
|
||||
lruvec = lock_page_lruvec(head);
|
||||
|
||||
for (i = nr - 1; i >= 1; i--) {
|
||||
__split_huge_page_tail(head, i, lruvec, list);
|
||||
/* Some pages can be beyond i_size: drop them from page cache */
|
||||
@ -2467,6 +2488,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
|
||||
}
|
||||
|
||||
ClearPageCompound(head);
|
||||
unlock_page_lruvec(lruvec);
|
||||
/* Caller disabled irqs, so they are still disabled here */
|
||||
|
||||
split_page_owner(head, nr);
|
||||
|
||||
@ -2484,8 +2507,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
|
||||
page_ref_add(head, 2);
|
||||
xa_unlock(&head->mapping->i_pages);
|
||||
}
|
||||
|
||||
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
|
||||
local_irq_enable();
|
||||
|
||||
remap_page(head, nr);
|
||||
|
||||
@ -2631,12 +2653,10 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
|
||||
int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
|
||||
struct deferred_split *ds_queue = get_deferred_split_queue(head);
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
struct address_space *mapping = NULL;
|
||||
int count, mapcount, extra_pins, ret;
|
||||
unsigned long flags;
|
||||
pgoff_t end;
|
||||
|
||||
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
|
||||
@ -2697,9 +2717,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
unmap_page(head);
|
||||
VM_BUG_ON_PAGE(compound_mapcount(head), head);
|
||||
|
||||
/* prevent PageLRU to go away from under us, and freeze lru stats */
|
||||
spin_lock_irqsave(&pgdata->lru_lock, flags);
|
||||
|
||||
/* block interrupt reentry in xa_lock and spinlock */
|
||||
local_irq_disable();
|
||||
if (mapping) {
|
||||
XA_STATE(xas, &mapping->i_pages, page_index(head));
|
||||
|
||||
@ -2729,7 +2748,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
__dec_lruvec_page_state(head, NR_FILE_THPS);
|
||||
}
|
||||
|
||||
__split_huge_page(page, list, end, flags);
|
||||
__split_huge_page(page, list, end);
|
||||
ret = 0;
|
||||
} else {
|
||||
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
|
||||
@ -2743,7 +2762,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
spin_unlock(&ds_queue->split_queue_lock);
|
||||
fail: if (mapping)
|
||||
xa_unlock(&mapping->i_pages);
|
||||
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
|
||||
local_irq_enable();
|
||||
remap_page(head, thp_nr_pages(head));
|
||||
ret = -EBUSY;
|
||||
}
|
||||
|
@ -20,6 +20,9 @@
|
||||
* Lockless page tracking & accounting
|
||||
* Unified hierarchy configuration model
|
||||
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
|
||||
*
|
||||
* Per memcg lru locking
|
||||
* Copyright (C) 2020 Alibaba, Inc, Alex Shi
|
||||
*/
|
||||
|
||||
#include <linux/page_counter.h>
|
||||
@ -1322,6 +1325,23 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_VM
|
||||
void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
memcg = page_memcg(page);
|
||||
|
||||
if (!memcg)
|
||||
VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
|
||||
else
|
||||
VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
|
||||
* @page: the page
|
||||
@ -1362,6 +1382,60 @@ out:
|
||||
return lruvec;
|
||||
}
|
||||
|
||||
/**
|
||||
* lock_page_lruvec - lock and return lruvec for a given page.
|
||||
* @page: the page
|
||||
*
|
||||
* This series functions should be used in either conditions:
|
||||
* PageLRU is cleared or unset
|
||||
* or page->_refcount is zero
|
||||
* or page is locked.
|
||||
*/
|
||||
struct lruvec *lock_page_lruvec(struct page *page)
|
||||
{
|
||||
struct lruvec *lruvec;
|
||||
struct pglist_data *pgdat = page_pgdat(page);
|
||||
|
||||
rcu_read_lock();
|
||||
lruvec = mem_cgroup_page_lruvec(page, pgdat);
|
||||
spin_lock(&lruvec->lru_lock);
|
||||
rcu_read_unlock();
|
||||
|
||||
lruvec_memcg_debug(lruvec, page);
|
||||
|
||||
return lruvec;
|
||||
}
|
||||
|
||||
struct lruvec *lock_page_lruvec_irq(struct page *page)
|
||||
{
|
||||
struct lruvec *lruvec;
|
||||
struct pglist_data *pgdat = page_pgdat(page);
|
||||
|
||||
rcu_read_lock();
|
||||
lruvec = mem_cgroup_page_lruvec(page, pgdat);
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
rcu_read_unlock();
|
||||
|
||||
lruvec_memcg_debug(lruvec, page);
|
||||
|
||||
return lruvec;
|
||||
}
|
||||
|
||||
struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
|
||||
{
|
||||
struct lruvec *lruvec;
|
||||
struct pglist_data *pgdat = page_pgdat(page);
|
||||
|
||||
rcu_read_lock();
|
||||
lruvec = mem_cgroup_page_lruvec(page, pgdat);
|
||||
spin_lock_irqsave(&lruvec->lru_lock, *flags);
|
||||
rcu_read_unlock();
|
||||
|
||||
lruvec_memcg_debug(lruvec, page);
|
||||
|
||||
return lruvec;
|
||||
}
|
||||
|
||||
/**
|
||||
* mem_cgroup_update_lru_size - account for adding or removing an lru page
|
||||
* @lruvec: mem_cgroup per zone lru vector
|
||||
@ -2142,6 +2216,12 @@ again:
|
||||
if (unlikely(!memcg))
|
||||
return NULL;
|
||||
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
local_irq_save(flags);
|
||||
might_lock(&memcg->move_lock);
|
||||
local_irq_restore(flags);
|
||||
#endif
|
||||
|
||||
if (atomic_read(&memcg->moving_account) <= 0)
|
||||
return memcg;
|
||||
|
||||
@ -3263,10 +3343,8 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
|
||||
/*
|
||||
* Because tail pages are not marked as "used", set it. We're under
|
||||
* pgdat->lru_lock and migration entries setup in all page mappings.
|
||||
* Because page_memcg(head) is not set on compound tails, set it now.
|
||||
*/
|
||||
void mem_cgroup_split_huge_fixup(struct page *head)
|
||||
{
|
||||
|
59
mm/mlock.c
59
mm/mlock.c
@ -105,26 +105,6 @@ void mlock_vma_page(struct page *page)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Isolate a page from LRU with optional get_page() pin.
|
||||
* Assumes lru_lock already held and page already pinned.
|
||||
*/
|
||||
static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
|
||||
{
|
||||
if (PageLRU(page)) {
|
||||
struct lruvec *lruvec;
|
||||
|
||||
lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
|
||||
if (getpage)
|
||||
get_page(page);
|
||||
ClearPageLRU(page);
|
||||
del_page_from_lru_list(page, lruvec, page_lru(page));
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Finish munlock after successful page isolation
|
||||
*
|
||||
@ -187,40 +167,24 @@ static void __munlock_isolation_failed(struct page *page)
|
||||
unsigned int munlock_vma_page(struct page *page)
|
||||
{
|
||||
int nr_pages;
|
||||
pg_data_t *pgdat = page_pgdat(page);
|
||||
|
||||
/* For try_to_munlock() and to serialize with page migration */
|
||||
BUG_ON(!PageLocked(page));
|
||||
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
|
||||
/*
|
||||
* Serialize with any parallel __split_huge_page_refcount() which
|
||||
* might otherwise copy PageMlocked to part of the tail pages before
|
||||
* we clear it in the head page. It also stabilizes thp_nr_pages().
|
||||
*/
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
|
||||
if (!TestClearPageMlocked(page)) {
|
||||
/* Potentially, PTE-mapped THP: do not skip the rest PTEs */
|
||||
nr_pages = 1;
|
||||
goto unlock_out;
|
||||
return 0;
|
||||
}
|
||||
|
||||
nr_pages = thp_nr_pages(page);
|
||||
__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
|
||||
mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
|
||||
|
||||
if (__munlock_isolate_lru_page(page, true)) {
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
if (!isolate_lru_page(page))
|
||||
__munlock_isolated_page(page);
|
||||
goto out;
|
||||
}
|
||||
else
|
||||
__munlock_isolation_failed(page);
|
||||
|
||||
unlock_out:
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
|
||||
out:
|
||||
return nr_pages - 1;
|
||||
}
|
||||
|
||||
@ -298,12 +262,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
|
||||
int nr = pagevec_count(pvec);
|
||||
int delta_munlocked = -nr;
|
||||
struct pagevec pvec_putback;
|
||||
struct lruvec *lruvec = NULL;
|
||||
int pgrescued = 0;
|
||||
|
||||
pagevec_init(&pvec_putback);
|
||||
|
||||
/* Phase 1: page isolation */
|
||||
spin_lock_irq(&zone->zone_pgdat->lru_lock);
|
||||
for (i = 0; i < nr; i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
|
||||
@ -312,9 +276,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
|
||||
* We already have pin from follow_page_mask()
|
||||
* so we can spare the get_page() here.
|
||||
*/
|
||||
if (__munlock_isolate_lru_page(page, false))
|
||||
if (TestClearPageLRU(page)) {
|
||||
lruvec = relock_page_lruvec_irq(page, lruvec);
|
||||
del_page_from_lru_list(page, lruvec,
|
||||
page_lru(page));
|
||||
continue;
|
||||
else
|
||||
} else
|
||||
__munlock_isolation_failed(page);
|
||||
} else {
|
||||
delta_munlocked++;
|
||||
@ -329,8 +296,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
|
||||
pagevec_add(&pvec_putback, pvec->pages[i]);
|
||||
pvec->pages[i] = NULL;
|
||||
}
|
||||
if (lruvec) {
|
||||
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
|
||||
spin_unlock_irq(&zone->zone_pgdat->lru_lock);
|
||||
unlock_page_lruvec_irq(lruvec);
|
||||
} else if (delta_munlocked) {
|
||||
mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
|
||||
}
|
||||
|
||||
/* Now we can release pins of pages that we are not munlocking */
|
||||
pagevec_release(&pvec_putback);
|
||||
|
@ -77,6 +77,7 @@ void lruvec_init(struct lruvec *lruvec)
|
||||
enum lru_list lru;
|
||||
|
||||
memset(lruvec, 0, sizeof(struct lruvec));
|
||||
spin_lock_init(&lruvec->lru_lock);
|
||||
|
||||
for_each_lru(lru)
|
||||
INIT_LIST_HEAD(&lruvec->lists[lru]);
|
||||
|
@ -6870,7 +6870,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
|
||||
init_waitqueue_head(&pgdat->pfmemalloc_wait);
|
||||
|
||||
pgdat_page_ext_init(pgdat);
|
||||
spin_lock_init(&pgdat->lru_lock);
|
||||
lruvec_init(&pgdat->__lruvec);
|
||||
}
|
||||
|
||||
|
@ -32,19 +32,15 @@
|
||||
static struct page *page_idle_get_page(unsigned long pfn)
|
||||
{
|
||||
struct page *page = pfn_to_online_page(pfn);
|
||||
pg_data_t *pgdat;
|
||||
|
||||
if (!page || !PageLRU(page) ||
|
||||
!get_page_unless_zero(page))
|
||||
return NULL;
|
||||
|
||||
pgdat = page_pgdat(page);
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
if (unlikely(!PageLRU(page))) {
|
||||
put_page(page);
|
||||
page = NULL;
|
||||
}
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
return page;
|
||||
}
|
||||
|
||||
|
12
mm/rmap.c
12
mm/rmap.c
@ -28,12 +28,12 @@
|
||||
* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
|
||||
* anon_vma->rwsem
|
||||
* mm->page_table_lock or pte_lock
|
||||
* pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
|
||||
* swap_lock (in swap_duplicate, swap_info_get)
|
||||
* mmlist_lock (in mmput, drain_mmlist and others)
|
||||
* mapping->private_lock (in __set_page_dirty_buffers)
|
||||
* mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
|
||||
* lock_page_memcg move_lock (in __set_page_dirty_buffers)
|
||||
* i_pages lock (widely used)
|
||||
* lruvec->lru_lock (in lock_page_lruvec_irq)
|
||||
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* sb_lock (within inode_lock in fs/fs-writeback.c)
|
||||
@ -1054,8 +1054,14 @@ static void __page_set_anon_rmap(struct page *page,
|
||||
if (!exclusive)
|
||||
anon_vma = anon_vma->root;
|
||||
|
||||
/*
|
||||
* page_idle does a lockless/optimistic rmap scan on page->mapping.
|
||||
* Make sure the compiler doesn't split the stores of anon_vma and
|
||||
* the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
|
||||
* could mistake the mapping for a struct address_space and crash.
|
||||
*/
|
||||
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
|
||||
page->mapping = (struct address_space *) anon_vma;
|
||||
WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
|
||||
page->index = linear_page_index(vma, address);
|
||||
}
|
||||
|
||||
|
208
mm/swap.c
208
mm/swap.c
@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
|
||||
static void __page_cache_release(struct page *page)
|
||||
{
|
||||
if (PageLRU(page)) {
|
||||
pg_data_t *pgdat = page_pgdat(page);
|
||||
struct lruvec *lruvec;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&pgdat->lru_lock, flags);
|
||||
lruvec = mem_cgroup_page_lruvec(page, pgdat);
|
||||
lruvec = lock_page_lruvec_irqsave(page, &flags);
|
||||
VM_BUG_ON_PAGE(!PageLRU(page), page);
|
||||
__ClearPageLRU(page);
|
||||
del_page_from_lru_list(page, lruvec, page_off_lru(page));
|
||||
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
}
|
||||
__ClearPageWaiters(page);
|
||||
}
|
||||
@ -204,63 +202,46 @@ int get_kernel_page(unsigned long start, int write, struct page **pages)
|
||||
EXPORT_SYMBOL_GPL(get_kernel_page);
|
||||
|
||||
static void pagevec_lru_move_fn(struct pagevec *pvec,
|
||||
void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
|
||||
void *arg)
|
||||
void (*move_fn)(struct page *page, struct lruvec *lruvec))
|
||||
{
|
||||
int i;
|
||||
struct pglist_data *pgdat = NULL;
|
||||
struct lruvec *lruvec;
|
||||
struct lruvec *lruvec = NULL;
|
||||
unsigned long flags = 0;
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
struct pglist_data *pagepgdat = page_pgdat(page);
|
||||
|
||||
if (pagepgdat != pgdat) {
|
||||
if (pgdat)
|
||||
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
|
||||
pgdat = pagepgdat;
|
||||
spin_lock_irqsave(&pgdat->lru_lock, flags);
|
||||
}
|
||||
/* block memcg migration during page moving between lru */
|
||||
if (!TestClearPageLRU(page))
|
||||
continue;
|
||||
|
||||
lruvec = mem_cgroup_page_lruvec(page, pgdat);
|
||||
(*move_fn)(page, lruvec, arg);
|
||||
lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
|
||||
(*move_fn)(page, lruvec);
|
||||
|
||||
SetPageLRU(page);
|
||||
}
|
||||
if (pgdat)
|
||||
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
|
||||
if (lruvec)
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
release_pages(pvec->pages, pvec->nr);
|
||||
pagevec_reinit(pvec);
|
||||
}
|
||||
|
||||
static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
|
||||
void *arg)
|
||||
static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
int *pgmoved = arg;
|
||||
|
||||
if (PageLRU(page) && !PageUnevictable(page)) {
|
||||
if (!PageUnevictable(page)) {
|
||||
del_page_from_lru_list(page, lruvec, page_lru(page));
|
||||
ClearPageActive(page);
|
||||
add_page_to_lru_list_tail(page, lruvec, page_lru(page));
|
||||
(*pgmoved) += thp_nr_pages(page);
|
||||
__count_vm_events(PGROTATED, thp_nr_pages(page));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* pagevec_move_tail() must be called with IRQ disabled.
|
||||
* Otherwise this may cause nasty races.
|
||||
*/
|
||||
static void pagevec_move_tail(struct pagevec *pvec)
|
||||
{
|
||||
int pgmoved = 0;
|
||||
|
||||
pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
|
||||
__count_vm_events(PGROTATED, pgmoved);
|
||||
}
|
||||
|
||||
/*
|
||||
* Writeback is about to end against a page which has been marked for immediate
|
||||
* reclaim. If it still appears to be reclaimable, move it to the tail of the
|
||||
* inactive list.
|
||||
*
|
||||
* rotate_reclaimable_page() must disable IRQs, to prevent nasty races.
|
||||
*/
|
||||
void rotate_reclaimable_page(struct page *page)
|
||||
{
|
||||
@ -273,7 +254,7 @@ void rotate_reclaimable_page(struct page *page)
|
||||
local_lock_irqsave(&lru_rotate.lock, flags);
|
||||
pvec = this_cpu_ptr(&lru_rotate.pvec);
|
||||
if (!pagevec_add(pvec, page) || PageCompound(page))
|
||||
pagevec_move_tail(pvec);
|
||||
pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
|
||||
local_unlock_irqrestore(&lru_rotate.lock, flags);
|
||||
}
|
||||
}
|
||||
@ -283,6 +264,14 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
|
||||
do {
|
||||
unsigned long lrusize;
|
||||
|
||||
/*
|
||||
* Hold lruvec->lru_lock is safe here, since
|
||||
* 1) The pinned lruvec in reclaim, or
|
||||
* 2) From a pre-LRU page during refault (which also holds the
|
||||
* rcu lock, so would be safe even if the page was on the LRU
|
||||
* and could move simultaneously to a new lruvec).
|
||||
*/
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
/* Record cost event */
|
||||
if (file)
|
||||
lruvec->file_cost += nr_pages;
|
||||
@ -306,6 +295,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
|
||||
lruvec->file_cost /= 2;
|
||||
lruvec->anon_cost /= 2;
|
||||
}
|
||||
spin_unlock_irq(&lruvec->lru_lock);
|
||||
} while ((lruvec = parent_lruvec(lruvec)));
|
||||
}
|
||||
|
||||
@ -315,10 +305,9 @@ void lru_note_cost_page(struct page *page)
|
||||
page_is_file_lru(page), thp_nr_pages(page));
|
||||
}
|
||||
|
||||
static void __activate_page(struct page *page, struct lruvec *lruvec,
|
||||
void *arg)
|
||||
static void __activate_page(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
|
||||
if (!PageActive(page) && !PageUnevictable(page)) {
|
||||
int lru = page_lru_base_type(page);
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
@ -340,7 +329,7 @@ static void activate_page_drain(int cpu)
|
||||
struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
|
||||
|
||||
if (pagevec_count(pvec))
|
||||
pagevec_lru_move_fn(pvec, __activate_page, NULL);
|
||||
pagevec_lru_move_fn(pvec, __activate_page);
|
||||
}
|
||||
|
||||
static bool need_activate_page_drain(int cpu)
|
||||
@ -358,7 +347,7 @@ static void activate_page(struct page *page)
|
||||
pvec = this_cpu_ptr(&lru_pvecs.activate_page);
|
||||
get_page(page);
|
||||
if (!pagevec_add(pvec, page) || PageCompound(page))
|
||||
pagevec_lru_move_fn(pvec, __activate_page, NULL);
|
||||
pagevec_lru_move_fn(pvec, __activate_page);
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
}
|
||||
}
|
||||
@ -370,12 +359,15 @@ static inline void activate_page_drain(int cpu)
|
||||
|
||||
static void activate_page(struct page *page)
|
||||
{
|
||||
pg_data_t *pgdat = page_pgdat(page);
|
||||
struct lruvec *lruvec;
|
||||
|
||||
page = compound_head(page);
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
__activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
if (TestClearPageLRU(page)) {
|
||||
lruvec = lock_page_lruvec_irq(page);
|
||||
__activate_page(page, lruvec);
|
||||
unlock_page_lruvec_irq(lruvec);
|
||||
SetPageLRU(page);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -525,16 +517,12 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
|
||||
* be write it out by flusher threads as this is much more effective
|
||||
* than the single-page writeout from reclaim.
|
||||
*/
|
||||
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
|
||||
void *arg)
|
||||
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
int lru;
|
||||
bool active;
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
if (!PageLRU(page))
|
||||
return;
|
||||
|
||||
if (PageUnevictable(page))
|
||||
return;
|
||||
|
||||
@ -573,10 +561,9 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
|
||||
}
|
||||
}
|
||||
|
||||
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
|
||||
void *arg)
|
||||
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
||||
if (PageActive(page) && !PageUnevictable(page)) {
|
||||
int lru = page_lru_base_type(page);
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
@ -591,10 +578,9 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
|
||||
}
|
||||
}
|
||||
|
||||
static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
|
||||
void *arg)
|
||||
static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
|
||||
if (PageAnon(page) && PageSwapBacked(page) &&
|
||||
!PageSwapCache(page) && !PageUnevictable(page)) {
|
||||
bool active = PageActive(page);
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
@ -636,21 +622,21 @@ void lru_add_drain_cpu(int cpu)
|
||||
|
||||
/* No harm done if a racing interrupt already did this */
|
||||
local_lock_irqsave(&lru_rotate.lock, flags);
|
||||
pagevec_move_tail(pvec);
|
||||
pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
|
||||
local_unlock_irqrestore(&lru_rotate.lock, flags);
|
||||
}
|
||||
|
||||
pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
|
||||
if (pagevec_count(pvec))
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
|
||||
|
||||
pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
|
||||
if (pagevec_count(pvec))
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_fn);
|
||||
|
||||
pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
|
||||
if (pagevec_count(pvec))
|
||||
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
|
||||
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
|
||||
|
||||
activate_page_drain(cpu);
|
||||
}
|
||||
@ -679,7 +665,7 @@ void deactivate_file_page(struct page *page)
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
|
||||
|
||||
if (!pagevec_add(pvec, page) || PageCompound(page))
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
}
|
||||
}
|
||||
@ -701,7 +687,7 @@ void deactivate_page(struct page *page)
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
|
||||
get_page(page);
|
||||
if (!pagevec_add(pvec, page) || PageCompound(page))
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
|
||||
pagevec_lru_move_fn(pvec, lru_deactivate_fn);
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
}
|
||||
}
|
||||
@ -723,7 +709,7 @@ void mark_page_lazyfree(struct page *page)
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
|
||||
get_page(page);
|
||||
if (!pagevec_add(pvec, page) || PageCompound(page))
|
||||
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
|
||||
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
}
|
||||
}
|
||||
@ -871,8 +857,7 @@ void release_pages(struct page **pages, int nr)
|
||||
{
|
||||
int i;
|
||||
LIST_HEAD(pages_to_free);
|
||||
struct pglist_data *locked_pgdat = NULL;
|
||||
struct lruvec *lruvec;
|
||||
struct lruvec *lruvec = NULL;
|
||||
unsigned long flags;
|
||||
unsigned int lock_batch;
|
||||
|
||||
@ -882,11 +867,11 @@ void release_pages(struct page **pages, int nr)
|
||||
/*
|
||||
* Make sure the IRQ-safe lock-holding time does not get
|
||||
* excessive with a continuous string of pages from the
|
||||
* same pgdat. The lock is held only if pgdat != NULL.
|
||||
* same lruvec. The lock is held only if lruvec != NULL.
|
||||
*/
|
||||
if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
|
||||
spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
|
||||
locked_pgdat = NULL;
|
||||
if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
lruvec = NULL;
|
||||
}
|
||||
|
||||
page = compound_head(page);
|
||||
@ -894,10 +879,9 @@ void release_pages(struct page **pages, int nr)
|
||||
continue;
|
||||
|
||||
if (is_zone_device_page(page)) {
|
||||
if (locked_pgdat) {
|
||||
spin_unlock_irqrestore(&locked_pgdat->lru_lock,
|
||||
flags);
|
||||
locked_pgdat = NULL;
|
||||
if (lruvec) {
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
lruvec = NULL;
|
||||
}
|
||||
/*
|
||||
* ZONE_DEVICE pages that return 'false' from
|
||||
@ -918,27 +902,22 @@ void release_pages(struct page **pages, int nr)
|
||||
continue;
|
||||
|
||||
if (PageCompound(page)) {
|
||||
if (locked_pgdat) {
|
||||
spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
|
||||
locked_pgdat = NULL;
|
||||
if (lruvec) {
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
lruvec = NULL;
|
||||
}
|
||||
__put_compound_page(page);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (PageLRU(page)) {
|
||||
struct pglist_data *pgdat = page_pgdat(page);
|
||||
struct lruvec *prev_lruvec = lruvec;
|
||||
|
||||
if (pgdat != locked_pgdat) {
|
||||
if (locked_pgdat)
|
||||
spin_unlock_irqrestore(&locked_pgdat->lru_lock,
|
||||
flags);
|
||||
lruvec = relock_page_lruvec_irqsave(page, lruvec,
|
||||
&flags);
|
||||
if (prev_lruvec != lruvec)
|
||||
lock_batch = 0;
|
||||
locked_pgdat = pgdat;
|
||||
spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
|
||||
}
|
||||
|
||||
lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
|
||||
VM_BUG_ON_PAGE(!PageLRU(page), page);
|
||||
__ClearPageLRU(page);
|
||||
del_page_from_lru_list(page, lruvec, page_off_lru(page));
|
||||
@ -948,8 +927,8 @@ void release_pages(struct page **pages, int nr)
|
||||
|
||||
list_add(&page->lru, &pages_to_free);
|
||||
}
|
||||
if (locked_pgdat)
|
||||
spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
|
||||
if (lruvec)
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
|
||||
mem_cgroup_uncharge_list(&pages_to_free);
|
||||
free_unref_page_list(&pages_to_free);
|
||||
@ -977,41 +956,7 @@ void __pagevec_release(struct pagevec *pvec)
|
||||
}
|
||||
EXPORT_SYMBOL(__pagevec_release);
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
/* used by __split_huge_page_refcount() */
|
||||
void lru_add_page_tail(struct page *page, struct page *page_tail,
|
||||
struct lruvec *lruvec, struct list_head *list)
|
||||
{
|
||||
VM_BUG_ON_PAGE(!PageHead(page), page);
|
||||
VM_BUG_ON_PAGE(PageCompound(page_tail), page);
|
||||
VM_BUG_ON_PAGE(PageLRU(page_tail), page);
|
||||
lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
|
||||
|
||||
if (!list)
|
||||
SetPageLRU(page_tail);
|
||||
|
||||
if (likely(PageLRU(page)))
|
||||
list_add_tail(&page_tail->lru, &page->lru);
|
||||
else if (list) {
|
||||
/* page reclaim is reclaiming a huge page */
|
||||
get_page(page_tail);
|
||||
list_add_tail(&page_tail->lru, list);
|
||||
} else {
|
||||
/*
|
||||
* Head page has not yet been counted, as an hpage,
|
||||
* so we must account for each subpage individually.
|
||||
*
|
||||
* Put page_tail on the list at the correct position
|
||||
* so they all end up in order.
|
||||
*/
|
||||
add_page_to_lru_list_tail(page_tail, lruvec,
|
||||
page_lru(page_tail));
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
|
||||
void *arg)
|
||||
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
enum lru_list lru;
|
||||
int was_unevictable = TestClearPageUnevictable(page);
|
||||
@ -1070,7 +1015,20 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
|
||||
*/
|
||||
void __pagevec_lru_add(struct pagevec *pvec)
|
||||
{
|
||||
pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
|
||||
int i;
|
||||
struct lruvec *lruvec = NULL;
|
||||
unsigned long flags = 0;
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
|
||||
lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
|
||||
__pagevec_lru_add_fn(page, lruvec);
|
||||
}
|
||||
if (lruvec)
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
release_pages(pvec->pages, pvec->nr);
|
||||
pagevec_reinit(pvec);
|
||||
}
|
||||
|
||||
/**
|
||||
|
199
mm/vmscan.c
199
mm/vmscan.c
@ -1539,9 +1539,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
|
||||
*
|
||||
* returns 0 on success, -ve errno on failure.
|
||||
*/
|
||||
int __isolate_lru_page(struct page *page, isolate_mode_t mode)
|
||||
int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
|
||||
{
|
||||
int ret = -EINVAL;
|
||||
int ret = -EBUSY;
|
||||
|
||||
/* Only take pages on the LRU. */
|
||||
if (!PageLRU(page))
|
||||
@ -1551,8 +1551,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
|
||||
if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
|
||||
return ret;
|
||||
|
||||
ret = -EBUSY;
|
||||
|
||||
/*
|
||||
* To minimise LRU disruption, the caller can indicate that it only
|
||||
* wants to isolate pages it will be able to operate on without
|
||||
@ -1593,20 +1591,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
|
||||
if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
|
||||
return ret;
|
||||
|
||||
if (likely(get_page_unless_zero(page))) {
|
||||
/*
|
||||
* Be careful not to clear PageLRU until after we're
|
||||
* sure the page is not being freed elsewhere -- the
|
||||
* page release code relies on it.
|
||||
*/
|
||||
ClearPageLRU(page);
|
||||
ret = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Update LRU sizes after isolating pages. The LRU size updates must
|
||||
* be complete before mem_cgroup_update_lru_size due to a sanity check.
|
||||
@ -1626,14 +1613,16 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
|
||||
}
|
||||
|
||||
/**
|
||||
* pgdat->lru_lock is heavily contended. Some of the functions that
|
||||
* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
|
||||
*
|
||||
* lruvec->lru_lock is heavily contended. Some of the functions that
|
||||
* shrink the lists perform better by taking out a batch of pages
|
||||
* and working on them outside the LRU lock.
|
||||
*
|
||||
* For pagecache intensive workloads, this function is the hottest
|
||||
* spot in the kernel (apart from copy_*_user functions).
|
||||
*
|
||||
* Appropriate locks must be held before calling this function.
|
||||
* Lru_lock must be held before calling this function.
|
||||
*
|
||||
* @nr_to_scan: The number of eligible pages to look through on the list.
|
||||
* @lruvec: The LRU vector to pull pages from.
|
||||
@ -1666,8 +1655,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
||||
page = lru_to_page(src);
|
||||
prefetchw_prev_lru_page(page, src, flags);
|
||||
|
||||
VM_BUG_ON_PAGE(!PageLRU(page), page);
|
||||
|
||||
nr_pages = compound_nr(page);
|
||||
total_scan += nr_pages;
|
||||
|
||||
@ -1688,20 +1675,34 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
||||
* only when the page is being freed somewhere else.
|
||||
*/
|
||||
scan += nr_pages;
|
||||
switch (__isolate_lru_page(page, mode)) {
|
||||
switch (__isolate_lru_page_prepare(page, mode)) {
|
||||
case 0:
|
||||
/*
|
||||
* Be careful not to clear PageLRU until after we're
|
||||
* sure the page is not being freed elsewhere -- the
|
||||
* page release code relies on it.
|
||||
*/
|
||||
if (unlikely(!get_page_unless_zero(page)))
|
||||
goto busy;
|
||||
|
||||
if (!TestClearPageLRU(page)) {
|
||||
/*
|
||||
* This page may in other isolation path,
|
||||
* but we still hold lru_lock.
|
||||
*/
|
||||
put_page(page);
|
||||
goto busy;
|
||||
}
|
||||
|
||||
nr_taken += nr_pages;
|
||||
nr_zone_taken[page_zonenum(page)] += nr_pages;
|
||||
list_move(&page->lru, dst);
|
||||
break;
|
||||
|
||||
case -EBUSY:
|
||||
default:
|
||||
busy:
|
||||
/* else it is being freed elsewhere */
|
||||
list_move(&page->lru, src);
|
||||
continue;
|
||||
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
@ -1764,21 +1765,16 @@ int isolate_lru_page(struct page *page)
|
||||
VM_BUG_ON_PAGE(!page_count(page), page);
|
||||
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
|
||||
|
||||
if (PageLRU(page)) {
|
||||
pg_data_t *pgdat = page_pgdat(page);
|
||||
if (TestClearPageLRU(page)) {
|
||||
struct lruvec *lruvec;
|
||||
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
lruvec = mem_cgroup_page_lruvec(page, pgdat);
|
||||
if (PageLRU(page)) {
|
||||
int lru = page_lru(page);
|
||||
get_page(page);
|
||||
ClearPageLRU(page);
|
||||
del_page_from_lru_list(page, lruvec, lru);
|
||||
lruvec = lock_page_lruvec_irq(page);
|
||||
del_page_from_lru_list(page, lruvec, page_lru(page));
|
||||
unlock_page_lruvec_irq(lruvec);
|
||||
ret = 0;
|
||||
}
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1820,29 +1816,14 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
|
||||
}
|
||||
|
||||
/*
|
||||
* This moves pages from @list to corresponding LRU list.
|
||||
*
|
||||
* We move them the other way if the page is referenced by one or more
|
||||
* processes, from rmap.
|
||||
*
|
||||
* If the pages are mostly unmapped, the processing is fast and it is
|
||||
* appropriate to hold zone_lru_lock across the whole operation. But if
|
||||
* the pages are mapped, the processing is slow (page_referenced()) so we
|
||||
* should drop zone_lru_lock around each page. It's impossible to balance
|
||||
* this, so instead we remove the pages from the LRU while processing them.
|
||||
* It is safe to rely on PG_active against the non-LRU pages in here because
|
||||
* nobody will play with that bit on a non-LRU page.
|
||||
*
|
||||
* The downside is that we have to touch page->_refcount against each page.
|
||||
* But we had to alter page->flags anyway.
|
||||
* move_pages_to_lru() moves pages from private @list to appropriate LRU list.
|
||||
* On return, @list is reused as a list of pages to be freed by the caller.
|
||||
*
|
||||
* Returns the number of pages moved to the given lruvec.
|
||||
*/
|
||||
|
||||
static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
int nr_pages, nr_moved = 0;
|
||||
LIST_HEAD(pages_to_free);
|
||||
struct page *page;
|
||||
@ -1851,39 +1832,55 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
|
||||
while (!list_empty(list)) {
|
||||
page = lru_to_page(list);
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
if (unlikely(!page_evictable(page))) {
|
||||
list_del(&page->lru);
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
if (unlikely(!page_evictable(page))) {
|
||||
spin_unlock_irq(&lruvec->lru_lock);
|
||||
putback_lru_page(page);
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
continue;
|
||||
}
|
||||
lruvec = mem_cgroup_page_lruvec(page, pgdat);
|
||||
|
||||
/*
|
||||
* The SetPageLRU needs to be kept here for list integrity.
|
||||
* Otherwise:
|
||||
* #0 move_pages_to_lru #1 release_pages
|
||||
* if !put_page_testzero
|
||||
* if (put_page_testzero())
|
||||
* !PageLRU //skip lru_lock
|
||||
* SetPageLRU()
|
||||
* list_add(&page->lru,)
|
||||
* list_add(&page->lru,)
|
||||
*/
|
||||
SetPageLRU(page);
|
||||
lru = page_lru(page);
|
||||
|
||||
nr_pages = thp_nr_pages(page);
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
|
||||
list_move(&page->lru, &lruvec->lists[lru]);
|
||||
|
||||
if (put_page_testzero(page)) {
|
||||
if (unlikely(put_page_testzero(page))) {
|
||||
__ClearPageLRU(page);
|
||||
__ClearPageActive(page);
|
||||
del_page_from_lru_list(page, lruvec, lru);
|
||||
|
||||
if (unlikely(PageCompound(page))) {
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
spin_unlock_irq(&lruvec->lru_lock);
|
||||
destroy_compound_page(page);
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
} else
|
||||
list_add(&page->lru, &pages_to_free);
|
||||
} else {
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* All pages were isolated from the same lruvec (and isolation
|
||||
* inhibits memcg migration).
|
||||
*/
|
||||
VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
|
||||
lru = page_lru(page);
|
||||
nr_pages = thp_nr_pages(page);
|
||||
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
|
||||
list_add(&page->lru, &lruvec->lists[lru]);
|
||||
nr_moved += nr_pages;
|
||||
if (PageActive(page))
|
||||
workingset_age_nonresident(lruvec, nr_pages);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* To save our caller's stack, now use input list for pages to free.
|
||||
@ -1939,7 +1936,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
|
||||
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
|
||||
&nr_scanned, sc, lru);
|
||||
@ -1951,27 +1948,25 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
|
||||
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
|
||||
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
|
||||
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
spin_unlock_irq(&lruvec->lru_lock);
|
||||
|
||||
if (nr_taken == 0)
|
||||
return 0;
|
||||
|
||||
nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
|
||||
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
move_pages_to_lru(lruvec, &page_list);
|
||||
|
||||
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
|
||||
lru_note_cost(lruvec, file, stat.nr_pageout);
|
||||
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
|
||||
if (!cgroup_reclaim(sc))
|
||||
__count_vm_events(item, nr_reclaimed);
|
||||
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
|
||||
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
|
||||
spin_unlock_irq(&lruvec->lru_lock);
|
||||
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
|
||||
lru_note_cost(lruvec, file, stat.nr_pageout);
|
||||
mem_cgroup_uncharge_list(&page_list);
|
||||
free_unref_page_list(&page_list);
|
||||
|
||||
@ -2003,6 +1998,23 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
|
||||
return nr_reclaimed;
|
||||
}
|
||||
|
||||
/*
|
||||
* shrink_active_list() moves pages from the active LRU to the inactive LRU.
|
||||
*
|
||||
* We move them the other way if the page is referenced by one or more
|
||||
* processes.
|
||||
*
|
||||
* If the pages are mostly unmapped, the processing is fast and it is
|
||||
* appropriate to hold lru_lock across the whole operation. But if
|
||||
* the pages are mapped, the processing is slow (page_referenced()), so
|
||||
* we should drop lru_lock around each page. It's impossible to balance
|
||||
* this, so instead we remove the pages from the LRU while processing them.
|
||||
* It is safe to rely on PG_active against the non-LRU pages in here because
|
||||
* nobody will play with that bit on a non-LRU page.
|
||||
*
|
||||
* The downside is that we have to touch page->_refcount against each page.
|
||||
* But we had to alter page->flags anyway.
|
||||
*/
|
||||
static void shrink_active_list(unsigned long nr_to_scan,
|
||||
struct lruvec *lruvec,
|
||||
struct scan_control *sc,
|
||||
@ -2022,7 +2034,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
|
||||
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
|
||||
&nr_scanned, sc, lru);
|
||||
@ -2033,7 +2045,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
||||
__count_vm_events(PGREFILL, nr_scanned);
|
||||
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
|
||||
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
spin_unlock_irq(&lruvec->lru_lock);
|
||||
|
||||
while (!list_empty(&l_hold)) {
|
||||
cond_resched();
|
||||
@ -2079,7 +2091,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
||||
/*
|
||||
* Move pages back to the lru list.
|
||||
*/
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
|
||||
nr_activate = move_pages_to_lru(lruvec, &l_active);
|
||||
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
|
||||
@ -2090,7 +2102,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
||||
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
|
||||
|
||||
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
spin_unlock_irq(&lruvec->lru_lock);
|
||||
|
||||
mem_cgroup_uncharge_list(&l_active);
|
||||
free_unref_page_list(&l_active);
|
||||
@ -2678,10 +2690,10 @@ again:
|
||||
/*
|
||||
* Determine the scan balance between anon and file LRUs.
|
||||
*/
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
spin_lock_irq(&target_lruvec->lru_lock);
|
||||
sc->anon_cost = target_lruvec->anon_cost;
|
||||
sc->file_cost = target_lruvec->file_cost;
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
spin_unlock_irq(&target_lruvec->lru_lock);
|
||||
|
||||
/*
|
||||
* Target desirable inactive:active list ratios for the anon
|
||||
@ -4257,15 +4269,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
|
||||
*/
|
||||
void check_move_unevictable_pages(struct pagevec *pvec)
|
||||
{
|
||||
struct lruvec *lruvec;
|
||||
struct pglist_data *pgdat = NULL;
|
||||
struct lruvec *lruvec = NULL;
|
||||
int pgscanned = 0;
|
||||
int pgrescued = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < pvec->nr; i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
struct pglist_data *pagepgdat = page_pgdat(page);
|
||||
int nr_pages;
|
||||
|
||||
if (PageTransTail(page))
|
||||
@ -4274,18 +4284,12 @@ void check_move_unevictable_pages(struct pagevec *pvec)
|
||||
nr_pages = thp_nr_pages(page);
|
||||
pgscanned += nr_pages;
|
||||
|
||||
if (pagepgdat != pgdat) {
|
||||
if (pgdat)
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
pgdat = pagepgdat;
|
||||
spin_lock_irq(&pgdat->lru_lock);
|
||||
}
|
||||
lruvec = mem_cgroup_page_lruvec(page, pgdat);
|
||||
|
||||
if (!PageLRU(page) || !PageUnevictable(page))
|
||||
/* block memcg migration during page moving between lru */
|
||||
if (!TestClearPageLRU(page))
|
||||
continue;
|
||||
|
||||
if (page_evictable(page)) {
|
||||
lruvec = relock_page_lruvec_irq(page, lruvec);
|
||||
if (page_evictable(page) && PageUnevictable(page)) {
|
||||
enum lru_list lru = page_lru_base_type(page);
|
||||
|
||||
VM_BUG_ON_PAGE(PageActive(page), page);
|
||||
@ -4294,12 +4298,15 @@ void check_move_unevictable_pages(struct pagevec *pvec)
|
||||
add_page_to_lru_list(page, lruvec, lru);
|
||||
pgrescued += nr_pages;
|
||||
}
|
||||
SetPageLRU(page);
|
||||
}
|
||||
|
||||
if (pgdat) {
|
||||
if (lruvec) {
|
||||
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
|
||||
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
|
||||
spin_unlock_irq(&pgdat->lru_lock);
|
||||
unlock_page_lruvec_irq(lruvec);
|
||||
} else if (pgscanned) {
|
||||
count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
|
||||
|
@ -381,9 +381,7 @@ void workingset_refault(struct page *page, void *shadow)
|
||||
if (workingset) {
|
||||
SetPageWorkingset(page);
|
||||
/* XXX: Move to lru_cache_add() when it supports new vs putback */
|
||||
spin_lock_irq(&page_pgdat(page)->lru_lock);
|
||||
lru_note_cost_page(page);
|
||||
spin_unlock_irq(&page_pgdat(page)->lru_lock);
|
||||
inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
|
||||
}
|
||||
out:
|
||||
|
Loading…
Reference in New Issue
Block a user