mirror of
https://github.com/torvalds/linux.git
synced 2024-11-10 14:11:52 +00:00
mm: multi-gen LRU: section for memcg LRU
Move memcg LRU code into a dedicated section. Improve the design doc to outline its architecture. Link: https://lkml.kernel.org/r/20230118001827.1040870-5-talumbau@google.com Signed-off-by: T.J. Alumbaugh <talumbau@google.com> Cc: Yu Zhao <yuzhao@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
ccbbbb8594
commit
36c7b4db7c
@ -186,9 +186,40 @@ is false positive, the cost is an additional scan of a range of PTEs,
|
|||||||
which may yield hot pages anyway. Parameters of the filter itself can
|
which may yield hot pages anyway. Parameters of the filter itself can
|
||||||
control the false positive rate in the limit.
|
control the false positive rate in the limit.
|
||||||
|
|
||||||
|
Memcg LRU
|
||||||
|
---------
|
||||||
|
An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
|
||||||
|
since each node and memcg combination has an LRU of folios (see
|
||||||
|
``mem_cgroup_lruvec()``). Its goal is to improve the scalability of
|
||||||
|
global reclaim, which is critical to system-wide memory overcommit in
|
||||||
|
data centers. Note that memcg LRU only applies to global reclaim.
|
||||||
|
|
||||||
|
The basic structure of an memcg LRU can be understood by an analogy to
|
||||||
|
the active/inactive LRU (of folios):
|
||||||
|
|
||||||
|
1. It has the young and the old (generations), i.e., the counterparts
|
||||||
|
to the active and the inactive;
|
||||||
|
2. The increment of ``max_seq`` triggers promotion, i.e., the
|
||||||
|
counterpart to activation;
|
||||||
|
3. Other events trigger similar operations, e.g., offlining an memcg
|
||||||
|
triggers demotion, i.e., the counterpart to deactivation.
|
||||||
|
|
||||||
|
In terms of global reclaim, it has two distinct features:
|
||||||
|
|
||||||
|
1. Sharding, which allows each thread to start at a random memcg (in
|
||||||
|
the old generation) and improves parallelism;
|
||||||
|
2. Eventual fairness, which allows direct reclaim to bail out at will
|
||||||
|
and reduces latency without affecting fairness over some time.
|
||||||
|
|
||||||
|
In terms of traversing memcgs during global reclaim, it improves the
|
||||||
|
best-case complexity from O(n) to O(1) and does not affect the
|
||||||
|
worst-case complexity O(n). Therefore, on average, it has a sublinear
|
||||||
|
complexity.
|
||||||
|
|
||||||
Summary
|
Summary
|
||||||
-------
|
-------
|
||||||
The multi-gen LRU can be disassembled into the following parts:
|
The multi-gen LRU (of folios) can be disassembled into the following
|
||||||
|
parts:
|
||||||
|
|
||||||
* Generations
|
* Generations
|
||||||
* Rmap walks
|
* Rmap walks
|
||||||
|
@ -122,18 +122,6 @@ static inline bool lru_gen_in_fault(void)
|
|||||||
return current->in_lru_fault;
|
return current->in_lru_fault;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_MEMCG
|
|
||||||
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
|
||||||
{
|
|
||||||
return READ_ONCE(lruvec->lrugen.seg);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static inline int lru_gen_from_seq(unsigned long seq)
|
static inline int lru_gen_from_seq(unsigned long seq)
|
||||||
{
|
{
|
||||||
return seq % MAX_NR_GENS;
|
return seq % MAX_NR_GENS;
|
||||||
@ -309,11 +297,6 @@ static inline bool lru_gen_in_fault(void)
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
|
@ -368,15 +368,6 @@ struct page_vma_mapped_walk;
|
|||||||
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||||
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||||
|
|
||||||
/* see the comment on MEMCG_NR_GENS */
|
|
||||||
enum {
|
|
||||||
MEMCG_LRU_NOP,
|
|
||||||
MEMCG_LRU_HEAD,
|
|
||||||
MEMCG_LRU_TAIL,
|
|
||||||
MEMCG_LRU_OLD,
|
|
||||||
MEMCG_LRU_YOUNG,
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef CONFIG_LRU_GEN
|
#ifdef CONFIG_LRU_GEN
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
@ -557,7 +548,7 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg);
|
|||||||
void lru_gen_online_memcg(struct mem_cgroup *memcg);
|
void lru_gen_online_memcg(struct mem_cgroup *memcg);
|
||||||
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
|
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
|
||||||
void lru_gen_release_memcg(struct mem_cgroup *memcg);
|
void lru_gen_release_memcg(struct mem_cgroup *memcg);
|
||||||
void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
|
void lru_gen_soft_reclaim(struct lruvec *lruvec);
|
||||||
|
|
||||||
#else /* !CONFIG_MEMCG */
|
#else /* !CONFIG_MEMCG */
|
||||||
|
|
||||||
@ -608,7 +599,7 @@ static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
|
|||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
|
static inline void lru_gen_soft_reclaim(struct lruvec *lruvec)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -476,12 +476,8 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
|
|||||||
struct mem_cgroup_tree_per_node *mctz;
|
struct mem_cgroup_tree_per_node *mctz;
|
||||||
|
|
||||||
if (lru_gen_enabled()) {
|
if (lru_gen_enabled()) {
|
||||||
struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
|
if (soft_limit_excess(memcg))
|
||||||
|
lru_gen_soft_reclaim(&memcg->nodeinfo[nid]->lruvec);
|
||||||
/* see the comment on MEMCG_NR_GENS */
|
|
||||||
if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
|
|
||||||
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
250
mm/vmscan.c
250
mm/vmscan.c
@ -4705,6 +4705,148 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
|||||||
mem_cgroup_unlock_pages();
|
mem_cgroup_unlock_pages();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/******************************************************************************
|
||||||
|
* memcg LRU
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
/* see the comment on MEMCG_NR_GENS */
|
||||||
|
enum {
|
||||||
|
MEMCG_LRU_NOP,
|
||||||
|
MEMCG_LRU_HEAD,
|
||||||
|
MEMCG_LRU_TAIL,
|
||||||
|
MEMCG_LRU_OLD,
|
||||||
|
MEMCG_LRU_YOUNG,
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef CONFIG_MEMCG
|
||||||
|
|
||||||
|
static int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||||
|
{
|
||||||
|
return READ_ONCE(lruvec->lrugen.seg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
|
||||||
|
{
|
||||||
|
int seg;
|
||||||
|
int old, new;
|
||||||
|
int bin = get_random_u32_below(MEMCG_NR_BINS);
|
||||||
|
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||||
|
|
||||||
|
spin_lock(&pgdat->memcg_lru.lock);
|
||||||
|
|
||||||
|
VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||||
|
|
||||||
|
seg = 0;
|
||||||
|
new = old = lruvec->lrugen.gen;
|
||||||
|
|
||||||
|
/* see the comment on MEMCG_NR_GENS */
|
||||||
|
if (op == MEMCG_LRU_HEAD)
|
||||||
|
seg = MEMCG_LRU_HEAD;
|
||||||
|
else if (op == MEMCG_LRU_TAIL)
|
||||||
|
seg = MEMCG_LRU_TAIL;
|
||||||
|
else if (op == MEMCG_LRU_OLD)
|
||||||
|
new = get_memcg_gen(pgdat->memcg_lru.seq);
|
||||||
|
else if (op == MEMCG_LRU_YOUNG)
|
||||||
|
new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
|
||||||
|
else
|
||||||
|
VM_WARN_ON_ONCE(true);
|
||||||
|
|
||||||
|
hlist_nulls_del_rcu(&lruvec->lrugen.list);
|
||||||
|
|
||||||
|
if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
|
||||||
|
hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
|
||||||
|
else
|
||||||
|
hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
|
||||||
|
|
||||||
|
pgdat->memcg_lru.nr_memcgs[old]--;
|
||||||
|
pgdat->memcg_lru.nr_memcgs[new]++;
|
||||||
|
|
||||||
|
lruvec->lrugen.gen = new;
|
||||||
|
WRITE_ONCE(lruvec->lrugen.seg, seg);
|
||||||
|
|
||||||
|
if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
|
||||||
|
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
|
||||||
|
|
||||||
|
spin_unlock(&pgdat->memcg_lru.lock);
|
||||||
|
}
|
||||||
|
|
||||||
|
void lru_gen_online_memcg(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
int gen;
|
||||||
|
int nid;
|
||||||
|
int bin = get_random_u32_below(MEMCG_NR_BINS);
|
||||||
|
|
||||||
|
for_each_node(nid) {
|
||||||
|
struct pglist_data *pgdat = NODE_DATA(nid);
|
||||||
|
struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||||
|
|
||||||
|
spin_lock(&pgdat->memcg_lru.lock);
|
||||||
|
|
||||||
|
VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||||
|
|
||||||
|
gen = get_memcg_gen(pgdat->memcg_lru.seq);
|
||||||
|
|
||||||
|
hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
|
||||||
|
pgdat->memcg_lru.nr_memcgs[gen]++;
|
||||||
|
|
||||||
|
lruvec->lrugen.gen = gen;
|
||||||
|
|
||||||
|
spin_unlock(&pgdat->memcg_lru.lock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void lru_gen_offline_memcg(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
int nid;
|
||||||
|
|
||||||
|
for_each_node(nid) {
|
||||||
|
struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||||
|
|
||||||
|
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void lru_gen_release_memcg(struct mem_cgroup *memcg)
|
||||||
|
{
|
||||||
|
int gen;
|
||||||
|
int nid;
|
||||||
|
|
||||||
|
for_each_node(nid) {
|
||||||
|
struct pglist_data *pgdat = NODE_DATA(nid);
|
||||||
|
struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||||
|
|
||||||
|
spin_lock(&pgdat->memcg_lru.lock);
|
||||||
|
|
||||||
|
VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||||
|
|
||||||
|
gen = lruvec->lrugen.gen;
|
||||||
|
|
||||||
|
hlist_nulls_del_rcu(&lruvec->lrugen.list);
|
||||||
|
pgdat->memcg_lru.nr_memcgs[gen]--;
|
||||||
|
|
||||||
|
if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
|
||||||
|
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
|
||||||
|
|
||||||
|
spin_unlock(&pgdat->memcg_lru.lock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void lru_gen_soft_reclaim(struct lruvec *lruvec)
|
||||||
|
{
|
||||||
|
/* see the comment on MEMCG_NR_GENS */
|
||||||
|
if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
|
||||||
|
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else /* !CONFIG_MEMCG */
|
||||||
|
|
||||||
|
static int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/******************************************************************************
|
/******************************************************************************
|
||||||
* the eviction
|
* the eviction
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
@ -5397,53 +5539,6 @@ done:
|
|||||||
pgdat->kswapd_failures = 0;
|
pgdat->kswapd_failures = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_MEMCG
|
|
||||||
void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
|
|
||||||
{
|
|
||||||
int seg;
|
|
||||||
int old, new;
|
|
||||||
int bin = get_random_u32_below(MEMCG_NR_BINS);
|
|
||||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
||||||
|
|
||||||
spin_lock(&pgdat->memcg_lru.lock);
|
|
||||||
|
|
||||||
VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
|
|
||||||
|
|
||||||
seg = 0;
|
|
||||||
new = old = lruvec->lrugen.gen;
|
|
||||||
|
|
||||||
/* see the comment on MEMCG_NR_GENS */
|
|
||||||
if (op == MEMCG_LRU_HEAD)
|
|
||||||
seg = MEMCG_LRU_HEAD;
|
|
||||||
else if (op == MEMCG_LRU_TAIL)
|
|
||||||
seg = MEMCG_LRU_TAIL;
|
|
||||||
else if (op == MEMCG_LRU_OLD)
|
|
||||||
new = get_memcg_gen(pgdat->memcg_lru.seq);
|
|
||||||
else if (op == MEMCG_LRU_YOUNG)
|
|
||||||
new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
|
|
||||||
else
|
|
||||||
VM_WARN_ON_ONCE(true);
|
|
||||||
|
|
||||||
hlist_nulls_del_rcu(&lruvec->lrugen.list);
|
|
||||||
|
|
||||||
if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
|
|
||||||
hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
|
|
||||||
else
|
|
||||||
hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
|
|
||||||
|
|
||||||
pgdat->memcg_lru.nr_memcgs[old]--;
|
|
||||||
pgdat->memcg_lru.nr_memcgs[new]++;
|
|
||||||
|
|
||||||
lruvec->lrugen.gen = new;
|
|
||||||
WRITE_ONCE(lruvec->lrugen.seg, seg);
|
|
||||||
|
|
||||||
if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
|
|
||||||
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
|
|
||||||
|
|
||||||
spin_unlock(&pgdat->memcg_lru.lock);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/******************************************************************************
|
/******************************************************************************
|
||||||
* state change
|
* state change
|
||||||
******************************************************************************/
|
******************************************************************************/
|
||||||
@ -6086,67 +6181,6 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void lru_gen_online_memcg(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
int gen;
|
|
||||||
int nid;
|
|
||||||
int bin = get_random_u32_below(MEMCG_NR_BINS);
|
|
||||||
|
|
||||||
for_each_node(nid) {
|
|
||||||
struct pglist_data *pgdat = NODE_DATA(nid);
|
|
||||||
struct lruvec *lruvec = get_lruvec(memcg, nid);
|
|
||||||
|
|
||||||
spin_lock(&pgdat->memcg_lru.lock);
|
|
||||||
|
|
||||||
VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
|
|
||||||
|
|
||||||
gen = get_memcg_gen(pgdat->memcg_lru.seq);
|
|
||||||
|
|
||||||
hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
|
|
||||||
pgdat->memcg_lru.nr_memcgs[gen]++;
|
|
||||||
|
|
||||||
lruvec->lrugen.gen = gen;
|
|
||||||
|
|
||||||
spin_unlock(&pgdat->memcg_lru.lock);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void lru_gen_offline_memcg(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
int nid;
|
|
||||||
|
|
||||||
for_each_node(nid) {
|
|
||||||
struct lruvec *lruvec = get_lruvec(memcg, nid);
|
|
||||||
|
|
||||||
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void lru_gen_release_memcg(struct mem_cgroup *memcg)
|
|
||||||
{
|
|
||||||
int gen;
|
|
||||||
int nid;
|
|
||||||
|
|
||||||
for_each_node(nid) {
|
|
||||||
struct pglist_data *pgdat = NODE_DATA(nid);
|
|
||||||
struct lruvec *lruvec = get_lruvec(memcg, nid);
|
|
||||||
|
|
||||||
spin_lock(&pgdat->memcg_lru.lock);
|
|
||||||
|
|
||||||
VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
|
|
||||||
|
|
||||||
gen = lruvec->lrugen.gen;
|
|
||||||
|
|
||||||
hlist_nulls_del_rcu(&lruvec->lrugen.list);
|
|
||||||
pgdat->memcg_lru.nr_memcgs[gen]--;
|
|
||||||
|
|
||||||
if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
|
|
||||||
WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
|
|
||||||
|
|
||||||
spin_unlock(&pgdat->memcg_lru.lock);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* CONFIG_MEMCG */
|
#endif /* CONFIG_MEMCG */
|
||||||
|
|
||||||
static int __init init_lru_gen(void)
|
static int __init init_lru_gen(void)
|
||||||
|
Loading…
Reference in New Issue
Block a user