forked from Minki/linux
ec1c86b25f
Evictable pages are divided into multiple generations for each lruvec. The youngest generation number is stored in lrugen->max_seq for both anon and file types as they are aged on an equal footing. The oldest generation numbers are stored in lrugen->min_seq[] separately for anon and file types as clean file pages can be evicted regardless of swap constraints. These three variables are monotonically increasing. Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into the gen counter in folio->flags. Each truncated generation number is an index to lrugen->lists[]. The sliding window technique is used to track at least MIN_NR_GENS and at most MAX_NR_GENS generations. The gen counter stores a value within [1, MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it stores 0. There are two conceptually independent procedures: "the aging", which produces young generations, and "the eviction", which consumes old generations. They form a closed-loop system, i.e., "the page reclaim". Both procedures can be invoked from userspace for the purposes of working set estimation and proactive reclaim. These techniques are commonly used to optimize job scheduling (bin packing) in data centers [1][2]. To avoid confusion, the terms "hot" and "cold" will be applied to the multi-gen LRU, as a new convention; the terms "active" and "inactive" will be applied to the active/inactive LRU, as usual. The protection of hot pages and the selection of cold pages are based on page access channels and patterns. There are two access channels: one through page tables and the other through file descriptors. The protection of the former channel is by design stronger because: 1. The uncertainty in determining the access patterns of the former channel is higher due to the approximation of the accessed bit. 2. The cost of evicting the former channel is higher due to the TLB flushes required and the likelihood of encountering the dirty bit. 3. The penalty of underprotecting the former channel is higher because applications usually do not prepare themselves for major page faults like they do for blocked I/O. E.g., GUI applications commonly use dedicated I/O threads to avoid blocking rendering threads. There are also two access patterns: one with temporal locality and the other without. For the reasons listed above, the former channel is assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is present; the latter channel is assumed to follow the latter pattern unless outlying refaults have been observed [3][4]. The next patch will address the "outlying refaults". Three macros, i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in this patch to make the entire patchset less diffy. A page is added to the youngest generation on faulting. The aging needs to check the accessed bit at least twice before handing this page over to the eviction. The first check takes care of the accessed bit set on the initial fault; the second check makes sure this page has not been used since then. This protocol, AKA second chance, requires a minimum of two generations, hence MIN_NR_GENS. [1] https://dl.acm.org/doi/10.1145/3297858.3304053 [2] https://dl.acm.org/doi/10.1145/3503222.3507731 [3] https://lwn.net/Articles/495543/ [4] https://lwn.net/Articles/815342/ Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com Signed-off-by: Yu Zhao <yuzhao@google.com> Acked-by: Brian Geffon <bgeffon@google.com> Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> Acked-by: Steven Barrett <steven@liquorix.net> Acked-by: Suleiman Souhlal <suleiman@google.com> Tested-by: Daniel Byrne <djbyrne@mtu.edu> Tested-by: Donald Carr <d@chaos-reins.com> Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com> Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> Tested-by: Sofia Trinh <sofia.trinh@edi.works> Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Barry Song <baohua@kernel.org> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Matthew Wilcox <willy@infradead.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Miaohe Lin <linmiaohe@huawei.com> Cc: Michael Larabel <Michael@MichaelLarabel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Rapoport <rppt@kernel.org> Cc: Mike Rapoport <rppt@linux.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
207 lines
5.3 KiB
C
207 lines
5.3 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* mm_init.c - Memory initialisation verification and debugging
|
|
*
|
|
* Copyright 2008 IBM Corporation, 2008
|
|
* Author Mel Gorman <mel@csn.ul.ie>
|
|
*
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/init.h>
|
|
#include <linux/kobject.h>
|
|
#include <linux/export.h>
|
|
#include <linux/memory.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/mman.h>
|
|
#include "internal.h"
|
|
|
|
#ifdef CONFIG_DEBUG_MEMORY_INIT
|
|
int __meminitdata mminit_loglevel;
|
|
|
|
/* The zonelists are simply reported, validation is manual. */
|
|
void __init mminit_verify_zonelist(void)
|
|
{
|
|
int nid;
|
|
|
|
if (mminit_loglevel < MMINIT_VERIFY)
|
|
return;
|
|
|
|
for_each_online_node(nid) {
|
|
pg_data_t *pgdat = NODE_DATA(nid);
|
|
struct zone *zone;
|
|
struct zoneref *z;
|
|
struct zonelist *zonelist;
|
|
int i, listid, zoneid;
|
|
|
|
BUILD_BUG_ON(MAX_ZONELISTS > 2);
|
|
for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
|
|
|
|
/* Identify the zone and nodelist */
|
|
zoneid = i % MAX_NR_ZONES;
|
|
listid = i / MAX_NR_ZONES;
|
|
zonelist = &pgdat->node_zonelists[listid];
|
|
zone = &pgdat->node_zones[zoneid];
|
|
if (!populated_zone(zone))
|
|
continue;
|
|
|
|
/* Print information about the zonelist */
|
|
printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
|
|
listid > 0 ? "thisnode" : "general", nid,
|
|
zone->name);
|
|
|
|
/* Iterate the zonelist */
|
|
for_each_zone_zonelist(zone, z, zonelist, zoneid)
|
|
pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
|
|
pr_cont("\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
void __init mminit_verify_pageflags_layout(void)
|
|
{
|
|
int shift, width;
|
|
unsigned long or_mask, add_mask;
|
|
|
|
shift = 8 * sizeof(unsigned long);
|
|
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
|
|
- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
|
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
|
|
SECTIONS_WIDTH,
|
|
NODES_WIDTH,
|
|
ZONES_WIDTH,
|
|
LAST_CPUPID_WIDTH,
|
|
KASAN_TAG_WIDTH,
|
|
LRU_GEN_WIDTH,
|
|
LRU_REFS_WIDTH,
|
|
NR_PAGEFLAGS);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
|
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
|
|
SECTIONS_SHIFT,
|
|
NODES_SHIFT,
|
|
ZONES_SHIFT,
|
|
LAST_CPUPID_SHIFT,
|
|
KASAN_TAG_WIDTH);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
|
|
"Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
|
|
(unsigned long)SECTIONS_PGSHIFT,
|
|
(unsigned long)NODES_PGSHIFT,
|
|
(unsigned long)ZONES_PGSHIFT,
|
|
(unsigned long)LAST_CPUPID_PGSHIFT,
|
|
(unsigned long)KASAN_TAG_PGSHIFT);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
|
|
"Node/Zone ID: %lu -> %lu\n",
|
|
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
|
|
(unsigned long)ZONEID_PGOFF);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
|
|
"location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
|
|
shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
|
|
#ifdef NODE_NOT_IN_PAGE_FLAGS
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
|
|
"Node not in page flags");
|
|
#endif
|
|
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
|
|
"Last cpupid not in page flags");
|
|
#endif
|
|
|
|
if (SECTIONS_WIDTH) {
|
|
shift -= SECTIONS_WIDTH;
|
|
BUG_ON(shift != SECTIONS_PGSHIFT);
|
|
}
|
|
if (NODES_WIDTH) {
|
|
shift -= NODES_WIDTH;
|
|
BUG_ON(shift != NODES_PGSHIFT);
|
|
}
|
|
if (ZONES_WIDTH) {
|
|
shift -= ZONES_WIDTH;
|
|
BUG_ON(shift != ZONES_PGSHIFT);
|
|
}
|
|
|
|
/* Check for bitmask overlaps */
|
|
or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
|
|
(NODES_MASK << NODES_PGSHIFT) |
|
|
(SECTIONS_MASK << SECTIONS_PGSHIFT);
|
|
add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
|
|
(NODES_MASK << NODES_PGSHIFT) +
|
|
(SECTIONS_MASK << SECTIONS_PGSHIFT);
|
|
BUG_ON(or_mask != add_mask);
|
|
}
|
|
|
|
static __init int set_mminit_loglevel(char *str)
|
|
{
|
|
get_option(&str, &mminit_loglevel);
|
|
return 0;
|
|
}
|
|
early_param("mminit_loglevel", set_mminit_loglevel);
|
|
#endif /* CONFIG_DEBUG_MEMORY_INIT */
|
|
|
|
struct kobject *mm_kobj;
|
|
EXPORT_SYMBOL_GPL(mm_kobj);
|
|
|
|
#ifdef CONFIG_SMP
|
|
s32 vm_committed_as_batch = 32;
|
|
|
|
void mm_compute_batch(int overcommit_policy)
|
|
{
|
|
u64 memsized_batch;
|
|
s32 nr = num_present_cpus();
|
|
s32 batch = max_t(s32, nr*2, 32);
|
|
unsigned long ram_pages = totalram_pages();
|
|
|
|
/*
|
|
* For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
|
|
* (total memory/#cpus), and lift it to 25% for other policies
|
|
* to easy the possible lock contention for percpu_counter
|
|
* vm_committed_as, while the max limit is INT_MAX
|
|
*/
|
|
if (overcommit_policy == OVERCOMMIT_NEVER)
|
|
memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
|
|
else
|
|
memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
|
|
|
|
vm_committed_as_batch = max_t(s32, memsized_batch, batch);
|
|
}
|
|
|
|
static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
|
|
unsigned long action, void *arg)
|
|
{
|
|
switch (action) {
|
|
case MEM_ONLINE:
|
|
case MEM_OFFLINE:
|
|
mm_compute_batch(sysctl_overcommit_memory);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return NOTIFY_OK;
|
|
}
|
|
|
|
static struct notifier_block compute_batch_nb __meminitdata = {
|
|
.notifier_call = mm_compute_batch_notifier,
|
|
.priority = IPC_CALLBACK_PRI, /* use lowest priority */
|
|
};
|
|
|
|
static int __init mm_compute_batch_init(void)
|
|
{
|
|
mm_compute_batch(sysctl_overcommit_memory);
|
|
register_hotmemory_notifier(&compute_batch_nb);
|
|
|
|
return 0;
|
|
}
|
|
|
|
__initcall(mm_compute_batch_init);
|
|
|
|
#endif
|
|
|
|
static int __init mm_sysfs_init(void)
|
|
{
|
|
mm_kobj = kobject_create_and_add("mm", kernel_kobj);
|
|
if (!mm_kobj)
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
|
}
|
|
postcore_initcall(mm_sysfs_init);
|