From 721f4a6526daafca15634f30c9865e880da3e1d1 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 5 Apr 2024 01:58:21 +0000 Subject: [PATCH 01/18] mm/memblock: remove empty dummy entry The dummy entry is introduced in the initial implementation of lmb in commit 7c8c6b9776fb ("powerpc: Merge lmb.c and make MM initialization use it."). As the comment says the empty dummy entry is to simplify the code. /* Create a dummy zero size LMB which will get coalesced away later. * This simplifies the lmb_add() code below... */ While current code is reimplemented by Tejun in commit 784656f9c680 ("memblock: Reimplement memblock_add_region()"). This empty dummy entry seems not benefit the code any more. Let's remove it. Signed-off-by: Wei Yang CC: Paul Mackerras CC: Tejun Heo CC: Mike Rapoport Link: https://lore.kernel.org/r/20240405015821.13411-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/memblock.c | 7 ++----- tools/testing/memblock/tests/basic_api.c | 8 ++++---- tools/testing/memblock/tests/common.c | 4 ++-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index d09136e040d3..98d25689cf10 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -114,12 +114,10 @@ static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, - .memory.cnt = 1, /* empty dummy entry */ .memory.max = INIT_MEMBLOCK_MEMORY_REGIONS, .memory.name = "memory", .reserved.regions = memblock_reserved_init_regions, - .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS, .reserved.name = "reserved", @@ -130,7 +128,6 @@ struct memblock memblock __initdata_memblock = { #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP struct memblock_type physmem = { .regions = memblock_physmem_init_regions, - .cnt = 1, /* empty dummy entry */ .max = INIT_PHYSMEM_REGIONS, .name = "physmem", }; @@ -356,7 +353,6 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u /* Special case for empty arrays */ if (type->cnt == 0) { WARN_ON(type->total_size != 0); - type->cnt = 1; type->regions[0].base = 0; type->regions[0].size = 0; type->regions[0].flags = 0; @@ -600,12 +596,13 @@ static int __init_memblock memblock_add_range(struct memblock_type *type, /* special case for empty array */ if (type->regions[0].size == 0) { - WARN_ON(type->cnt != 1 || type->total_size); + WARN_ON(type->cnt != 0 || type->total_size); type->regions[0].base = base; type->regions[0].size = size; type->regions[0].flags = flags; memblock_set_region_node(&type->regions[0], nid); type->total_size = size; + type->cnt = 1; return 0; } diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index 57bf2688edfd..f317fe691fc4 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -15,12 +15,12 @@ static int memblock_initialization_check(void) PREFIX_PUSH(); ASSERT_NE(memblock.memory.regions, NULL); - ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.cnt, 0); ASSERT_EQ(memblock.memory.max, EXPECTED_MEMBLOCK_REGIONS); ASSERT_EQ(strcmp(memblock.memory.name, "memory"), 0); ASSERT_NE(memblock.reserved.regions, NULL); - ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.cnt, 0); ASSERT_EQ(memblock.memory.max, EXPECTED_MEMBLOCK_REGIONS); ASSERT_EQ(strcmp(memblock.reserved.name, "reserved"), 0); @@ -1295,7 +1295,7 @@ static int memblock_remove_only_region_check(void) ASSERT_EQ(rgn->base, 0); ASSERT_EQ(rgn->size, 0); - ASSERT_EQ(memblock.memory.cnt, 1); + ASSERT_EQ(memblock.memory.cnt, 0); ASSERT_EQ(memblock.memory.total_size, 0); test_pass_pop(); @@ -1723,7 +1723,7 @@ static int memblock_free_only_region_check(void) ASSERT_EQ(rgn->base, 0); ASSERT_EQ(rgn->size, 0); - ASSERT_EQ(memblock.reserved.cnt, 1); + ASSERT_EQ(memblock.reserved.cnt, 0); ASSERT_EQ(memblock.reserved.total_size, 0); test_pass_pop(); diff --git a/tools/testing/memblock/tests/common.c b/tools/testing/memblock/tests/common.c index f43b6f414983..c2c569f12178 100644 --- a/tools/testing/memblock/tests/common.c +++ b/tools/testing/memblock/tests/common.c @@ -40,13 +40,13 @@ void reset_memblock_regions(void) { memset(memblock.memory.regions, 0, memblock.memory.cnt * sizeof(struct memblock_region)); - memblock.memory.cnt = 1; + memblock.memory.cnt = 0; memblock.memory.max = INIT_MEMBLOCK_REGIONS; memblock.memory.total_size = 0; memset(memblock.reserved.regions, 0, memblock.reserved.cnt * sizeof(struct memblock_region)); - memblock.reserved.cnt = 1; + memblock.reserved.cnt = 0; memblock.reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS; memblock.reserved.total_size = 0; } From 3d3165193776ddacf59f101f0fa05cfab9f1a9ba Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 7 May 2024 07:58:27 +0000 Subject: [PATCH 02/18] memblock tests: add memblock_reserve_all_locations_check() Instead of adding 129th memory block at the last position, let's try all possible position. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240507075833.6346-2-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- tools/testing/memblock/tests/basic_api.c | 107 +++++++++++++++++++++++ 1 file changed, 107 insertions(+) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index f317fe691fc4..bd3ebbf6b697 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -982,6 +982,112 @@ static int memblock_reserve_many_check(void) return 0; } + +/* + * A test that trying to reserve the 129th memory block at all locations. + * Expect to trigger memblock_double_array() to double the + * memblock.memory.max, find a new valid memory as reserved.regions. + * + * 0 1 2 128 + * +-------+ +-------+ +-------+ +-------+ + * | 32K | | 32K | | 32K | ... | 32K | + * +-------+-------+-------+-------+-------+ +-------+ + * |<-32K->| |<-32K->| + * + */ +/* Keep the gap so these memory region will not be merged. */ +#define MEMORY_BASE(idx) (SZ_128K + (MEM_SIZE * 2) * (idx)) +static int memblock_reserve_all_locations_check(void) +{ + int i, skip; + void *orig_region; + struct region r = { + .base = SZ_16K, + .size = SZ_16K, + }; + phys_addr_t new_reserved_regions_size; + + PREFIX_PUSH(); + + /* Reserve the 129th memory block for all possible positions*/ + for (skip = 0; skip < INIT_MEMBLOCK_REGIONS + 1; skip++) { + reset_memblock_regions(); + memblock_allow_resize(); + + /* Add a valid memory region used by double_array(). */ + dummy_physical_memory_init(); + memblock_add(dummy_physical_memory_base(), MEM_SIZE); + + for (i = 0; i < INIT_MEMBLOCK_REGIONS + 1; i++) { + if (i == skip) + continue; + + /* Reserve some fakes memory region to fulfill the memblock. */ + memblock_reserve(MEMORY_BASE(i), MEM_SIZE); + + if (i < skip) { + ASSERT_EQ(memblock.reserved.cnt, i + 1); + ASSERT_EQ(memblock.reserved.total_size, (i + 1) * MEM_SIZE); + } else { + ASSERT_EQ(memblock.reserved.cnt, i); + ASSERT_EQ(memblock.reserved.total_size, i * MEM_SIZE); + } + } + + orig_region = memblock.reserved.regions; + + /* This reserve the 129 memory_region, and makes it double array. */ + memblock_reserve(MEMORY_BASE(skip), MEM_SIZE); + + /* + * This is the memory region size used by the doubled reserved.regions, + * and it has been reserved due to it has been used. The size is used to + * calculate the total_size that the memblock.reserved have now. + */ + new_reserved_regions_size = PAGE_ALIGN((INIT_MEMBLOCK_REGIONS * 2) * + sizeof(struct memblock_region)); + /* + * The double_array() will find a free memory region as the new + * reserved.regions, and the used memory region will be reserved, so + * there will be one more region exist in the reserved memblock. And the + * one more reserved region's size is new_reserved_regions_size. + */ + ASSERT_EQ(memblock.reserved.cnt, INIT_MEMBLOCK_REGIONS + 2); + ASSERT_EQ(memblock.reserved.total_size, (INIT_MEMBLOCK_REGIONS + 1) * MEM_SIZE + + new_reserved_regions_size); + ASSERT_EQ(memblock.reserved.max, INIT_MEMBLOCK_REGIONS * 2); + + /* + * Now memblock_double_array() works fine. Let's check after the + * double_array(), the memblock_reserve() still works as normal. + */ + memblock_reserve(r.base, r.size); + ASSERT_EQ(memblock.reserved.regions[0].base, r.base); + ASSERT_EQ(memblock.reserved.regions[0].size, r.size); + + ASSERT_EQ(memblock.reserved.cnt, INIT_MEMBLOCK_REGIONS + 3); + ASSERT_EQ(memblock.reserved.total_size, (INIT_MEMBLOCK_REGIONS + 1) * MEM_SIZE + + new_reserved_regions_size + + r.size); + ASSERT_EQ(memblock.reserved.max, INIT_MEMBLOCK_REGIONS * 2); + + dummy_physical_memory_cleanup(); + + /* + * The current reserved.regions is occupying a range of memory that + * allocated from dummy_physical_memory_init(). After free the memory, + * we must not use it. So restore the origin memory region to make sure + * the tests can run as normal and not affected by the double array. + */ + memblock.reserved.regions = orig_region; + memblock.reserved.cnt = INIT_MEMBLOCK_RESERVED_REGIONS; + } + + test_pass_pop(); + + return 0; +} + static int memblock_reserve_checks(void) { prefix_reset(); @@ -997,6 +1103,7 @@ static int memblock_reserve_checks(void) memblock_reserve_between_check(); memblock_reserve_near_max_check(); memblock_reserve_many_check(); + memblock_reserve_all_locations_check(); prefix_pop(); From f6df89c3582a337090ae1f37c3648bdb35da29f7 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 7 May 2024 07:58:28 +0000 Subject: [PATCH 03/18] memblock tests: add memblock_reserve_many_may_conflict_check() This may trigger the case fixed by commit 48c3b583bbdd ("mm/memblock: fix overlapping allocation when doubling reserved array"). This is done by adding the 129th reserve region into memblock.memory. If memblock_double_array() use this reserve region as new array, it fails. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240507075833.6346-3-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- tools/testing/memblock/tests/basic_api.c | 151 +++++++++++++++++++++++ tools/testing/memblock/tests/common.c | 4 +- tools/testing/memblock/tests/common.h | 1 + 3 files changed, 154 insertions(+), 2 deletions(-) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index bd3ebbf6b697..fdac82656d15 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -1088,6 +1088,156 @@ static int memblock_reserve_all_locations_check(void) return 0; } +/* + * A test that trying to reserve the 129th memory block at all locations. + * Expect to trigger memblock_double_array() to double the + * memblock.memory.max, find a new valid memory as reserved.regions. And make + * sure it doesn't conflict with the range we want to reserve. + * + * For example, we have 128 regions in reserved and now want to reserve + * the skipped one. Since reserved is full, memblock_double_array() would find + * an available range in memory for the new array. We intended to put two + * ranges in memory with one is the exact range of the skipped one. Before + * commit 48c3b583bbdd ("mm/memblock: fix overlapping allocation when doubling + * reserved array"), the new array would sits in the skipped range which is a + * conflict. The expected new array should be allocated from memory.regions[0]. + * + * 0 1 + * memory +-------+ +-------+ + * | 32K | | 32K | + * +-------+ ------+-------+-------+-------+ + * |<-32K->|<-32K->|<-32K->| + * + * 0 skipped 127 + * reserved +-------+ ......... +-------+ + * | 32K | . 32K . ... | 32K | + * +-------+-------+-------+ +-------+ + * |<-32K->| + * ^ + * | + * | + * skipped one + */ +/* Keep the gap so these memory region will not be merged. */ +#define MEMORY_BASE_OFFSET(idx, offset) ((offset) + (MEM_SIZE * 2) * (idx)) +static int memblock_reserve_many_may_conflict_check(void) +{ + int i, skip; + void *orig_region; + struct region r = { + .base = SZ_16K, + .size = SZ_16K, + }; + phys_addr_t new_reserved_regions_size; + + /* + * 0 1 129 + * +---+ +---+ +---+ + * |32K| |32K| .. |32K| + * +---+ +---+ +---+ + * + * Pre-allocate the range for 129 memory block + one range for double + * memblock.reserved.regions at idx 0. + */ + dummy_physical_memory_init(); + phys_addr_t memory_base = dummy_physical_memory_base(); + phys_addr_t offset = PAGE_ALIGN(memory_base); + + PREFIX_PUSH(); + + /* Reserve the 129th memory block for all possible positions*/ + for (skip = 1; skip <= INIT_MEMBLOCK_REGIONS + 1; skip++) { + reset_memblock_regions(); + memblock_allow_resize(); + + reset_memblock_attributes(); + /* Add a valid memory region used by double_array(). */ + memblock_add(MEMORY_BASE_OFFSET(0, offset), MEM_SIZE); + /* + * Add a memory region which will be reserved as 129th memory + * region. This is not expected to be used by double_array(). + */ + memblock_add(MEMORY_BASE_OFFSET(skip, offset), MEM_SIZE); + + for (i = 1; i <= INIT_MEMBLOCK_REGIONS + 1; i++) { + if (i == skip) + continue; + + /* Reserve some fakes memory region to fulfill the memblock. */ + memblock_reserve(MEMORY_BASE_OFFSET(i, offset), MEM_SIZE); + + if (i < skip) { + ASSERT_EQ(memblock.reserved.cnt, i); + ASSERT_EQ(memblock.reserved.total_size, i * MEM_SIZE); + } else { + ASSERT_EQ(memblock.reserved.cnt, i - 1); + ASSERT_EQ(memblock.reserved.total_size, (i - 1) * MEM_SIZE); + } + } + + orig_region = memblock.reserved.regions; + + /* This reserve the 129 memory_region, and makes it double array. */ + memblock_reserve(MEMORY_BASE_OFFSET(skip, offset), MEM_SIZE); + + /* + * This is the memory region size used by the doubled reserved.regions, + * and it has been reserved due to it has been used. The size is used to + * calculate the total_size that the memblock.reserved have now. + */ + new_reserved_regions_size = PAGE_ALIGN((INIT_MEMBLOCK_REGIONS * 2) * + sizeof(struct memblock_region)); + /* + * The double_array() will find a free memory region as the new + * reserved.regions, and the used memory region will be reserved, so + * there will be one more region exist in the reserved memblock. And the + * one more reserved region's size is new_reserved_regions_size. + */ + ASSERT_EQ(memblock.reserved.cnt, INIT_MEMBLOCK_REGIONS + 2); + ASSERT_EQ(memblock.reserved.total_size, (INIT_MEMBLOCK_REGIONS + 1) * MEM_SIZE + + new_reserved_regions_size); + ASSERT_EQ(memblock.reserved.max, INIT_MEMBLOCK_REGIONS * 2); + + /* + * The first reserved region is allocated for double array + * with the size of new_reserved_regions_size and the base to be + * MEMORY_BASE_OFFSET(0, offset) + SZ_32K - new_reserved_regions_size + */ + ASSERT_EQ(memblock.reserved.regions[0].base + memblock.reserved.regions[0].size, + MEMORY_BASE_OFFSET(0, offset) + SZ_32K); + ASSERT_EQ(memblock.reserved.regions[0].size, new_reserved_regions_size); + + /* + * Now memblock_double_array() works fine. Let's check after the + * double_array(), the memblock_reserve() still works as normal. + */ + memblock_reserve(r.base, r.size); + ASSERT_EQ(memblock.reserved.regions[0].base, r.base); + ASSERT_EQ(memblock.reserved.regions[0].size, r.size); + + ASSERT_EQ(memblock.reserved.cnt, INIT_MEMBLOCK_REGIONS + 3); + ASSERT_EQ(memblock.reserved.total_size, (INIT_MEMBLOCK_REGIONS + 1) * MEM_SIZE + + new_reserved_regions_size + + r.size); + ASSERT_EQ(memblock.reserved.max, INIT_MEMBLOCK_REGIONS * 2); + + /* + * The current reserved.regions is occupying a range of memory that + * allocated from dummy_physical_memory_init(). After free the memory, + * we must not use it. So restore the origin memory region to make sure + * the tests can run as normal and not affected by the double array. + */ + memblock.reserved.regions = orig_region; + memblock.reserved.cnt = INIT_MEMBLOCK_RESERVED_REGIONS; + } + + dummy_physical_memory_cleanup(); + + test_pass_pop(); + + return 0; +} + static int memblock_reserve_checks(void) { prefix_reset(); @@ -1104,6 +1254,7 @@ static int memblock_reserve_checks(void) memblock_reserve_near_max_check(); memblock_reserve_many_check(); memblock_reserve_all_locations_check(); + memblock_reserve_many_may_conflict_check(); prefix_pop(); diff --git a/tools/testing/memblock/tests/common.c b/tools/testing/memblock/tests/common.c index c2c569f12178..3250c8e5124b 100644 --- a/tools/testing/memblock/tests/common.c +++ b/tools/testing/memblock/tests/common.c @@ -61,7 +61,7 @@ void reset_memblock_attributes(void) static inline void fill_memblock(void) { - memset(memory_block.base, 1, MEM_SIZE); + memset(memory_block.base, 1, PHYS_MEM_SIZE); } void setup_memblock(void) @@ -103,7 +103,7 @@ void setup_numa_memblock(const unsigned int node_fracs[]) void dummy_physical_memory_init(void) { - memory_block.base = malloc(MEM_SIZE); + memory_block.base = malloc(PHYS_MEM_SIZE); assert(memory_block.base); fill_memblock(); } diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index b5ec59aa62d7..2f26405562b0 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -12,6 +12,7 @@ #include <../selftests/kselftest.h> #define MEM_SIZE SZ_32K +#define PHYS_MEM_SIZE SZ_16M #define NUMA_NODES 8 #define INIT_MEMBLOCK_REGIONS 128 From 3aca2cea907c647ee7720b7ba22734f9e8e7cfa3 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 7 May 2024 07:58:29 +0000 Subject: [PATCH 04/18] mm/memblock: fix comment for memblock_isolate_range() The isolated range is [*@start_rgn, *@end_rgn - 1], while the comment says "the end region inside the range" is *@end_rgn. Let's correct it. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240507075833.6346-4-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/memblock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index 98d25689cf10..7f3cd96d6769 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -777,7 +777,8 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt * Walk @type and ensure that regions don't cross the boundaries defined by * [@base, @base + @size). Crossing regions are split at the boundaries, * which may create at most two more regions. The index of the first - * region inside the range is returned in *@start_rgn and end in *@end_rgn. + * region inside the range is returned in *@start_rgn and the index of the + * first region after the range is returned in *@end_rgn. * * Return: * 0 on success, -errno on failure. From 1a879671bdfd14698a839f30de8e6d76e1e858fd Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 7 May 2024 07:58:30 +0000 Subject: [PATCH 05/18] memblock tests: add memblock_overlaps_region_checks Add a test case for memblock_overlaps_region(). Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240507075833.6346-5-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- tools/testing/memblock/tests/basic_api.c | 48 ++++++++++++++++++++++++ tools/testing/memblock/tests/common.h | 3 ++ 2 files changed, 51 insertions(+) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index fdac82656d15..67503089e6a0 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -2387,6 +2387,53 @@ static int memblock_trim_memory_checks(void) return 0; } +static int memblock_overlaps_region_check(void) +{ + struct region r = { + .base = SZ_1G, + .size = SZ_4M + }; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_add(r.base, r.size); + + /* Far Away */ + ASSERT_FALSE(memblock_overlaps_region(&memblock.memory, SZ_1M, SZ_1M)); + ASSERT_FALSE(memblock_overlaps_region(&memblock.memory, SZ_2G, SZ_1M)); + + /* Neighbor */ + ASSERT_FALSE(memblock_overlaps_region(&memblock.memory, SZ_1G - SZ_1M, SZ_1M)); + ASSERT_FALSE(memblock_overlaps_region(&memblock.memory, SZ_1G + SZ_4M, SZ_1M)); + + /* Partial Overlap */ + ASSERT_TRUE(memblock_overlaps_region(&memblock.memory, SZ_1G - SZ_1M, SZ_2M)); + ASSERT_TRUE(memblock_overlaps_region(&memblock.memory, SZ_1G + SZ_2M, SZ_2M)); + + /* Totally Overlap */ + ASSERT_TRUE(memblock_overlaps_region(&memblock.memory, SZ_1G, SZ_4M)); + ASSERT_TRUE(memblock_overlaps_region(&memblock.memory, SZ_1G - SZ_2M, SZ_8M)); + ASSERT_TRUE(memblock_overlaps_region(&memblock.memory, SZ_1G + SZ_1M, SZ_1M)); + + test_pass_pop(); + + return 0; +} + +static int memblock_overlaps_region_checks(void) +{ + prefix_reset(); + prefix_push("memblock_overlaps_region"); + test_print("Running memblock_overlaps_region tests...\n"); + + memblock_overlaps_region_check(); + + prefix_pop(); + + return 0; +} + int memblock_basic_checks(void) { memblock_initialization_check(); @@ -2396,6 +2443,7 @@ int memblock_basic_checks(void) memblock_free_checks(); memblock_bottom_up_checks(); memblock_trim_memory_checks(); + memblock_overlaps_region_checks(); return 0; } diff --git a/tools/testing/memblock/tests/common.h b/tools/testing/memblock/tests/common.h index 2f26405562b0..e1138e06c903 100644 --- a/tools/testing/memblock/tests/common.h +++ b/tools/testing/memblock/tests/common.h @@ -40,6 +40,9 @@ enum test_flags { assert((_expected) == (_seen)); \ } while (0) +#define ASSERT_TRUE(_seen) ASSERT_EQ(true, _seen) +#define ASSERT_FALSE(_seen) ASSERT_EQ(false, _seen) + /** * ASSERT_NE(): * Check the condition From 1eb0a28d039a479bb4adec0320592caf5bd5175b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 7 May 2024 07:58:31 +0000 Subject: [PATCH 06/18] mm/memblock: return true directly on finding overlap region Not necessary to break and check i against type->cnt again. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240507075833.6346-6-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 7f3cd96d6769..da9a6c862a69 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -194,8 +194,8 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, for (i = 0; i < type->cnt; i++) if (memblock_addrs_overlap(base, size, type->regions[i].base, type->regions[i].size)) - break; - return i < type->cnt; + return true; + return false; } /** From b73f6b98bbd0b4c1fdcebc0c5b926349455035bf Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 7 May 2024 07:58:32 +0000 Subject: [PATCH 07/18] mm/memblock: use PAGE_ALIGN_DOWN to get pgend in free_memmap Leverage the macro PAGE_ALIGN_DOWN to get pgend. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240507075833.6346-7-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/memblock.c | 2 +- tools/include/linux/mm.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index da9a6c862a69..33a8b6f7b626 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2039,7 +2039,7 @@ static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn) * downwards. */ pg = PAGE_ALIGN(__pa(start_pg)); - pgend = __pa(end_pg) & PAGE_MASK; + pgend = PAGE_ALIGN_DOWN(__pa(end_pg)); /* * If there are free pages between these, free the section of the diff --git a/tools/include/linux/mm.h b/tools/include/linux/mm.h index dc0fc7125bc3..cad4f2927983 100644 --- a/tools/include/linux/mm.h +++ b/tools/include/linux/mm.h @@ -12,6 +12,7 @@ #define PHYS_ADDR_MAX (~(phys_addr_t)0) #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) +#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) #define __va(x) ((void *)((unsigned long)(x))) #define __pa(x) ((unsigned long)(x)) From 3be381d11f872de066774317031ba8edd2d8797e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 25 May 2024 02:30:38 +0000 Subject: [PATCH 08/18] mm/mm_init.c: use memblock_region_memory_base_pfn() to get startpfn Just like what it does in "if (mirrored_kernelcore)", we should use memblock_region_memory_base_pfn() to get the startpfn. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240525023040.13509-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/mm_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index f72b852bd5b8..2dfb87841fdb 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -363,7 +363,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) nid = memblock_get_region_node(r); - usable_startpfn = PFN_DOWN(r->base); + usable_startpfn = memblock_region_memory_base_pfn(r); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; From 93bbbcb1e762a49fc18ee1272545b77371595f1e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 25 May 2024 02:30:39 +0000 Subject: [PATCH 09/18] mm/memblock: fix a typo in description of for_each_mem_region() No functional change. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240525023040.13509-2-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- include/linux/memblock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e2082240586d..6cf18dc2b4d0 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -565,7 +565,7 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo } /** - * for_each_mem_region - itereate over memory regions + * for_each_mem_region - iterate over memory regions * @region: loop variable */ #define for_each_mem_region(region) \ From 922306a253e20ad5d0c4b8479d2dc5df9f325a04 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 25 May 2024 02:30:40 +0000 Subject: [PATCH 10/18] mm/mm_init.c: move nr_initialised reset down a bit We don't need to count nr_initialised in two cases: * for low zones that are always populated * after first_deferred_pfn is detected Let's move the nr_initialised reset down a bit to reduce some comparison of prev_end_pfn and end_pfn. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240525023040.13509-3-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/mm_init.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 2dfb87841fdb..bdbd800b0436 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -676,6 +676,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) if (early_page_ext_enabled()) return false; + + /* Always populate low zones for address-constrained allocations */ + if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) + return false; + + if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) + return true; + /* * prev_end_pfn static that contains the end of previous zone * No need to protect because called very early in boot before smp_init. @@ -685,12 +693,6 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) nr_initialised = 0; } - /* Always populate low zones for address-constrained allocations */ - if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) - return false; - - if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) - return true; /* * We start only with one section of pages, more pages are added as * needed until the rest of deferred pages are initialized. From ce8ebb95439459f7e24b02c6943e278f46d2d328 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 5 Jun 2024 07:13:37 +0000 Subject: [PATCH 11/18] mm/mm_init.c: get the highest zone directly We have recorded nr_zones in pgdat, just get it directly. Signed-off-by: Wei Yang Reviewed-by: Mike Rapoport (IBM) Link: https://lore.kernel.org/all/20240605071339.15330-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/mm_init.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index bdbd800b0436..6eda9367730b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2140,7 +2140,7 @@ static int __init deferred_init_memmap(void *data) unsigned long first_init_pfn, flags; unsigned long start = jiffies; struct zone *zone; - int zid, max_threads; + int max_threads; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -2167,12 +2167,8 @@ static int __init deferred_init_memmap(void *data) */ pgdat_resize_unlock(pgdat, &flags); - /* Only the highest zone is deferred so find it */ - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - zone = pgdat->node_zones + zid; - if (first_init_pfn < zone_end_pfn(zone)) - break; - } + /* Only the highest zone is deferred */ + zone = pgdat->node_zones + pgdat->nr_zones - 1; /* If the zone is empty somebody else may have cleared out the zone */ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, @@ -2200,7 +2196,7 @@ static int __init deferred_init_memmap(void *data) } zone_empty: /* Sanity check that the next zone really is unpopulated */ - WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); + WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone)); pr_info("node %d deferred pages initialised in %ums\n", pgdat->node_id, jiffies_to_msecs(jiffies - start)); From 544b8e14c24b1f1927659e546d008d678e77e19c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 5 Jun 2024 07:13:38 +0000 Subject: [PATCH 12/18] mm/mm_init.c: use deferred_init_mem_pfn_range_in_zone() to decide loop condition If deferred_init_mem_pfn_range_in_zone() return true, we know it finds some range in (spfn, epfn). Then we can use it directly for the loop condition. Signed-off-by: Wei Yang Reviewed-by: Mike Rapoport (IBM) Link: https://lore.kernel.org/all/20240605071339.15330-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/mm_init.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 6eda9367730b..c9c8c7458f27 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2170,20 +2170,15 @@ static int __init deferred_init_memmap(void *data) /* Only the highest zone is deferred */ zone = pgdat->node_zones + pgdat->nr_zones - 1; - /* If the zone is empty somebody else may have cleared out the zone */ - if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, - first_init_pfn)) - goto zone_empty; - max_threads = deferred_page_init_max_threads(cpumask); - while (spfn < epfn) { - unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); + while (deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, first_init_pfn)) { + first_init_pfn = ALIGN(epfn, PAGES_PER_SECTION); struct padata_mt_job job = { .thread_fn = deferred_init_memmap_chunk, .fn_arg = zone, .start = spfn, - .size = epfn_align - spfn, + .size = first_init_pfn - spfn, .align = PAGES_PER_SECTION, .min_chunk = PAGES_PER_SECTION, .max_threads = max_threads, @@ -2191,10 +2186,8 @@ static int __init deferred_init_memmap(void *data) }; padata_do_multithreaded(&job); - deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, - epfn_align); } -zone_empty: + /* Sanity check that the next zone really is unpopulated */ WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone)); From f1180fd2a7c039691b64ebf404c746a74e40b7b0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 5 Jun 2024 07:13:39 +0000 Subject: [PATCH 13/18] mm/mm_init.c: not always search next deferred_init_pfn from very beginning In function deferred_init_memmap(), we call deferred_init_mem_pfn_range_in_zone() to get the next deferred_init_pfn. But we always search it from the very beginning. Since we save the index in i, we can leverage this to search from i next time. [rppt refine the comment] Signed-off-by: Wei Yang Link: https://lore.kernel.org/all/20240605071339.15330-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- include/linux/memblock.h | 19 ------------------- mm/mm_init.c | 23 ++++++++++++++--------- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6cf18dc2b4d0..45cac33334c8 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -299,25 +299,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, unsigned long *out_spfn, unsigned long *out_epfn); -/** - * for_each_free_mem_pfn_range_in_zone - iterate through zone specific free - * memblock areas - * @i: u64 used as loop variable - * @zone: zone in which all of the memory blocks reside - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * - * Walks over free (memory && !reserved) areas of memblock in a specific - * zone. Available once memblock and an empty zone is initialized. The main - * assumption is that the zone start, end, and pgdat have been associated. - * This way we can use the zone to determine NUMA node, and if a given part - * of the memblock is valid for the zone. - */ -#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \ - for (i = 0, \ - __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ - i != U64_MAX; \ - __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) /** * for_each_free_mem_pfn_range_in_zone_from - iterate through zone specific diff --git a/mm/mm_init.c b/mm/mm_init.c index c9c8c7458f27..5a4c846393b7 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2021,24 +2021,29 @@ static unsigned long __init deferred_init_pages(struct zone *zone, } /* - * This function is meant to pre-load the iterator for the zone init. - * Specifically it walks through the ranges until we are caught up to the - * first_init_pfn value and exits there. If we never encounter the value we - * return false indicating there are no valid ranges left. + * This function is meant to pre-load the iterator for the zone init from + * a given point. + * Specifically it walks through the ranges starting with initial index + * passed to it until we are caught up to the first_init_pfn value and + * exits there. If we never encounter the value we return false indicating + * there are no valid ranges left. */ static bool __init deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, unsigned long *spfn, unsigned long *epfn, unsigned long first_init_pfn) { - u64 j; + u64 j = *i; + + if (j == 0) + __next_mem_pfn_range_in_zone(&j, zone, spfn, epfn); /* * Start out by walking through the ranges in this zone that have * already been initialized. We don't need to do anything with them * so we just need to flush them out of the system. */ - for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { + for_each_free_mem_pfn_range_in_zone_from(j, zone, spfn, epfn) { if (*epfn <= first_init_pfn) continue; if (*spfn < first_init_pfn) @@ -2110,7 +2115,7 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, { unsigned long spfn, epfn; struct zone *zone = arg; - u64 i; + u64 i = 0; deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); @@ -2141,7 +2146,7 @@ static int __init deferred_init_memmap(void *data) unsigned long start = jiffies; struct zone *zone; int max_threads; - u64 i; + u64 i = 0; /* Bind memory initialisation thread to a local node if possible */ if (!cpumask_empty(cpumask)) @@ -2216,7 +2221,7 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order) unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; unsigned long spfn, epfn, flags; unsigned long nr_pages = 0; - u64 i; + u64 i = 0; /* Only the last zone may have deferred pages */ if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) From 0e9899feed9cbb7d33c01ad849dc307b1560b0ab Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 10 Jun 2024 14:37:42 +0000 Subject: [PATCH 14/18] mm/mm_init.c: don't initialize page->lru again Current page initialization call flow looks like this with some simplification: setup_arch() paging_init() free_area_init() memmap_init() memmap_init_zone_range() memmap_init_range() defer_init() __init_single_page() mm_core_init() mem_init() memblock_free_all() free_low_memory_core_early() memmap_init_reserved_pages() reserve_bootmem_region() init_reserved_page() __init_single_page() There two cases depends on CONFIG_DEFERRED_STRUCT_PAGE_INIT. * If CONFIG_DEFERRED_STRUCT_PAGE_INIT, pages after first_init_pfn is skipped at defer_init(). Then init_reserved_page() is defined to call __init_single_page() for them. * If !CONFIG_DEFERRED_STRUCT_PAGE_INIT, pages are all initialized by memmap_init_range(). In both cases, after init_reserved_page(), we expect __init_single_page() has done its work to the page, which already initialize page->lru properly. We don't need to do it again. Signed-off-by: Wei Yang Link: https://lore.kernel.org/r/20240610143742.26401-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (IBM) --- mm/mm_init.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 5a4c846393b7..ba993305e8c2 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -760,9 +760,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, init_reserved_page(start_pfn, nid); - /* Avoid false-positive PageTail() */ - INIT_LIST_HEAD(&page->lru); - /* * no need for atomic set_bit because the struct * page is not visible yet so nobody should From 1e4c64b71c9bf230b25fde12cbcceacfdc8b3332 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 13 Jun 2024 11:55:07 -0400 Subject: [PATCH 15/18] mm/memblock: Add "reserve_mem" to reserved named memory at boot up In order to allow for requesting a memory region that can be used for things like pstore on multiple machines where the memory layout is not the same, add a new option to the kernel command line called "reserve_mem". The format is: reserve_mem=nn:align:name Where it will find nn amount of memory at the given alignment of align. The name field is to allow another subsystem to retrieve where the memory was found. For example: reserve_mem=12M:4096:oops ramoops.mem_name=oops Where ramoops.mem_name will tell ramoops that memory was reserved for it via the reserve_mem option and it can find it by calling: if (reserve_mem_find_by_name("oops", &start, &size)) { // start holds the start address and size holds the size given This is typically used for systems that do not wipe the RAM, and this command line will try to reserve the same physical memory on soft reboots. Note, it is not guaranteed to be the same location. For example, if KASLR places the kernel at the location of where the RAM reservation was from a previous boot, the new reservation will be at a different location. Any subsystem using this feature must add a way to verify that the contents of the physical memory is from a previous boot, as there may be cases where the memory will not be located at the same location. Not all systems may work either. There could be bit flips if the reboot goes through the BIOS. Using kexec to reboot the machine is likely to have better results in such cases. Link: https://lore.kernel.org/all/ZjJVnZUX3NZiGW6q@kernel.org/ Suggested-by: Mike Rapoport Tested-by: Guilherme G. Piccoli Signed-off-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20240613155527.437020271@goodmis.org Signed-off-by: Mike Rapoport (IBM) --- .../admin-guide/kernel-parameters.txt | 22 ++++ include/linux/mm.h | 2 + mm/memblock.c | 117 ++++++++++++++++++ 3 files changed, 141 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b600df82669d..56e18b1a520d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5710,6 +5710,28 @@ them. If is less than 0x10000, the region is assumed to be I/O ports; otherwise it is memory. + reserve_mem= [RAM] + Format: nn[KNG]::