diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 1921353259ae..f2cd6ef53ff3 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -151,6 +151,11 @@ NUMA numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. + numa=hotadd=percent + Only allow hotadd memory to preallocate page structures upto + percent of already available memory. + numa=hotadd=0 will disable hotadd memory. + ACPI acpi=off Don't enable ACPI diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 492161168402..dff870534199 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -530,8 +530,7 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) unsigned long pfn; unsigned long total = 0, mem = 0; for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - unsigned long addr = pfn << PAGE_SHIFT; - if (pfn_valid(pfn) && e820_mapped(addr, addr+1, E820_RAM)) { + if (pfn_valid(pfn)) { online_page(pfn_to_page(pfn)); err = 0; mem++; diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 4be82d6e2b48..779132af29a7 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -142,6 +142,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<> PAGE_SHIFT; + unsigned long e_pfn = end >> PAGE_SHIFT; + int changed = 0; + struct bootnode *nd = &nodes_add[node]; + + /* I had some trouble with strange memory hotadd regions breaking + the boot. Be very strict here and reject anything unexpected. + If you want working memory hotadd write correct SRATs. + + The node size check is a basic sanity check to guard against + mistakes */ + if ((signed long)(end - start) < NODE_MIN_SIZE) { + printk(KERN_ERR "SRAT: Hotplug area too small\n"); + return -1; + } + + /* This check might be a bit too strict, but I'm keeping it for now. */ + if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) { + printk(KERN_ERR "SRAT: Hotplug area has existing memory\n"); + return -1; + } + + if (!hotadd_enough_memory(&nodes_add[node])) { + printk(KERN_ERR "SRAT: Hotplug area too large\n"); + return -1; + } + + /* Looks good */ + + found_add_area = 1; + if (nd->start == nd->end) { + nd->start = start; + nd->end = end; + changed = 1; + } else { + if (nd->start == end) { + nd->start = start; + changed = 1; + } + if (nd->end == start) { + nd->end = end; + changed = 1; + } + if (!changed) + printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); + } + + if ((nd->end >> PAGE_SHIFT) > end_pfn) + end_pfn = nd->end >> PAGE_SHIFT; + + if (changed) + printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + return 0; +} +#endif + /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ void __init acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) { - struct bootnode *nd; + struct bootnode *nd, oldnode; unsigned long start, end; int node, pxm; int i; @@ -172,6 +292,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) } if (ma->flags.enabled == 0) return; + if (ma->flags.hot_pluggable && hotadd_percent == 0) + return; start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32); end = start + (ma->length_lo | ((u64)ma->length_hi << 32)); pxm = ma->proximity_domain; @@ -181,10 +303,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) bad_srat(); return; } - /* It is fine to add this area to the nodes data it will be used later*/ - if (ma->flags.hot_pluggable == 1) - printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n", - start, end); i = conflicting_nodes(start, end); if (i == node) { printk(KERN_WARNING @@ -199,6 +317,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) return; } nd = &nodes[node]; + oldnode = *nd; if (!node_test_and_set(node, nodes_parsed)) { nd->start = start; nd->end = end; @@ -208,8 +327,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) if (nd->end < end) nd->end = end; } + printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, nd->start, nd->end); + +#ifdef RESERVE_HOTADD + if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) { + /* Ignore hotadd region. Undo damage */ + printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); + *nd = oldnode; + if ((nd->start | nd->end) == 0) + node_clear(node, nodes_parsed); + } +#endif } /* Sanity check to catch more bad SRATs (they are amazingly common). @@ -225,6 +355,9 @@ static int nodes_cover_memory(void) unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; pxmram -= e820_hole_size(s, e); + pxmram -= nodes_add[i].end - nodes_add[i].start; + if ((long)pxmram < 0) + pxmram = 0; } e820ram = end_pfn - e820_hole_size(0, end_pfn); @@ -258,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { - cutoff_node(i, start, end); + cutoff_node(i, start, end); if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) unparse_node(i); } @@ -303,6 +436,25 @@ static int node_to_pxm(int n) return 0; } +void __init srat_reserve_add_area(int nodeid) +{ + if (found_add_area && nodes_add[nodeid].end) { + u64 total_mb; + + printk(KERN_INFO "SRAT: Reserving hot-add memory space " + "for node %d at %Lx-%Lx\n", + nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); + total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) + >> PAGE_SHIFT; + total_mb *= sizeof(struct page); + total_mb >>= 20; + printk(KERN_INFO "SRAT: This will cost you %Lu MB of " + "pre-allocated memory.\n", (unsigned long long)total_mb); + reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, + nodes_add[nodeid].end - nodes_add[nodeid].start); + } +} + int __node_distance(int a, int b) { int index; diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h index f6cbb4cbb5a3..f0ba4d984bdf 100644 --- a/include/asm-x86_64/numa.h +++ b/include/asm-x86_64/numa.h @@ -18,6 +18,8 @@ extern void numa_init_array(void); extern int numa_off; extern void numa_set_node(int cpu, int node); +extern void srat_reserve_add_area(int nodeid); +extern int hotadd_percent; extern unsigned char apicid_to_node[256]; #ifdef CONFIG_NUMA