From fac1bceeeb04886fc2ee952672e6e6c85ce41dca Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Sat, 3 Aug 2024 08:01:22 +0200 Subject: [PATCH 01/14] xen: use correct end address of kernel for conflict checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running as a Xen PV dom0 the kernel is loaded by the hypervisor using a different memory map than that of the host. In order to minimize the required changes in the kernel, the kernel adapts its memory map to that of the host. In order to do that it is checking for conflicts of its load address with the host memory map. Unfortunately the tested memory range does not include the .brk area, which might result in crashes or memory corruption when this area does conflict with the memory map of the host. Fix the test by using the _end label instead of __bss_stop. Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout") Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 806ddb2391d9..4bcc70a71b7d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -825,7 +825,7 @@ char * __init xen_memory_setup(void) * to relocating (and even reusing) pages with kernel text or data. */ if (xen_is_e820_reserved(__pa_symbol(_text), - __pa_symbol(__bss_stop) - __pa_symbol(_text))) { + __pa_symbol(_end) - __pa_symbol(_text))) { xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); BUG(); } From ba88829706e2c5b7238638fc2b0713edf596495e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 2 Aug 2024 14:11:06 +0200 Subject: [PATCH 02/14] xen: introduce generic helper checking for memory map conflicts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When booting as a Xen PV dom0 the memory layout of the dom0 is modified to match that of the host, as this requires less changes in the kernel for supporting Xen. There are some cases, though, which are problematic, as it is the Xen hypervisor selecting the kernel's load address plus some other data, which might conflict with the host's memory map. These conflicts are detected at boot time and result in a boot error. In order to support handling at least some of these conflicts in future, introduce a generic helper function which will later gain the ability to adapt the memory layout when possible. Add the missing check for the xen_start_info area. Note that possible p2m map and initrd memory conflicts are handled already by copying the data to memory areas not conflicting with the memory map. The initial stack allocated by Xen doesn't need to be checked, as early boot code is switching to the statically allocated initial kernel stack. Initial page tables and the kernel itself will be handled later. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross --- arch/x86/xen/mmu_pv.c | 5 +---- arch/x86/xen/setup.c | 34 ++++++++++++++++++++++++++++------ arch/x86/xen/xen-ops.h | 3 ++- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index f1ce39d6d32c..839e6613753d 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2018,10 +2018,7 @@ void __init xen_reserve_special_pages(void) void __init xen_pt_check_e820(void) { - if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { - xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(xen_pt_base, xen_pt_size, "page table"); } static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 4bcc70a71b7d..96765180514b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -567,7 +567,7 @@ static void __init xen_ignore_unusable(void) } } -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +static bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { struct e820_entry *entry; unsigned mapcnt; @@ -624,6 +624,23 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Check for an area in physical memory to be usable for non-movable purposes. + * An area is considered to usable if the used E820 map lists it to be RAM. + * In case the area is not usable, crash the system with an error message. + */ +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component) +{ + if (!xen_is_e820_reserved(start, size)) + return; + + xen_raw_console_write("Xen hypervisor allocated "); + xen_raw_console_write(component); + xen_raw_console_write(" memory conflicts with E820 map\n"); + BUG(); +} + /* * Like memcpy, but with physical addresses for dest and src. */ @@ -824,11 +841,16 @@ char * __init xen_memory_setup(void) * Failing now is better than running into weird problems later due * to relocating (and even reusing) pages with kernel text or data. */ - if (xen_is_e820_reserved(__pa_symbol(_text), - __pa_symbol(_end) - __pa_symbol(_text))) { - xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* + * Check for a conflict of the xen_start_info memory with the target + * E820 map. + */ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); /* * Check for a conflict of the hypervisor supplied page tables with diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0cf16fc79e0b..9a27d1d653d3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -48,7 +48,8 @@ void xen_mm_unpin_all(void); void __init xen_relocate_p2m(void); #endif -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); From c4498ae316da5b5786ccd448fc555f3339b8e4ca Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 6 Aug 2024 09:56:48 +0200 Subject: [PATCH 03/14] xen: move checks for e820 conflicts further up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the checks for e820 memory map conflicts using the xen_chk_is_e820_usable() helper further up in order to prepare resolving some of the possible conflicts by doing some e820 map modifications, which must happen before evaluating the RAM layout. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross --- arch/x86/xen/setup.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 96765180514b..dba68951ed6b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -764,6 +764,28 @@ char * __init xen_memory_setup(void) /* Make sure the Xen-supplied memory map is well-ordered. */ e820__update_table(&xen_e820_table); + /* + * Check whether the kernel itself conflicts with the target E820 map. + * Failing now is better than running into weird problems later due + * to relocating (and even reusing) pages with kernel text or data. + */ + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* + * Check for a conflict of the xen_start_info memory with the target + * E820 map. + */ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); + + /* + * Check for a conflict of the hypervisor supplied page tables with + * the target E820 map. + */ + xen_pt_check_e820(); + max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ @@ -836,28 +858,6 @@ char * __init xen_memory_setup(void) e820__update_table(e820_table); - /* - * Check whether the kernel itself conflicts with the target E820 map. - * Failing now is better than running into weird problems later due - * to relocating (and even reusing) pages with kernel text or data. - */ - xen_chk_is_e820_usable(__pa_symbol(_text), - __pa_symbol(_end) - __pa_symbol(_text), - "kernel"); - - /* - * Check for a conflict of the xen_start_info memory with the target - * E820 map. - */ - xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), - "xen_start_info"); - - /* - * Check for a conflict of the hypervisor supplied page tables with - * the target E820 map. - */ - xen_pt_check_e820(); - xen_reserve_xen_mfnlist(); /* Check for a conflict of the initrd with the target E820 map. */ From 43dc2a0f479b9cd30f6674986d7a40517e999d31 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 6 Aug 2024 10:24:41 +0200 Subject: [PATCH 04/14] xen: move max_pfn in xen_memory_setup() out of function scope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of having max_pfn as a local variable of xen_memory_setup(), make it a static variable in setup.c instead. This avoids having to pass it to subfunctions, which will be needed in more cases in future. Rename it to ini_nr_pages, as the value denotes the currently usable number of memory pages as passed from the hypervisor at boot time. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross --- arch/x86/xen/setup.c | 52 ++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index dba68951ed6b..2c79bb5a9cd0 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -46,6 +46,9 @@ bool xen_pv_pci_possible; /* E820 map used during setting up memory. */ static struct e820_table xen_e820_table __initdata; +/* Number of initially usable memory pages. */ +static unsigned long ini_nr_pages __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -212,7 +215,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages) + unsigned long end_pfn) { unsigned long pfn, end; int ret; @@ -220,7 +223,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN_ON(start_pfn > end_pfn); /* Release pages first. */ - end = min(end_pfn, nr_pages); + end = min(end_pfn, ini_nr_pages); for (pfn = start_pfn; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -341,15 +344,14 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long remap_pfn) + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; unsigned long n = end_pfn - start_pfn; if (remap_pfn == 0) - remap_pfn = nr_pages; + remap_pfn = ini_nr_pages; while (i < n) { unsigned long cur_pfn = start_pfn + i; @@ -358,19 +360,19 @@ static unsigned long __init xen_set_identity_and_remap_chunk( unsigned long remap_range_size; /* Do not remap pages beyond the current allocation */ - if (cur_pfn >= nr_pages) { + if (cur_pfn >= ini_nr_pages) { /* Identity map remaining pages */ set_phys_range_identity(cur_pfn, cur_pfn + size); break; } - if (cur_pfn + size > nr_pages) - size = nr_pages - cur_pfn; + if (cur_pfn + size > ini_nr_pages) + size = ini_nr_pages - cur_pfn; remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warn("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages); + cur_pfn + left); break; } /* Adjust size to fit in current e820 RAM region */ @@ -397,18 +399,18 @@ static unsigned long __init xen_set_identity_and_remap_chunk( } static unsigned long __init xen_count_remap_pages( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pages) { - if (start_pfn >= nr_pages) + if (start_pfn >= ini_nr_pages) return remap_pages; - return remap_pages + min(end_pfn, nr_pages) - start_pfn; + return remap_pages + min(end_pfn, ini_nr_pages) - start_pfn; } -static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, +static unsigned long __init xen_foreach_remap_area( unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn, - unsigned long nr_pages, unsigned long last_val)) + unsigned long last_val)) { phys_addr_t start = 0; unsigned long ret_val = 0; @@ -436,8 +438,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) - ret_val = func(start_pfn, end_pfn, nr_pages, - ret_val); + ret_val = func(start_pfn, end_pfn, ret_val); start = end; } } @@ -700,7 +701,7 @@ static void __init xen_reserve_xen_mfnlist(void) **/ char * __init xen_memory_setup(void) { - unsigned long max_pfn, pfn_s, n_pfns; + unsigned long pfn_s, n_pfns; phys_addr_t mem_end, addr, size, chunk_size; u32 type; int rc; @@ -712,9 +713,8 @@ char * __init xen_memory_setup(void) int op; xen_parse_512gb(); - max_pfn = xen_get_pages_limit(); - max_pfn = min(max_pfn, xen_start_info->nr_pages); - mem_end = PFN_PHYS(max_pfn); + ini_nr_pages = min(xen_get_pages_limit(), xen_start_info->nr_pages); + mem_end = PFN_PHYS(ini_nr_pages); memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries); set_xen_guest_handle(memmap.buffer, xen_e820_table.entries); @@ -789,10 +789,10 @@ char * __init xen_memory_setup(void) max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ - max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages); + max_pages += xen_foreach_remap_area(xen_count_remap_pages); - if (max_pages > max_pfn) - extra_pages += max_pages - max_pfn; + if (max_pages > ini_nr_pages) + extra_pages += max_pages - ini_nr_pages; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO @@ -801,8 +801,8 @@ char * __init xen_memory_setup(void) * Make sure we have no memory above max_pages, as this area * isn't handled by the p2m management. */ - maxmem_pages = EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)); - extra_pages = min3(maxmem_pages, extra_pages, max_pages - max_pfn); + maxmem_pages = EXTRA_MEM_RATIO * min(ini_nr_pages, PFN_DOWN(MAXMEM)); + extra_pages = min3(maxmem_pages, extra_pages, max_pages - ini_nr_pages); i = 0; addr = xen_e820_table.entries[0].addr; size = xen_e820_table.entries[0].size; @@ -885,7 +885,7 @@ char * __init xen_memory_setup(void) * Set identity map on non-RAM pages and prepare remapping the * underlying RAM. */ - xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk); + xen_foreach_remap_area(xen_set_identity_and_remap_chunk); pr_info("Released %ld page(s)\n", xen_released_pages); From d05208cf7f05420ad10cc7f9550f91d485523659 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 14 Aug 2024 16:47:25 +0200 Subject: [PATCH 05/14] xen: add capability to remap non-RAM pages to different PFNs When running as a Xen PV dom0 it can happen that the kernel is being loaded to a guest physical address conflicting with the host memory map. In order to be able to resolve this conflict, add the capability to remap non-RAM areas to different guest PFNs. A function to use this remapping information for other purposes than doing the remap will be added when needed. As the number of conflicts should be rather low (currently only machines with max. 1 conflict are known), save the remap data in a small statically allocated array. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross --- arch/x86/xen/p2m.c | 63 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 3 ++ 2 files changed, 66 insertions(+) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 7c735b730acd..2809bded30ea 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -80,6 +80,7 @@ #include #include #include +#include #include "xen-ops.h" @@ -792,6 +793,68 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } +/* Remapped non-RAM areas */ +#define NR_NONRAM_REMAP 4 +static struct nonram_remap { + phys_addr_t maddr; + phys_addr_t paddr; + size_t size; +} xen_nonram_remap[NR_NONRAM_REMAP] __ro_after_init; +static unsigned int nr_nonram_remap __ro_after_init; + +/* + * Do the real remapping of non-RAM regions as specified in the + * xen_nonram_remap[] array. + * In case of an error just crash the system. + */ +void __init xen_do_remap_nonram(void) +{ + unsigned int i; + unsigned int remapped = 0; + const struct nonram_remap *remap = xen_nonram_remap; + unsigned long pfn, mfn, end_pfn; + + for (i = 0; i < nr_nonram_remap; i++) { + end_pfn = PFN_UP(remap->paddr + remap->size); + pfn = PFN_DOWN(remap->paddr); + mfn = PFN_DOWN(remap->maddr); + while (pfn < end_pfn) { + if (!set_phys_to_machine(pfn, mfn)) + panic("Failed to set p2m mapping for pfn=%lx mfn=%lx\n", + pfn, mfn); + + pfn++; + mfn++; + remapped++; + } + + remap++; + } + + pr_info("Remapped %u non-RAM page(s)\n", remapped); +} + +/* + * Add a new non-RAM remap entry. + * In case of no free entry found, just crash the system. + */ +void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, + unsigned long size) +{ + BUG_ON((maddr & ~PAGE_MASK) != (paddr & ~PAGE_MASK)); + + if (nr_nonram_remap == NR_NONRAM_REMAP) { + xen_raw_console_write("Number of required E820 entry remapping actions exceed maximum value\n"); + BUG(); + } + + xen_nonram_remap[nr_nonram_remap].maddr = maddr; + xen_nonram_remap[nr_nonram_remap].paddr = paddr; + xen_nonram_remap[nr_nonram_remap].size = size; + + nr_nonram_remap++; +} + #ifdef CONFIG_XEN_DEBUG_FS #include static int p2m_dump_show(struct seq_file *m, void *v) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a27d1d653d3..e1b782e823e6 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -47,6 +47,9 @@ void xen_mm_unpin_all(void); #ifdef CONFIG_X86_64 void __init xen_relocate_p2m(void); #endif +void __init xen_do_remap_nonram(void); +void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, + unsigned long size); void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, const char *component); From 9221222c717dbddac1e3c49906525475d87a3a44 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 9 Aug 2024 17:52:55 +0200 Subject: [PATCH 06/14] xen: allow mapping ACPI data using a different physical address When running as a Xen PV dom0 the system needs to map ACPI data of the host using host physical addresses, while those addresses can conflict with the guest physical addresses of the loaded linux kernel. The same problem might apply in case a PV guest is configured to use the host memory map. This conflict can be solved by mapping the ACPI data to a different guest physical address, but mapping the data via acpi_os_ioremap() must still be possible using the host physical address, as this address might be generated by AML when referencing some of the ACPI data. When configured to support running as a Xen PV domain, have an implementation of acpi_os_ioremap() being aware of the possibility to need above mentioned translation of a host physical address to the guest physical address. This modification requires to #include linux/acpi.h in some sources which need to include asm/acpi.h directly. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross --- arch/x86/include/asm/acpi.h | 8 +++++++ arch/x86/kernel/acpi/boot.c | 11 ++++++++++ arch/x86/kernel/jailhouse.c | 1 + arch/x86/kernel/mmconf-fam10h_64.c | 1 + arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/x86_init.c | 1 + arch/x86/xen/p2m.c | 35 ++++++++++++++++++++++++++++++ arch/x86/xen/setup.c | 2 +- 8 files changed, 59 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 21bc53f5ed0c..5ab1a4598d00 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -174,6 +174,14 @@ void acpi_generic_reduced_hw_init(void); void x86_default_set_root_pointer(u64 addr); u64 x86_default_get_root_pointer(void); +#ifdef CONFIG_XEN_PV +/* A Xen PV domain needs a special acpi_os_ioremap() handling. */ +extern void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, + acpi_size size); +void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size); +#define acpi_os_ioremap acpi_os_ioremap +#endif + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9f4618dcd704..4efecac49863 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1778,3 +1778,14 @@ u64 x86_default_get_root_pointer(void) { return boot_params.acpi_rsdp_addr; } + +#ifdef CONFIG_XEN_PV +void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + return ioremap_cache(phys, size); +} + +void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, acpi_size size) = + x86_acpi_os_ioremap; +EXPORT_SYMBOL_GPL(acpi_os_ioremap); +#endif diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index df337860612d..cd8ed1edbf9e 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index c94dec6a1834..1f54eedc3015 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0c35207320cb..390e4fe7433e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 82b128d3f309..0a2bbd674a6d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 2809bded30ea..b52d3e17e2c1 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -834,6 +835,34 @@ void __init xen_do_remap_nonram(void) pr_info("Remapped %u non-RAM page(s)\n", remapped); } +#ifdef CONFIG_ACPI +/* + * Xen variant of acpi_os_ioremap() taking potentially remapped non-RAM + * regions into account. + * Any attempt to map an area crossing a remap boundary will produce a + * WARN() splat. + * phys is related to remap->maddr on input and will be rebased to remap->paddr. + */ +static void __iomem *xen_acpi_os_ioremap(acpi_physical_address phys, + acpi_size size) +{ + unsigned int i; + const struct nonram_remap *remap = xen_nonram_remap; + + for (i = 0; i < nr_nonram_remap; i++) { + if (phys + size > remap->maddr && + phys < remap->maddr + remap->size) { + WARN_ON(phys < remap->maddr || + phys + size > remap->maddr + remap->size); + phys += remap->paddr - remap->maddr; + break; + } + } + + return x86_acpi_os_ioremap(phys, size); +} +#endif /* CONFIG_ACPI */ + /* * Add a new non-RAM remap entry. * In case of no free entry found, just crash the system. @@ -848,6 +877,12 @@ void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, BUG(); } +#ifdef CONFIG_ACPI + /* Switch to the Xen acpi_os_ioremap() variant. */ + if (nr_nonram_remap == 0) + acpi_os_ioremap = xen_acpi_os_ioremap; +#endif + xen_nonram_remap[nr_nonram_remap].maddr = maddr; xen_nonram_remap[nr_nonram_remap].paddr = paddr; xen_nonram_remap[nr_nonram_remap].size = size; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2c79bb5a9cd0..1114e49937da 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -15,12 +15,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include From be35d91c8880650404f3bf813573222dfb106935 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 2 Aug 2024 20:14:22 +0200 Subject: [PATCH 07/14] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to minimize required special handling for running as Xen PV dom0, the memory layout is modified to match that of the host. This requires to have only RAM at the locations where Xen allocated memory is living. Unfortunately there seem to be some machines, where ACPI NVS is located at 64 MB, resulting in a conflict with the loaded kernel or the initial page tables built by Xen. Avoid this conflict by swapping the ACPI NVS area in the memory map with unused RAM. This is possible via modification of the dom0 P2M map. Accesses to the ACPI NVS area are done either for saving and restoring it across suspend operations (this will work the same way as before), or by ACPI code when NVS memory is referenced from other ACPI tables. The latter case is handled by a Xen specific indirection of acpi_os_ioremap(). While the E820 map can (and should) be modified right away, the P2M map can be updated only after memory allocation is working, as the P2M map might need to be extended. Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout") Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki Reviewed-by: Jan Beulich Signed-off-by: Juergen Gross --- arch/x86/xen/setup.c | 92 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1114e49937da..c3db71d96c43 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -495,6 +495,8 @@ void __init xen_remap_memory(void) set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); + + xen_do_remap_nonram(); } static unsigned long __init xen_get_pages_limit(void) @@ -625,14 +627,102 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Swap a non-RAM E820 map entry with RAM above ini_nr_pages. + * Note that the E820 map is modified accordingly, but the P2M map isn't yet. + * The adaption of the P2M must be deferred until page allocation is possible. + */ +static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t mem_end = PFN_PHYS(ini_nr_pages); + phys_addr_t swap_addr, swap_size, entry_end; + + swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr); + swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size); + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + entry_end = entry->addr + entry->size; + if (entry->type == E820_TYPE_RAM && entry->size >= swap_size && + entry_end - swap_size >= mem_end) { + /* Reduce RAM entry by needed space (whole pages). */ + entry->size -= swap_size; + + /* Add new entry at the end of E820 map. */ + entry = xen_e820_table.entries + + xen_e820_table.nr_entries; + xen_e820_table.nr_entries++; + + /* Fill new entry (keep size and page offset). */ + entry->type = swap_entry->type; + entry->addr = entry_end - swap_size + + swap_addr - swap_entry->addr; + entry->size = swap_entry->size; + + /* Convert old entry to RAM, align to pages. */ + swap_entry->type = E820_TYPE_RAM; + swap_entry->addr = swap_addr; + swap_entry->size = swap_size; + + /* Remember PFN<->MFN relation for P2M update. */ + xen_add_remap_nonram(swap_addr, entry_end - swap_size, + swap_size); + + /* Order E820 table and merge entries. */ + e820__update_table(&xen_e820_table); + + return; + } + + entry++; + } + + xen_raw_console_write("No suitable area found for required E820 entry remapping action\n"); + BUG(); +} + +/* + * Look for non-RAM memory types in a specific guest physical area and move + * those away if possible (ACPI NVS only for now). + */ +static void __init xen_e820_resolve_conflicts(phys_addr_t start, + phys_addr_t size) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t end; + + if (!size) + return; + + end = start + size; + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + if (entry->addr >= end) + return; + + if (entry->addr + entry->size > start && + entry->type == E820_TYPE_NVS) + xen_e820_swap_entry_with_ram(entry); + + entry++; + } +} + /* * Check for an area in physical memory to be usable for non-movable purposes. - * An area is considered to usable if the used E820 map lists it to be RAM. + * An area is considered to usable if the used E820 map lists it to be RAM or + * some other type which can be moved to higher PFNs while keeping the MFNs. * In case the area is not usable, crash the system with an error message. */ void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, const char *component) { + xen_e820_resolve_conflicts(start, size); + if (!xen_is_e820_reserved(start, size)) return; From 661362e3dcab464d6b6976c019fd5b5433bda85f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 2 Aug 2024 18:42:51 +0300 Subject: [PATCH 08/14] xen, pvh: fix unbootable VMs (PVH + KASAN - AMD_MEM_ENCRYPT) Uninstrument arch/x86/platform/pvh/enlighten.c: KASAN has not been setup _this_ early in the boot process. Steps to reproduce: make allnoconfig make sure CONFIG_AMD_MEM_ENCRYPT is disabled AMD_MEM_ENCRYPT independently uninstruments lib/string.o so PVH boot code calls into uninstrumented memset() and memcmp() which can make the bug disappear depending on the compiler. enable CONFIG_PVH enable CONFIG_KASAN enable serial console this is fun exercise if you never done it from nothing :^) make qemu-system-x86_64 \ -enable-kvm \ -cpu host \ -smp cpus=1 \ -m 4096 \ -serial stdio \ -kernel vmlinux \ -append 'console=ttyS0 ignore_loglevel' Messages on serial console will easily tell OK kernel from unbootable kernel. In bad case qemu hangs in an infinite loop stroboscoping "SeaBIOS" message. Signed-off-by: Alexey Dobriyan Acked-by: Juergen Gross Message-ID: <20240802154253.482658-1-adobriyan@gmail.com> Signed-off-by: Juergen Gross --- arch/x86/platform/pvh/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/platform/pvh/Makefile b/arch/x86/platform/pvh/Makefile index 5dec5067c9fb..c43fb7964dc4 100644 --- a/arch/x86/platform/pvh/Makefile +++ b/arch/x86/platform/pvh/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 OBJECT_FILES_NON_STANDARD_head.o := y +KASAN_SANITIZE := n obj-$(CONFIG_PVH) += enlighten.o obj-$(CONFIG_PVH) += head.o From 416a33c9afcef24d8b48e414d08d4ae4472aa669 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 2 Aug 2024 18:42:52 +0300 Subject: [PATCH 09/14] x86/cpu: fix unbootable VMs by inlining memcmp() in hypervisor_cpuid_base() If this memcmp() is not inlined then PVH early boot code can call into KASAN-instrumented memcmp() which results in unbootable VMs: pvh_start_xen xen_prepare_pvh xen_cpuid_base hypervisor_cpuid_base memcmp Signed-off-by: Alexey Dobriyan Acked-by: Juergen Gross Message-ID: <20240802154253.482658-2-adobriyan@gmail.com> Signed-off-by: Juergen Gross --- arch/x86/include/asm/cpuid.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index 6b122a31da06..80cc6386d7b1 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -196,7 +196,12 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) for_each_possible_hypervisor_cpuid_base(base) { cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); - if (!memcmp(sig, signature, 12) && + /* + * This must not compile to "call memcmp" because it's called + * from PVH early boot code before instrumentation is set up + * and memcmp() itself may be instrumented. + */ + if (!__builtin_memcmp(sig, signature, 12) && (leaves == 0 || ((eax - base) >= leaves))) return base; } From fbe5a6dfe492eff8b0adef85fcabd84e246f1102 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 2 Aug 2024 18:42:53 +0300 Subject: [PATCH 10/14] xen, pvh: fix unbootable VMs by inlining memset() in xen_prepare_pvh() If this memset() is not inlined than PVH early boot code can call into KASAN-instrumented memset() which results in unbootable VMs. Signed-off-by: Alexey Dobriyan Acked-by: Juergen Gross Message-ID: <20240802154253.482658-3-adobriyan@gmail.com> Signed-off-by: Juergen Gross --- arch/x86/platform/pvh/enlighten.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/platform/pvh/enlighten.c b/arch/x86/platform/pvh/enlighten.c index 944e0290f2c0..2263885d16ba 100644 --- a/arch/x86/platform/pvh/enlighten.c +++ b/arch/x86/platform/pvh/enlighten.c @@ -130,7 +130,11 @@ void __init xen_prepare_pvh(void) BUG(); } - memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); + /* + * This must not compile to "call memset" because memset() may be + * instrumented. + */ + __builtin_memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); hypervisor_specific_init(xen_guest); From 3adc73efad83534c3f6df6580009482e2c08187d Mon Sep 17 00:00:00 2001 From: Shen Lichuan Date: Thu, 29 Aug 2024 16:47:10 +0800 Subject: [PATCH 11/14] xen/xenbus: Convert to use ERR_CAST() Use ERR_CAST() as it is designed for casting an error pointer to another type. This macro utilizes the __force and __must_check modifiers, which instruct the compiler to verify for errors at the locations where it is employed. Signed-off-by: Shen Lichuan Reviewed-by: Juergen Gross Message-ID: <20240829084710.30312-1-shenlichuan@vivo.com> Signed-off-by: Juergen Gross --- drivers/xen/xenbus/xenbus_xs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 028a182bcc9e..d32c726f7a12 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -427,12 +427,12 @@ char **xenbus_directory(struct xenbus_transaction t, path = join(dir, node); if (IS_ERR(path)) - return (char **)path; + return ERR_CAST(path); strings = xs_single(t, XS_DIRECTORY, path, &len); kfree(path); if (IS_ERR(strings)) - return (char **)strings; + return ERR_CAST(strings); return split(strings, len, num); } @@ -465,7 +465,7 @@ void *xenbus_read(struct xenbus_transaction t, path = join(dir, node); if (IS_ERR(path)) - return (void *)path; + return ERR_CAST(path); ret = xs_single(t, XS_READ, path, len); kfree(path); From a8d0b5eb3f9f838d3781be7d3d24b22fe492a916 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 20 Aug 2024 18:50:56 -0600 Subject: [PATCH 12/14] xen/pci: Avoid -Wflex-array-member-not-at-end warning Use the `DEFINE_RAW_FLEX()` helper for an on-stack definition of a flexible structure where the size of the flexible-array member is known at compile-time, and refactor the rest of the code, accordingly. So, with this, fix the following warning: drivers/xen/pci.c:48:55: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Message-ID: Signed-off-by: Juergen Gross --- drivers/xen/pci.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index 72d4e3f193af..a2facd8f7e51 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -44,15 +44,11 @@ static int xen_add_device(struct device *dev) } #endif if (pci_seg_supported) { - struct { - struct physdev_pci_device_add add; - uint32_t pxm; - } add_ext = { - .add.seg = pci_domain_nr(pci_dev->bus), - .add.bus = pci_dev->bus->number, - .add.devfn = pci_dev->devfn - }; - struct physdev_pci_device_add *add = &add_ext.add; + DEFINE_RAW_FLEX(struct physdev_pci_device_add, add, optarr, 1); + + add->seg = pci_domain_nr(pci_dev->bus); + add->bus = pci_dev->bus->number; + add->devfn = pci_dev->devfn; #ifdef CONFIG_ACPI acpi_handle handle; From 9f40ec84a7976d95c34e7cc070939deb103652b0 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 13 Sep 2024 12:05:02 +0200 Subject: [PATCH 13/14] xen/swiotlb: add alignment check for dma buffers When checking a memory buffer to be consecutive in machine memory, the alignment needs to be checked, too. Failing to do so might result in DMA memory not being aligned according to its requested size, leading to error messages like: 4xxx 0000:2b:00.0: enabling device (0140 -> 0142) 4xxx 0000:2b:00.0: Ring address not aligned 4xxx 0000:2b:00.0: Failed to initialise service qat_crypto 4xxx 0000:2b:00.0: Resetting device qat_dev0 4xxx: probe of 0000:2b:00.0 failed with error -14 Fixes: 9435cce87950 ("xen/swiotlb: Add support for 64KB page granularity") Signed-off-by: Juergen Gross Reviewed-by: Stefano Stabellini Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 35155258a7e2..d7fbba8b4700 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -78,9 +78,15 @@ static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); + phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); next_bfn = pfn_to_bfn(xen_pfn); + /* If buffer is physically aligned, ensure DMA alignment. */ + if (IS_ALIGNED(p, algn) && + !IS_ALIGNED((phys_addr_t)next_bfn << XEN_PAGE_SHIFT, algn)) + return 1; + for (i = 1; i < nr_pages; i++) if (pfn_to_bfn(++xen_pfn) != ++next_bfn) return 1; From c3dea3d54f4d399f8044547f0f1abdccbdfb0fee Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Sun, 15 Sep 2024 13:06:44 +0200 Subject: [PATCH 14/14] xen/swiotlb: fix allocated size The allocated size in xen_swiotlb_alloc_coherent() and xen_swiotlb_free_coherent() is calculated wrong for the case of XEN_PAGE_SIZE not matching PAGE_SIZE. Fix that. Fixes: 7250f422da04 ("xen-swiotlb: use actually allocated size on check physical continuous") Reported-by: Jan Beulich Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Reviewed-by: Stefano Stabellini Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index d7fbba8b4700..a337edcf8faf 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -147,7 +147,7 @@ xen_swiotlb_alloc_coherent(struct device *dev, size_t size, void *ret; /* Align the allocation to the Xen page size */ - size = 1UL << (order + XEN_PAGE_SHIFT); + size = ALIGN(size, XEN_PAGE_SIZE); ret = (void *)__get_free_pages(flags, get_order(size)); if (!ret) @@ -179,7 +179,7 @@ xen_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, int order = get_order(size); /* Convert the size to actually allocated. */ - size = 1UL << (order + XEN_PAGE_SHIFT); + size = ALIGN(size, XEN_PAGE_SIZE); if (WARN_ON_ONCE(dma_handle + size - 1 > dev->coherent_dma_mask) || WARN_ON_ONCE(range_straddles_page_boundary(phys, size)))