From c603a309cc75f3dd018ddb20ee44c05047918cbf Mon Sep 17 00:00:00 2001 From: Thomas Lendacky Date: Wed, 19 Jun 2019 18:40:57 +0000 Subject: [PATCH 1/8] x86/mm: Identify the end of the kernel area to be reserved The memory occupied by the kernel is reserved using memblock_reserve() in setup_arch(). Currently, the area is from symbols _text to __bss_stop. Everything after __bss_stop must be specifically reserved otherwise it is discarded. This is not clearly documented. Add a new symbol, __end_of_kernel_reserve, that more readily identifies what is reserved, along with comments that indicate what is reserved, what is discarded and what needs to be done to prevent a section from being discarded. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov Reviewed-by: Baoquan He Reviewed-by: Dave Hansen Tested-by: Lianbo Jiang Cc: Andy Lutomirski Cc: Brijesh Singh Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joerg Roedel Cc: Juergen Gross Cc: Kees Cook Cc: Nick Desaulniers Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Robert Richter Cc: Sami Tolvanen Cc: Sinan Kaya Cc: Thomas Gleixner Cc: "x86@kernel.org" Link: https://lkml.kernel.org/r/7db7da45b435f8477f25e66f292631ff766a844c.1560969363.git.thomas.lendacky@amd.com --- arch/x86/include/asm/sections.h | 2 ++ arch/x86/kernel/setup.c | 8 +++++++- arch/x86/kernel/vmlinux.lds.S | 9 ++++++++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 8ea1cfdbeabc..71b32f2570ab 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -13,4 +13,6 @@ extern char __end_rodata_aligned[]; extern char __end_rodata_hpage_align[]; #endif +extern char __end_of_kernel_reserve[]; + #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 08a5f4a131f5..dac60ad37e5e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -827,8 +827,14 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) void __init setup_arch(char **cmdline_p) { + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ memblock_reserve(__pa_symbol(_text), - (unsigned long)__bss_stop - (unsigned long)_text); + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); /* * Make sure page 0 is always reserved because on systems with diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0850b5149345..ca2252ca6ad7 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -368,6 +368,14 @@ SECTIONS __bss_stop = .; } + /* + * The memory occupied from _text to here, __end_of_kernel_reserve, is + * automatically reserved in setup_arch(). Anything after here must be + * explicitly reserved using memblock_reserve() or it will be discarded + * and treated as available memory. + */ + __end_of_kernel_reserve = .; + . = ALIGN(PAGE_SIZE); .brk : AT(ADDR(.brk) - LOAD_OFFSET) { __brk_base = .; @@ -382,7 +390,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG - /* Sections to be discarded */ DISCARDS /DISCARD/ : { *(.eh_frame) From e1bfa87399e372446454ecbaeba2800f0a385733 Mon Sep 17 00:00:00 2001 From: Thomas Lendacky Date: Wed, 19 Jun 2019 18:40:59 +0000 Subject: [PATCH 2/8] x86/mm: Create a workarea in the kernel for SME early encryption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order for the kernel to be encrypted "in place" during boot, a workarea outside of the kernel must be used. This SME workarea used during early encryption of the kernel is situated on a 2MB boundary after the end of the kernel text, data, etc. sections (_end). This works well during initial boot of a compressed kernel because of the relocation used for decompression of the kernel. But when performing a kexec boot, there's a chance that the SME workarea may not be mapped by the kexec pagetables or that some of the other data used by kexec could exist in this range. Create a section for SME in vmlinux.lds.S. Position it after "_end", which is after "__end_of_kernel_reserve", so that the memory will be reclaimed during boot and since this area is all zeroes, it compresses well. This new section will be part of the kernel image, so kexec will account for it in pagetable mappings and placement of data after the kernel. Here's an example of a kernel size without and with the SME section: without: vmlinux: 36,501,616 bzImage: 6,497,344 100000000-47f37ffff : System RAM 1e4000000-1e47677d4 : Kernel code (0x7677d4) 1e47677d5-1e4e2e0bf : Kernel data (0x6c68ea) 1e5074000-1e5372fff : Kernel bss (0x2fefff) with: vmlinux: 44,419,408 bzImage: 6,503,136 880000000-c7ff7ffff : System RAM 8cf000000-8cf7677d4 : Kernel code (0x7677d4) 8cf7677d5-8cfe2e0bf : Kernel data (0x6c68ea) 8d0074000-8d0372fff : Kernel bss (0x2fefff) Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov Reviewed-by: Baoquan He Reviewed-by: Dave Hansen Tested-by: Lianbo Jiang Cc: Andy Lutomirski Cc: Brijesh Singh Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joerg Roedel Cc: Kees Cook Cc: Nick Desaulniers Cc: Peter Zijlstra Cc: "Rafael Ávila de Espíndola" Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: "x86@kernel.org" Link: https://lkml.kernel.org/r/3c483262eb4077b1654b2052bd14a8d011bffde3.1560969363.git.thomas.lendacky@amd.com --- arch/x86/kernel/vmlinux.lds.S | 25 +++++++++++++++++++++++++ arch/x86/mm/mem_encrypt_identity.c | 22 ++++++++++++++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ca2252ca6ad7..147cd020516a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -387,6 +387,31 @@ SECTIONS . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Early scratch/workarea section: Lives outside of the kernel proper + * (_text - _end). + * + * Resides after _end because even though the .brk section is after + * __end_of_kernel_reserve, the .brk section is later reserved as a + * part of the kernel. Since it is located after __end_of_kernel_reserve + * it will be discarded and become part of the available memory. As + * such, it can only be used by very early boot code and must not be + * needed afterwards. + * + * Currently used by SME for performing in-place encryption of the + * kernel during boot. Resides on a 2MB boundary to simplify the + * pagetable setup used for SME in-place encryption. + */ + . = ALIGN(HPAGE_SIZE); + .init.scratch : AT(ADDR(.init.scratch) - LOAD_OFFSET) { + __init_scratch_begin = .; + *(.init.scratch) + . = ALIGN(HPAGE_SIZE); + __init_scratch_end = .; + } +#endif + STABS_DEBUG DWARF_DEBUG diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 4aa9b1480866..6a8dd483f7d9 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -73,6 +73,19 @@ struct sme_populate_pgd_data { unsigned long vaddr_end; }; +/* + * This work area lives in the .init.scratch section, which lives outside of + * the kernel proper. It is sized to hold the intermediate copy buffer and + * more than enough pagetable pages. + * + * By using this section, the kernel can be encrypted in place and it + * avoids any possibility of boot parameters or initramfs images being + * placed such that the in-place encryption logic overwrites them. This + * section is 2MB aligned to allow for simple pagetable setup using only + * PMD entries (see vmlinux.lds.S). + */ +static char sme_workarea[2 * PMD_PAGE_SIZE] __section(.init.scratch); + static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; static char sme_cmdline_off[] __initdata = "off"; @@ -314,8 +327,13 @@ void __init sme_encrypt_kernel(struct boot_params *bp) } #endif - /* Set the encryption workarea to be immediately after the kernel */ - workarea_start = kernel_end; + /* + * We're running identity mapped, so we must obtain the address to the + * SME encryption workarea using rip-relative addressing. + */ + asm ("lea sme_workarea(%%rip), %0" + : "=r" (workarea_start) + : "p" (sme_workarea)); /* * Calculate required number of workarea bytes needed: From ae9e13d621d6795ec1ad6bf10bd2549c6c3feca4 Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Tue, 23 Apr 2019 09:30:05 +0800 Subject: [PATCH 3/8] x86/e820, ioport: Add a new I/O resource descriptor IORES_DESC_RESERVED When executing the kexec_file_load() syscall, the first kernel needs to pass the e820 reserved ranges to the second kernel because some devices (PCI, for example) need them present in the kdump kernel for proper initialization. But the kernel can not exactly match the e820 reserved ranges when walking through the iomem resources using the default IORES_DESC_NONE descriptor, because there are several types of e820 ranges which are marked IORES_DESC_NONE, see e820_type_to_iores_desc(). Therefore, add a new I/O resource descriptor called IORES_DESC_RESERVED to mark exactly those ranges. It will be used to match the reserved resource ranges when walking through iomem resources. [ bp: Massage commit message. ] Suggested-by: Borislav Petkov Signed-off-by: Lianbo Jiang Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: bhe@redhat.com Cc: dave.hansen@linux.intel.com Cc: dyoung@redhat.com Cc: "H. Peter Anvin" Cc: Huang Zijiang Cc: Ingo Molnar Cc: Joe Perches Cc: Juergen Gross Cc: kexec@lists.infradead.org Cc: Masayoshi Mizuma Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Cc: x86-ml Link: https://lkml.kernel.org/r/20190423013007.17838-2-lijiang@redhat.com --- arch/x86/kernel/e820.c | 2 +- include/linux/ioport.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 8f32e705a980..e69408bf664b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1063,10 +1063,10 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE; case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ - case E820_TYPE_RESERVED: /* Fall-through: */ default: return IORES_DESC_NONE; } } diff --git a/include/linux/ioport.h b/include/linux/ioport.h index da0ebaec25f0..6ed59de48bd5 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -133,6 +133,7 @@ enum { IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, IORES_DESC_DEVICE_PUBLIC_MEMORY = 7, + IORES_DESC_RESERVED = 8, }; /* helpers to define resources */ From 5da04cc86d1215fd9fe0e5c88ead6e8428a75e56 Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Tue, 23 Apr 2019 09:30:06 +0800 Subject: [PATCH 4/8] x86/mm: Rework ioremap resource mapping determination On ioremap(), __ioremap_check_mem() does a couple of checks on the supplied memory range to determine how the range should be mapped and in particular what protection flags should be used. Generalize the procedure by introducing IORES_MAP_* flags which control different aspects of the ioremapping and use them in the respective helpers which determine which descriptor flags should be set per range. [ bp: - Rewrite commit message. - Add/improve comments. - Reflow __ioremap_caller()'s args. - s/__ioremap_check_desc/__ioremap_check_encrypted/g; - s/__ioremap_res_check/__ioremap_collect_map_flags/g; - clarify __ioremap_check_ram()'s purpose. ] Signed-off-by: Lianbo Jiang Co-developed-by: Borislav Petkov Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: bhe@redhat.com Cc: Dave Hansen Cc: dyoung@redhat.com Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: kexec@lists.infradead.org Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Cc: x86-ml Link: https://lkml.kernel.org/r/20190423013007.17838-3-lijiang@redhat.com --- arch/x86/mm/ioremap.c | 71 ++++++++++++++++++++++++++---------------- include/linux/ioport.h | 9 ++++++ 2 files changed, 54 insertions(+), 26 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4b6423e7bd21..e500f1df1140 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -28,9 +28,11 @@ #include "physaddr.h" -struct ioremap_mem_flags { - bool system_ram; - bool desc_other; +/* + * Descriptor controlling ioremap() behavior. + */ +struct ioremap_desc { + unsigned int flags; }; /* @@ -62,13 +64,14 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } -static bool __ioremap_check_ram(struct resource *res) +/* Does the range (or a subset of) contain normal RAM? */ +static unsigned int __ioremap_check_ram(struct resource *res) { unsigned long start_pfn, stop_pfn; unsigned long i; if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) - return false; + return 0; start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; stop_pfn = (res->end + 1) >> PAGE_SHIFT; @@ -76,28 +79,44 @@ static bool __ioremap_check_ram(struct resource *res) for (i = 0; i < (stop_pfn - start_pfn); ++i) if (pfn_valid(start_pfn + i) && !PageReserved(pfn_to_page(start_pfn + i))) - return true; + return IORES_MAP_SYSTEM_RAM; } - return false; + return 0; } -static int __ioremap_check_desc_other(struct resource *res) +/* + * In a SEV guest, NONE and RESERVED should not be mapped encrypted because + * there the whole memory is already encrypted. + */ +static unsigned int __ioremap_check_encrypted(struct resource *res) { - return (res->desc != IORES_DESC_NONE); + if (!sev_active()) + return 0; + + switch (res->desc) { + case IORES_DESC_NONE: + case IORES_DESC_RESERVED: + break; + default: + return IORES_MAP_ENCRYPTED; + } + + return 0; } -static int __ioremap_res_check(struct resource *res, void *arg) +static int __ioremap_collect_map_flags(struct resource *res, void *arg) { - struct ioremap_mem_flags *flags = arg; + struct ioremap_desc *desc = arg; - if (!flags->system_ram) - flags->system_ram = __ioremap_check_ram(res); + if (!(desc->flags & IORES_MAP_SYSTEM_RAM)) + desc->flags |= __ioremap_check_ram(res); - if (!flags->desc_other) - flags->desc_other = __ioremap_check_desc_other(res); + if (!(desc->flags & IORES_MAP_ENCRYPTED)) + desc->flags |= __ioremap_check_encrypted(res); - return flags->system_ram && flags->desc_other; + return ((desc->flags & (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)) == + (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)); } /* @@ -106,15 +125,15 @@ static int __ioremap_res_check(struct resource *res, void *arg) * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). */ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, - struct ioremap_mem_flags *flags) + struct ioremap_desc *desc) { u64 start, end; start = (u64)addr; end = start + size - 1; - memset(flags, 0, sizeof(*flags)); + memset(desc, 0, sizeof(struct ioremap_desc)); - walk_mem_res(start, end, flags, __ioremap_res_check); + walk_mem_res(start, end, desc, __ioremap_collect_map_flags); } /* @@ -131,15 +150,15 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, * have to convert them into an offset in a page-aligned mapping, but the * caller shouldn't need to know that small detail. */ -static void __iomem *__ioremap_caller(resource_size_t phys_addr, - unsigned long size, enum page_cache_mode pcm, - void *caller, bool encrypted) +static void __iomem * +__ioremap_caller(resource_size_t phys_addr, unsigned long size, + enum page_cache_mode pcm, void *caller, bool encrypted) { unsigned long offset, vaddr; resource_size_t last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; - struct ioremap_mem_flags mem_flags; + struct ioremap_desc io_desc; struct vm_struct *area; enum page_cache_mode new_pcm; pgprot_t prot; @@ -158,12 +177,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } - __ioremap_check_mem(phys_addr, size, &mem_flags); + __ioremap_check_mem(phys_addr, size, &io_desc); /* * Don't allow anybody to remap normal RAM that we're using.. */ - if (mem_flags.system_ram) { + if (io_desc.flags & IORES_MAP_SYSTEM_RAM) { WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", &phys_addr, &last_addr); return NULL; @@ -201,7 +220,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * resulting mapping. */ prot = PAGE_KERNEL_IO; - if ((sev_active() && mem_flags.desc_other) || encrypted) + if ((io_desc.flags & IORES_MAP_ENCRYPTED) || encrypted) prot = pgprot_encrypted(prot); switch (pcm) { diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 6ed59de48bd5..5db386cfc2d4 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -12,6 +12,7 @@ #ifndef __ASSEMBLY__ #include #include +#include /* * Resources are tree-like, allowing * nesting etc.. @@ -136,6 +137,14 @@ enum { IORES_DESC_RESERVED = 8, }; +/* + * Flags controlling ioremap() behavior. + */ +enum { + IORES_MAP_SYSTEM_RAM = BIT(0), + IORES_MAP_ENCRYPTED = BIT(1), +}; + /* helpers to define resources */ #define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ { \ From 980621daf368f2b9aa69c7ea01baa654edb7577b Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Tue, 23 Apr 2019 09:30:07 +0800 Subject: [PATCH 5/8] x86/crash: Add e820 reserved ranges to kdump kernel's e820 table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At present, when using the kexec_file_load() syscall to load the kernel image and initramfs, for example: kexec -s -p xxx the kernel does not pass the e820 reserved ranges to the second kernel, which might cause two problems: 1. MMCONFIG: A device in PCI segment 1 cannot be discovered by the kernel PCI probing without all the e820 I/O reservations being present in the e820 table. Which is the case currently, because the kdump kernel does not have those reservations because the kexec command does not pass the I/O reservation via the "memmap=xxx" command line option. Further details courtesy of Bjorn Helgaas¹: I think you should regard correct MCFG/ECAM usage in the kdump kernel as a requirement. MMCONFIG (aka ECAM) space is described in the ACPI MCFG table. If you don't have ECAM: (a) PCI devices won't work at all on non-x86 systems that use only ECAM for config access, (b) you won't be able to access devices on non-0 segments (granted, there aren't very many of these yet, but there will be more in the future), and (c) you won't be able to access extended config space (addresses 0x100-0xfff), which means none of the Extended Capabilities will be available (AER, ACS, ATS, etc). 2. The second issue is that the SME kdump kernel doesn't work without the e820 reserved ranges. When SME is active in the kdump kernel, those reserved regions are still decrypted, but because those reserved ranges are not present at all in kdump kernel's e820 table, they are accessed as encrypted. Which is obviously wrong. [1]: https://lkml.kernel.org/r/CABhMZUUscS3jUZUSM5Y6EYJK6weo7Mjj5-EAKGvbw0qEe%2B38zw@mail.gmail.com [ bp: Heavily massage commit message. ] Suggested-by: Dave Young Signed-off-by: Lianbo Jiang Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Baoquan He Cc: Bjorn Helgaas Cc: dave.hansen@linux.intel.com Cc: Dave Young Cc: "Gustavo A. R. Silva" Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: kexec@lists.infradead.org Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Cc: x86-ml Cc: Yi Wang Link: https://lkml.kernel.org/r/20190423013007.17838-4-lijiang@redhat.com --- arch/x86/kernel/crash.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 576b2e1bfc12..32c956705b8e 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -381,6 +381,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, memmap_entry_callback); + /* Add e820 reserved ranges */ + cmd.type = E820_TYPE_RESERVED; + flags = IORESOURCE_MEM; + walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd, + memmap_entry_callback); + /* Add crashk_low_res region */ if (crashk_low_res.end) { ei.addr = crashk_low_res.start; From 1a79c1b8a04153c4c387518967ce851f89e22733 Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Tue, 30 Apr 2019 15:44:19 +0800 Subject: [PATCH 6/8] x86/kexec: Do not map kexec area as decrypted when SEV is active When a virtual machine panics, its memory needs to be dumped for analysis. With memory encryption in the picture, special care must be taken when loading a kexec/kdump kernel in a SEV guest. A SEV guest starts and runs fully encrypted. In order to load a kexec kernel and initrd, arch_kexec_post_{alloc,free}_pages() need to not map areas as decrypted unconditionally but differentiate whether the kernel is running as a SEV guest and if so, leave kexec area encrypted. [ bp: Reduce commit message to the relevant information pertaining to this commit only. ] Co-developed-by: Brijesh Singh Signed-off-by: Brijesh Singh Signed-off-by: Lianbo Jiang Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: bhe@redhat.com Cc: Brijesh Singh Cc: dyoung@redhat.com Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: kexec@lists.infradead.org Cc: "Kirill A. Shutemov" Cc: Thomas Gleixner Cc: Tom Lendacky Cc: x86-ml Link: https://lkml.kernel.org/r/20190430074421.7852-2-lijiang@redhat.com --- arch/x86/kernel/machine_kexec_64.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index ceba408ea982..3b38449028e0 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -559,8 +559,20 @@ void arch_kexec_unprotect_crashkres(void) kexec_mark_crashkres(false); } +/* + * During a traditional boot under SME, SME will encrypt the kernel, + * so the SME kexec kernel also needs to be un-encrypted in order to + * replicate a normal SME boot. + * + * During a traditional boot under SEV, the kernel has already been + * loaded encrypted, so the SEV kexec kernel needs to be encrypted in + * order to replicate a normal SEV boot. + */ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { + if (sev_active()) + return 0; + /* * If SME is active we need to be sure that kexec pages are * not encrypted because when we boot to the new kernel the @@ -571,6 +583,9 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { + if (sev_active()) + return; + /* * If SME is active we need to reset the pages back to being * an encrypted mapping before freeing them. From 85784d16c2cf172cf1ebaf2390d6b7c4045d659c Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Tue, 30 Apr 2019 15:44:20 +0800 Subject: [PATCH 7/8] x86/kexec: Set the C-bit in the identity map page table when SEV is active When SEV is active, the second kernel image is loaded into encrypted memory. For that, make sure that when kexec builds the identity mapping page table, the memory is encrypted (i.e., _PAGE_ENC is set). [ bp: Sort local args and OR in _PAGE_ENC for more clarity. ] Co-developed-by: Brijesh Singh Signed-off-by: Brijesh Singh Signed-off-by: Lianbo Jiang Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: bhe@redhat.com Cc: dyoung@redhat.com Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: kexec@lists.infradead.org Cc: "Kirill A. Shutemov" Cc: Thomas Gleixner Cc: Tom Lendacky Cc: x86-ml Link: https://lkml.kernel.org/r/20190430074421.7852-3-lijiang@redhat.com --- arch/x86/kernel/machine_kexec_64.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 3b38449028e0..16c37fe489bc 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -50,12 +50,13 @@ static void free_transition_pgtable(struct kimage *image) static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) { + pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; + unsigned long vaddr, paddr; + int result = -ENOMEM; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long vaddr, paddr; - int result = -ENOMEM; vaddr = (unsigned long)relocate_kernel; paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); @@ -92,7 +93,11 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); } pte = pte_offset_kernel(pmd, vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); + + if (sev_active()) + prot = PAGE_KERNEL_EXEC; + + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); return 0; err: return result; @@ -129,6 +134,11 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); + if (sev_active()) { + info.page_flag |= _PAGE_ENC; + info.kernpg_flag |= _PAGE_ENC; + } + if (direct_gbpages) info.direct_gbpages = true; From 4eb5fec31e613105668a1472d5876f3d0558e5d8 Mon Sep 17 00:00:00 2001 From: Lianbo Jiang Date: Tue, 30 Apr 2019 15:44:21 +0800 Subject: [PATCH 8/8] fs/proc/vmcore: Enable dumping of encrypted memory when SEV was active In the kdump kernel, the memory of the first kernel gets to be dumped into a vmcore file. Similarly to SME kdump, if SEV was enabled in the first kernel, the old memory has to be remapped encrypted in order to access it properly. Commit 992b649a3f01 ("kdump, proc/vmcore: Enable kdumping encrypted memory with SME enabled") took care of the SME case but it uses sme_active() which checks for SME only. Use mem_encrypt_active() instead, which returns true when either SME or SEV is active. Unlike SME, the second kernel images (kernel and initrd) are loaded into encrypted memory when SEV is active, hence the kernel elf header must be remapped as encrypted in order to access it properly. [ bp: Massage commit message. ] Co-developed-by: Brijesh Singh Signed-off-by: Brijesh Singh Signed-off-by: Lianbo Jiang Signed-off-by: Borislav Petkov Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Arnd Bergmann Cc: bhe@redhat.com Cc: dyoung@redhat.com Cc: Ganesh Goudar Cc: H. Peter Anvin Cc: kexec@lists.infradead.org Cc: linux-fsdevel@vger.kernel.org Cc: Matthew Wilcox Cc: Mike Rapoport Cc: mingo@redhat.com Cc: Rahul Lakkireddy Cc: Souptick Joarder Cc: Thomas Gleixner Cc: Tom Lendacky Cc: x86-ml Link: https://lkml.kernel.org/r/20190430074421.7852-4-lijiang@redhat.com --- fs/proc/vmcore.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7bb96fdd38ad..57957c91c6df 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -166,7 +166,7 @@ void __weak elfcorehdr_free(unsigned long long addr) */ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, false); + return read_from_oldmem(buf, count, ppos, 0, sev_active()); } /* @@ -174,7 +174,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) */ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, sme_active()); + return read_from_oldmem(buf, count, ppos, 0, mem_encrypt_active()); } /* @@ -374,7 +374,7 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, buflen); start = m->paddr + *fpos - m->offset; tmp = read_from_oldmem(buffer, tsz, &start, - userbuf, sme_active()); + userbuf, mem_encrypt_active()); if (tmp < 0) return tmp; buflen -= tsz;