If the hypervisor doesn't support hugepages, the kernel ends up allocating a large
number of page table pages. The early page table allocation was wrongly
setting the max memblock limit to ppc64_rma_size with radix translation
which resulted in boot failure as shown below.
Kernel panic - not syncing:
early_alloc_pgtable: Failed to allocate 16777216 bytes align=0x1000000 nid=-1 from=0x0000000000000000 max_addr=0xffffffffffffffff
CPU: 0 PID: 0 Comm: swapper Not tainted 5.8.0-24.9-default+ #2
Call Trace:
[c0000000016f3d00] [c0000000007c6470] dump_stack+0xc4/0x114 (unreliable)
[c0000000016f3d40] [c00000000014c78c] panic+0x164/0x418
[c0000000016f3dd0] [c000000000098890] early_alloc_pgtable+0xe0/0xec
[c0000000016f3e60] [c0000000010a5440] radix__early_init_mmu+0x360/0x4b4
[c0000000016f3ef0] [c000000001099bac] early_init_mmu+0x1c/0x3c
[c0000000016f3f10] [c00000000109a320] early_setup+0x134/0x170
This was because the kernel was checking for the radix feature before we enable the
feature via mmu_features. This resulted in the kernel using hash restrictions on
radix.
Rework the early init code such that the kernel boot with memblock restrictions
as imposed by hash. At that point, the kernel still hasn't finalized the
translation the kernel will end up using.
We have three different ways of detecting radix.
1. dt_cpu_ftrs_scan -> used only in case of PowerNV
2. ibm,pa-features -> Used when we don't use cpu_dt_ftr_scan
3. CAS -> Where we negotiate with hypervisor about the supported translation.
We look at 1 or 2 early in the boot and after that, we look at the CAS vector to
finalize the translation the kernel will use. We also support a kernel command
line option (disable_radix) to switch to hash.
Update the memblock limit after mmu_early_init_devtree() if the kernel is going
to use radix translation. This forces some of the memblock allocations we do before
mmu_early_init_devtree() to be within the RMA limit.
Fixes: 2bfd65e45e
("powerpc/mm/radix: Add radix callbacks for early init routines")
Reported-by: Shirisha Ganta <shiganta@in.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Reviewed-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200828100852.426575-1-aneesh.kumar@linux.ibm.com
282 lines
7.4 KiB
C
282 lines
7.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_POWERPC_BOOK3S_64_MMU_H_
|
|
#define _ASM_POWERPC_BOOK3S_64_MMU_H_
|
|
|
|
#include <asm/page.h>
|
|
|
|
#ifndef __ASSEMBLY__
|
|
/*
|
|
* Page size definition
|
|
*
|
|
* shift : is the "PAGE_SHIFT" value for that page size
|
|
* sllp : is a bit mask with the value of SLB L || LP to be or'ed
|
|
* directly to a slbmte "vsid" value
|
|
* penc : is the HPTE encoding mask for the "LP" field:
|
|
*
|
|
*/
|
|
struct mmu_psize_def {
|
|
unsigned int shift; /* number of bits */
|
|
int penc[MMU_PAGE_COUNT]; /* HPTE encoding */
|
|
unsigned int tlbiel; /* tlbiel supported for that page size */
|
|
unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */
|
|
union {
|
|
unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */
|
|
unsigned long ap; /* Ap encoding used by PowerISA 3.0 */
|
|
};
|
|
};
|
|
extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
|
|
#endif /* __ASSEMBLY__ */
|
|
|
|
/*
|
|
* If we store section details in page->flags we can't increase the MAX_PHYSMEM_BITS
|
|
* if we increase SECTIONS_WIDTH we will not store node details in page->flags and
|
|
* page_to_nid does a page->section->node lookup
|
|
* Hence only increase for VMEMMAP. Further depending on SPARSEMEM_EXTREME reduce
|
|
* memory requirements with large number of sections.
|
|
* 51 bits is the max physical real address on POWER9
|
|
*/
|
|
#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \
|
|
defined(CONFIG_PPC_64K_PAGES)
|
|
#define MAX_PHYSMEM_BITS 51
|
|
#else
|
|
#define MAX_PHYSMEM_BITS 46
|
|
#endif
|
|
|
|
/* 64-bit classic hash table MMU */
|
|
#include <asm/book3s/64/mmu-hash.h>
|
|
|
|
#ifndef __ASSEMBLY__
|
|
/*
|
|
* ISA 3.0 partition and process table entry format
|
|
*/
|
|
struct prtb_entry {
|
|
__be64 prtb0;
|
|
__be64 prtb1;
|
|
};
|
|
extern struct prtb_entry *process_tb;
|
|
|
|
struct patb_entry {
|
|
__be64 patb0;
|
|
__be64 patb1;
|
|
};
|
|
extern struct patb_entry *partition_tb;
|
|
|
|
/* Bits in patb0 field */
|
|
#define PATB_HR (1UL << 63)
|
|
#define RPDB_MASK 0x0fffffffffffff00UL
|
|
#define RPDB_SHIFT (1UL << 8)
|
|
#define RTS1_SHIFT 61 /* top 2 bits of radix tree size */
|
|
#define RTS1_MASK (3UL << RTS1_SHIFT)
|
|
#define RTS2_SHIFT 5 /* bottom 3 bits of radix tree size */
|
|
#define RTS2_MASK (7UL << RTS2_SHIFT)
|
|
#define RPDS_MASK 0x1f /* root page dir. size field */
|
|
|
|
/* Bits in patb1 field */
|
|
#define PATB_GR (1UL << 63) /* guest uses radix; must match HR */
|
|
#define PRTS_MASK 0x1f /* process table size field */
|
|
#define PRTB_MASK 0x0ffffffffffff000UL
|
|
|
|
/* Number of supported PID bits */
|
|
extern unsigned int mmu_pid_bits;
|
|
|
|
/* Base PID to allocate from */
|
|
extern unsigned int mmu_base_pid;
|
|
|
|
/*
|
|
* memory block size used with radix translation.
|
|
*/
|
|
extern unsigned int __ro_after_init radix_mem_block_size;
|
|
|
|
#define PRTB_SIZE_SHIFT (mmu_pid_bits + 4)
|
|
#define PRTB_ENTRIES (1ul << mmu_pid_bits)
|
|
|
|
/*
|
|
* Power9 currently only support 64K partition table size.
|
|
*/
|
|
#define PATB_SIZE_SHIFT 16
|
|
|
|
typedef unsigned long mm_context_id_t;
|
|
struct spinlock;
|
|
|
|
/* Maximum possible number of NPUs in a system. */
|
|
#define NV_MAX_NPUS 8
|
|
|
|
typedef struct {
|
|
union {
|
|
/*
|
|
* We use id as the PIDR content for radix. On hash we can use
|
|
* more than one id. The extended ids are used when we start
|
|
* having address above 512TB. We allocate one extended id
|
|
* for each 512TB. The new id is then used with the 49 bit
|
|
* EA to build a new VA. We always use ESID_BITS_1T_MASK bits
|
|
* from EA and new context ids to build the new VAs.
|
|
*/
|
|
mm_context_id_t id;
|
|
mm_context_id_t extended_id[TASK_SIZE_USER64/TASK_CONTEXT_SIZE];
|
|
};
|
|
|
|
/* Number of bits in the mm_cpumask */
|
|
atomic_t active_cpus;
|
|
|
|
/* Number of users of the external (Nest) MMU */
|
|
atomic_t copros;
|
|
|
|
/* Number of user space windows opened in process mm_context */
|
|
atomic_t vas_windows;
|
|
|
|
struct hash_mm_context *hash_context;
|
|
|
|
unsigned long vdso_base;
|
|
/*
|
|
* pagetable fragment support
|
|
*/
|
|
void *pte_frag;
|
|
void *pmd_frag;
|
|
#ifdef CONFIG_SPAPR_TCE_IOMMU
|
|
struct list_head iommu_group_mem_list;
|
|
#endif
|
|
|
|
#ifdef CONFIG_PPC_MEM_KEYS
|
|
/*
|
|
* Each bit represents one protection key.
|
|
* bit set -> key allocated
|
|
* bit unset -> key available for allocation
|
|
*/
|
|
u32 pkey_allocation_map;
|
|
s16 execute_only_pkey; /* key holding execute-only protection */
|
|
#endif
|
|
} mm_context_t;
|
|
|
|
static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
|
|
{
|
|
return ctx->hash_context->user_psize;
|
|
}
|
|
|
|
static inline void mm_ctx_set_user_psize(mm_context_t *ctx, u16 user_psize)
|
|
{
|
|
ctx->hash_context->user_psize = user_psize;
|
|
}
|
|
|
|
static inline unsigned char *mm_ctx_low_slices(mm_context_t *ctx)
|
|
{
|
|
return ctx->hash_context->low_slices_psize;
|
|
}
|
|
|
|
static inline unsigned char *mm_ctx_high_slices(mm_context_t *ctx)
|
|
{
|
|
return ctx->hash_context->high_slices_psize;
|
|
}
|
|
|
|
static inline unsigned long mm_ctx_slb_addr_limit(mm_context_t *ctx)
|
|
{
|
|
return ctx->hash_context->slb_addr_limit;
|
|
}
|
|
|
|
static inline void mm_ctx_set_slb_addr_limit(mm_context_t *ctx, unsigned long limit)
|
|
{
|
|
ctx->hash_context->slb_addr_limit = limit;
|
|
}
|
|
|
|
static inline struct slice_mask *slice_mask_for_size(mm_context_t *ctx, int psize)
|
|
{
|
|
#ifdef CONFIG_PPC_64K_PAGES
|
|
if (psize == MMU_PAGE_64K)
|
|
return &ctx->hash_context->mask_64k;
|
|
#endif
|
|
#ifdef CONFIG_HUGETLB_PAGE
|
|
if (psize == MMU_PAGE_16M)
|
|
return &ctx->hash_context->mask_16m;
|
|
if (psize == MMU_PAGE_16G)
|
|
return &ctx->hash_context->mask_16g;
|
|
#endif
|
|
BUG_ON(psize != MMU_PAGE_4K);
|
|
|
|
return &ctx->hash_context->mask_4k;
|
|
}
|
|
|
|
#ifdef CONFIG_PPC_SUBPAGE_PROT
|
|
static inline struct subpage_prot_table *mm_ctx_subpage_prot(mm_context_t *ctx)
|
|
{
|
|
return ctx->hash_context->spt;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* The current system page and segment sizes
|
|
*/
|
|
extern int mmu_linear_psize;
|
|
extern int mmu_virtual_psize;
|
|
extern int mmu_vmalloc_psize;
|
|
extern int mmu_vmemmap_psize;
|
|
extern int mmu_io_psize;
|
|
|
|
/* MMU initialization */
|
|
void mmu_early_init_devtree(void);
|
|
void hash__early_init_devtree(void);
|
|
void radix__early_init_devtree(void);
|
|
#ifdef CONFIG_PPC_MEM_KEYS
|
|
void pkey_early_init_devtree(void);
|
|
#else
|
|
static inline void pkey_early_init_devtree(void) {}
|
|
#endif
|
|
|
|
extern void hash__early_init_mmu(void);
|
|
extern void radix__early_init_mmu(void);
|
|
static inline void __init early_init_mmu(void)
|
|
{
|
|
if (radix_enabled())
|
|
return radix__early_init_mmu();
|
|
return hash__early_init_mmu();
|
|
}
|
|
extern void hash__early_init_mmu_secondary(void);
|
|
extern void radix__early_init_mmu_secondary(void);
|
|
static inline void early_init_mmu_secondary(void)
|
|
{
|
|
if (radix_enabled())
|
|
return radix__early_init_mmu_secondary();
|
|
return hash__early_init_mmu_secondary();
|
|
}
|
|
|
|
extern void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base,
|
|
phys_addr_t first_memblock_size);
|
|
static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base,
|
|
phys_addr_t first_memblock_size)
|
|
{
|
|
/*
|
|
* Hash has more strict restrictions. At this point we don't
|
|
* know which translations we will pick. Hence go with hash
|
|
* restrictions.
|
|
*/
|
|
return hash__setup_initial_memory_limit(first_memblock_base,
|
|
first_memblock_size);
|
|
}
|
|
|
|
#ifdef CONFIG_PPC_PSERIES
|
|
extern void radix_init_pseries(void);
|
|
#else
|
|
static inline void radix_init_pseries(void) { };
|
|
#endif
|
|
|
|
static inline int get_user_context(mm_context_t *ctx, unsigned long ea)
|
|
{
|
|
int index = ea >> MAX_EA_BITS_PER_CONTEXT;
|
|
|
|
if (likely(index < ARRAY_SIZE(ctx->extended_id)))
|
|
return ctx->extended_id[index];
|
|
|
|
/* should never happen */
|
|
WARN_ON(1);
|
|
return 0;
|
|
}
|
|
|
|
static inline unsigned long get_user_vsid(mm_context_t *ctx,
|
|
unsigned long ea, int ssize)
|
|
{
|
|
unsigned long context = get_user_context(ctx, ea);
|
|
|
|
return get_vsid(context, ea, ssize);
|
|
}
|
|
|
|
#endif /* __ASSEMBLY__ */
|
|
#endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */
|