linux/arch/x86/mm/pgtable.c
Ian Campbell 1431559200 x86, mm: Allow highmem user page tables to be disabled at boot time
Distros generally (I looked at Debian, RHEL5 and SLES11) seem to
enable CONFIG_HIGHPTE for any x86 configuration which has highmem
enabled. This means that the overhead applies even to machines which
have a fairly modest amount of high memory and which therefore do not
really benefit from allocating PTEs in high memory but still pay the
price of the additional mapping operations.

Running kernbench on a 4G box I found that with CONFIG_HIGHPTE=y but
no actual highptes being allocated there was a reduction in system
time used from 59.737s to 55.9s.

With CONFIG_HIGHPTE=y and highmem PTEs being allocated:
  Average Optimal load -j 4 Run (std deviation):
  Elapsed Time 175.396 (0.238914)
  User Time 515.983 (5.85019)
  System Time 59.737 (1.26727)
  Percent CPU 263.8 (71.6796)
  Context Switches 39989.7 (4672.64)
  Sleeps 42617.7 (246.307)

With CONFIG_HIGHPTE=y but with no highmem PTEs being allocated:
  Average Optimal load -j 4 Run (std deviation):
  Elapsed Time 174.278 (0.831968)
  User Time 515.659 (6.07012)
  System Time 55.9 (1.07799)
  Percent CPU 263.8 (71.266)
  Context Switches 39929.6 (4485.13)
  Sleeps 42583.7 (373.039)

This patch allows the user to control the allocation of PTEs in
highmem from the command line ("userpte=nohigh") but retains the
status-quo as the default.

It is possible that some simple heuristic could be developed which
allows auto-tuning of this option however I don't have a sufficiently
large machine available to me to perform any particularly meaningful
experiments. We could probably handwave up an argument for a threshold
at 16G of total RAM.

Assuming 768M of lowmem we have 196608 potential lowmem PTE
pages. Each page can map 2M of RAM in a PAE-enabled configuration,
meaning a maximum of 384G of RAM could potentially be mapped using
lowmem PTEs.

Even allowing generous factor of 10 to account for other required
lowmem allocations, generous slop to account for page sharing (which
reduces the total amount of RAM mappable by a given number of PT
pages) and other innacuracies in the estimations it would seem that
even a 32G machine would not have a particularly pressing need for
highmem PTEs. I think 32G could be considered to be at the upper bound
of what might be sensible on a 32 bit machine (although I think in
practice 64G is still supported).

It's seems questionable if HIGHPTE is even a win for any amount of RAM
you would sensibly run a 32 bit kernel on rather than going 64 bit.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
LKML-Reference: <1266403090-20162-1-git-send-email-ian.campbell@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
2010-02-25 10:28:19 +01:00

375 lines
8.5 KiB
C

#include <linux/mm.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
#else
#define PGALLOC_USER_GFP 0
#endif
gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
return (pte_t *)__get_free_page(PGALLOC_GFP);
}
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
struct page *pte;
pte = alloc_pages(__userpte_alloc_gfp, 0);
if (pte)
pgtable_page_ctor(pte);
return pte;
}
static int __init setup_userpte(char *arg)
{
if (!arg)
return -EINVAL;
/*
* "userpte=nohigh" disables allocation of user pagetables in
* high memory.
*/
if (strcmp(arg, "nohigh") == 0)
__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
else
return -EINVAL;
return 0;
}
early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
tlb_remove_page(tlb, pte);
}
#if PAGETABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pmd));
}
#if PAGETABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
static inline void pgd_list_add(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
list_add(&page->lru, &pgd_list);
}
static inline void pgd_list_del(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
list_del(&page->lru);
}
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
static void pgd_ctor(pgd_t *pgd)
{
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
if (PAGETABLE_LEVELS == 2 ||
(PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
PAGETABLE_LEVELS == 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
__pa(swapper_pg_dir) >> PAGE_SHIFT,
KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
}
/* list required to sync kernel mapping updates */
if (!SHARED_KERNEL_PMD)
pgd_list_add(pgd);
}
static void pgd_dtor(pgd_t *pgd)
{
unsigned long flags; /* can be called from interrupt context */
if (SHARED_KERNEL_PMD)
return;
spin_lock_irqsave(&pgd_lock, flags);
pgd_list_del(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
}
/*
* List of all pgd's needed for non-PAE so it can invalidate entries
* in both cached and uncached pgd's; not needed for PAE since the
* kernel pmd is shared. If PAE were not to share the pmd a similar
* tactic would be needed. This is essentially codepath-based locking
* against pageattr.c; it is the unique case in which a valid change
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
* vmalloc faults work because attached pagetables are never freed.
* -- wli
*/
#ifdef CONFIG_X86_PAE
/*
* In PAE mode, we need to do a cr3 reload (=tlb flush) when
* updating the top-level pagetable entries to guarantee the
* processor notices the update. Since this is expensive, and
* all 4 top-level entries are used almost immediately in a
* new process's life, we just pre-populate them here.
*
* Also, if we're in a paravirt environment where the kernel pmd is
* not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
* and initialize the kernel pmds here.
*/
#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
/* Note: almost everything apart from _PAGE_PRESENT is
reserved at the pmd (PDPT) level. */
set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
/*
* According to Intel App note "TLBs, Paging-Structure Caches,
* and Their Invalidation", April 2007, document 317080-001,
* section 8.1: in PAE mode we explicitly have to flush the
* TLB via cr3 if the top-level pgd is changed...
*/
if (mm == current->active_mm)
write_cr3(read_cr3());
}
#else /* !CONFIG_X86_PAE */
/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS 0
#endif /* CONFIG_X86_PAE */
static void free_pmds(pmd_t *pmds[])
{
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++)
if (pmds[i])
free_page((unsigned long)pmds[i]);
}
static int preallocate_pmds(pmd_t *pmds[])
{
int i;
bool failed = false;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
if (pmd == NULL)
failed = true;
pmds[i] = pmd;
}
if (failed) {
free_pmds(pmds);
return -ENOMEM;
}
return 0;
}
/*
* Mop up any pmd pages which may still be attached to the pgd.
* Normally they will be freed by munmap/exit_mmap, but any pmd we
* preallocate which never got a corresponding vma will need to be
* freed manually.
*/
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pgd_t pgd = pgdp[i];
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
pgdp[i] = native_make_pgd(0);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
}
}
}
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
pud_t *pud;
unsigned long addr;
int i;
if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
return;
pud = pud_offset(pgd, 0);
for (addr = i = 0; i < PREALLOCATED_PMDS;
i++, pud++, addr += PUD_SIZE) {
pmd_t *pmd = pmds[i];
if (i >= KERNEL_PGD_BOUNDARY)
memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
sizeof(pmd_t) * PTRS_PER_PMD);
pud_populate(mm, pud, pmd);
}
}
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pmd_t *pmds[PREALLOCATED_PMDS];
unsigned long flags;
pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
if (pgd == NULL)
goto out;
mm->pgd = pgd;
if (preallocate_pmds(pmds) != 0)
goto out_free_pgd;
if (paravirt_pgd_alloc(mm) != 0)
goto out_free_pmds;
/*
* Make sure that pre-populating the pmds is atomic with
* respect to anything walking the pgd_list, so that they
* never see a partially populated pgd.
*/
spin_lock_irqsave(&pgd_lock, flags);
pgd_ctor(pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
spin_unlock_irqrestore(&pgd_lock, flags);
return pgd;
out_free_pmds:
free_pmds(pmds);
out_free_pgd:
free_page((unsigned long)pgd);
out:
return NULL;
}
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
pgd_mop_up_pmds(mm, pgd);
pgd_dtor(pgd);
paravirt_pgd_free(mm, pgd);
free_page((unsigned long)pgd);
}
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
int changed = !pte_same(*ptep, entry);
if (changed && dirty) {
*ptep = entry;
pte_update_defer(vma->vm_mm, address, ptep);
flush_tlb_page(vma, address);
}
return changed;
}
int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
int ret = 0;
if (pte_young(*ptep))
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *) &ptep->pte);
if (ret)
pte_update(vma->vm_mm, addr, ptep);
return ret;
}
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
int young;
young = ptep_test_and_clear_young(vma, address, ptep);
if (young)
flush_tlb_page(vma, address);
return young;
}
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
*
* Can be used to relocate the fixmap area and poke a hole in the top
* of kernel address space to make room for a hypervisor.
*/
void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
BUG_ON(fixmaps_set > 0);
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
(int)-reserve);
__FIXADDR_TOP = -reserve - PAGE_SIZE;
#endif
}
int fixmaps_set;
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
unsigned long address = __fix_to_virt(idx);
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
}
set_pte_vaddr(address, pte);
fixmaps_set++;
}
void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
pgprot_t flags)
{
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}