mirror of
https://github.com/torvalds/linux.git
synced 2024-11-18 10:01:43 +00:00
c119ecce89
The VMI backend uses explicit page type notification to track shadow page tables. The allocation of page table roots is especially tricky. We need to clone the root for non-PAE mode while it is protected under the pgd lock to correctly copy the shadow. We don't need to allocate pgds in PAE mode, (PDPs in Intel terminology) as they only have 4 entries, and are cached entirely by the processor, which makes shadowing them rather simple. For base page table level allocation, pmd_populate provides the exact hook point we need. Also, we need to allocate pages when splitting a large page, and we must release pages before returning the page to any free pool. Despite being required with these slightly odd semantics for VMI, Xen also uses these hooks to determine the exact moment when page tables are created or released. AK: All nops for other architectures Signed-off-by: Zachary Amsden <zach@vmware.com> Signed-off-by: Andi Kleen <ak@suse.de> Cc: Andi Kleen <ak@suse.de> Cc: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Chris Wright <chrisw@sous-sol.org> Signed-off-by: Andrew Morton <akpm@osdl.org>
259 lines
6.4 KiB
C
259 lines
6.4 KiB
C
/*
|
|
* Copyright 2002 Andi Kleen, SuSE Labs.
|
|
* Thanks to Ben LaHaise for precious feedback.
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/module.h>
|
|
#include <linux/slab.h>
|
|
#include <asm/uaccess.h>
|
|
#include <asm/processor.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/sections.h>
|
|
|
|
static DEFINE_SPINLOCK(cpa_lock);
|
|
static struct list_head df_list = LIST_HEAD_INIT(df_list);
|
|
|
|
|
|
pte_t *lookup_address(unsigned long address)
|
|
{
|
|
pgd_t *pgd = pgd_offset_k(address);
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
if (pgd_none(*pgd))
|
|
return NULL;
|
|
pud = pud_offset(pgd, address);
|
|
if (pud_none(*pud))
|
|
return NULL;
|
|
pmd = pmd_offset(pud, address);
|
|
if (pmd_none(*pmd))
|
|
return NULL;
|
|
if (pmd_large(*pmd))
|
|
return (pte_t *)pmd;
|
|
return pte_offset_kernel(pmd, address);
|
|
}
|
|
|
|
static struct page *split_large_page(unsigned long address, pgprot_t prot,
|
|
pgprot_t ref_prot)
|
|
{
|
|
int i;
|
|
unsigned long addr;
|
|
struct page *base;
|
|
pte_t *pbase;
|
|
|
|
spin_unlock_irq(&cpa_lock);
|
|
base = alloc_pages(GFP_KERNEL, 0);
|
|
spin_lock_irq(&cpa_lock);
|
|
if (!base)
|
|
return NULL;
|
|
|
|
/*
|
|
* page_private is used to track the number of entries in
|
|
* the page table page that have non standard attributes.
|
|
*/
|
|
SetPagePrivate(base);
|
|
page_private(base) = 0;
|
|
|
|
address = __pa(address);
|
|
addr = address & LARGE_PAGE_MASK;
|
|
pbase = (pte_t *)page_address(base);
|
|
paravirt_alloc_pt(page_to_pfn(base));
|
|
for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
|
|
set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT,
|
|
addr == address ? prot : ref_prot));
|
|
}
|
|
return base;
|
|
}
|
|
|
|
static void flush_kernel_map(void *arg)
|
|
{
|
|
unsigned long adr = (unsigned long)arg;
|
|
|
|
if (adr && cpu_has_clflush) {
|
|
int i;
|
|
for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
|
|
asm volatile("clflush (%0)" :: "r" (adr + i));
|
|
} else if (boot_cpu_data.x86_model >= 4)
|
|
wbinvd();
|
|
|
|
/* Flush all to work around Errata in early athlons regarding
|
|
* large page flushing.
|
|
*/
|
|
__flush_tlb_all();
|
|
}
|
|
|
|
static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
|
|
{
|
|
struct page *page;
|
|
unsigned long flags;
|
|
|
|
set_pte_atomic(kpte, pte); /* change init_mm */
|
|
if (PTRS_PER_PMD > 1)
|
|
return;
|
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
for (page = pgd_list; page; page = (struct page *)page->index) {
|
|
pgd_t *pgd;
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
pgd = (pgd_t *)page_address(page) + pgd_index(address);
|
|
pud = pud_offset(pgd, address);
|
|
pmd = pmd_offset(pud, address);
|
|
set_pte_atomic((pte_t *)pmd, pte);
|
|
}
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
}
|
|
|
|
/*
|
|
* No more special protections in this 2/4MB area - revert to a
|
|
* large page again.
|
|
*/
|
|
static inline void revert_page(struct page *kpte_page, unsigned long address)
|
|
{
|
|
pgprot_t ref_prot;
|
|
pte_t *linear;
|
|
|
|
ref_prot =
|
|
((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
|
|
? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE;
|
|
|
|
linear = (pte_t *)
|
|
pmd_offset(pud_offset(pgd_offset_k(address), address), address);
|
|
set_pmd_pte(linear, address,
|
|
pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
|
|
ref_prot));
|
|
}
|
|
|
|
static int
|
|
__change_page_attr(struct page *page, pgprot_t prot)
|
|
{
|
|
pte_t *kpte;
|
|
unsigned long address;
|
|
struct page *kpte_page;
|
|
|
|
BUG_ON(PageHighMem(page));
|
|
address = (unsigned long)page_address(page);
|
|
|
|
kpte = lookup_address(address);
|
|
if (!kpte)
|
|
return -EINVAL;
|
|
kpte_page = virt_to_page(kpte);
|
|
if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
|
|
if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
|
|
set_pte_atomic(kpte, mk_pte(page, prot));
|
|
} else {
|
|
pgprot_t ref_prot;
|
|
struct page *split;
|
|
|
|
ref_prot =
|
|
((address & LARGE_PAGE_MASK) < (unsigned long)&_etext)
|
|
? PAGE_KERNEL_EXEC : PAGE_KERNEL;
|
|
split = split_large_page(address, prot, ref_prot);
|
|
if (!split)
|
|
return -ENOMEM;
|
|
set_pmd_pte(kpte,address,mk_pte(split, ref_prot));
|
|
kpte_page = split;
|
|
}
|
|
page_private(kpte_page)++;
|
|
} else if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
|
|
set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
|
|
BUG_ON(page_private(kpte_page) == 0);
|
|
page_private(kpte_page)--;
|
|
} else
|
|
BUG();
|
|
|
|
/*
|
|
* If the pte was reserved, it means it was created at boot
|
|
* time (not via split_large_page) and in turn we must not
|
|
* replace it with a largepage.
|
|
*/
|
|
if (!PageReserved(kpte_page)) {
|
|
if (cpu_has_pse && (page_private(kpte_page) == 0)) {
|
|
ClearPagePrivate(kpte_page);
|
|
paravirt_release_pt(page_to_pfn(kpte_page));
|
|
list_add(&kpte_page->lru, &df_list);
|
|
revert_page(kpte_page, address);
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static inline void flush_map(void *adr)
|
|
{
|
|
on_each_cpu(flush_kernel_map, adr, 1, 1);
|
|
}
|
|
|
|
/*
|
|
* Change the page attributes of an page in the linear mapping.
|
|
*
|
|
* This should be used when a page is mapped with a different caching policy
|
|
* than write-back somewhere - some CPUs do not like it when mappings with
|
|
* different caching policies exist. This changes the page attributes of the
|
|
* in kernel linear mapping too.
|
|
*
|
|
* The caller needs to ensure that there are no conflicting mappings elsewhere.
|
|
* This function only deals with the kernel linear map.
|
|
*
|
|
* Caller must call global_flush_tlb() after this.
|
|
*/
|
|
int change_page_attr(struct page *page, int numpages, pgprot_t prot)
|
|
{
|
|
int err = 0;
|
|
int i;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&cpa_lock, flags);
|
|
for (i = 0; i < numpages; i++, page++) {
|
|
err = __change_page_attr(page, prot);
|
|
if (err)
|
|
break;
|
|
}
|
|
spin_unlock_irqrestore(&cpa_lock, flags);
|
|
return err;
|
|
}
|
|
|
|
void global_flush_tlb(void)
|
|
{
|
|
struct list_head l;
|
|
struct page *pg, *next;
|
|
|
|
BUG_ON(irqs_disabled());
|
|
|
|
spin_lock_irq(&cpa_lock);
|
|
list_replace_init(&df_list, &l);
|
|
spin_unlock_irq(&cpa_lock);
|
|
if (!cpu_has_clflush)
|
|
flush_map(NULL);
|
|
list_for_each_entry_safe(pg, next, &l, lru) {
|
|
if (cpu_has_clflush)
|
|
flush_map(page_address(pg));
|
|
__free_page(pg);
|
|
}
|
|
}
|
|
|
|
#ifdef CONFIG_DEBUG_PAGEALLOC
|
|
void kernel_map_pages(struct page *page, int numpages, int enable)
|
|
{
|
|
if (PageHighMem(page))
|
|
return;
|
|
if (!enable)
|
|
debug_check_no_locks_freed(page_address(page),
|
|
numpages * PAGE_SIZE);
|
|
|
|
/* the return value is ignored - the calls cannot fail,
|
|
* large pages are disabled at boot time.
|
|
*/
|
|
change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
|
|
/* we should perform an IPI and flush all tlbs,
|
|
* but that can deadlock->flush only current cpu.
|
|
*/
|
|
__flush_tlb_all();
|
|
}
|
|
#endif
|
|
|
|
EXPORT_SYMBOL(change_page_attr);
|
|
EXPORT_SYMBOL(global_flush_tlb);
|