From c9b5ad546e7d486465a3dd8c89245ac3707a4384 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 14 Jun 2016 12:56:01 +0200 Subject: [PATCH 01/50] s390/mm: tag normal pages vs pages used in page tables The ESSA instruction has a new option that allows to tag pages that are not used as a page table. Without the tag the hypervisor has to assume that any guest page could be used in a page table inside the guest. This forces the hypervisor to flush all guest TLB entries whenever a host page table entry is invalidated. With the tag the host can skip the TLB flush if the page is tagged as normal page. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/page-states.h | 1 + arch/s390/include/asm/page.h | 3 + arch/s390/include/asm/setup.h | 3 +- arch/s390/kernel/suspend.c | 22 +++- arch/s390/kernel/vdso.c | 2 + arch/s390/mm/init.c | 2 + arch/s390/mm/page-states.c | 196 +++++++++++++++++++++++++--- arch/s390/mm/pgalloc.c | 2 + 8 files changed, 209 insertions(+), 22 deletions(-) diff --git a/arch/s390/include/asm/page-states.h b/arch/s390/include/asm/page-states.h index 42267a2fe29e..ca21b28a7b17 100644 --- a/arch/s390/include/asm/page-states.h +++ b/arch/s390/include/asm/page-states.h @@ -13,6 +13,7 @@ #define ESSA_SET_POT_VOLATILE 4 #define ESSA_SET_STABLE_RESIDENT 5 #define ESSA_SET_STABLE_IF_RESIDENT 6 +#define ESSA_SET_STABLE_NODAT 7 #define ESSA_MAX ESSA_SET_STABLE_IF_RESIDENT diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 624deaa44230..b463df89f344 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -133,6 +133,9 @@ static inline int page_reset_referenced(unsigned long addr) struct page; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); +void arch_set_page_dat(struct page *page, int order); +void arch_set_page_nodat(struct page *page, int order); +int arch_test_page_nodat(struct page *page); void arch_set_page_states(int make_stable); static inline int devmem_is_allowed(unsigned long pfn) diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index cd78155b1829..11e59ba4b521 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -106,7 +106,8 @@ extern void pfault_fini(void); void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); -extern void cmma_init(void); +void cmma_init(void); +void cmma_init_nodat(void); extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c index 39e2f41b6cf0..c8ea715bfe10 100644 --- a/arch/s390/kernel/suspend.c +++ b/arch/s390/kernel/suspend.c @@ -98,10 +98,16 @@ int page_key_alloc(unsigned long pages) */ void page_key_read(unsigned long *pfn) { + struct page *page; unsigned long addr; + unsigned char key; - addr = (unsigned long) page_address(pfn_to_page(*pfn)); - *(unsigned char *) pfn = (unsigned char) page_get_storage_key(addr); + page = pfn_to_page(*pfn); + addr = (unsigned long) page_address(page); + key = (unsigned char) page_get_storage_key(addr) & 0x7f; + if (arch_test_page_nodat(page)) + key |= 0x80; + *(unsigned char *) pfn = key; } /* @@ -126,8 +132,16 @@ void page_key_memorize(unsigned long *pfn) */ void page_key_write(void *address) { - page_set_storage_key((unsigned long) address, - page_key_rp->data[page_key_rx], 0); + struct page *page; + unsigned char key; + + key = page_key_rp->data[page_key_rx]; + page_set_storage_key((unsigned long) address, key & 0x7f, 0); + page = virt_to_page(address); + if (key & 0x80) + arch_set_page_nodat(page, 0); + else + arch_set_page_dat(page, 0); if (++page_key_rx >= PAGE_KEY_DATA_SIZE) return; page_key_rp = page_key_rp->next; diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index b89d19f6f2ab..eacda05b45d7 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -157,6 +157,8 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore) page_frame = get_zeroed_page(GFP_KERNEL); if (!segment_table || !page_table || !page_frame) goto out; + arch_set_page_dat(virt_to_page(segment_table), SEGMENT_ORDER); + arch_set_page_dat(virt_to_page(page_table), 0); /* Initialize per-cpu vdso data page */ vd = (struct vdso_per_cpu_data *) page_frame; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8111694ce55a..3aee54b2ba60 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -137,6 +137,8 @@ void __init mem_init(void) free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ + cmma_init_nodat(); + mem_init_print_info(NULL); } diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 69a7b01ae746..07fa7b8ae233 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -10,9 +10,10 @@ #include #include #include +#include #include #include - +#include #include static int cmma_flag = 1; @@ -36,14 +37,16 @@ __setup("cmma=", cmma); static inline int cmma_test_essa(void) { register unsigned long tmp asm("0") = 0; - register int rc asm("1") = -EOPNOTSUPP; + register int rc asm("1"); + /* test ESSA_GET_STATE */ asm volatile( - " .insn rrf,0xb9ab0000,%1,%1,0,0\n" + " .insn rrf,0xb9ab0000,%1,%1,%2,0\n" "0: la %0,0\n" "1:\n" EX_TABLE(0b,1b) - : "+&d" (rc), "+&d" (tmp)); + : "=&d" (rc), "+&d" (tmp) + : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP)); return rc; } @@ -51,11 +54,26 @@ void __init cmma_init(void) { if (!cmma_flag) return; - if (cmma_test_essa()) + if (cmma_test_essa()) { cmma_flag = 0; + return; + } + if (test_facility(147)) + cmma_flag = 2; } -static inline void set_page_unstable(struct page *page, int order) +static inline unsigned char get_page_state(struct page *page) +{ + unsigned char state; + + asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (state) + : "a" (page_to_phys(page)), + "i" (ESSA_GET_STATE)); + return state & 0x3f; +} + +static inline void set_page_unused(struct page *page, int order) { int i, rc; @@ -66,14 +84,7 @@ static inline void set_page_unstable(struct page *page, int order) "i" (ESSA_SET_UNUSED)); } -void arch_free_page(struct page *page, int order) -{ - if (!cmma_flag) - return; - set_page_unstable(page, order); -} - -static inline void set_page_stable(struct page *page, int order) +static inline void set_page_stable_dat(struct page *page, int order) { int i, rc; @@ -84,11 +95,162 @@ static inline void set_page_stable(struct page *page, int order) "i" (ESSA_SET_STABLE)); } +static inline void set_page_stable_nodat(struct page *page, int order) +{ + int i, rc; + + for (i = 0; i < (1 << order); i++) + asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" + : "=&d" (rc) + : "a" (page_to_phys(page + i)), + "i" (ESSA_SET_STABLE_NODAT)); +} + +static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pmd_t *pmd; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) || pmd_large(*pmd)) + continue; + page = virt_to_page(pmd_val(*pmd)); + set_bit(PG_arch_1, &page->flags); + } while (pmd++, addr = next, addr != end); +} + +static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + pud_t *pud; + int i; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none(*pud) || pud_large(*pud)) + continue; + if (!pud_folded(*pud)) { + page = virt_to_page(pud_val(*pud)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pmd(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) +{ + unsigned long next; + struct page *page; + p4d_t *p4d; + int i; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d)) + continue; + if (!p4d_folded(*p4d)) { + page = virt_to_page(p4d_val(*p4d)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_pud(p4d, addr, next); + } while (p4d++, addr = next, addr != end); +} + +static void mark_kernel_pgd(void) +{ + unsigned long addr, next; + struct page *page; + pgd_t *pgd; + int i; + + addr = 0; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, MODULES_END); + if (pgd_none(*pgd)) + continue; + if (!pgd_folded(*pgd)) { + page = virt_to_page(pgd_val(*pgd)); + for (i = 0; i < 3; i++) + set_bit(PG_arch_1, &page[i].flags); + } + mark_kernel_p4d(pgd, addr, next); + } while (pgd++, addr = next, addr != MODULES_END); +} + +void __init cmma_init_nodat(void) +{ + struct memblock_region *reg; + struct page *page; + unsigned long start, end, ix; + + if (cmma_flag < 2) + return; + /* Mark pages used in kernel page tables */ + mark_kernel_pgd(); + + /* Set all kernel pages not used for page tables to stable/no-dat */ + for_each_memblock(memory, reg) { + start = memblock_region_memory_base_pfn(reg); + end = memblock_region_memory_end_pfn(reg); + page = pfn_to_page(start); + for (ix = start; ix < end; ix++, page++) { + if (__test_and_clear_bit(PG_arch_1, &page->flags)) + continue; /* skip page table pages */ + if (!list_empty(&page->lru)) + continue; /* skip free pages */ + set_page_stable_nodat(page, 0); + } + } +} + +void arch_free_page(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_unused(page, order); +} + void arch_alloc_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_stable(page, order); + if (cmma_flag < 2) + set_page_stable_dat(page, order); + else + set_page_stable_nodat(page, order); +} + +void arch_set_page_dat(struct page *page, int order) +{ + if (!cmma_flag) + return; + set_page_stable_dat(page, order); +} + +void arch_set_page_nodat(struct page *page, int order) +{ + if (cmma_flag < 2) + return; + set_page_stable_nodat(page, order); +} + +int arch_test_page_nodat(struct page *page) +{ + unsigned char state; + + if (cmma_flag < 2) + return 0; + state = get_page_state(page); + return !!(state & 0x20); } void arch_set_page_states(int make_stable) @@ -108,9 +270,9 @@ void arch_set_page_states(int make_stable) list_for_each(l, &zone->free_area[order].free_list[t]) { page = list_entry(l, struct page, lru); if (make_stable) - set_page_stable(page, order); + set_page_stable_dat(page, 0); else - set_page_unstable(page, order); + set_page_unused(page, order); } } spin_unlock_irqrestore(&zone->lock, flags); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 18918e394ce4..a4de34ce392c 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -57,6 +57,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (!page) return NULL; + arch_set_page_dat(page, 2); return (unsigned long *) page_to_phys(page); } @@ -214,6 +215,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) __free_page(page); return NULL; } + arch_set_page_dat(page, 0); /* Initialize page table */ table = (unsigned long *) page_to_phys(page); if (mm_alloc_pgste(mm)) { From 118bd31bea2cdb7f1dbf22dd9a58e818b5313156 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 26 Jul 2016 16:53:09 +0200 Subject: [PATCH 02/50] s390/mm: add no-dat TLB flush optimization Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 41 ++++++++---- arch/s390/include/asm/setup.h | 6 +- arch/s390/include/asm/tlbflush.h | 5 +- arch/s390/mm/pageattr.c | 2 +- arch/s390/mm/pgtable.c | 111 ++++++++++++++++++++++++------- drivers/s390/char/sclp_early.c | 6 +- 6 files changed, 129 insertions(+), 42 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 57057fb1cc07..c92713893313 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -952,15 +952,27 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_GLOBAL 0 #define IPTE_LOCAL 1 -static inline void __ptep_ipte(unsigned long address, pte_t *ptep, int local) +#define IPTE_NODAT 0x400 + +static inline void __ptep_ipte(unsigned long address, pte_t *ptep, + unsigned long opt, int local) { unsigned long pto = (unsigned long) ptep; - /* Invalidation + TLB flush for the pte */ + if (__builtin_constant_p(opt) && opt == 0) { + /* Invalidation + TLB flush for the pte */ + asm volatile( + " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" + : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), + [m4] "i" (local)); + return; + } + + /* Invalidate ptes with options + TLB flush of the ptes */ asm volatile( - " .insn rrf,0xb2210000,%[r1],%[r2],0,%[m4]" - : "+m" (*ptep) : [r1] "a" (pto), [r2] "a" (address), - [m4] "i" (local)); + " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]" + : [r2] "+a" (address), [r3] "+a" (opt) + : [r1] "a" (pto), [m4] "i" (local) : "memory"); } static inline void __ptep_ipte_range(unsigned long address, int nr, @@ -1341,31 +1353,36 @@ static inline void __pmdp_csp(pmd_t *pmdp) #define IDTE_GLOBAL 0 #define IDTE_LOCAL 1 -static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp, int local) +#define IDTE_PTOA 0x0800 +#define IDTE_NODAT 0x1000 + +static inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, + unsigned long opt, int local) { unsigned long sto; - sto = (unsigned long) pmdp - pmd_index(address) * sizeof(pmd_t); + sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t); asm volatile( " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" : "+m" (*pmdp) - : [r1] "a" (sto), [r2] "a" ((address & HPAGE_MASK)), + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), [m4] "i" (local) : "cc" ); } -static inline void __pudp_idte(unsigned long address, pud_t *pudp, int local) +static inline void __pudp_idte(unsigned long addr, pud_t *pudp, + unsigned long opt, int local) { unsigned long r3o; - r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t); + r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t); r3o |= _ASCE_TYPE_REGION3; asm volatile( " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" : "+m" (*pudp) - : [r1] "a" (r3o), [r2] "a" ((address & PUD_MASK)), + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), [m4] "i" (local) - : "cc"); + : "cc" ); } pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t); diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 11e59ba4b521..49c425903894 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -29,8 +29,9 @@ #define MACHINE_FLAG_TE _BITUL(11) #define MACHINE_FLAG_TLB_LC _BITUL(12) #define MACHINE_FLAG_VX _BITUL(13) -#define MACHINE_FLAG_NX _BITUL(14) -#define MACHINE_FLAG_GS _BITUL(15) +#define MACHINE_FLAG_TLB_GUEST _BITUL(14) +#define MACHINE_FLAG_NX _BITUL(15) +#define MACHINE_FLAG_GS _BITUL(16) #define LPP_MAGIC _BITUL(31) #define LPP_PFAULT_PID_MASK _AC(0xffffffff, UL) @@ -68,6 +69,7 @@ extern void detect_memory_memblock(void); #define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #define MACHINE_HAS_TLB_LC (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_LC) #define MACHINE_HAS_VX (S390_lowcore.machine_flags & MACHINE_FLAG_VX) +#define MACHINE_HAS_TLB_GUEST (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST) #define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) #define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 39846100682a..38d82ed60345 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -20,10 +20,13 @@ static inline void __tlb_flush_local(void) */ static inline void __tlb_flush_idte(unsigned long asce) { + unsigned long opt; + + opt = IDTE_PTOA; /* Global TLB flush for the mm */ asm volatile( " .insn rrf,0xb98e0000,0,%0,%1,0" - : : "a" (2048), "a" (asce) : "cc"); + : : "a" (opt), "a" (asce) : "cc"); } #ifdef CONFIG_SMP diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 180481589246..5734b01ca765 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -328,7 +328,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) return; } for (i = 0; i < nr; i++) { - __ptep_ipte(address, pte, IPTE_GLOBAL); + __ptep_ipte(address, pte, 0, IPTE_GLOBAL); address += PAGE_SIZE; pte++; } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index d4d409ba206b..9696bf89f03a 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -25,6 +25,38 @@ #include #include +static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL) + opt |= IPTE_NODAT; + __ptep_ipte(addr, ptep, opt, IPTE_LOCAL); + } else { + __ptep_ipte(addr, ptep, 0, IPTE_LOCAL); + } +} + +static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + unsigned long opt, asce; + + if (MACHINE_HAS_TLB_GUEST) { + opt = 0; + asce = READ_ONCE(mm->context.gmap_asce); + if (asce == 0UL) + opt |= IPTE_NODAT; + __ptep_ipte(addr, ptep, opt, IPTE_GLOBAL); + } else { + __ptep_ipte(addr, ptep, 0, IPTE_GLOBAL); + } +} + static inline pte_t ptep_flush_direct(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -36,9 +68,9 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __ptep_ipte(addr, ptep, IPTE_LOCAL); + ptep_ipte_local(mm, addr, ptep); else - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep); atomic_dec(&mm->context.flush_count); return old; } @@ -57,7 +89,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, pte_val(*ptep) |= _PAGE_INVALID; mm->context.flush_mm = 1; } else - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep); atomic_dec(&mm->context.flush_count); return old; } @@ -290,6 +322,26 @@ void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_modify_prot_commit); +static inline void pmdp_idte_local(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT, IDTE_LOCAL); + else + __pmdp_idte(addr, pmdp, 0, IDTE_LOCAL); +} + +static inline void pmdp_idte_global(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pmdp_idte(addr, pmdp, IDTE_NODAT, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pmdp_idte(addr, pmdp, 0, IDTE_GLOBAL); + else + __pmdp_csp(pmdp); +} + static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { @@ -298,16 +350,12 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, old = *pmdp; if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) return old; - if (!MACHINE_HAS_IDTE) { - __pmdp_csp(pmdp); - return old; - } atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pmdp_idte(addr, pmdp, IDTE_LOCAL); + pmdp_idte_local(mm, addr, pmdp); else - __pmdp_idte(addr, pmdp, IDTE_GLOBAL); + pmdp_idte_global(mm, addr, pmdp); atomic_dec(&mm->context.flush_count); return old; } @@ -325,10 +373,9 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, cpumask_of(smp_processor_id()))) { pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; mm->context.flush_mm = 1; - } else if (MACHINE_HAS_IDTE) - __pmdp_idte(addr, pmdp, IDTE_GLOBAL); - else - __pmdp_csp(pmdp); + } else { + pmdp_idte_global(mm, addr, pmdp); + } atomic_dec(&mm->context.flush_count); return old; } @@ -359,6 +406,30 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(pmdp_xchg_lazy); +static inline void pudp_idte_local(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT, IDTE_LOCAL); + else + __pudp_idte(addr, pudp, 0, IDTE_LOCAL); +} + +static inline void pudp_idte_global(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (MACHINE_HAS_TLB_GUEST) + __pudp_idte(addr, pudp, IDTE_NODAT, IDTE_GLOBAL); + else if (MACHINE_HAS_IDTE) + __pudp_idte(addr, pudp, 0, IDTE_GLOBAL); + else + /* + * Invalid bit position is the same for pmd and pud, so we can + * re-use _pmd_csp() here + */ + __pmdp_csp((pmd_t *) pudp); +} + static inline pud_t pudp_flush_direct(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { @@ -367,20 +438,12 @@ static inline pud_t pudp_flush_direct(struct mm_struct *mm, old = *pudp; if (pud_val(old) & _REGION_ENTRY_INVALID) return old; - if (!MACHINE_HAS_IDTE) { - /* - * Invalid bit position is the same for pmd and pud, so we can - * re-use _pmd_csp() here - */ - __pmdp_csp((pmd_t *) pudp); - return old; - } atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - __pudp_idte(addr, pudp, IDTE_LOCAL); + pudp_idte_local(mm, addr, pudp); else - __pudp_idte(addr, pudp, IDTE_GLOBAL); + pudp_idte_global(mm, addr, pudp); atomic_dec(&mm->context.flush_count); return old; } @@ -645,7 +708,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { pgste = pgste_pte_notify(mm, addr, ptep, pgste); - __ptep_ipte(addr, ptep, IPTE_GLOBAL); + ptep_ipte_global(mm, addr, ptep); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; else diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index efd84d1d178b..bc1fc00910b0 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -39,7 +39,7 @@ struct read_info_sccb { u8 fac84; /* 84 */ u8 fac85; /* 85 */ u8 _pad_86[91 - 86]; /* 86-90 */ - u8 flags; /* 91 */ + u8 fac91; /* 91 */ u8 _pad_92[98 - 92]; /* 92-97 */ u8 fac98; /* 98 */ u8 hamaxpow; /* 99 */ @@ -103,6 +103,8 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb) sclp.has_kss = !!(sccb->fac98 & 0x01); if (sccb->fac85 & 0x02) S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP; + if (sccb->fac91 & 0x40) + S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_GUEST; sclp.rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; sclp.rzm = sccb->rnsize ? sccb->rnsize : sccb->rnsize2; sclp.rzm <<= 20; @@ -139,7 +141,7 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb) /* Save IPL information */ sclp_ipl_info.is_valid = 1; - if (sccb->flags & 0x2) + if (sccb->fac91 & 0x2) sclp_ipl_info.has_dump = 1; memcpy(&sclp_ipl_info.loadparm, &sccb->loadparm, LOADPARM_LEN); From 28c807e5132ecc9f1607461eabfa1fc67b21e163 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 26 Jul 2016 16:00:22 +0200 Subject: [PATCH 03/50] s390/mm: add guest ASCE TLB flush optimization Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 58 +++++++++++++++++++++++--------- arch/s390/include/asm/tlbflush.h | 2 ++ arch/s390/mm/pageattr.c | 2 +- arch/s390/mm/pgtable.c | 36 +++++++++++++------- 4 files changed, 70 insertions(+), 28 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index c92713893313..b20b0f7170e8 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -953,9 +953,11 @@ static inline pte_t pte_mkhuge(pte_t pte) #define IPTE_LOCAL 1 #define IPTE_NODAT 0x400 +#define IPTE_GUEST_ASCE 0x800 static inline void __ptep_ipte(unsigned long address, pte_t *ptep, - unsigned long opt, int local) + unsigned long opt, unsigned long asce, + int local) { unsigned long pto = (unsigned long) ptep; @@ -969,6 +971,7 @@ static inline void __ptep_ipte(unsigned long address, pte_t *ptep, } /* Invalidate ptes with options + TLB flush of the ptes */ + opt = opt | (asce & _ASCE_ORIGIN); asm volatile( " .insn rrf,0xb2210000,%[r1],%[r2],%[r3],%[m4]" : [r2] "+a" (address), [r3] "+a" (opt) @@ -1355,34 +1358,59 @@ static inline void __pmdp_csp(pmd_t *pmdp) #define IDTE_PTOA 0x0800 #define IDTE_NODAT 0x1000 +#define IDTE_GUEST_ASCE 0x2000 static inline void __pmdp_idte(unsigned long addr, pmd_t *pmdp, - unsigned long opt, int local) + unsigned long opt, unsigned long asce, + int local) { unsigned long sto; sto = (unsigned long) pmdp - pmd_index(addr) * sizeof(pmd_t); - asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" - : "+m" (*pmdp) - : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), - [m4] "i" (local) - : "cc" ); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK)), + [m4] "i" (local) + : "cc" ); + } else { + /* flush with guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + : "+m" (*pmdp) + : [r1] "a" (sto), [r2] "a" ((addr & HPAGE_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } } static inline void __pudp_idte(unsigned long addr, pud_t *pudp, - unsigned long opt, int local) + unsigned long opt, unsigned long asce, + int local) { unsigned long r3o; r3o = (unsigned long) pudp - pud_index(addr) * sizeof(pud_t); r3o |= _ASCE_TYPE_REGION3; - asm volatile( - " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" - : "+m" (*pudp) - : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), - [m4] "i" (local) - : "cc" ); + if (__builtin_constant_p(opt) && opt == 0) { + /* flush without guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],0,%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK)), + [m4] "i" (local) + : "cc"); + } else { + /* flush with guest asce */ + asm volatile( + " .insn rrf,0xb98e0000,%[r1],%[r2],%[r3],%[m4]" + : "+m" (*pudp) + : [r1] "a" (r3o), [r2] "a" ((addr & PUD_MASK) | opt), + [r3] "a" (asce), [m4] "i" (local) + : "cc" ); + } } pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t); diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 38d82ed60345..4d759f8f4bc7 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -23,6 +23,8 @@ static inline void __tlb_flush_idte(unsigned long asce) unsigned long opt; opt = IDTE_PTOA; + if (MACHINE_HAS_TLB_GUEST) + opt |= IDTE_GUEST_ASCE; /* Global TLB flush for the mm */ asm volatile( " .insn rrf,0xb98e0000,0,%0,%1,0" diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 5734b01ca765..567fe92e2bb8 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -328,7 +328,7 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) return; } for (i = 0; i < nr; i++) { - __ptep_ipte(address, pte, 0, IPTE_GLOBAL); + __ptep_ipte(address, pte, 0, 0, IPTE_GLOBAL); address += PAGE_SIZE; pte++; } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 9696bf89f03a..3f1abc7b5fd2 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -35,9 +35,13 @@ static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, asce = READ_ONCE(mm->context.gmap_asce); if (asce == 0UL) opt |= IPTE_NODAT; - __ptep_ipte(addr, ptep, opt, IPTE_LOCAL); + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_LOCAL); } else { - __ptep_ipte(addr, ptep, 0, IPTE_LOCAL); + __ptep_ipte(addr, ptep, 0, 0, IPTE_LOCAL); } } @@ -51,9 +55,13 @@ static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, asce = READ_ONCE(mm->context.gmap_asce); if (asce == 0UL) opt |= IPTE_NODAT; - __ptep_ipte(addr, ptep, opt, IPTE_GLOBAL); + if (asce != -1UL) { + asce = asce ? : mm->context.asce; + opt |= IPTE_GUEST_ASCE; + } + __ptep_ipte(addr, ptep, opt, asce, IPTE_GLOBAL); } else { - __ptep_ipte(addr, ptep, 0, IPTE_GLOBAL); + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); } } @@ -326,18 +334,20 @@ static inline void pmdp_idte_local(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { if (MACHINE_HAS_TLB_GUEST) - __pmdp_idte(addr, pmdp, IDTE_NODAT, IDTE_LOCAL); + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); else - __pmdp_idte(addr, pmdp, 0, IDTE_LOCAL); + __pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL); } static inline void pmdp_idte_global(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { if (MACHINE_HAS_TLB_GUEST) - __pmdp_idte(addr, pmdp, IDTE_NODAT, IDTE_GLOBAL); + __pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); else if (MACHINE_HAS_IDTE) - __pmdp_idte(addr, pmdp, 0, IDTE_GLOBAL); + __pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL); else __pmdp_csp(pmdp); } @@ -410,18 +420,20 @@ static inline void pudp_idte_local(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { if (MACHINE_HAS_TLB_GUEST) - __pudp_idte(addr, pudp, IDTE_NODAT, IDTE_LOCAL); + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_LOCAL); else - __pudp_idte(addr, pudp, 0, IDTE_LOCAL); + __pudp_idte(addr, pudp, 0, 0, IDTE_LOCAL); } static inline void pudp_idte_global(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { if (MACHINE_HAS_TLB_GUEST) - __pudp_idte(addr, pudp, IDTE_NODAT, IDTE_GLOBAL); + __pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE, + mm->context.asce, IDTE_GLOBAL); else if (MACHINE_HAS_IDTE) - __pudp_idte(addr, pudp, 0, IDTE_GLOBAL); + __pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL); else /* * Invalid bit position is the same for pmd and pud, so we can From cd774b9076845312caf76ad52523bd9219e89ae1 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 26 Jul 2016 17:02:31 +0200 Subject: [PATCH 04/50] s390/mm,kvm: use nodat PGSTE tag to optimize TLB flushing Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 1 + arch/s390/mm/pgtable.c | 47 +++++++++++++++++++++------------ 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index b20b0f7170e8..bb59a0aa3249 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -376,6 +376,7 @@ static inline int is_module_addr(void *addr) /* Guest Page State used for virtualization */ #define _PGSTE_GPS_ZERO 0x0000000080000000UL +#define _PGSTE_GPS_NODAT 0x0000000040000000UL #define _PGSTE_GPS_USAGE_MASK 0x0000000003000000UL #define _PGSTE_GPS_USAGE_STABLE 0x0000000000000000UL #define _PGSTE_GPS_USAGE_UNUSED 0x0000000001000000UL diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 3f1abc7b5fd2..8d018c76ee85 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -26,14 +26,14 @@ #include static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, int nodat) { unsigned long opt, asce; if (MACHINE_HAS_TLB_GUEST) { opt = 0; asce = READ_ONCE(mm->context.gmap_asce); - if (asce == 0UL) + if (asce == 0UL || nodat) opt |= IPTE_NODAT; if (asce != -1UL) { asce = asce ? : mm->context.asce; @@ -46,14 +46,14 @@ static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, } static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, int nodat) { unsigned long opt, asce; if (MACHINE_HAS_TLB_GUEST) { opt = 0; asce = READ_ONCE(mm->context.gmap_asce); - if (asce == 0UL) + if (asce == 0UL || nodat) opt |= IPTE_NODAT; if (asce != -1UL) { asce = asce ? : mm->context.asce; @@ -66,7 +66,8 @@ static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr, } static inline pte_t ptep_flush_direct(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + int nodat) { pte_t old; @@ -76,15 +77,16 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (MACHINE_HAS_TLB_LC && cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) - ptep_ipte_local(mm, addr, ptep); + ptep_ipte_local(mm, addr, ptep, nodat); else - ptep_ipte_global(mm, addr, ptep); + ptep_ipte_global(mm, addr, ptep, nodat); atomic_dec(&mm->context.flush_count); return old; } static inline pte_t ptep_flush_lazy(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + int nodat) { pte_t old; @@ -97,7 +99,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, pte_val(*ptep) |= _PAGE_INVALID; mm->context.flush_mm = 1; } else - ptep_ipte_global(mm, addr, ptep); + ptep_ipte_global(mm, addr, ptep, nodat); atomic_dec(&mm->context.flush_count); return old; } @@ -269,10 +271,12 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_direct(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_direct(mm, addr, ptep, nodat); old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); preempt_enable(); return old; @@ -284,10 +288,12 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_lazy(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new); preempt_enable(); return old; @@ -299,10 +305,12 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; pte_t old; + int nodat; preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); - old = ptep_flush_lazy(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + old = ptep_flush_lazy(mm, addr, ptep, nodat); if (mm_has_pgste(mm)) { pgste = pgste_update_all(old, pgste, mm); pgste_set(ptep, pgste); @@ -557,7 +565,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, { pte_t entry; pgste_t pgste; - int pte_i, pte_p; + int pte_i, pte_p, nodat; pgste = pgste_get_lock(ptep); entry = *ptep; @@ -570,13 +578,14 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, return -EAGAIN; } /* Change access rights and set pgste bit */ + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); if (prot == PROT_NONE && !pte_i) { - ptep_flush_direct(mm, addr, ptep); + ptep_flush_direct(mm, addr, ptep, nodat); pgste = pgste_update_all(entry, pgste, mm); pte_val(entry) |= _PAGE_INVALID; } if (prot == PROT_READ && !pte_p) { - ptep_flush_direct(mm, addr, ptep); + ptep_flush_direct(mm, addr, ptep, nodat); pte_val(entry) &= ~_PAGE_INVALID; pte_val(entry) |= _PAGE_PROTECT; } @@ -616,10 +625,12 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep) { pgste_t pgste; + int nodat; pgste = pgste_get_lock(ptep); /* notifier is called by the caller */ - ptep_flush_direct(mm, saddr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_flush_direct(mm, saddr, ptep, nodat); /* don't touch the storage key - it belongs to parent pgste */ pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID)); pgste_set_unlock(ptep, pgste); @@ -692,6 +703,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte_t *ptep; pte_t pte; bool dirty; + int nodat; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); @@ -720,7 +732,8 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) pte = *ptep; if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { pgste = pgste_pte_notify(mm, addr, ptep, pgste); - ptep_ipte_global(mm, addr, ptep); + nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); + ptep_ipte_global(mm, addr, ptep, nodat); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) pte_val(pte) |= _PAGE_PROTECT; else From 8457d7754ba72a8b41900ccac46ff0561fc6e962 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jun 2017 08:57:24 +0200 Subject: [PATCH 05/50] s390/mm: remove and correct comments within pgtable.h Reviewed-by: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index bb59a0aa3249..f8d9369947e0 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -11,18 +11,6 @@ #ifndef _ASM_S390_PGTABLE_H #define _ASM_S390_PGTABLE_H -/* - * The Linux memory management assumes a three-level page table setup. - * For s390 64 bit we use up to four of the five levels the hardware - * provides (region first tables are not used). - * - * The "pgd_xxx()" functions are trivial for a folded two-level - * setup: the pgd is never bad, and a pmd always exists (as it's folded - * into the pgd entry) - * - * This file contains the functions and defines necessary to modify and use - * the S390 page table tree. - */ #ifndef __ASSEMBLY__ #include #include @@ -79,11 +67,6 @@ extern unsigned long zero_page_mask; /* TODO: s390 cannot support io_remap_pfn_range... */ #endif /* !__ASSEMBLY__ */ -/* - * PMD_SHIFT determines the size of the area a second-level page - * table can map - * PGDIR_SHIFT determines what a third-level page table entry can map - */ #define PMD_SHIFT 20 #define PUD_SHIFT 31 #define P4D_SHIFT 42 @@ -98,12 +81,6 @@ extern unsigned long zero_page_mask; #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -/* - * entries per page directory level: the S390 is two-level, so - * we don't really have any PMD directory physically. - * for S390 segment-table entries are combined to one PGD - * that leads to 1024 pte per pgd - */ #define PTRS_PER_PTE 256 #define PTRS_PER_PMD 2048 #define PTRS_PER_PUD 2048 @@ -269,7 +246,7 @@ static inline int is_module_addr(void *addr) */ /* Bits in the segment/region table address-space-control-element */ -#define _ASCE_ORIGIN ~0xfffUL/* segment table origin */ +#define _ASCE_ORIGIN ~0xfffUL/* region/segment table origin */ #define _ASCE_PRIVATE_SPACE 0x100 /* private space control */ #define _ASCE_ALT_EVENT 0x80 /* storage alteration event control */ #define _ASCE_SPACE_SWITCH 0x40 /* space switch event */ @@ -320,9 +297,9 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL #define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL #define _SEGMENT_ENTRY_ORIGIN_LARGE ~0xfffffUL /* large page address */ -#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* segment table origin */ -#define _SEGMENT_ENTRY_PROTECT 0x200 /* page protection bit */ -#define _SEGMENT_ENTRY_NOEXEC 0x100 /* region no-execute bit */ +#define _SEGMENT_ENTRY_ORIGIN ~0x7ffUL/* page table origin */ +#define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */ +#define _SEGMENT_ENTRY_NOEXEC 0x100 /* segment no-execute bit */ #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ #define _SEGMENT_ENTRY (0) From a31a00ba7c161ac3741c4c8a4872f88891b24554 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jun 2017 09:07:11 +0200 Subject: [PATCH 06/50] s390/mm: get rid of __ASSEMBLY__ guards within pgtable.h We have C code also outside of #ifndef __ASSEMBLY__. So these guards seem to be quite pointless and can be removed. Reviewed-by: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index f8d9369947e0..888420348ad1 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -11,7 +11,6 @@ #ifndef _ASM_S390_PGTABLE_H #define _ASM_S390_PGTABLE_H -#ifndef __ASSEMBLY__ #include #include #include @@ -65,7 +64,6 @@ extern unsigned long zero_page_mask; #define __HAVE_COLOR_ZERO_PAGE /* TODO: s390 cannot support io_remap_pfn_range... */ -#endif /* !__ASSEMBLY__ */ #define PMD_SHIFT 20 #define PUD_SHIFT 31 @@ -100,7 +98,6 @@ extern unsigned long zero_page_mask; #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %p.\n", __FILE__, __LINE__, (void *) pgd_val(e)) -#ifndef __ASSEMBLY__ /* * The vmalloc and module area will always be on the topmost area of the * kernel mapping. We reserve 128GB (64bit) for vmalloc and modules. @@ -1571,8 +1568,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#endif /* !__ASSEMBLY__ */ - #define kern_addr_valid(addr) (1) extern int vmem_add_mapping(unsigned long start, unsigned long size); From c67da7c7c5d4c0a45b079b21f6991cb7e753856e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 16 Jun 2017 17:24:39 +0200 Subject: [PATCH 07/50] s390/mm: introduce defines to reflect the hardware mmu Add various defines like e.g. _REGION1_SHIFT to reflect the hardware mmu. We have quite a bit code that does not make use of the Linux memory management primitives but directly modifies page, segment and region values. Most of this is open-coded like e.g. "1UL << 53". In order to clean this up introduce a couple of new defines. The existing Linux memory management defines are changed, so the mapping to the hardware implementation is reflected. Reviewed-by: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/page.h | 10 +++-- arch/s390/include/asm/pgalloc.h | 2 + arch/s390/include/asm/pgtable.h | 68 +++++++++++++++++++++++---------- 3 files changed, 57 insertions(+), 23 deletions(-) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index b463df89f344..c68b21b12182 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -10,10 +10,14 @@ #include #include +#define _PAGE_SHIFT 12 +#define _PAGE_SIZE (_AC(1, UL) << _PAGE_SHIFT) +#define _PAGE_MASK (~(_PAGE_SIZE - 1)) + /* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) +#define PAGE_SHIFT _PAGE_SHIFT +#define PAGE_SIZE _PAGE_SIZE +#define PAGE_MASK _PAGE_MASK #define PAGE_DEFAULT_ACC 0 #define PAGE_DEFAULT_KEY (PAGE_DEFAULT_ACC << 4) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index bb0ff1bb0c4a..eccfa0642712 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -15,6 +15,8 @@ #include #include +#define CRST_ALLOC_ORDER 2 + unsigned long *crst_table_alloc(struct mm_struct *); void crst_table_free(struct mm_struct *, unsigned long *); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 888420348ad1..90f422fddd2b 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -65,26 +65,6 @@ extern unsigned long zero_page_mask; /* TODO: s390 cannot support io_remap_pfn_range... */ -#define PMD_SHIFT 20 -#define PUD_SHIFT 31 -#define P4D_SHIFT 42 -#define PGDIR_SHIFT 53 - -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) -#define P4D_SIZE (1UL << P4D_SHIFT) -#define P4D_MASK (~(P4D_SIZE-1)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -#define PTRS_PER_PTE 256 -#define PTRS_PER_PMD 2048 -#define PTRS_PER_PUD 2048 -#define PTRS_PER_P4D 2048 -#define PTRS_PER_PGD 2048 - #define FIRST_USER_ADDRESS 0UL #define pte_ERROR(e) \ @@ -314,6 +294,54 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif +#define _CRST_ENTRIES 2048 /* number of region/segment table entries */ +#define _PAGE_ENTRIES 256 /* number of page table entries */ + +#define _CRST_TABLE_SIZE (_CRST_ENTRIES * 8) +#define _PAGE_TABLE_SIZE (_PAGE_ENTRIES * 8) + +#define _REGION1_SHIFT 53 +#define _REGION2_SHIFT 42 +#define _REGION3_SHIFT 31 +#define _SEGMENT_SHIFT 20 + +#define _REGION1_INDEX (0x7ffUL << _REGION1_SHIFT) +#define _REGION2_INDEX (0x7ffUL << _REGION2_SHIFT) +#define _REGION3_INDEX (0x7ffUL << _REGION3_SHIFT) +#define _SEGMENT_INDEX (0x7ffUL << _SEGMENT_SHIFT) +#define _PAGE_INDEX (0xffUL << _PAGE_SHIFT) + +#define _REGION1_SIZE (1UL << _REGION1_SHIFT) +#define _REGION2_SIZE (1UL << _REGION2_SHIFT) +#define _REGION3_SIZE (1UL << _REGION3_SHIFT) +#define _SEGMENT_SIZE (1UL << _SEGMENT_SHIFT) + +#define _REGION1_MASK (~(_REGION1_SIZE - 1)) +#define _REGION2_MASK (~(_REGION2_SIZE - 1)) +#define _REGION3_MASK (~(_REGION3_SIZE - 1)) +#define _SEGMENT_MASK (~(_SEGMENT_SIZE - 1)) + +#define PMD_SHIFT _SEGMENT_SHIFT +#define PUD_SHIFT _REGION3_SHIFT +#define P4D_SHIFT _REGION2_SHIFT +#define PGDIR_SHIFT _REGION1_SHIFT + +#define PMD_SIZE _SEGMENT_SIZE +#define PUD_SIZE _REGION3_SIZE +#define P4D_SIZE _REGION2_SIZE +#define PGDIR_SIZE _REGION1_SIZE + +#define PMD_MASK _SEGMENT_MASK +#define PUD_MASK _REGION3_MASK +#define P4D_MASK _REGION2_MASK +#define PGDIR_MASK _REGION1_MASK + +#define PTRS_PER_PTE _PAGE_ENTRIES +#define PTRS_PER_PMD _CRST_ENTRIES +#define PTRS_PER_PUD _CRST_ENTRIES +#define PTRS_PER_P4D _CRST_ENTRIES +#define PTRS_PER_PGD _CRST_ENTRIES + /* * Segment table and region3 table entry encoding * (R = read-only, I = invalid, y = young bit): From f1c1174fa099566f02c809193e9720593b231ae2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 5 Jul 2017 07:37:27 +0200 Subject: [PATCH 08/50] s390/mm: use new mm defines instead of magic values Reviewed-by: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/elf.h | 2 +- arch/s390/include/asm/ipl.h | 2 +- arch/s390/include/asm/mmu_context.h | 6 +- arch/s390/include/asm/pgalloc.h | 12 +-- arch/s390/include/asm/qdio.h | 2 +- arch/s390/include/asm/tlb.h | 6 +- arch/s390/kernel/dumpstack.c | 2 +- arch/s390/kernel/relocate_kernel.S | 5 +- arch/s390/kernel/setup.c | 8 +- arch/s390/kernel/vdso32/vdso32.lds.S | 4 +- arch/s390/kernel/vdso64/vdso64.lds.S | 4 +- arch/s390/mm/fault.c | 10 +-- arch/s390/mm/gmap.c | 124 +++++++++++++-------------- arch/s390/mm/init.c | 5 +- arch/s390/mm/pgalloc.c | 10 +-- 15 files changed, 103 insertions(+), 99 deletions(-) diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index c92ed0170be2..65998a1f5d43 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -191,7 +191,7 @@ struct arch_elf_state { } while (0) #define CORE_DUMP_USE_REGSET -#define ELF_EXEC_PAGESIZE 4096 +#define ELF_EXEC_PAGESIZE PAGE_SIZE /* * This is the base location for PIE (ET_DYN with INTERP) loads. On diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index edb5161df7e2..6810bd757312 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -81,7 +81,7 @@ struct ipl_parameter_block { struct ipl_block_fcp fcp; struct ipl_block_ccw ccw; } ipl_info; -} __attribute__((packed,aligned(4096))); +} __packed __aligned(PAGE_SIZE); /* * IPL validity flags diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 4541ac44b35f..92c1eb79ada4 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -33,7 +33,7 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.use_cmma = 0; #endif switch (mm->context.asce_limit) { - case 1UL << 42: + case _REGION2_SIZE: /* * forked 3-level task, fall through to set new asce with new * mm->pgd @@ -44,12 +44,12 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION3; break; - case 1UL << 53: + case _REGION1_SIZE: /* forked 4-level task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; break; - case 1UL << 31: + case _REGION3_SIZE: /* forked 2-level compat task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index eccfa0642712..ead67a34781f 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -44,16 +44,16 @@ static inline void clear_table(unsigned long *s, unsigned long val, size_t n) static inline void crst_table_init(unsigned long *crst, unsigned long entry) { - clear_table(crst, entry, sizeof(unsigned long)*2048); + clear_table(crst, entry, _CRST_TABLE_SIZE); } static inline unsigned long pgd_entry_type(struct mm_struct *mm) { - if (mm->context.asce_limit <= (1UL << 31)) + if (mm->context.asce_limit <= _REGION3_SIZE) return _SEGMENT_ENTRY_EMPTY; - if (mm->context.asce_limit <= (1UL << 42)) + if (mm->context.asce_limit <= _REGION2_SIZE) return _REGION3_ENTRY_EMPTY; - if (mm->context.asce_limit <= (1UL << 53)) + if (mm->context.asce_limit <= _REGION1_SIZE) return _REGION2_ENTRY_EMPTY; return _REGION1_ENTRY_EMPTY; } @@ -121,7 +121,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) if (!table) return NULL; - if (mm->context.asce_limit == (1UL << 31)) { + if (mm->context.asce_limit == _REGION3_SIZE) { /* Forking a compat process with 2 page table levels */ if (!pgtable_pmd_page_ctor(virt_to_page(table))) { crst_table_free(mm, table); @@ -133,7 +133,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - if (mm->context.asce_limit == (1UL << 31)) + if (mm->context.asce_limit == _REGION3_SIZE) pgtable_pmd_page_dtor(virt_to_page(pgd)); crst_table_free(mm, (unsigned long *) pgd); } diff --git a/arch/s390/include/asm/qdio.h b/arch/s390/include/asm/qdio.h index 998b61cd0e56..eaee69e7c42a 100644 --- a/arch/s390/include/asm/qdio.h +++ b/arch/s390/include/asm/qdio.h @@ -80,7 +80,7 @@ struct qdr { u32 qkey : 4; u32 : 28; struct qdesfmt0 qdf0[126]; -} __attribute__ ((packed, aligned(4096))); +} __packed __aligned(PAGE_SIZE); #define QIB_AC_OUTBOUND_PCI_SUPPORTED 0x40 #define QIB_RFLAGS_ENABLE_QEBSM 0x80 diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 7317b3108a88..950af48e88be 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -130,7 +130,7 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 31)) + if (tlb->mm->context.asce_limit <= _REGION3_SIZE) return; pgtable_pmd_page_dtor(virt_to_page(pmd)); tlb_remove_table(tlb, pmd); @@ -146,7 +146,7 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 53)) + if (tlb->mm->context.asce_limit <= _REGION1_SIZE) return; tlb_remove_table(tlb, p4d); } @@ -161,7 +161,7 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long address) { - if (tlb->mm->context.asce_limit <= (1UL << 42)) + if (tlb->mm->context.asce_limit <= _REGION2_SIZE) return; tlb_remove_table(tlb, pud); } diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index dab78babfab6..2aa545dca4d5 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -76,7 +76,7 @@ void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task, frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); #ifdef CONFIG_CHECK_STACK sp = __dump_trace(func, data, sp, - S390_lowcore.panic_stack + frame_size - 4096, + S390_lowcore.panic_stack + frame_size - PAGE_SIZE, S390_lowcore.panic_stack + frame_size); #endif sp = __dump_trace(func, data, sp, diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index cfac28330b03..4bdc65636603 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -7,6 +7,7 @@ */ #include +#include #include /* @@ -55,8 +56,8 @@ ENTRY(relocate_kernel) .back_pgm: lmg %r0,%r15,gprregs-.base(%r13) .top: - lghi %r7,4096 # load PAGE_SIZE in r7 - lghi %r9,4096 # load PAGE_SIZE in r9 + lghi %r7,PAGE_SIZE # load PAGE_SIZE in r7 + lghi %r9,PAGE_SIZE # load PAGE_SIZE in r9 lg %r5,0(%r2) # read another word for indirection page aghi %r2,8 # increment pointer tml %r5,0x1 # is it a destination page? diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 3d1d808ea8a9..bc1c95b7a4bd 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -305,7 +305,7 @@ static void __init setup_lowcore(void) /* * Setup lowcore for boot cpu */ - BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * 4096); + BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * PAGE_SIZE); lc = memblock_virt_alloc_low(sizeof(*lc), sizeof(*lc)); lc->restart_psw.mask = PSW_KERNEL_BITS; lc->restart_psw.addr = (unsigned long) restart_int_handler; @@ -469,10 +469,10 @@ static void __init setup_memory_end(void) vmalloc_size = VMALLOC_END ?: (128UL << 30) - MODULES_LEN; tmp = (memory_end ?: max_physmem_end) / PAGE_SIZE; tmp = tmp * (sizeof(struct page) + PAGE_SIZE); - if (tmp + vmalloc_size + MODULES_LEN <= (1UL << 42)) - vmax = 1UL << 42; /* 3-level kernel page table */ + if (tmp + vmalloc_size + MODULES_LEN <= _REGION2_SIZE) + vmax = _REGION2_SIZE; /* 3-level kernel page table */ else - vmax = 1UL << 53; /* 4-level kernel page table */ + vmax = _REGION1_SIZE; /* 4-level kernel page table */ /* module area is at the end of the kernel address space. */ MODULES_END = vmax; MODULES_VADDR = MODULES_END - MODULES_LEN; diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S index 8f048c2d6d13..263a7f9eee1e 100644 --- a/arch/s390/kernel/vdso32/vdso32.lds.S +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -2,6 +2,8 @@ * This is the infamous ld script for the 32 bits vdso * library */ + +#include #include OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") @@ -91,7 +93,7 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); PROVIDE(_vdso_data = .); /DISCARD/ : { diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index f35455d497fe..9e3dbbcc1cfc 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -2,6 +2,8 @@ * This is the infamous ld script for the 64 bits vdso * library */ + +#include #include OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") @@ -91,7 +93,7 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - . = ALIGN(4096); + . = ALIGN(PAGE_SIZE); PROVIDE(_vdso_data = .); /DISCARD/ : { diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 14f25798b001..bdabb013537b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -135,7 +135,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_alert("AS:%016lx ", asce); switch (asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: - table = table + ((address >> 53) & 0x7ff); + table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; if (bad_address(table)) goto bad; pr_cont("R1:%016lx ", *table); @@ -144,7 +144,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_REGION2: - table = table + ((address >> 42) & 0x7ff); + table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; if (bad_address(table)) goto bad; pr_cont("R2:%016lx ", *table); @@ -153,7 +153,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_REGION3: - table = table + ((address >> 31) & 0x7ff); + table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; if (bad_address(table)) goto bad; pr_cont("R3:%016lx ", *table); @@ -162,7 +162,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* fallthrough */ case _ASCE_TYPE_SEGMENT: - table = table + ((address >> 20) & 0x7ff); + table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (bad_address(table)) goto bad; pr_cont("S:%016lx ", *table); @@ -170,7 +170,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) goto out; table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); } - table = table + ((address >> 12) & 0xff); + table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; if (bad_address(table)) goto bad; pr_cont("P:%016lx ", *table); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 4fb3d3cdb370..53292c03e312 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -36,16 +36,16 @@ static struct gmap *gmap_alloc(unsigned long limit) unsigned long *table; unsigned long etype, atype; - if (limit < (1UL << 31)) { - limit = (1UL << 31) - 1; + if (limit < _REGION3_SIZE) { + limit = _REGION3_SIZE - 1; atype = _ASCE_TYPE_SEGMENT; etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < (1UL << 42)) { - limit = (1UL << 42) - 1; + } else if (limit < _REGION2_SIZE) { + limit = _REGION2_SIZE - 1; atype = _ASCE_TYPE_REGION3; etype = _REGION3_ENTRY_EMPTY; - } else if (limit < (1UL << 53)) { - limit = (1UL << 53) - 1; + } else if (limit < _REGION1_SIZE) { + limit = _REGION1_SIZE - 1; atype = _ASCE_TYPE_REGION2; etype = _REGION2_ENTRY_EMPTY; } else { @@ -65,7 +65,7 @@ static struct gmap *gmap_alloc(unsigned long limit) spin_lock_init(&gmap->guest_table_lock); spin_lock_init(&gmap->shadow_lock); atomic_set(&gmap->ref_count, 1); - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) goto out_free; page->index = 0; @@ -186,7 +186,7 @@ static void gmap_free(struct gmap *gmap) gmap_flush_tlb(gmap); /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); @@ -306,7 +306,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; new = (unsigned long *) page_to_phys(page); @@ -321,7 +321,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, } spin_unlock(&gmap->guest_table_lock); if (page) - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return 0; } @@ -546,30 +546,30 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) /* Create higher level tables in the gmap page table */ table = gmap->table; if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr >> 53) & 0x7ff; + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & 0xffe0000000000000UL)) + gaddr & _REGION1_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr >> 42) & 0x7ff; + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & 0xfffffc0000000000UL)) + gaddr & _REGION2_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr >> 31) & 0x7ff; + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if ((*table & _REGION_ENTRY_INVALID) && gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & 0xffffffff80000000UL)) + gaddr & _REGION3_MASK)) return -ENOMEM; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); } - table += (gaddr >> 20) & 0x7ff; + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ mm = gmap->mm; pgd = pgd_offset(mm, vmaddr); @@ -771,7 +771,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = gmap->table; switch (gmap->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: - table += (gaddr >> 53) & 0x7ff; + table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if (level == 4) break; if (*table & _REGION_ENTRY_INVALID) @@ -779,7 +779,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_REGION2: - table += (gaddr >> 42) & 0x7ff; + table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if (level == 3) break; if (*table & _REGION_ENTRY_INVALID) @@ -787,7 +787,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_REGION3: - table += (gaddr >> 31) & 0x7ff; + table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if (level == 2) break; if (*table & _REGION_ENTRY_INVALID) @@ -795,13 +795,13 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); /* Fallthrough */ case _ASCE_TYPE_SEGMENT: - table += (gaddr >> 20) & 0x7ff; + table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (level == 1) break; if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); - table += (gaddr >> 12) & 0xff; + table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; } return table; } @@ -1126,7 +1126,7 @@ static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr) table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */ if (!table || *table & _PAGE_INVALID) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1); + gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1); ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table); } @@ -1144,7 +1144,7 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, int i; BUG_ON(!gmap_is_shadow(sg)); - for (i = 0; i < 256; i++, raddr += 1UL << 12) + for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE) pgt[i] = _PAGE_INVALID; } @@ -1164,8 +1164,8 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1); - sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); + sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); *ste = _SEGMENT_ENTRY_EMPTY; @@ -1193,7 +1193,7 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; - for (i = 0; i < 2048; i++, raddr += 1UL << 20) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); @@ -1222,8 +1222,8 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */ if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1); - r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); + r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); *r3e = _REGION3_ENTRY_EMPTY; @@ -1231,7 +1231,7 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) /* Free segment table */ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1251,7 +1251,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; - for (i = 0; i < 2048; i++, raddr += 1UL << 31) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); @@ -1260,7 +1260,7 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, /* Free segment table */ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1280,8 +1280,8 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */ if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1); - r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); + r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); *r2e = _REGION2_ENTRY_EMPTY; @@ -1289,7 +1289,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) /* Free region 3 table */ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1309,7 +1309,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; - for (i = 0; i < 2048; i++, raddr += 1UL << 42) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); @@ -1318,7 +1318,7 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, /* Free region 3 table */ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1338,8 +1338,8 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN)) return; - gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1); - r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff)); + gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); + r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); *r1e = _REGION1_ENTRY_EMPTY; @@ -1347,7 +1347,7 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) /* Free region 2 table */ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } /** @@ -1367,7 +1367,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, BUG_ON(!gmap_is_shadow(sg)); asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; - for (i = 0; i < 2048; i++, raddr += 1UL << 53) { + for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); @@ -1378,7 +1378,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, /* Free region 2 table */ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); list_del(&page->lru); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); } } @@ -1535,7 +1535,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, /* protect after insertion, so it will get properly invalidated */ down_read(&parent->mm->mmap_sem); rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, - ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096, + ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, PROT_READ, PGSTE_VSIE_BIT); up_read(&parent->mm->mmap_sem); spin_lock(&parent->shadow_lock); @@ -1578,7 +1578,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = r2t & _REGION_ENTRY_ORIGIN; @@ -1614,10 +1614,10 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, } spin_unlock(&sg->guest_table_lock); /* Make r2t read-only in parent gmap page table */ - raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1; + raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1; origin = r2t & _REGION_ENTRY_ORIGIN; - offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1634,7 +1634,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_r2t); @@ -1662,7 +1662,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = r3t & _REGION_ENTRY_ORIGIN; @@ -1697,10 +1697,10 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, } spin_unlock(&sg->guest_table_lock); /* Make r3t read-only in parent gmap page table */ - raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2; + raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2; origin = r3t & _REGION_ENTRY_ORIGIN; - offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1717,7 +1717,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_r3t); @@ -1745,7 +1745,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ - page = alloc_pages(GFP_KERNEL, 2); + page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = sgt & _REGION_ENTRY_ORIGIN; @@ -1781,10 +1781,10 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, } spin_unlock(&sg->guest_table_lock); /* Make sgt read-only in parent gmap page table */ - raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3; + raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3; origin = sgt & _REGION_ENTRY_ORIGIN; - offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096; - len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset; + offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE; + len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset; rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ); spin_lock(&sg->guest_table_lock); if (!rc) { @@ -1801,7 +1801,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, return rc; out_free: spin_unlock(&sg->guest_table_lock); - __free_pages(page, 2); + __free_pages(page, CRST_ALLOC_ORDER); return rc; } EXPORT_SYMBOL_GPL(gmap_shadow_sgt); @@ -1902,7 +1902,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, } spin_unlock(&sg->guest_table_lock); /* Make pgt read-only in parent gmap page table (not the pgste) */ - raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT; + raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT; origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK; rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ); spin_lock(&sg->guest_table_lock); @@ -2021,7 +2021,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, } /* Check for top level table */ start = sg->orig_asce & _ASCE_ORIGIN; - end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096; + end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE; if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start && gaddr < end) { /* The complete shadow table has to go */ @@ -2032,7 +2032,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, return; } /* Remove the page table tree from on specific entry */ - head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12); + head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT); gmap_for_each_rmap_safe(rmap, rnext, head) { bits = rmap->raddr & _SHADOW_RMAP_MASK; raddr = rmap->raddr ^ bits; @@ -2076,7 +2076,7 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, struct gmap *gmap, *sg, *next; offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (4096 / sizeof(pte_t)); + offset = offset * (PAGE_SIZE / sizeof(pte_t)); rcu_read_lock(); list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) { spin_lock(&gmap->guest_table_lock); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3aee54b2ba60..c52a6b834f08 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -84,7 +84,7 @@ void __init paging_init(void) psw_t psw; init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > (1UL << 42)) { + if (VMALLOC_END > _REGION2_SIZE) { asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; pgd_type = _REGION2_ENTRY_EMPTY; } else { @@ -93,8 +93,7 @@ void __init paging_init(void) } init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; S390_lowcore.kernel_asce = init_mm.context.asce; - clear_table((unsigned long *) init_mm.pgd, pgd_type, - sizeof(unsigned long)*2048); + crst_table_init((unsigned long *) init_mm.pgd, pgd_type); vmem_map_init(); /* enable virtual mapping in kernel mode */ diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index a4de34ce392c..c5b74dd61197 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -83,7 +83,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) int rc, notify; /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ - BUG_ON(mm->context.asce_limit < (1UL << 42)); + BUG_ON(mm->context.asce_limit < _REGION2_SIZE); if (end >= TASK_SIZE_MAX) return -ENOMEM; rc = 0; @@ -96,11 +96,11 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) } spin_lock_bh(&mm->page_table_lock); pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit == (1UL << 42)) { + if (mm->context.asce_limit == _REGION2_SIZE) { crst_table_init(table, _REGION2_ENTRY_EMPTY); p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd); mm->pgd = (pgd_t *) table; - mm->context.asce_limit = 1UL << 53; + mm->context.asce_limit = _REGION1_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; } else { @@ -124,7 +124,7 @@ void crst_table_downgrade(struct mm_struct *mm) pgd_t *pgd; /* downgrade should only happen from 3 to 2 levels (compat only) */ - BUG_ON(mm->context.asce_limit != (1UL << 42)); + BUG_ON(mm->context.asce_limit != _REGION2_SIZE); if (current->active_mm == mm) { clear_user_asce(); @@ -133,7 +133,7 @@ void crst_table_downgrade(struct mm_struct *mm) pgd = mm->pgd; mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->context.asce_limit = 1UL << 31; + mm->context.asce_limit = _REGION3_SIZE; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; crst_table_free(mm, (unsigned long *) pgd); From 58cdf5eb13d83e7ec7444a3f88115b21d7512369 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 5 Jul 2017 07:37:14 +0200 Subject: [PATCH 09/50] KVM: s390: use new mm defines instead of magic values Acked-by: Christian Borntraeger Reviewed-by: Janosch Frank Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kvm/diag.c | 8 ++++---- arch/s390/kvm/gaccess.c | 35 +++++++++++++++++------------------ arch/s390/kvm/priv.c | 8 ++++---- arch/s390/kvm/vsie.c | 2 +- 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index ce865bd4f81d..e4d36094aceb 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -27,7 +27,7 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) unsigned long prefix = kvm_s390_get_prefix(vcpu); start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; - end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; + end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + PAGE_SIZE; vcpu->stat.diagnose_10++; if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end @@ -51,9 +51,9 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) */ gmap_discard(vcpu->arch.gmap, start, prefix); if (start <= prefix) - gmap_discard(vcpu->arch.gmap, 0, 4096); - if (end > prefix + 4096) - gmap_discard(vcpu->arch.gmap, 4096, 8192); + gmap_discard(vcpu->arch.gmap, 0, PAGE_SIZE); + if (end > prefix + PAGE_SIZE) + gmap_discard(vcpu->arch.gmap, PAGE_SIZE, 2 * PAGE_SIZE); gmap_discard(vcpu->arch.gmap, prefix + 2 * PAGE_SIZE, end); } return 0; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 653cae5e1ee1..3cc77391a102 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -629,7 +629,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, iep = ctlreg0.iep && test_kvm_facility(vcpu->kvm, 130); if (asce.r) goto real_address; - ptr = asce.origin * 4096; + ptr = asce.origin * PAGE_SIZE; switch (asce.dt) { case ASCE_TYPE_REGION1: if (vaddr.rfx01 > asce.tl) @@ -674,7 +674,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_REGION_SECOND_TRANS; if (edat1) dat_protection |= rfte.p; - ptr = rfte.rto * 4096 + vaddr.rsx * 8; + ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8; } /* fallthrough */ case ASCE_TYPE_REGION2: { @@ -692,7 +692,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_REGION_THIRD_TRANS; if (edat1) dat_protection |= rste.p; - ptr = rste.rto * 4096 + vaddr.rtx * 8; + ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8; } /* fallthrough */ case ASCE_TYPE_REGION3: { @@ -720,7 +720,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, return PGM_SEGMENT_TRANSLATION; if (edat1) dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto * 4096 + vaddr.sx * 8; + ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8; } /* fallthrough */ case ASCE_TYPE_SEGMENT: { @@ -743,7 +743,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, goto absolute_address; } dat_protection |= ste.fc0.p; - ptr = ste.fc0.pto * 2048 + vaddr.px * 8; + ptr = ste.fc0.pto * (PAGE_SIZE / 2) + vaddr.px * 8; } } if (kvm_is_error_gpa(vcpu->kvm, ptr)) @@ -993,7 +993,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, parent = sg->parent; vaddr.addr = saddr; asce.val = sg->orig_asce; - ptr = asce.origin * 4096; + ptr = asce.origin * PAGE_SIZE; if (asce.r) { *fake = 1; ptr = 0; @@ -1029,7 +1029,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, union region1_table_entry rfte; if (*fake) { - ptr += (unsigned long) vaddr.rfx << 53; + ptr += vaddr.rfx * _REGION1_SIZE; rfte.val = ptr; goto shadow_r2t; } @@ -1044,7 +1044,7 @@ static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, return PGM_REGION_SECOND_TRANS; if (sg->edat_level >= 1) *dat_protection |= rfte.p; - ptr = rfte.rto << 12UL; + ptr = rfte.rto * PAGE_SIZE; shadow_r2t: rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) @@ -1055,7 +1055,7 @@ shadow_r2t: union region2_table_entry rste; if (*fake) { - ptr += (unsigned long) vaddr.rsx << 42; + ptr += vaddr.rsx * _REGION2_SIZE; rste.val = ptr; goto shadow_r3t; } @@ -1070,7 +1070,7 @@ shadow_r2t: return PGM_REGION_THIRD_TRANS; if (sg->edat_level >= 1) *dat_protection |= rste.p; - ptr = rste.rto << 12UL; + ptr = rste.rto * PAGE_SIZE; shadow_r3t: rste.p |= *dat_protection; rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); @@ -1082,7 +1082,7 @@ shadow_r3t: union region3_table_entry rtte; if (*fake) { - ptr += (unsigned long) vaddr.rtx << 31; + ptr += vaddr.rtx * _REGION3_SIZE; rtte.val = ptr; goto shadow_sgt; } @@ -1098,7 +1098,7 @@ shadow_r3t: if (rtte.fc && sg->edat_level >= 2) { *dat_protection |= rtte.fc0.p; *fake = 1; - ptr = rtte.fc1.rfaa << 31UL; + ptr = rtte.fc1.rfaa * _REGION3_SIZE; rtte.val = ptr; goto shadow_sgt; } @@ -1106,7 +1106,7 @@ shadow_r3t: return PGM_SEGMENT_TRANSLATION; if (sg->edat_level >= 1) *dat_protection |= rtte.fc0.p; - ptr = rtte.fc0.sto << 12UL; + ptr = rtte.fc0.sto * PAGE_SIZE; shadow_sgt: rtte.fc0.p |= *dat_protection; rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); @@ -1118,7 +1118,7 @@ shadow_sgt: union segment_table_entry ste; if (*fake) { - ptr += (unsigned long) vaddr.sx << 20; + ptr += vaddr.sx * _SEGMENT_SIZE; ste.val = ptr; goto shadow_pgt; } @@ -1134,11 +1134,11 @@ shadow_sgt: *dat_protection |= ste.fc0.p; if (ste.fc && sg->edat_level >= 1) { *fake = 1; - ptr = ste.fc1.sfaa << 20UL; + ptr = ste.fc1.sfaa * _SEGMENT_SIZE; ste.val = ptr; goto shadow_pgt; } - ptr = ste.fc0.pto << 11UL; + ptr = ste.fc0.pto * (PAGE_SIZE / 2); shadow_pgt: ste.fc0.p |= *dat_protection; rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); @@ -1187,8 +1187,7 @@ int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, vaddr.addr = saddr; if (fake) { - /* offset in 1MB guest memory block */ - pte.val = pgt + ((unsigned long) vaddr.px << 12UL); + pte.val = pgt + vaddr.px * PAGE_SIZE; goto shadow_page; } if (!rc) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 8a1dac793d6b..785ad028bde6 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -329,7 +329,7 @@ static int handle_sske(struct kvm_vcpu *vcpu) start = kvm_s390_logical_to_effective(vcpu, start); if (m3 & SSKE_MB) { /* start already designates an absolute address */ - end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + end = (start + _SEGMENT_SIZE) & ~(_SEGMENT_SIZE - 1); } else { start = kvm_s390_real_to_abs(vcpu, start); end = start + PAGE_SIZE; @@ -893,10 +893,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) case 0x00000000: /* only 4k frames specify a real address */ start = kvm_s390_real_to_abs(vcpu, start); - end = (start + (1UL << 12)) & ~((1UL << 12) - 1); + end = (start + PAGE_SIZE) & ~(PAGE_SIZE - 1); break; case 0x00001000: - end = (start + (1UL << 20)) & ~((1UL << 20) - 1); + end = (start + _SEGMENT_SIZE) & ~(_SEGMENT_SIZE - 1); break; case 0x00002000: /* only support 2G frame size if EDAT2 is available and we are @@ -904,7 +904,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (!test_kvm_facility(vcpu->kvm, 78) || psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_BITS_AMODE_24BIT) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - end = (start + (1UL << 31)) & ~((1UL << 31) - 1); + end = (start + _REGION3_SIZE) & ~(_REGION3_SIZE - 1); break; default: return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 715c19c45d9a..ba8203e4d516 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -1069,7 +1069,7 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu) if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); - BUILD_BUG_ON(sizeof(struct vsie_page) != 4096); + BUILD_BUG_ON(sizeof(struct vsie_page) != PAGE_SIZE); scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL); /* 512 byte alignment */ From a01ef3082d9c216f1187ea308c25e5dd486afe3d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 16 Jun 2017 17:51:15 +0200 Subject: [PATCH 10/50] s390/mm,vmem: simplify region and segment table allocation code Reviewed-by: Martin Schwidefsky Reviewed-by: Janosch Frank Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgalloc.h | 4 +++ arch/s390/include/asm/pgtable.h | 3 --- arch/s390/mm/pageattr.c | 3 ++- arch/s390/mm/vmem.c | 47 +++++++++------------------------ 4 files changed, 18 insertions(+), 39 deletions(-) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index ead67a34781f..a0d9167519b1 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -160,4 +160,8 @@ static inline void pmd_populate(struct mm_struct *mm, extern void rcu_table_freelist_finish(void); +void vmem_map_init(void); +void *vmem_crst_alloc(unsigned long val); +pte_t *vmem_pte_alloc(void); + #endif /* _S390_PGALLOC_H */ diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 90f422fddd2b..c9560e035207 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -21,9 +21,6 @@ extern pgd_t swapper_pg_dir[]; extern void paging_init(void); -extern void vmem_map_init(void); -pmd_t *vmem_pmd_alloc(void); -pte_t *vmem_pte_alloc(void); enum { PG_DIRECT_MAP_4K = 0, diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 567fe92e2bb8..552f898dfa74 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -191,7 +192,7 @@ static int split_pud_page(pud_t *pudp, unsigned long addr) pud_t new; int i, ro, nx; - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) return -ENOMEM; pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index d8398962a723..c0af0d7b6e5f 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -38,37 +38,14 @@ static void __ref *vmem_alloc_pages(unsigned int order) return (void *) memblock_alloc(size, size); } -static inline p4d_t *vmem_p4d_alloc(void) +void *vmem_crst_alloc(unsigned long val) { - p4d_t *p4d = NULL; + unsigned long *table; - p4d = vmem_alloc_pages(2); - if (!p4d) - return NULL; - clear_table((unsigned long *) p4d, _REGION2_ENTRY_EMPTY, PAGE_SIZE * 4); - return p4d; -} - -static inline pud_t *vmem_pud_alloc(void) -{ - pud_t *pud = NULL; - - pud = vmem_alloc_pages(2); - if (!pud) - return NULL; - clear_table((unsigned long *) pud, _REGION3_ENTRY_EMPTY, PAGE_SIZE * 4); - return pud; -} - -pmd_t *vmem_pmd_alloc(void) -{ - pmd_t *pmd = NULL; - - pmd = vmem_alloc_pages(2); - if (!pmd) - return NULL; - clear_table((unsigned long *) pmd, _SEGMENT_ENTRY_EMPTY, PAGE_SIZE * 4); - return pmd; + table = vmem_alloc_pages(CRST_ALLOC_ORDER); + if (table) + crst_table_init(table, val); + return table; } pte_t __ref *vmem_pte_alloc(void) @@ -114,14 +91,14 @@ static int vmem_add_mem(unsigned long start, unsigned long size) while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { - p4_dir = vmem_p4d_alloc(); + p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); if (!p4_dir) goto out; pgd_populate(&init_mm, pg_dir, p4_dir); } p4_dir = p4d_offset(pg_dir, address); if (p4d_none(*p4_dir)) { - pu_dir = vmem_pud_alloc(); + pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); if (!pu_dir) goto out; p4d_populate(&init_mm, p4_dir, pu_dir); @@ -136,7 +113,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size) continue; } if (pud_none(*pu_dir)) { - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) goto out; pud_populate(&init_mm, pu_dir, pm_dir); @@ -253,7 +230,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) for (address = start; address < end;) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { - p4_dir = vmem_p4d_alloc(); + p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); if (!p4_dir) goto out; pgd_populate(&init_mm, pg_dir, p4_dir); @@ -261,7 +238,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) p4_dir = p4d_offset(pg_dir, address); if (p4d_none(*p4_dir)) { - pu_dir = vmem_pud_alloc(); + pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); if (!pu_dir) goto out; p4d_populate(&init_mm, p4_dir, pu_dir); @@ -269,7 +246,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) pu_dir = pud_offset(p4_dir, address); if (pud_none(*pu_dir)) { - pm_dir = vmem_pmd_alloc(); + pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!pm_dir) goto out; pud_populate(&init_mm, pu_dir, pm_dir); From 45be0a02f8110a13502fac6167a893c7c0eff432 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 6 Feb 2017 15:56:14 +0100 Subject: [PATCH 11/50] s390/sclp: single increment assignment control Set a new option bit of the attach command to speed up memory hotplug. Signed-off-by: Martin Schwidefsky --- drivers/s390/char/sclp_cmd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index b9c5522b8a68..dff8b94871f0 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c @@ -252,6 +252,7 @@ static int sclp_attach_storage(u8 id) if (!sccb) return -ENOMEM; sccb->header.length = PAGE_SIZE; + sccb->header.function_code = 0x40; rc = sclp_sync_request_timeout(0x00080001 | id << 8, sccb, SCLP_QUEUE_INTERVAL); if (rc) From 6e2ef5e4f6cc57344762932d70d38ba4ec65fa8b Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 27 Oct 2016 12:41:39 +0200 Subject: [PATCH 12/50] s390/time: add support for the TOD clock epoch extension The TOD epoch extension adds 8 epoch bits to the TOD clock to provide a continuous clock after 2042/09/17. The store-clock-extended (STCKE) instruction will store the epoch index in the first byte of the 16 bytes stored by the instruction. The read_boot_clock64 and the read_presistent_clock64 functions need to take the additional bits into account to give the correct result after 2042/09/17. The clock-comparator register will stay 64 bit wide. The comparison of the clock-comparator with the TOD clock is limited to bytes 1 to 8 of the extended TOD format. To deal with the overflow problem due to an epoch change the clock-comparator sign control in CR0 can be used to switch the comparison of the 64-bit TOD clock with the clock-comparator to a signed comparison. The decision between the signed vs. unsigned clock-comparator comparisons is done at boot time. Only if the TOD clock is in the second half of a 142 year epoch the signed comparison is used. This solves the epoch overflow issue as long as the machine is booted at least once in an epoch. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/lowcore.h | 48 ++++++++++++------------ arch/s390/include/asm/setup.h | 2 + arch/s390/include/asm/timex.h | 38 +++++++++++++++++-- arch/s390/kernel/asm-offsets.c | 1 + arch/s390/kernel/debug.c | 9 +++-- arch/s390/kernel/early.c | 15 ++++++-- arch/s390/kernel/head.S | 3 +- arch/s390/kernel/head64.S | 4 +- arch/s390/kernel/irq.c | 3 +- arch/s390/kernel/setup.c | 2 +- arch/s390/kernel/time.c | 65 +++++++++++++++++++++++---------- arch/s390/lib/delay.c | 2 +- 12 files changed, 130 insertions(+), 62 deletions(-) diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 8a5b082797f8..a6870ea6ea8b 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -95,46 +95,46 @@ struct lowcore { __u64 int_clock; /* 0x0310 */ __u64 mcck_clock; /* 0x0318 */ __u64 clock_comparator; /* 0x0320 */ + __u64 boot_clock[2]; /* 0x0328 */ /* Current process. */ - __u64 current_task; /* 0x0328 */ - __u8 pad_0x318[0x320-0x318]; /* 0x0330 */ - __u64 kernel_stack; /* 0x0338 */ + __u64 current_task; /* 0x0338 */ + __u64 kernel_stack; /* 0x0340 */ /* Interrupt, panic and restart stack. */ - __u64 async_stack; /* 0x0340 */ - __u64 panic_stack; /* 0x0348 */ - __u64 restart_stack; /* 0x0350 */ + __u64 async_stack; /* 0x0348 */ + __u64 panic_stack; /* 0x0350 */ + __u64 restart_stack; /* 0x0358 */ /* Restart function and parameter. */ - __u64 restart_fn; /* 0x0358 */ - __u64 restart_data; /* 0x0360 */ - __u64 restart_source; /* 0x0368 */ + __u64 restart_fn; /* 0x0360 */ + __u64 restart_data; /* 0x0368 */ + __u64 restart_source; /* 0x0370 */ /* Address space pointer. */ - __u64 kernel_asce; /* 0x0370 */ - __u64 user_asce; /* 0x0378 */ + __u64 kernel_asce; /* 0x0378 */ + __u64 user_asce; /* 0x0380 */ /* * The lpp and current_pid fields form a * 64-bit value that is set as program * parameter with the LPP instruction. */ - __u32 lpp; /* 0x0380 */ - __u32 current_pid; /* 0x0384 */ + __u32 lpp; /* 0x0388 */ + __u32 current_pid; /* 0x038c */ /* SMP info area */ - __u32 cpu_nr; /* 0x0388 */ - __u32 softirq_pending; /* 0x038c */ - __u64 percpu_offset; /* 0x0390 */ - __u64 vdso_per_cpu_data; /* 0x0398 */ - __u64 machine_flags; /* 0x03a0 */ - __u32 preempt_count; /* 0x03a8 */ - __u8 pad_0x03ac[0x03b0-0x03ac]; /* 0x03ac */ - __u64 gmap; /* 0x03b0 */ - __u32 spinlock_lockval; /* 0x03b8 */ - __u32 fpu_flags; /* 0x03bc */ - __u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */ + __u32 cpu_nr; /* 0x0390 */ + __u32 softirq_pending; /* 0x0394 */ + __u64 percpu_offset; /* 0x0398 */ + __u64 vdso_per_cpu_data; /* 0x03a0 */ + __u64 machine_flags; /* 0x03a8 */ + __u32 preempt_count; /* 0x03b0 */ + __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */ + __u64 gmap; /* 0x03b8 */ + __u32 spinlock_lockval; /* 0x03c0 */ + __u32 fpu_flags; /* 0x03c4 */ + __u8 pad_0x03c8[0x0400-0x03c8]; /* 0x03c8 */ /* Per cpu primary space access list */ __u32 paste[16]; /* 0x0400 */ diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 49c425903894..61da4bd6edad 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -32,6 +32,7 @@ #define MACHINE_FLAG_TLB_GUEST _BITUL(14) #define MACHINE_FLAG_NX _BITUL(15) #define MACHINE_FLAG_GS _BITUL(16) +#define MACHINE_FLAG_SCC _BITUL(17) #define LPP_MAGIC _BITUL(31) #define LPP_PFAULT_PID_MASK _AC(0xffffffff, UL) @@ -72,6 +73,7 @@ extern void detect_memory_memblock(void); #define MACHINE_HAS_TLB_GUEST (S390_lowcore.machine_flags & MACHINE_FLAG_TLB_GUEST) #define MACHINE_HAS_NX (S390_lowcore.machine_flags & MACHINE_FLAG_NX) #define MACHINE_HAS_GS (S390_lowcore.machine_flags & MACHINE_FLAG_GS) +#define MACHINE_HAS_SCC (S390_lowcore.machine_flags & MACHINE_FLAG_SCC) /* * Console mode. Override with conmode= diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 118535123f34..0ea03c11458d 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -15,6 +15,8 @@ /* The value of the TOD clock for 1.1.1970. */ #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL +extern u64 clock_comparator_max; + /* Inline functions for clock register access. */ static inline int set_tod_clock(__u64 time) { @@ -126,7 +128,7 @@ static inline unsigned long long local_tick_disable(void) unsigned long long old; old = S390_lowcore.clock_comparator; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; set_clock_comparator(S390_lowcore.clock_comparator); return old; } @@ -178,20 +180,20 @@ int get_phys_clock(unsigned long long *clock); void init_cpu_timer(void); unsigned long long monotonic_clock(void); -extern u64 sched_clock_base_cc; +extern unsigned char tod_clock_base[16] __aligned(8); /** * get_clock_monotonic - returns current time in clock rate units * * The caller must ensure that preemption is disabled. - * The clock and sched_clock_base get changed via stop_machine. + * The clock and tod_clock_base get changed via stop_machine. * Therefore preemption must be disabled when calling this * function, otherwise the returned value is not guaranteed to * be monotonic. */ static inline unsigned long long get_tod_clock_monotonic(void) { - return get_tod_clock() - sched_clock_base_cc; + return get_tod_clock() - *(unsigned long long *) &tod_clock_base[1]; } /** @@ -218,4 +220,32 @@ static inline unsigned long long tod_to_ns(unsigned long long todval) return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9); } +/** + * tod_after - compare two 64 bit TOD values + * @a: first 64 bit TOD timestamp + * @b: second 64 bit TOD timestamp + * + * Returns: true if a is later than b + */ +static inline int tod_after(unsigned long long a, unsigned long long b) +{ + if (MACHINE_HAS_SCC) + return (long long) a > (long long) b; + return a > b; +} + +/** + * tod_after_eq - compare two 64 bit TOD values + * @a: first 64 bit TOD timestamp + * @b: second 64 bit TOD timestamp + * + * Returns: true if a is later than b + */ +static inline int tod_after_eq(unsigned long long a, unsigned long long b) +{ + if (MACHINE_HAS_SCC) + return (long long) a >= (long long) b; + return a >= b; +} + #endif diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index b65c414b6c0e..3d42f91c95fd 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -158,6 +158,7 @@ int main(void) OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); + OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 86b3e74f569e..1d9e83c401fc 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -866,7 +866,8 @@ static inline void debug_finish_entry(debug_info_t * id, debug_entry_t* active, int level, int exception) { - active->id.stck = get_tod_clock_fast() - sched_clock_base_cc; + active->id.stck = get_tod_clock_fast() - + *(unsigned long long *) &tod_clock_base[1]; active->id.fields.cpuid = smp_processor_id(); active->caller = __builtin_return_address(0); active->id.fields.exception = exception; @@ -1455,15 +1456,15 @@ int debug_dflt_header_fn(debug_info_t * id, struct debug_view *view, int area, debug_entry_t * entry, char *out_buf) { - unsigned long sec, usec; + unsigned long base, sec, usec; char *except_str; unsigned long caller; int rc = 0; unsigned int level; level = entry->id.fields.level; - sec = (entry->id.stck >> 12) + (sched_clock_base_cc >> 12); - sec = sec - (TOD_UNIX_EPOCH >> 12); + base = (*(unsigned long *) &tod_clock_base[0]) >> 4; + sec = (entry->id.stck >> 12) + base - (TOD_UNIX_EPOCH >> 12); usec = do_div(sec, USEC_PER_SEC); if (entry->id.fields.exception) diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 5d20182ee8ae..added6790460 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -53,8 +53,9 @@ static void __init reset_tod_clock(void) if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0) disabled_wait(0); - sched_clock_base_cc = TOD_UNIX_EPOCH; - S390_lowcore.last_update_clock = sched_clock_base_cc; + memset(tod_clock_base, 0, 16); + *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH; + S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; } #ifdef CONFIG_SHARED_KERNEL @@ -165,8 +166,8 @@ static noinline __init void create_kernel_nss(void) } /* re-initialize cputime accounting. */ - sched_clock_base_cc = get_tod_clock(); - S390_lowcore.last_update_clock = sched_clock_base_cc; + get_tod_clock_ext(tod_clock_base); + S390_lowcore.last_update_clock = *(__u64 *) &tod_clock_base[1]; S390_lowcore.last_update_timer = 0x7fffffffffffffffULL; S390_lowcore.user_timer = 0; S390_lowcore.system_timer = 0; @@ -387,6 +388,12 @@ static __init void detect_machine_facilities(void) } if (test_facility(133)) S390_lowcore.machine_flags |= MACHINE_FLAG_GS; + if (test_facility(139) && (tod_clock_base[1] & 0x80)) { + /* Enabled signed clock comparator comparisons */ + S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; + clock_comparator_max = -1ULL >> 1; + __ctl_set_bit(0, 53); + } } static inline void save_vector_registers(void) diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index eff5b31671d4..8ed753c72d9b 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -302,7 +302,8 @@ ENTRY(startup_kdump) xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 lctlg %c0,%c15,0x200(%r0) # initialize control registers - stck __LC_LAST_UPDATE_CLOCK + stcke __LC_BOOT_CLOCK + mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 spt 6f-.LPG0(%r13) mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13) l %r15,.Lstack-.LPG0(%r13) diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 31c91f24e562..0d8f2a858ced 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -21,8 +21,8 @@ ENTRY(startup_continue) xc __LC_LPP+1(7,0),__LC_LPP+1 # clear lpp and current_pid mvi __LC_LPP,0x80 # and set LPP_MAGIC .insn s,0xb2800000,__LC_LPP # load program parameter -0: larl %r1,sched_clock_base_cc - mvc 0(8,%r1),__LC_LAST_UPDATE_CLOCK +0: larl %r1,tod_clock_base + mvc 0(16,%r1),__LC_BOOT_CLOCK larl %r13,.LPG1 # get base lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 6dca93b29bed..a2fdff0e730b 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -105,7 +105,8 @@ void do_IRQ(struct pt_regs *regs, int irq) old_regs = set_irq_regs(regs); irq_enter(); - if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator) + if (tod_after_eq(S390_lowcore.int_clock, + S390_lowcore.clock_comparator)) /* Serve timer interrupts first. */ clock_comparator_work(); generic_handle_irq(irq); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index bc1c95b7a4bd..e8b84894b650 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -323,7 +323,7 @@ static void __init setup_lowcore(void) lc->io_new_psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_MCHECK; lc->io_new_psw.addr = (unsigned long) io_int_handler; - lc->clock_comparator = -1ULL; + lc->clock_comparator = clock_comparator_max; lc->kernel_stack = ((unsigned long) &init_thread_union) + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->async_stack = (unsigned long) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 192efdfac918..15abecba068e 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -51,8 +51,15 @@ #include #include "entry.h" -u64 sched_clock_base_cc = -1; /* Force to data section. */ -EXPORT_SYMBOL_GPL(sched_clock_base_cc); +unsigned char tod_clock_base[16] __aligned(8) = { + /* Force to data section. */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; +EXPORT_SYMBOL_GPL(tod_clock_base); + +u64 clock_comparator_max = -1ULL; +EXPORT_SYMBOL_GPL(clock_comparator_max); static DEFINE_PER_CPU(struct clock_event_device, comparators); @@ -75,7 +82,7 @@ void __init time_early_init(void) struct ptff_qui qui; /* Initialize TOD steering parameters */ - tod_steering_end = sched_clock_base_cc; + tod_steering_end = *(unsigned long long *) &tod_clock_base[1]; vdso_data->ts_end = tod_steering_end; if (!test_facility(28)) @@ -111,22 +118,27 @@ unsigned long long monotonic_clock(void) } EXPORT_SYMBOL(monotonic_clock); -static void tod_to_timeval(__u64 todval, struct timespec64 *xt) +static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt) { - unsigned long long sec; + unsigned long long high, low, rem, sec, nsec; + + /* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */ + high = (*(unsigned long long *) clk) >> 4; + low = (*(unsigned long long *)&clk[7]) << 4; + /* Calculate seconds and nano-seconds */ + sec = high; + rem = do_div(sec, 1000000); + nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32; - sec = todval >> 12; - do_div(sec, 1000000); xt->tv_sec = sec; - todval -= (sec * 1000000) << 12; - xt->tv_nsec = ((todval * 1000) >> 12); + xt->tv_nsec = nsec; } void clock_comparator_work(void) { struct clock_event_device *cd; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; cd = this_cpu_ptr(&comparators); cd->event_handler(cd); } @@ -148,7 +160,7 @@ void init_cpu_timer(void) struct clock_event_device *cd; int cpu; - S390_lowcore.clock_comparator = -1ULL; + S390_lowcore.clock_comparator = clock_comparator_max; set_clock_comparator(S390_lowcore.clock_comparator); cpu = smp_processor_id(); @@ -179,7 +191,7 @@ static void clock_comparator_interrupt(struct ext_code ext_code, unsigned long param64) { inc_irq_stat(IRQEXT_CLK); - if (S390_lowcore.clock_comparator == -1ULL) + if (S390_lowcore.clock_comparator == clock_comparator_max) set_clock_comparator(S390_lowcore.clock_comparator); } @@ -197,18 +209,28 @@ static void stp_reset(void); void read_persistent_clock64(struct timespec64 *ts) { - __u64 clock; + unsigned char clk[STORE_CLOCK_EXT_SIZE]; + __u64 delta; - clock = get_tod_clock() - initial_leap_seconds; - tod_to_timeval(clock - TOD_UNIX_EPOCH, ts); + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + get_tod_clock_ext(clk); + *(__u64 *) &clk[1] -= delta; + if (*(__u64 *) &clk[1] > delta) + clk[0]--; + ext_to_timespec64(clk, ts); } void read_boot_clock64(struct timespec64 *ts) { - __u64 clock; + unsigned char clk[STORE_CLOCK_EXT_SIZE]; + __u64 delta; - clock = sched_clock_base_cc - initial_leap_seconds; - tod_to_timeval(clock - TOD_UNIX_EPOCH, ts); + delta = initial_leap_seconds + TOD_UNIX_EPOCH; + memcpy(clk, tod_clock_base, 16); + *(__u64 *) &clk[1] -= delta; + if (*(__u64 *) &clk[1] > delta) + clk[0]--; + ext_to_timespec64(clk, ts); } static u64 read_tod_clock(struct clocksource *cs) @@ -406,7 +428,10 @@ static void clock_sync_global(unsigned long long delta) struct ptff_qto qto; /* Fixup the monotonic sched clock. */ - sched_clock_base_cc += delta; + *(unsigned long long *) &tod_clock_base[1] += delta; + if (*(unsigned long long *) &tod_clock_base[1] < delta) + /* Epoch overflow */ + tod_clock_base[0]++; /* Adjust TOD steering parameters. */ vdso_data->tb_update_count++; now = get_tod_clock(); @@ -437,7 +462,7 @@ static void clock_sync_global(unsigned long long delta) static void clock_sync_local(unsigned long long delta) { /* Add the delta to the clock comparator. */ - if (S390_lowcore.clock_comparator != -1ULL) { + if (S390_lowcore.clock_comparator != clock_comparator_max) { S390_lowcore.clock_comparator += delta; set_clock_comparator(S390_lowcore.clock_comparator); } diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index 92e90e40b6fb..7f17555ad4d5 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -57,7 +57,7 @@ static void __udelay_enabled(unsigned long long usecs) end = get_tod_clock_fast() + (usecs << 12); do { clock_saved = 0; - if (end < S390_lowcore.clock_comparator) { + if (tod_after(S390_lowcore.clock_comparator, end)) { clock_saved = local_tick_disable(); set_clock_comparator(end); } From 6997c32365ac5a61b298a9e165ed32497c8cbc25 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 12 Apr 2017 14:17:25 +0200 Subject: [PATCH 13/50] s390: add support for IBM z14 machines Add detection for machine type 0x3906 and set the ELF platform name to z14. Add the miscellaneous-instruction-extension 2 facility to the list of facilities for z14. And allow to generate code that only runs on a z14 machine. Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 18 ++++++++++++++++++ arch/s390/Makefile | 6 ++++-- arch/s390/kernel/setup.c | 3 +++ arch/s390/tools/gen_facilities.c | 3 +++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 7eeb75d758c1..48af970320cb 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -222,6 +222,10 @@ config HAVE_MARCH_Z13_FEATURES def_bool n select HAVE_MARCH_ZEC12_FEATURES +config HAVE_MARCH_Z14_FEATURES + def_bool n + select HAVE_MARCH_Z13_FEATURES + choice prompt "Processor type" default MARCH_Z196 @@ -282,6 +286,14 @@ config MARCH_Z13 2964 series). The kernel will be slightly faster but will not work on older machines. +config MARCH_Z14 + bool "IBM z14" + select HAVE_MARCH_Z14_FEATURES + help + Select this to enable optimizations for IBM z14 (3906 series). + The kernel will be slightly faster but will not work on older + machines. + endchoice config MARCH_Z900_TUNE @@ -305,6 +317,9 @@ config MARCH_ZEC12_TUNE config MARCH_Z13_TUNE def_bool TUNE_Z13 || MARCH_Z13 && TUNE_DEFAULT +config MARCH_Z14_TUNE + def_bool TUNE_Z14 || MARCH_Z14 && TUNE_DEFAULT + choice prompt "Tune code generation" default TUNE_DEFAULT @@ -343,6 +358,9 @@ config TUNE_ZEC12 config TUNE_Z13 bool "IBM z13" +config TUNE_Z14 + bool "IBM z14" + endchoice config 64BIT diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 54e00526b8df..dac821cfcd43 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -31,7 +31,8 @@ mflags-$(CONFIG_MARCH_Z9_109) := -march=z9-109 mflags-$(CONFIG_MARCH_Z10) := -march=z10 mflags-$(CONFIG_MARCH_Z196) := -march=z196 mflags-$(CONFIG_MARCH_ZEC12) := -march=zEC12 -mflags-$(CONFIG_MARCH_Z13) := -march=z13 +mflags-$(CONFIG_MARCH_Z13) := -march=z13 +mflags-$(CONFIG_MARCH_Z14) := -march=z14 export CC_FLAGS_MARCH := $(mflags-y) @@ -44,7 +45,8 @@ cflags-$(CONFIG_MARCH_Z9_109_TUNE) += -mtune=z9-109 cflags-$(CONFIG_MARCH_Z10_TUNE) += -mtune=z10 cflags-$(CONFIG_MARCH_Z196_TUNE) += -mtune=z196 cflags-$(CONFIG_MARCH_ZEC12_TUNE) += -mtune=zEC12 -cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 +cflags-$(CONFIG_MARCH_Z13_TUNE) += -mtune=z13 +cflags-$(CONFIG_MARCH_Z14_TUNE) += -mtune=z14 cflags-y += -Wa,-I$(srctree)/arch/$(ARCH)/include diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index e8b84894b650..a50238e17867 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -818,6 +818,9 @@ static int __init setup_hwcaps(void) case 0x2965: strcpy(elf_platform, "z13"); break; + case 0x3906: + strcpy(elf_platform, "z14"); + break; } /* diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 025ea20fc4b4..3e878ecf522c 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -53,6 +53,9 @@ static struct facility_def facility_defs[] = { #endif #ifdef CONFIG_HAVE_MARCH_Z13_FEATURES 53, /* load-and-zero-rightmost-byte, etc. */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z14_FEATURES + 58, /* miscellaneous-instruction-extension 2 */ #endif -1 /* END */ } From 9920decd64cdfa078efe968bf85de6bf50c73bb7 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 19 Jul 2017 12:39:10 +0530 Subject: [PATCH 14/50] s390/zcrypt_queue: constify attribute_group structures. attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 1361 96 0 1457 5b1 s390/crypto/zcrypt_queue.o File size After adding 'const': text data bss dec hex filename 1425 32 0 1457 5b1 s390/crypto/zcrypt_queue.o Signed-off-by: Arvind Yadav Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/zcrypt_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/crypto/zcrypt_queue.c b/drivers/s390/crypto/zcrypt_queue.c index a303f3b2c328..4742be0eec24 100644 --- a/drivers/s390/crypto/zcrypt_queue.c +++ b/drivers/s390/crypto/zcrypt_queue.c @@ -89,7 +89,7 @@ static struct attribute *zcrypt_queue_attrs[] = { NULL, }; -static struct attribute_group zcrypt_queue_attr_group = { +static const struct attribute_group zcrypt_queue_attr_group = { .attrs = zcrypt_queue_attrs, }; From 9731c0a9bc42f51187bf78007c79b667c9ccb427 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 19 Jul 2017 12:39:11 +0530 Subject: [PATCH 15/50] s390/zcrypt_card: constify attribute_group structures. attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 1019 160 0 1179 49b drivers/s390/crypto/zcrypt_card.o File size After adding 'const': text data bss dec hex filename 1083 96 0 1179 49b drivers/s390/crypto/zcrypt_card.o Signed-off-by: Arvind Yadav Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/zcrypt_card.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/crypto/zcrypt_card.c b/drivers/s390/crypto/zcrypt_card.c index 53436ea52230..f85dacf1c284 100644 --- a/drivers/s390/crypto/zcrypt_card.c +++ b/drivers/s390/crypto/zcrypt_card.c @@ -98,7 +98,7 @@ static struct attribute *zcrypt_card_attrs[] = { NULL, }; -static struct attribute_group zcrypt_card_attr_group = { +static const struct attribute_group zcrypt_card_attr_group = { .attrs = zcrypt_card_attrs, }; From 131716776b778b4a2053759547eddc7674be1e29 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 19 Jul 2017 12:39:12 +0530 Subject: [PATCH 16/50] s390/dasd: constify attribute_group structures. attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. Signed-off-by: Arvind Yadav Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd_devmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 779dce069cc5..7b539d92eeaf 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -1634,7 +1634,7 @@ static struct attribute * dasd_attrs[] = { NULL, }; -static struct attribute_group dasd_attr_group = { +static const struct attribute_group dasd_attr_group = { .attrs = dasd_attrs, }; From f460d113c6805e953cbcafb5e10b6c0edf5a55aa Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 19 Jul 2017 12:39:13 +0530 Subject: [PATCH 17/50] s390/cio: constify attribute_group structures. attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. Signed-off-by: Arvind Yadav Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 7be01a58b44f..489b583f263d 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -612,7 +612,7 @@ static struct attribute *io_subchannel_attrs[] = { NULL, }; -static struct attribute_group io_subchannel_attr_group = { +static const struct attribute_group io_subchannel_attr_group = { .attrs = io_subchannel_attrs, }; @@ -626,7 +626,7 @@ static struct attribute * ccwdev_attrs[] = { NULL, }; -static struct attribute_group ccwdev_attr_group = { +static const struct attribute_group ccwdev_attr_group = { .attrs = ccwdev_attrs, }; From cfe9a0428dda7122c3da1c65d252c37f37b1486a Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 19 Jul 2017 12:39:14 +0530 Subject: [PATCH 18/50] s390/qeth: constify attribute_group structures. attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 6763 1216 0 7979 1f2b drivers/s390/net/qeth_l3_sys.o File size After adding 'const': text data bss dec hex filename 7019 960 0 7971 1f2b drivers/s390/net/qeth_l3_sys.o Signed-off-by: Arvind Yadav Signed-off-by: Martin Schwidefsky --- drivers/s390/net/qeth_l3_sys.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c index f2f94f59e0fa..1a80ce41425e 100644 --- a/drivers/s390/net/qeth_l3_sys.c +++ b/drivers/s390/net/qeth_l3_sys.c @@ -350,7 +350,7 @@ static struct attribute *qeth_l3_device_attrs[] = { NULL, }; -static struct attribute_group qeth_l3_device_attr_group = { +static const struct attribute_group qeth_l3_device_attr_group = { .attrs = qeth_l3_device_attrs, }; @@ -680,7 +680,7 @@ static struct attribute *qeth_ipato_device_attrs[] = { NULL, }; -static struct attribute_group qeth_device_ipato_group = { +static const struct attribute_group qeth_device_ipato_group = { .name = "ipa_takeover", .attrs = qeth_ipato_device_attrs, }; @@ -843,7 +843,7 @@ static struct attribute *qeth_vipa_device_attrs[] = { NULL, }; -static struct attribute_group qeth_device_vipa_group = { +static const struct attribute_group qeth_device_vipa_group = { .name = "vipa", .attrs = qeth_vipa_device_attrs, }; @@ -1006,7 +1006,7 @@ static struct attribute *qeth_rxip_device_attrs[] = { NULL, }; -static struct attribute_group qeth_device_rxip_group = { +static const struct attribute_group qeth_device_rxip_group = { .name = "rxip", .attrs = qeth_rxip_device_attrs, }; From cb0259ca61b0dd4617fb62c821492ae76a6390b4 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 19 Jul 2017 12:39:15 +0530 Subject: [PATCH 19/50] s390/raw3270: constify attribute_group structures. attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 8069 816 16 8901 22c5 drivers/s390/char/raw3270.o File size After adding 'const': text data bss dec hex filename 8133 752 16 8901 22c5 drivers/s390/char/raw3270.o Signed-off-by: Arvind Yadav Signed-off-by: Martin Schwidefsky --- drivers/s390/char/raw3270.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/char/raw3270.c b/drivers/s390/char/raw3270.c index 710f2292911d..5d4f053d7c38 100644 --- a/drivers/s390/char/raw3270.c +++ b/drivers/s390/char/raw3270.c @@ -1082,7 +1082,7 @@ static struct attribute * raw3270_attrs[] = { NULL, }; -static struct attribute_group raw3270_attr_group = { +static const struct attribute_group raw3270_attr_group = { .attrs = raw3270_attrs, }; From fc2b0f52aea5daf19adb576112a7db6be7a0379c Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 19 Jul 2017 12:39:16 +0530 Subject: [PATCH 20/50] s390/tape: constify attribute_group structures. attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 11511 656 16 12183 2f97 drivers/s390/char/tape_core.o File size After adding 'const': text data bss dec hex filename 11575 592 16 12183 2f97 drivers/s390/char/tape_core.o Signed-off-by: Arvind Yadav Signed-off-by: Martin Schwidefsky --- drivers/s390/char/tape_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/char/tape_core.c b/drivers/s390/char/tape_core.c index 3c379da2eef8..9dd4534823b3 100644 --- a/drivers/s390/char/tape_core.c +++ b/drivers/s390/char/tape_core.c @@ -175,7 +175,7 @@ static struct attribute *tape_attrs[] = { NULL }; -static struct attribute_group tape_attr_group = { +static const struct attribute_group tape_attr_group = { .attrs = tape_attrs, }; From 8351378f5873164ecc966c58fac43bab53216c94 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 19 Jul 2017 12:39:17 +0530 Subject: [PATCH 21/50] s390/sclp_ocf: constify attribute_group structures. attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. Signed-off-by: Arvind Yadav Signed-off-by: Martin Schwidefsky --- drivers/s390/char/sclp_ocf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/char/sclp_ocf.c b/drivers/s390/char/sclp_ocf.c index f59b71776bbd..f9cbb1ab047b 100644 --- a/drivers/s390/char/sclp_ocf.c +++ b/drivers/s390/char/sclp_ocf.c @@ -126,7 +126,7 @@ static struct attribute *ocf_attrs[] = { NULL, }; -static struct attribute_group ocf_attr_group = { +static const struct attribute_group ocf_attr_group = { .attrs = ocf_attrs, }; From 7f7e6e28cd3285ce5d5a3d88b334eda428dd7d66 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 19 Apr 2017 14:54:05 +0200 Subject: [PATCH 22/50] s390/spinlock: add niai spinlock hints The z14 machine introduces new mode of the next-instruction-access-intent NIAI instruction. With NIAI-8 it is possible to pin a cache-line on a CPU for a small amount of time, NIAI-7 releases the cache-line again. Finally NIAI-4 can be used to prevent the CPU to speculatively access memory beyond the compare-and-swap instruction to get the lock. Use these instruction in the spinlock code. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/spinlock.h | 9 ++-- arch/s390/lib/spinlock.c | 87 +++++++++++++++++++------------- 2 files changed, 56 insertions(+), 40 deletions(-) diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index f7838ecd83c6..339e450b0567 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -92,10 +92,11 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) { typecheck(int, lp->lock); asm volatile( - "st %1,%0\n" - : "+Q" (lp->lock) - : "d" (0) - : "cc", "memory"); +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0070\n" /* NIAI 7 */ +#endif + " st %1,%0\n" + : "=Q" (lp->lock) : "d" (0) : "cc", "memory"); } static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index ffb15bd4c593..b12663d653d8 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -32,42 +32,63 @@ static int __init spin_retry_setup(char *str) } __setup("spin_retry=", spin_retry_setup); +static inline int arch_load_niai4(int *lock) +{ + int owner; + + asm volatile( +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0040\n" /* NIAI 4 */ +#endif + " l %0,%1\n" + : "=d" (owner) : "Q" (*lock) : "memory"); + return owner; +} + +static inline int arch_cmpxchg_niai8(int *lock, int old, int new) +{ + int expected = old; + + asm volatile( +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + " .long 0xb2fa0080\n" /* NIAI 8 */ +#endif + " cs %0,%3,%1\n" + : "=d" (old), "=Q" (*lock) + : "0" (old), "d" (new), "Q" (*lock) + : "cc", "memory"); + return expected == old; +} + void arch_spin_lock_wait(arch_spinlock_t *lp) { int cpu = SPINLOCK_LOCKVAL; - int owner, count, first_diag; + int owner, count; - first_diag = 1; + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_load_niai4(&lp->lock); + if (owner && arch_vcpu_is_preempted(~owner)) + smp_yield_cpu(~owner); + + count = spin_retry; while (1) { - owner = ACCESS_ONCE(lp->lock); + owner = arch_load_niai4(&lp->lock); /* Try to get the lock if it is free. */ if (!owner) { - if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) + if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) return; continue; } - /* First iteration: check if the lock owner is running. */ - if (first_diag && arch_vcpu_is_preempted(~owner)) { - smp_yield_cpu(~owner); - first_diag = 0; + if (count-- >= 0) continue; - } - /* Loop for a while on the lock value. */ count = spin_retry; - do { - owner = ACCESS_ONCE(lp->lock); - } while (owner && count-- > 0); - if (!owner) - continue; /* * For multiple layers of hypervisors, e.g. z/VM + LPAR * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) { + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); - first_diag = 0; - } } } EXPORT_SYMBOL(arch_spin_lock_wait); @@ -75,42 +96,36 @@ EXPORT_SYMBOL(arch_spin_lock_wait); void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) { int cpu = SPINLOCK_LOCKVAL; - int owner, count, first_diag; + int owner, count; local_irq_restore(flags); - first_diag = 1; + + /* Pass the virtual CPU to the lock holder if it is not running */ + owner = arch_load_niai4(&lp->lock); + if (owner && arch_vcpu_is_preempted(~owner)) + smp_yield_cpu(~owner); + + count = spin_retry; while (1) { - owner = ACCESS_ONCE(lp->lock); + owner = arch_load_niai4(&lp->lock); /* Try to get the lock if it is free. */ if (!owner) { local_irq_disable(); - if (__atomic_cmpxchg_bool(&lp->lock, 0, cpu)) + if (arch_cmpxchg_niai8(&lp->lock, 0, cpu)) return; local_irq_restore(flags); continue; } - /* Check if the lock owner is running. */ - if (first_diag && arch_vcpu_is_preempted(~owner)) { - smp_yield_cpu(~owner); - first_diag = 0; + if (count-- >= 0) continue; - } - /* Loop for a while on the lock value. */ count = spin_retry; - do { - owner = ACCESS_ONCE(lp->lock); - } while (owner && count-- > 0); - if (!owner) - continue; /* * For multiple layers of hypervisors, e.g. z/VM + LPAR * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) { + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); - first_diag = 0; - } } } EXPORT_SYMBOL(arch_spin_lock_wait_flags); From 65a6d0c56c3b866de04777e386a29ebc8b42899a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 1 Aug 2017 07:50:10 +0200 Subject: [PATCH 23/50] s390/nmi: keep comments consistent The name of bit 36 of the machine check interruption code is "guarded storage registers validity". Add the missing "validity" part in order to be consistent with all other comments, which include this piece of information. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/nmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index 9d91cf3e427f..c8e211b9a002 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -72,7 +72,7 @@ union mci { u64 ar : 1; /* 33 access register validity */ u64 da : 1; /* 34 delayed access exception */ u64 : 1; /* 35 */ - u64 gs : 1; /* 36 guarded storage registers */ + u64 gs : 1; /* 36 guarded storage registers validity */ u64 : 5; /* 37-41 */ u64 pr : 1; /* 42 tod programmable register validity */ u64 fc : 1; /* 43 fp control register validity */ From 2edf3fa5766f5aa1715a612d31bbd15b2211d64e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Aug 2017 07:11:57 +0200 Subject: [PATCH 24/50] s390: remove asm/mman.h and asm/types.h Both header files only include the corresponding uapi header file and therefore can be removed. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ebcdic.h | 4 +--- arch/s390/include/asm/mman.h | 11 ----------- arch/s390/include/asm/types.h | 11 ----------- 3 files changed, 1 insertion(+), 25 deletions(-) delete mode 100644 arch/s390/include/asm/mman.h delete mode 100644 arch/s390/include/asm/types.h diff --git a/arch/s390/include/asm/ebcdic.h b/arch/s390/include/asm/ebcdic.h index c5befc5a3bf5..b71735eab23f 100644 --- a/arch/s390/include/asm/ebcdic.h +++ b/arch/s390/include/asm/ebcdic.h @@ -9,9 +9,7 @@ #ifndef _EBCDIC_H #define _EBCDIC_H -#ifndef _S390_TYPES_H -#include -#endif +#include extern __u8 _ascebc_500[256]; /* ASCII -> EBCDIC 500 conversion table */ extern __u8 _ebcasc_500[256]; /* EBCDIC 500 -> ASCII conversion table */ diff --git a/arch/s390/include/asm/mman.h b/arch/s390/include/asm/mman.h deleted file mode 100644 index b79813d9cf68..000000000000 --- a/arch/s390/include/asm/mman.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * S390 version - * - * Derived from "include/asm-i386/mman.h" - */ -#ifndef __S390_MMAN_H__ -#define __S390_MMAN_H__ - -#include - -#endif /* __S390_MMAN_H__ */ diff --git a/arch/s390/include/asm/types.h b/arch/s390/include/asm/types.h deleted file mode 100644 index 6740f4f9781f..000000000000 --- a/arch/s390/include/asm/types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* - * S390 version - * - * Derived from "include/asm-i386/types.h" - */ -#ifndef _S390_TYPES_H -#define _S390_TYPES_H - -#include - -#endif /* _S390_TYPES_H */ From 3cb8f11c5d3ad853ce71cee3cad753cf81376fd8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Aug 2017 09:45:02 +0200 Subject: [PATCH 25/50] s390: use generic uapi/asm/swab.h clang doesn't like s390 specific inline assembler constraints. These are present in our arch specific uapi/asm/swab.h which again is required by some ebpf test cases. For current compiler versions the generic swab.h already makes use of gcc's builtin functions. Therefore we can simply remove our own header file and use the generic one. This will generate worse code if used with compilers before gcc 4.8, which has no __builtin_bswap16(); or before gcc v4.4, which has no __builtin_bswap[32|64](). For these cases a C implementation fallback would be used which generates more code, but is still correct (170KB extra code for gcc 4.3 with performance_defconfig). However given that we need (and want) to get rid of the inline assemblies anyway in order to be able to use clang, the above is just a minor drawback if old gcc compilers are used. With current compilers there is close to zero difference, except for three btrfs bit functions which generate more out-of-line code. The generated code looks still correct and also uses the s390 specific byteswap instructions. Reported-and-tested-by: Thomas Richter Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/uapi/asm/Kbuild | 1 + arch/s390/include/uapi/asm/swab.h | 89 ------------------------------- 2 files changed, 1 insertion(+), 89 deletions(-) delete mode 100644 arch/s390/include/uapi/asm/swab.h diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index ca62066895e0..098f28778a13 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -9,4 +9,5 @@ generic-y += param.h generic-y += poll.h generic-y += resource.h generic-y += sockios.h +generic-y += swab.h generic-y += termbits.h diff --git a/arch/s390/include/uapi/asm/swab.h b/arch/s390/include/uapi/asm/swab.h deleted file mode 100644 index da3bfe5cc161..000000000000 --- a/arch/s390/include/uapi/asm/swab.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef _S390_SWAB_H -#define _S390_SWAB_H - -/* - * S390 version - * Copyright IBM Corp. 1999 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) - */ - -#include - -#ifndef __s390x__ -# define __SWAB_64_THRU_32__ -#endif - -#ifdef __s390x__ -static inline __u64 __arch_swab64p(const __u64 *x) -{ - __u64 result; - - asm volatile("lrvg %0,%1" : "=d" (result) : "m" (*x)); - return result; -} -#define __arch_swab64p __arch_swab64p - -static inline __u64 __arch_swab64(__u64 x) -{ - __u64 result; - - asm volatile("lrvgr %0,%1" : "=d" (result) : "d" (x)); - return result; -} -#define __arch_swab64 __arch_swab64 - -static inline void __arch_swab64s(__u64 *x) -{ - *x = __arch_swab64p(x); -} -#define __arch_swab64s __arch_swab64s -#endif /* __s390x__ */ - -static inline __u32 __arch_swab32p(const __u32 *x) -{ - __u32 result; - - asm volatile( -#ifndef __s390x__ - " icm %0,8,%O1+3(%R1)\n" - " icm %0,4,%O1+2(%R1)\n" - " icm %0,2,%O1+1(%R1)\n" - " ic %0,%1" - : "=&d" (result) : "Q" (*x) : "cc"); -#else /* __s390x__ */ - " lrv %0,%1" - : "=d" (result) : "m" (*x)); -#endif /* __s390x__ */ - return result; -} -#define __arch_swab32p __arch_swab32p - -#ifdef __s390x__ -static inline __u32 __arch_swab32(__u32 x) -{ - __u32 result; - - asm volatile("lrvr %0,%1" : "=d" (result) : "d" (x)); - return result; -} -#define __arch_swab32 __arch_swab32 -#endif /* __s390x__ */ - -static inline __u16 __arch_swab16p(const __u16 *x) -{ - __u16 result; - - asm volatile( -#ifndef __s390x__ - " icm %0,2,%O1+1(%R1)\n" - " ic %0,%1\n" - : "=&d" (result) : "Q" (*x) : "cc"); -#else /* __s390x__ */ - " lrvh %0,%1" - : "=d" (result) : "m" (*x)); -#endif /* __s390x__ */ - return result; -} -#define __arch_swab16p __arch_swab16p - -#endif /* _S390_SWAB_H */ From 83a8842438c02f02de3c87ae9038ae0e9f808540 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Aug 2017 13:21:00 +0200 Subject: [PATCH 26/50] s390: use generic asm/unaligned.h And another header file for which we can use the generic variant, even though it doesn't look obvious at first glance. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/Kbuild | 1 + arch/s390/include/asm/unaligned.h | 13 ------------- 2 files changed, 1 insertion(+), 13 deletions(-) delete mode 100644 arch/s390/include/asm/unaligned.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index b3c88479feba..6e2c9f7e47fa 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -16,4 +16,5 @@ generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += preempt.h generic-y += trace_clock.h +generic-y += unaligned.h generic-y += word-at-a-time.h diff --git a/arch/s390/include/asm/unaligned.h b/arch/s390/include/asm/unaligned.h deleted file mode 100644 index da9627afe5d8..000000000000 --- a/arch/s390/include/asm/unaligned.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _ASM_S390_UNALIGNED_H -#define _ASM_S390_UNALIGNED_H - -/* - * The S390 can do unaligned accesses itself. - */ -#include -#include - -#define get_unaligned __get_unaligned_be -#define put_unaligned __put_unaligned_be - -#endif /* _ASM_S390_UNALIGNED_H */ From c7e85ae5eaab5c12fb4a2f65ffef1c3350072110 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Wed, 2 Aug 2017 21:37:22 +0530 Subject: [PATCH 27/50] s390/sclp: add const to bin_attribute structure Declare bin_attribute structure as const as it is only passed as an argument to the function sysfs_create_bin_file. This argument is of type const, so declare the structure as const. Cross compiled for s390 architecture. Signed-off-by: Bhumika Goyal Signed-off-by: Martin Schwidefsky --- drivers/s390/char/sclp_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/char/sclp_config.c b/drivers/s390/char/sclp_config.c index 1406fb688a26..7003d52c2191 100644 --- a/drivers/s390/char/sclp_config.c +++ b/drivers/s390/char/sclp_config.c @@ -135,7 +135,7 @@ static ssize_t sysfs_ofb_data_write(struct file *filp, struct kobject *kobj, return rc ?: count; } -static struct bin_attribute ofb_bin_attr = { +static const struct bin_attribute ofb_bin_attr = { .attr = { .name = "event_data", .mode = S_IWUSR, From ac34bbc3604bd7822ffb06b3e22151a6d99bc4c9 Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Wed, 2 Aug 2017 21:37:23 +0530 Subject: [PATCH 28/50] s390/cio: add const to bin_attribute structures Add const to bin_attribute structures as they are only passed to the functions device_{remove/create}_bin_file. The corresponding arguments are of type const, so declare the structures to be const. Cross compiled for s390 architecture. Signed-off-by: Bhumika Goyal Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/chp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index 432fc40990bd..f4166f80c4d4 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -143,7 +143,7 @@ static ssize_t chp_measurement_chars_read(struct file *filp, sizeof(chp->cmg_chars)); } -static struct bin_attribute chp_measurement_chars_attr = { +static const struct bin_attribute chp_measurement_chars_attr = { .attr = { .name = "measurement_chars", .mode = S_IRUSR, @@ -197,7 +197,7 @@ static ssize_t chp_measurement_read(struct file *filp, struct kobject *kobj, return count; } -static struct bin_attribute chp_measurement_attr = { +static const struct bin_attribute chp_measurement_attr = { .attr = { .name = "measurement", .mode = S_IRUSR, From 16f48641716000ae674159a021c93632d172f466 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 7 Aug 2017 15:16:15 +0200 Subject: [PATCH 29/50] s390/mm: add missing virt_to_pfn() etc. helper functions Add all the missing little helper functions like virt_to_pfn(), phys_to_page(), etc. While we had a couple of these helper functions like e.g. page_to_phys() the other functions were missing, which is quite annoying if one is looking for exactly such a function. Therefore finally add all those little helper functions. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/page.h | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index c68b21b12182..5d5c2b3500a4 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -152,16 +152,26 @@ static inline int devmem_is_allowed(unsigned long pfn) #endif /* !__ASSEMBLY__ */ -#define __PAGE_OFFSET 0x0UL -#define PAGE_OFFSET 0x0UL -#define __pa(x) (unsigned long)(x) -#define __va(x) (void *)(unsigned long)(x) -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) +#define __PAGE_OFFSET 0x0UL +#define PAGE_OFFSET 0x0UL + +#define __pa(x) ((unsigned long)(x)) +#define __va(x) ((void *)(unsigned long)(x)) + +#define virt_to_pfn(kaddr) (__pa(kaddr) >> PAGE_SHIFT) #define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT) + +#define virt_to_page(kaddr) pfn_to_page(virt_to_pfn(kaddr)) #define page_to_virt(page) pfn_to_virt(page_to_pfn(page)) +#define phys_to_pfn(kaddr) ((kaddr) >> PAGE_SHIFT) +#define pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) + +#define phys_to_page(kaddr) pfn_to_page(phys_to_pfn(kaddr)) +#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) + +#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) + #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) From 34ad7cdc1bb2ea65934d235be89fabf1bb40d824 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 7 Aug 2017 15:16:15 +0200 Subject: [PATCH 30/50] s390/mm: prevent memory offline for memory blocks with cma areas Memory blocks that contain areas for the contiguous memory allocator (cma) should not be allowed to go offline. Otherwise this would render cma completely useless. This might make sense on other architectures where memory might be taken offline due to hardware errors, but not on architectures which support memory hotplug for load balancing. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/init.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index c52a6b834f08..3b567838b905 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -167,6 +168,58 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG + +#ifdef CONFIG_CMA + +/* Prevent memory blocks which contain cma regions from going offline */ + +struct s390_cma_mem_data { + unsigned long start; + unsigned long end; +}; + +static int s390_cma_check_range(struct cma *cma, void *data) +{ + struct s390_cma_mem_data *mem_data; + unsigned long start, end; + + mem_data = data; + start = cma_get_base(cma); + end = start + cma_get_size(cma); + if (end < mem_data->start) + return 0; + if (start >= mem_data->end) + return 0; + return -EBUSY; +} + +static int s390_cma_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct s390_cma_mem_data mem_data; + struct memory_notify *arg; + int rc = 0; + + arg = data; + mem_data.start = arg->start_pfn << PAGE_SHIFT; + mem_data.end = mem_data.start + (arg->nr_pages << PAGE_SHIFT); + if (action == MEM_GOING_OFFLINE) + rc = cma_for_each_area(s390_cma_check_range, &mem_data); + return notifier_from_errno(rc); +} + +static struct notifier_block s390_cma_mem_nb = { + .notifier_call = s390_cma_mem_notifier, +}; + +static int __init s390_cma_mem_init(void) +{ + return register_memory_notifier(&s390_cma_mem_nb); +} +device_initcall(s390_cma_mem_init); + +#endif /* CONFIG_CMA */ + int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); From 267239cc10f18251892a0783104df3dc22b620d5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 7 Aug 2017 15:16:15 +0200 Subject: [PATCH 31/50] s390/vmcp: fix uaccess check and avoid undefined behavior The vmcp device driver should return -EFAULT if get_user() fails, due to an invalid user space address. In addition the buffer size value from user space is passed unchecked to get_order(). The return value of get_order(0) undefined. Therefore explicitly test for zero before calling get_order() and also return -EFAULT if get_user() fails. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/char/vmcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 98749fa817da..66d5e9f83e0d 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -150,7 +150,9 @@ static long vmcp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) get_order(session->bufsize)); session->response=NULL; temp = get_user(session->bufsize, argp); - if (get_order(session->bufsize) > 8) { + if (temp) + session->bufsize = PAGE_SIZE; + if (!session->bufsize || get_order(session->bufsize) > 8) { session->bufsize = PAGE_SIZE; temp = -EINVAL; } From cd4386a931b6310b05559d2e28efda04d30ab593 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 7 Aug 2017 15:16:15 +0200 Subject: [PATCH 32/50] s390/cpcmd,vmcp: avoid GFP_DMA allocations According to the CP Programming Services manual Diagnose Code 8 "Virtual Console Function" can be used in all addressing modes. Also the input and output buffers do not have a limitation which specifies they need to be below the 2GB line. This is true at least since z/VM 5.4. Therefore remove the sam31/64 instructions and allow for simple GFP_KERNEL allocations. This makes it easier to allocate a 1MB page if the user requested such a large return buffer. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/cpcmd.h | 7 +++---- arch/s390/kernel/cpcmd.c | 13 ++++--------- drivers/s390/char/vmcp.c | 2 +- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/s390/include/asm/cpcmd.h b/arch/s390/include/asm/cpcmd.h index 3dfadb5d648f..ca2b0624ad46 100644 --- a/arch/s390/include/asm/cpcmd.h +++ b/arch/s390/include/asm/cpcmd.h @@ -10,9 +10,8 @@ /* * the lowlevel function for cpcmd - * the caller of __cpcmd has to ensure that the response buffer is below 2 GB */ -extern int __cpcmd(const char *cmd, char *response, int rlen, int *response_code); +int __cpcmd(const char *cmd, char *response, int rlen, int *response_code); /* * cpcmd is the in-kernel interface for issuing CP commands @@ -25,8 +24,8 @@ extern int __cpcmd(const char *cmd, char *response, int rlen, int *response_code * response_code: return pointer for VM's error code * return value: the size of the response. The caller can check if the buffer * was large enough by comparing the return value and rlen - * NOTE: If the response buffer is not below 2 GB, cpcmd can sleep + * NOTE: If the response buffer is not in real storage, cpcmd can sleep */ -extern int cpcmd(const char *cmd, char *response, int rlen, int *response_code); +int cpcmd(const char *cmd, char *response, int rlen, int *response_code); #endif /* _ASM_S390_CPCMD_H */ diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c index 9f0e4a2785f7..63bc6603e0ed 100644 --- a/arch/s390/kernel/cpcmd.c +++ b/arch/s390/kernel/cpcmd.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -28,9 +29,7 @@ static int diag8_noresponse(int cmdlen) register unsigned long reg3 asm ("3") = cmdlen; asm volatile( - " sam31\n" " diag %1,%0,0x8\n" - " sam64\n" : "+d" (reg3) : "d" (reg2) : "cc"); return reg3; } @@ -43,9 +42,7 @@ static int diag8_response(int cmdlen, char *response, int *rlen) register unsigned long reg5 asm ("5") = *rlen; asm volatile( - " sam31\n" " diag %2,%0,0x8\n" - " sam64\n" " brc 8,1f\n" " agr %1,%4\n" "1:\n" @@ -57,7 +54,6 @@ static int diag8_response(int cmdlen, char *response, int *rlen) /* * __cpcmd has some restrictions over cpcmd - * - the response buffer must reside below 2GB (if any) * - __cpcmd is unlocked and therefore not SMP-safe */ int __cpcmd(const char *cmd, char *response, int rlen, int *response_code) @@ -88,13 +84,12 @@ EXPORT_SYMBOL(__cpcmd); int cpcmd(const char *cmd, char *response, int rlen, int *response_code) { + unsigned long flags; char *lowbuf; int len; - unsigned long flags; - if ((virt_to_phys(response) != (unsigned long) response) || - (((unsigned long)response + rlen) >> 31)) { - lowbuf = kmalloc(rlen, GFP_KERNEL | GFP_DMA); + if (is_vmalloc_or_module_addr(response)) { + lowbuf = kmalloc(rlen, GFP_KERNEL); if (!lowbuf) { pr_warn("The cpcmd kernel function failed to allocate a response buffer\n"); return -ENOMEM; diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 66d5e9f83e0d..b5e3a49745f9 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -98,7 +98,7 @@ vmcp_write(struct file *file, const char __user *buff, size_t count, } if (!session->response) session->response = (char *)__get_free_pages(GFP_KERNEL - | __GFP_RETRY_MAYFAIL | GFP_DMA, + | __GFP_RETRY_MAYFAIL, get_order(session->bufsize)); if (!session->response) { mutex_unlock(&session->mutex); From 3f4298427ad521fdc74fb991b17d84959513218a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 7 Aug 2017 15:16:15 +0200 Subject: [PATCH 33/50] s390/vmcp: make use of contiguous memory allocator If memory is fragmented it is unlikely that large order memory allocations succeed. This has been an issue with the vmcp device driver since a long time, since it requires large physical contiguous memory ares for large responses. To hopefully resolve this issue make use of the contiguous memory allocator (cma). This patch adds a vmcp specific vmcp cma area with a default size of 4MB. The size can be changed either via the VMCP_CMA_SIZE config option at compile time or with the "vmcp_cma" kernel parameter (e.g. "vmcp_cma=16m"). For any vmcp response buffers larger than 16k memory from the cma area will be allocated. If such an allocation fails, there is a fallback to the buddy allocator. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- .../admin-guide/kernel-parameters.txt | 4 + arch/s390/include/asm/setup.h | 6 ++ arch/s390/kernel/setup.c | 1 + drivers/s390/char/Kconfig | 11 +++ drivers/s390/char/vmcp.c | 74 +++++++++++++++++-- drivers/s390/char/vmcp.h | 3 +- 6 files changed, 90 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d9c171ce4190..5a2d5079139b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4375,6 +4375,10 @@ decrease the size and leave more room for directly mapped kernel RAM. + vmcp_cma=nn[MG] [KNL,S390] + Sets the memory size reserved for contiguous memory + allocations for the vmcp device driver. + vmhalt= [KNL,S390] Perform z/VM CP command after system halt. Format: diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 61da4bd6edad..490e035b3716 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -108,6 +108,12 @@ extern void pfault_fini(void); #define pfault_fini() do { } while (0) #endif /* CONFIG_PFAULT */ +#ifdef CONFIG_VMCP +void vmcp_cma_reserve(void); +#else +static inline void vmcp_cma_reserve(void) { } +#endif + void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault); void cmma_init(void); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index a50238e17867..164a1e16b53e 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -925,6 +925,7 @@ void __init setup_arch(char **cmdline_p) setup_memory_end(); setup_memory(); dma_contiguous_reserve(memory_end); + vmcp_cma_reserve(); check_initrd(); reserve_crashkernel(); diff --git a/drivers/s390/char/Kconfig b/drivers/s390/char/Kconfig index b3f1c458905f..97c4c9fdd53d 100644 --- a/drivers/s390/char/Kconfig +++ b/drivers/s390/char/Kconfig @@ -169,10 +169,21 @@ config VMCP def_bool y prompt "Support for the z/VM CP interface" depends on S390 + select CMA help Select this option if you want to be able to interact with the control program on z/VM +config VMCP_CMA_SIZE + int "Memory in MiB reserved for z/VM CP interface" + default "4" + depends on VMCP + help + Specify the default amount of memory in MiB reserved for the z/VM CP + interface. If needed this memory is used for large contiguous memory + allocations. The default can be changed with the kernel command line + parameter "vmcp_cma". + config MONREADER def_tristate m prompt "API for reading z/VM monitor service records" diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index b5e3a49745f9..c202b407698f 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -17,15 +17,77 @@ #include #include #include +#include #include +#include +#include +#include #include #include #include -#include #include "vmcp.h" static debug_info_t *vmcp_debug; +static unsigned long vmcp_cma_size __initdata = CONFIG_VMCP_CMA_SIZE * 1024 * 1024; +static struct cma *vmcp_cma; + +static int __init early_parse_vmcp_cma(char *p) +{ + vmcp_cma_size = ALIGN(memparse(p, NULL), PAGE_SIZE); + return 0; +} +early_param("vmcp_cma", early_parse_vmcp_cma); + +void __init vmcp_cma_reserve(void) +{ + if (!MACHINE_IS_VM) + return; + cma_declare_contiguous(0, vmcp_cma_size, 0, 0, 0, false, "vmcp", &vmcp_cma); +} + +static void vmcp_response_alloc(struct vmcp_session *session) +{ + struct page *page = NULL; + int nr_pages, order; + + order = get_order(session->bufsize); + nr_pages = ALIGN(session->bufsize, PAGE_SIZE) >> PAGE_SHIFT; + /* + * For anything below order 3 allocations rely on the buddy + * allocator. If such low-order allocations can't be handled + * anymore the system won't work anyway. + */ + if (order > 2) + page = cma_alloc(vmcp_cma, nr_pages, 0, GFP_KERNEL); + if (page) { + session->response = (char *)page_to_phys(page); + session->cma_alloc = 1; + return; + } + session->response = (char *)__get_free_pages(GFP_KERNEL | __GFP_RETRY_MAYFAIL, order); +} + +static void vmcp_response_free(struct vmcp_session *session) +{ + int nr_pages, order; + struct page *page; + + if (!session->response) + return; + order = get_order(session->bufsize); + nr_pages = ALIGN(session->bufsize, PAGE_SIZE) >> PAGE_SHIFT; + if (session->cma_alloc) { + page = phys_to_page((unsigned long)session->response); + cma_release(vmcp_cma, page, nr_pages); + session->cma_alloc = 0; + goto out; + } + free_pages((unsigned long)session->response, order); +out: + session->response = NULL; +} + static int vmcp_open(struct inode *inode, struct file *file) { struct vmcp_session *session; @@ -51,7 +113,7 @@ static int vmcp_release(struct inode *inode, struct file *file) session = file->private_data; file->private_data = NULL; - free_pages((unsigned long)session->response, get_order(session->bufsize)); + vmcp_response_free(session); kfree(session); return 0; } @@ -97,9 +159,7 @@ vmcp_write(struct file *file, const char __user *buff, size_t count, return -ERESTARTSYS; } if (!session->response) - session->response = (char *)__get_free_pages(GFP_KERNEL - | __GFP_RETRY_MAYFAIL, - get_order(session->bufsize)); + vmcp_response_alloc(session); if (!session->response) { mutex_unlock(&session->mutex); kfree(cmd); @@ -146,9 +206,7 @@ static long vmcp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) mutex_unlock(&session->mutex); return put_user(temp, argp); case VMCP_SETBUF: - free_pages((unsigned long)session->response, - get_order(session->bufsize)); - session->response=NULL; + vmcp_response_free(session); temp = get_user(session->bufsize, argp); if (temp) session->bufsize = PAGE_SIZE; diff --git a/drivers/s390/char/vmcp.h b/drivers/s390/char/vmcp.h index 1e29b0418382..4e725edf449f 100644 --- a/drivers/s390/char/vmcp.h +++ b/drivers/s390/char/vmcp.h @@ -20,8 +20,9 @@ #define VMCP_GETSIZE _IOR(0x10, 3, int) struct vmcp_session { - unsigned int bufsize; char *response; + unsigned int bufsize; + unsigned int cma_alloc : 1; int resp_size; int resp_code; /* As we use copy_from/to_user, which might * From ef267938f07197fc011e3aada67ac70a3c65c2ff Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 7 Aug 2017 15:16:16 +0200 Subject: [PATCH 34/50] s390/vmcp: split vmcp header file and move to uapi Split the vmcp header file and move the device driver internal structure to the C file, and move the ioctl definitions to the uapi directory. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- .../s390/include/uapi/asm}/vmcp.h | 21 +++++++------------ drivers/s390/char/vmcp.c | 11 +++++++++- 2 files changed, 17 insertions(+), 15 deletions(-) rename {drivers/s390/char => arch/s390/include/uapi/asm}/vmcp.h (54%) diff --git a/drivers/s390/char/vmcp.h b/arch/s390/include/uapi/asm/vmcp.h similarity index 54% rename from drivers/s390/char/vmcp.h rename to arch/s390/include/uapi/asm/vmcp.h index 4e725edf449f..4caf71714a55 100644 --- a/drivers/s390/char/vmcp.h +++ b/arch/s390/include/uapi/asm/vmcp.h @@ -12,20 +12,13 @@ * The idea of this driver is based on cpint from Neale Ferguson */ +#ifndef _UAPI_ASM_VMCP_H +#define _UAPI_ASM_VMCP_H + #include -#include -#define VMCP_GETCODE _IOR(0x10, 1, int) -#define VMCP_SETBUF _IOW(0x10, 2, int) -#define VMCP_GETSIZE _IOR(0x10, 3, int) +#define VMCP_GETCODE _IOR(0x10, 1, int) +#define VMCP_SETBUF _IOW(0x10, 2, int) +#define VMCP_GETSIZE _IOR(0x10, 3, int) -struct vmcp_session { - char *response; - unsigned int bufsize; - unsigned int cma_alloc : 1; - int resp_size; - int resp_code; - /* As we use copy_from/to_user, which might * - * sleep and cannot use a spinlock */ - struct mutex mutex; -}; +#endif /* _UAPI_ASM_VMCP_H */ diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index c202b407698f..18b30119a70e 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -25,7 +25,16 @@ #include #include #include -#include "vmcp.h" +#include + +struct vmcp_session { + char *response; + unsigned int bufsize; + unsigned int cma_alloc : 1; + int resp_size; + int resp_code; + struct mutex mutex; +}; static debug_info_t *vmcp_debug; From 4ae48c046814d8b0386883aeb9060b08f46a2ec1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 7 Aug 2017 15:16:16 +0200 Subject: [PATCH 35/50] s390/vmcp: return -ENOTTY for unknown ioctl commands Return -ENOTTY for unknown ioctl commands instead of -ENOIOCTLCMD. This isn't that much of difference, since common code will translate -ENOIOCTLCMD to -ENOTTY anyway, but this way it seems to be more obvious what is happening (at least to me). Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/char/vmcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 18b30119a70e..30fd474c86b3 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -231,7 +231,7 @@ static long vmcp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return put_user(temp, argp); default: mutex_unlock(&session->mutex); - return -ENOIOCTLCMD; + return -ENOTTY; } } From 307957b643493c1bc038d4e4b717871184f13ddf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 7 Aug 2017 15:16:16 +0200 Subject: [PATCH 36/50] s390/vmcp: simplify vmcp_ioctl() vmcp_ioctl() has many different return statements and duplicates a lot of mutex_unlock() calls. Simplify this so that only one return statement and one mutex_unlock() call is left. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/char/vmcp.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 30fd474c86b3..0aa50afa5063 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -199,8 +199,8 @@ vmcp_write(struct file *file, const char __user *buff, size_t count, static long vmcp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct vmcp_session *session; + int ret = -ENOTTY; int __user *argp; - int temp; session = file->private_data; if (is_compat_task()) @@ -211,28 +211,26 @@ static long vmcp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return -ERESTARTSYS; switch (cmd) { case VMCP_GETCODE: - temp = session->resp_code; - mutex_unlock(&session->mutex); - return put_user(temp, argp); + ret = put_user(session->resp_code, argp); + break; case VMCP_SETBUF: vmcp_response_free(session); - temp = get_user(session->bufsize, argp); - if (temp) + ret = get_user(session->bufsize, argp); + if (ret) session->bufsize = PAGE_SIZE; if (!session->bufsize || get_order(session->bufsize) > 8) { session->bufsize = PAGE_SIZE; - temp = -EINVAL; + ret = -EINVAL; } - mutex_unlock(&session->mutex); - return temp; + break; case VMCP_GETSIZE: - temp = session->resp_size; - mutex_unlock(&session->mutex); - return put_user(temp, argp); + ret = put_user(session->resp_size, argp); + break; default: - mutex_unlock(&session->mutex); - return -ENOTTY; + break; } + mutex_unlock(&session->mutex); + return ret; } static const struct file_operations vmcp_fops = { From 5db23179998ded72cb8f58a296c0e99716d7df5b Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 3 Jul 2017 15:32:23 +0200 Subject: [PATCH 37/50] s390/pci: log changes to uid checking Some hypervisors allow to toggle uid checking at runtime. Log changes to uid checking in s390dbf. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/pci/pci_clp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index bd534b4d40e3..0ae3936e266f 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -24,6 +24,14 @@ bool zpci_unique_uid; +static void update_uid_checking(bool new) +{ + if (zpci_unique_uid != new) + zpci_dbg(1, "uid checking:%d\n", new); + + zpci_unique_uid = new; +} + static inline void zpci_err_clp(unsigned int rsp, int rc) { struct { @@ -319,7 +327,7 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, void *data, goto out; } - zpci_unique_uid = rrb->response.uid_checking; + update_uid_checking(rrb->response.uid_checking); WARN_ON_ONCE(rrb->response.entry_size != sizeof(struct clp_fh_list_entry)); From a3c1a2194a7c776c08ad704cd8a3b3ea694c60e6 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 15 Jun 2017 17:13:15 +0200 Subject: [PATCH 38/50] s390/scm: use common completion path Since commit caf7df122721 ("block: remove the errors field from struct request") rq->errors can't be (mis)used by block device drivers to store the error condition for usage during async completion. Because of that I simply used async completion only for the non-error paths. This patch places the error within the private data of struct request and uses async completion for all paths again. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/block/scm_blk.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 0071febac9e6..2e7fd966c515 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -249,13 +249,13 @@ static void scm_request_requeue(struct scm_request *scmrq) static void scm_request_finish(struct scm_request *scmrq) { struct scm_blk_dev *bdev = scmrq->bdev; + int *error; int i; for (i = 0; i < nr_requests_per_io && scmrq->request[i]; i++) { - if (scmrq->error) - blk_mq_end_request(scmrq->request[i], scmrq->error); - else - blk_mq_complete_request(scmrq->request[i]); + error = blk_mq_rq_to_pdu(scmrq->request[i]); + *error = scmrq->error; + blk_mq_complete_request(scmrq->request[i]); } atomic_dec(&bdev->queued_reqs); @@ -415,7 +415,9 @@ void scm_blk_irq(struct scm_device *scmdev, void *data, blk_status_t error) static void scm_blk_request_done(struct request *req) { - blk_mq_end_request(req, 0); + int *error = blk_mq_rq_to_pdu(req); + + blk_mq_end_request(req, *error); } static const struct block_device_operations scm_blk_devops = { @@ -448,6 +450,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) atomic_set(&bdev->queued_reqs, 0); bdev->tag_set.ops = &scm_mq_ops; + bdev->tag_set.cmd_size = sizeof(int); bdev->tag_set.nr_hw_queues = nr_requests; bdev->tag_set.queue_depth = nr_requests_per_io * nr_requests; bdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; From d2907225cf9621140664209037bbce5107e02c91 Mon Sep 17 00:00:00 2001 From: Stefan Haberland Date: Fri, 14 Jul 2017 10:33:18 +0200 Subject: [PATCH 39/50] s390/dasd: add average request times to dasd statistics Add average times to the DASD statistics interface. Signed-off-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 34 ++++++++++++++++++++++++++++++---- drivers/s390/block/dasd_int.h | 4 ++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 670ac0a4ef49..a97b2b819f51 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -801,11 +801,12 @@ static void dasd_profile_end(struct dasd_block *block, struct dasd_ccw_req *cqr, struct request *req) { - long strtime, irqtime, endtime, tottime; /* in microseconds */ - long tottimeps, sectors; + unsigned long strtime, irqtime, endtime, tottime; + unsigned long tottimeps, sectors; struct dasd_device *device; int sectors_ind, tottime_ind, tottimeps_ind, strtime_ind; int irqtime_ind, irqtimeps_ind, endtime_ind; + struct dasd_profile_info *data; device = cqr->startdev; if (!(dasd_global_profile_level || @@ -835,6 +836,11 @@ static void dasd_profile_end(struct dasd_block *block, spin_lock(&dasd_global_profile.lock); if (dasd_global_profile.data) { + data = dasd_global_profile.data; + data->dasd_sum_times += tottime; + data->dasd_sum_time_str += strtime; + data->dasd_sum_time_irq += irqtime; + data->dasd_sum_time_end += endtime; dasd_profile_end_add_data(dasd_global_profile.data, cqr->startdev != block->base, cqr->cpmode == 1, @@ -847,7 +853,12 @@ static void dasd_profile_end(struct dasd_block *block, spin_unlock(&dasd_global_profile.lock); spin_lock(&block->profile.lock); - if (block->profile.data) + if (block->profile.data) { + data = block->profile.data; + data->dasd_sum_times += tottime; + data->dasd_sum_time_str += strtime; + data->dasd_sum_time_irq += irqtime; + data->dasd_sum_time_end += endtime; dasd_profile_end_add_data(block->profile.data, cqr->startdev != block->base, cqr->cpmode == 1, @@ -856,10 +867,16 @@ static void dasd_profile_end(struct dasd_block *block, tottimeps_ind, strtime_ind, irqtime_ind, irqtimeps_ind, endtime_ind); + } spin_unlock(&block->profile.lock); spin_lock(&device->profile.lock); - if (device->profile.data) + if (device->profile.data) { + data = device->profile.data; + data->dasd_sum_times += tottime; + data->dasd_sum_time_str += strtime; + data->dasd_sum_time_irq += irqtime; + data->dasd_sum_time_end += endtime; dasd_profile_end_add_data(device->profile.data, cqr->startdev != block->base, cqr->cpmode == 1, @@ -868,6 +885,7 @@ static void dasd_profile_end(struct dasd_block *block, tottimeps_ind, strtime_ind, irqtime_ind, irqtimeps_ind, endtime_ind); + } spin_unlock(&device->profile.lock); } @@ -989,6 +1007,14 @@ static void dasd_stats_seq_print(struct seq_file *m, seq_printf(m, "total_sectors %u\n", data->dasd_io_sects); seq_printf(m, "total_pav %u\n", data->dasd_io_alias); seq_printf(m, "total_hpf %u\n", data->dasd_io_tpm); + seq_printf(m, "avg_total %lu\n", data->dasd_io_reqs ? + data->dasd_sum_times / data->dasd_io_reqs : 0UL); + seq_printf(m, "avg_build_to_ssch %lu\n", data->dasd_io_reqs ? + data->dasd_sum_time_str / data->dasd_io_reqs : 0UL); + seq_printf(m, "avg_ssch_to_irq %lu\n", data->dasd_io_reqs ? + data->dasd_sum_time_irq / data->dasd_io_reqs : 0UL); + seq_printf(m, "avg_irq_to_end %lu\n", data->dasd_io_reqs ? + data->dasd_sum_time_end / data->dasd_io_reqs : 0UL); seq_puts(m, "histogram_sectors "); dasd_stats_array(m, data->dasd_io_secs); seq_puts(m, "histogram_io_times "); diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index dca7cb1e6f65..43f383b2f87d 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -454,6 +454,10 @@ struct dasd_profile_info { unsigned int dasd_read_time2[32]; /* hist. of time from start to irq */ unsigned int dasd_read_time3[32]; /* hist. of time from irq to end */ unsigned int dasd_read_nr_req[32]; /* hist. of # of requests in chanq */ + unsigned long dasd_sum_times; /* sum of request times */ + unsigned long dasd_sum_time_str; /* sum of time from build to start */ + unsigned long dasd_sum_time_irq; /* sum of time from start to irq */ + unsigned long dasd_sum_time_end; /* sum of time from irq to end */ }; struct dasd_profile { From 673cfddf6e05749c8bdf044140415fcefd5e9546 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 11 Aug 2017 15:54:16 +0200 Subject: [PATCH 40/50] s390: fix 'novx' early parameter handling Specifying the 'novx' kernel parameter always results in a warning: Malformed early option 'novx' The reason for this is that the novx early parameter handling function always returns a non-zero value which means that an error occurred. Fix this and return the correct zero value instead. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/early.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index added6790460..ca8cd80e8feb 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -420,7 +420,7 @@ static int __init disable_vector_extension(char *str) { S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX; __ctl_clear_bit(0, 17); - return 1; + return 0; } early_param("novx", disable_vector_extension); From e1108e8f0d4b56f1310539fcb695f91f25302704 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 14 Aug 2017 07:54:32 +0200 Subject: [PATCH 41/50] s390/smp: convert cpuhp_setup_state() return code to zero on success cpuhp_setup_state() returns a state number on CPUHP_AP_ONLINE_DYN if no error occurred. Therefore convert the return code to zero before using it as exit code for s390_smp_init(). Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/smp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1020a11a24e5..1cee6753d47a 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -1181,6 +1181,7 @@ static int __init s390_smp_init(void) rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", smp_cpu_online, smp_cpu_pre_down); + rc = rc <= 0 ? rc : 0; out: return rc; } From 7bf76f0169538279b78536393639859eeb7d93f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Tue, 15 Aug 2017 16:40:18 +0200 Subject: [PATCH 42/50] s390/dasd: Change unsigned long long to unsigned long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unsigned long long and unsigned long were different in size for 31-bit. For 64-bit the size for both datatypes is 8 Bytes and since the support for 31-bit is long gone we can clean up a little and change everything to unsigned long. Change get_phys_clock() along the way to accept unsigned long as well so that the DASD code can be consistent. Acked-by: Heiko Carstens Signed-off-by: Jan Höppner Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/timex.h | 2 +- arch/s390/kernel/time.c | 2 +- drivers/s390/block/dasd.c | 2 +- drivers/s390/block/dasd_3990_erp.c | 2 +- drivers/s390/block/dasd_diag.c | 2 +- drivers/s390/block/dasd_eckd.c | 8 ++------ drivers/s390/block/dasd_eckd.h | 2 +- drivers/s390/block/dasd_erp.c | 2 +- drivers/s390/block/dasd_int.h | 12 ++++++------ drivers/s390/block/dasd_proc.c | 2 +- 10 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 0ea03c11458d..93f2eb3f277c 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -176,7 +176,7 @@ static inline cycles_t get_cycles(void) return (cycles_t) get_tod_clock() >> 2; } -int get_phys_clock(unsigned long long *clock); +int get_phys_clock(unsigned long *clock); void init_cpu_timer(void); unsigned long long monotonic_clock(void); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 15abecba068e..5cbd52169348 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -357,7 +357,7 @@ static unsigned long clock_sync_flags; * source. If the clock mode is local it will return -EOPNOTSUPP and * -EAGAIN if the clock is not in sync with the external reference. */ -int get_phys_clock(unsigned long long *clock) +int get_phys_clock(unsigned long *clock) { atomic_t *sw_ptr; unsigned int sw0, sw1; diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index a97b2b819f51..83f80710d555 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1665,7 +1665,7 @@ void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm, { struct dasd_ccw_req *cqr, *next; struct dasd_device *device; - unsigned long long now; + unsigned long now; int nrf_suppressed = 0; int fp_suppressed = 0; u8 *sense = NULL; diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c index 107cd3361e29..e448a0fc0c09 100644 --- a/drivers/s390/block/dasd_3990_erp.c +++ b/drivers/s390/block/dasd_3990_erp.c @@ -2231,7 +2231,7 @@ static void dasd_3990_erp_account_error(struct dasd_ccw_req *erp) struct dasd_device *device = erp->startdev; __u8 lpum = erp->refers->irb.esw.esw1.lpum; int pos = pathmask_to_pos(lpum); - unsigned long long clk; + unsigned long clk; if (!device->path_thrhld) return; diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 5667146c6a0a..98fb28e49d2c 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -235,7 +235,7 @@ static void dasd_ext_handler(struct ext_code ext_code, { struct dasd_ccw_req *cqr, *next; struct dasd_device *device; - unsigned long long expires; + unsigned long expires; unsigned long flags; addr_t ip; int rc; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index c3e5ad641b0b..8eafcd5fa004 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -3254,11 +3254,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track( /* 1x prefix + one read/write ccw per track */ cplength = 1 + trkcount; - /* on 31-bit we need space for two 32 bit addresses per page - * on 64-bit one 64 bit address - */ - datasize = sizeof(struct PFX_eckd_data) + - cidaw * sizeof(unsigned long long); + datasize = sizeof(struct PFX_eckd_data) + cidaw * sizeof(unsigned long); /* Allocate the ccw request. */ cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, datasize, @@ -3856,7 +3852,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_raw(struct dasd_device *startdev, } size = ALIGN(size, 8); - datasize = size + cidaw * sizeof(unsigned long long); + datasize = size + cidaw * sizeof(unsigned long); /* Allocate the ccw request. */ cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, diff --git a/drivers/s390/block/dasd_eckd.h b/drivers/s390/block/dasd_eckd.h index fb1f537d986a..34e153a6b19c 100644 --- a/drivers/s390/block/dasd_eckd.h +++ b/drivers/s390/block/dasd_eckd.h @@ -165,7 +165,7 @@ struct DE_eckd_data { __u8 ga_extended; /* Global Attributes Extended */ struct ch_t beg_ext; struct ch_t end_ext; - unsigned long long ep_sys_time; /* Ext Parameter - System Time Stamp */ + unsigned long ep_sys_time; /* Ext Parameter - System Time Stamp */ __u8 ep_format; /* Extended Parameter format byte */ __u8 ep_prio; /* Extended Parameter priority I/O byte */ __u8 ep_reserved1; /* Extended Parameter Reserved */ diff --git a/drivers/s390/block/dasd_erp.c b/drivers/s390/block/dasd_erp.c index 9e3419124264..6389feb2fb7a 100644 --- a/drivers/s390/block/dasd_erp.c +++ b/drivers/s390/block/dasd_erp.c @@ -124,7 +124,7 @@ dasd_default_erp_action(struct dasd_ccw_req *cqr) struct dasd_ccw_req *dasd_default_erp_postaction(struct dasd_ccw_req *cqr) { int success; - unsigned long long startclk, stopclk; + unsigned long startclk, stopclk; struct dasd_device *startdev; BUG_ON(cqr->refers == NULL || cqr->function == NULL); diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 43f383b2f87d..66248795b973 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -196,10 +196,10 @@ struct dasd_ccw_req { void *function; /* originating ERP action */ /* these are for statistics only */ - unsigned long long buildclk; /* TOD-clock of request generation */ - unsigned long long startclk; /* TOD-clock of request start */ - unsigned long long stopclk; /* TOD-clock of request interrupt */ - unsigned long long endclk; /* TOD-clock of request termination */ + unsigned long buildclk; /* TOD-clock of request generation */ + unsigned long startclk; /* TOD-clock of request start */ + unsigned long stopclk; /* TOD-clock of request interrupt */ + unsigned long endclk; /* TOD-clock of request termination */ /* Callback that is called after reaching final status. */ void (*callback)(struct dasd_ccw_req *, void *data); @@ -423,7 +423,7 @@ struct dasd_path { u8 chpid; struct dasd_conf_data *conf_data; atomic_t error_count; - unsigned long long errorclk; + unsigned long errorclk; }; @@ -539,7 +539,7 @@ struct dasd_block { struct block_device *bdev; atomic_t open_count; - unsigned long long blocks; /* size of volume in blocks */ + unsigned long blocks; /* size of volume in blocks */ unsigned int bp_block; /* bytes per block */ unsigned int s2b_shift; /* log2 (bp_block/512) */ diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c index 70dc2c4cd3f7..7104d6765773 100644 --- a/drivers/s390/block/dasd_proc.c +++ b/drivers/s390/block/dasd_proc.c @@ -90,7 +90,7 @@ dasd_devices_show(struct seq_file *m, void *v) seq_printf(m, "n/f "); else seq_printf(m, - "at blocksize: %d, %lld blocks, %lld MB", + "at blocksize: %u, %lu blocks, %lu MB", block->bp_block, block->blocks, ((block->bp_block >> 9) * block->blocks) >> 11); From 41b0dbfac0da515ad62edf9256414c56ee217364 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Thu, 17 Aug 2017 14:50:13 +0200 Subject: [PATCH 43/50] s390/topology: Remove the unused parent_node() macro Commit a7be6e5a7f8d ("mm: drop useless local parameters of __register_one_node()") removes the last user of parent_node(). The parent_node() macro in S390 platform is unnecessary. Remove it for cleanup. Reported-by: Michael Ellerman Signed-off-by: Dou Liyang Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/topology.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index fa1bfce10370..5222da162b69 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -77,12 +77,6 @@ static inline const struct cpumask *cpumask_of_node(int node) return &node_to_cpumask_map[node]; } -/* - * Returns the number of the node containing node 'node'. This - * architecture is flat, so it is a pretty simple function! - */ -#define parent_node(node) (node) - #define pcibus_to_node(bus) __pcibus_to_node(bus) #define node_distance(a, b) __node_distance(a, b) From eb304e800d491d5168df61a999beebe8042e7e58 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 18 Aug 2017 08:35:33 +0200 Subject: [PATCH 44/50] s390/vmcp: simplify vmcp_response_free() Get rid of the goto and "out" label within vmcp_response_free() which I added. This just makes the code harder to read than necessary. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/char/vmcp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/s390/char/vmcp.c b/drivers/s390/char/vmcp.c index 0aa50afa5063..7898bbcc28fc 100644 --- a/drivers/s390/char/vmcp.c +++ b/drivers/s390/char/vmcp.c @@ -90,10 +90,9 @@ static void vmcp_response_free(struct vmcp_session *session) page = phys_to_page((unsigned long)session->response); cma_release(vmcp_cma, page, nr_pages); session->cma_alloc = 0; - goto out; + } else { + free_pages((unsigned long)session->response, order); } - free_pages((unsigned long)session->response, order); -out: session->response = NULL; } From ba7944114ad3c2273611c95d6571c0924010a105 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 18 Aug 2017 09:02:01 +0200 Subject: [PATCH 45/50] s390/facilities: fix typo There really is no "general extension" facility available. Use the correct name "general instructions extension" facility instead. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/tools/gen_facilities.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 3e878ecf522c..29d72bf8ed2b 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -41,7 +41,7 @@ static struct facility_def facility_defs[] = { 27, /* mvcos */ 32, /* compare and swap and store */ 33, /* compare and swap and store 2 */ - 34, /* general extension facility */ + 34, /* general instructions extension */ 35, /* execute extensions */ #endif #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES From 8e58ab5c6563e3cc94029853471a36b87b4716eb Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 23 Aug 2017 12:13:51 +0200 Subject: [PATCH 46/50] s390/mm: use generic mm_hooks With git commit 3446c13b268af86391d06611327006b059b8bab1 "s390/mm: four page table levels vs. fork" s390 dropped its architecture specific version of arch_dup_mmap. Now all functions defined by include/asm-generic/mm_hooks.h are identical to the s390 versions. Use the generic header. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu_context.h | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 92c1eb79ada4..31eea6261488 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -12,6 +12,7 @@ #include #include #include +#include static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) @@ -133,30 +134,4 @@ static inline void activate_mm(struct mm_struct *prev, set_user_asce(next); } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ -} - -static inline void arch_exit_mmap(struct mm_struct *mm) -{ -} - -static inline void arch_unmap(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} - -static inline void arch_bprm_mm_init(struct mm_struct *mm, - struct vm_area_struct *vma) -{ -} - -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool execute, bool foreign) -{ - /* by default, allow everything */ - return true; -} #endif /* __S390_MMU_CONTEXT_H */ From d66bf801e0219ca9495fdb92574f842ba5462e04 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 21 Aug 2017 14:47:04 +0200 Subject: [PATCH 47/50] s390/uaccess: avoid mvcos jump label If the kernel is compiled for z10 or later machines the uaccess code inlines the mvcos instruction. The facility bit 27 which indicates the availability of MVCOS has to be set. The have_mvcos jump label will always be true. Make the generation of the have_mvcos jump label conditional on !CONFIG_HAVE_MARCH_Z10_FEATURES. Signed-off-by: Martin Schwidefsky --- arch/s390/lib/uaccess.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index b3bd3f23b8e8..4ea9106417ee 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -15,8 +15,30 @@ #include #include +#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES static DEFINE_STATIC_KEY_FALSE(have_mvcos); +static int __init uaccess_init(void) +{ + if (test_facility(27)) + static_branch_enable(&have_mvcos); + return 0; +} +early_initcall(uaccess_init); + +static inline int copy_with_mvcos(void) +{ + if (static_branch_likely(&have_mvcos)) + return 1; + return 0; +} +#else +static inline int copy_with_mvcos(void) +{ + return 1; +} +#endif + static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr, unsigned long size) { @@ -84,7 +106,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, unsigned long raw_copy_from_user(void *to, const void __user *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_from_user_mvcos(to, from, n); return copy_from_user_mvcp(to, from, n); } @@ -157,7 +179,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, unsigned long raw_copy_to_user(void __user *to, const void *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_to_user_mvcos(to, from, n); return copy_to_user_mvcs(to, from, n); } @@ -220,7 +242,7 @@ static inline unsigned long copy_in_user_mvc(void __user *to, const void __user unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return copy_in_user_mvcos(to, from, n); return copy_in_user_mvc(to, from, n); } @@ -292,7 +314,7 @@ static inline unsigned long clear_user_xc(void __user *to, unsigned long size) unsigned long __clear_user(void __user *to, unsigned long size) { - if (static_branch_likely(&have_mvcos)) + if (copy_with_mvcos()) return clear_user_mvcos(to, size); return clear_user_xc(to, size); } @@ -349,11 +371,3 @@ long __strncpy_from_user(char *dst, const char __user *src, long size) return done; } EXPORT_SYMBOL(__strncpy_from_user); - -static int __init uaccess_init(void) -{ - if (test_facility(27)) - static_branch_enable(&have_mvcos); - return 0; -} -early_initcall(uaccess_init); From 8b94dd9e0d4439f83fcc55f1f8020c86e79b623d Mon Sep 17 00:00:00 2001 From: Bhumika Goyal Date: Fri, 25 Aug 2017 18:40:29 +0530 Subject: [PATCH 48/50] s390/zcrypt: make CPRBX const Make this const as it is only used in a copy operation. Signed-off-by: Bhumika Goyal Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/zcrypt_msgtype6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c index 4fddb4319481..afd20cee7ea0 100644 --- a/drivers/s390/crypto/zcrypt_msgtype6.c +++ b/drivers/s390/crypto/zcrypt_msgtype6.c @@ -140,7 +140,7 @@ struct function_and_rules_block { * + 0x000A 'MRP ' (MCL3 'PK' or CEX2C 'PK') * - VUD block */ -static struct CPRBX static_cprbx = { +static const struct CPRBX static_cprbx = { .cprb_len = 0x00DC, .cprb_ver_id = 0x02, .func_id = {0x54, 0x32}, From 28b841b3a7cb07a4bfd436a15b31bc88509dcf9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B6ppner?= Date: Thu, 30 Jun 2016 13:28:57 +0200 Subject: [PATCH 49/50] s390/dasd: Add discard support for FBA devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The z/VM hypervisor provides virtual disks (VDISK) which are backed by main memory of the hypervisor. Those devices are seen as DASD FBA disks within the Linux guest. Whenever data is written to such a device, memory is allocated on-the-fly by z/VM accordingly. This memory, however, is not being freed if data on the device is deleted by the guest OS. In order to make memory usable after deletion again, add discard support to the FBA discipline. While at it, update comments regarding the DASD_FEATURE_* flags. Reviewed-by: Stefan Haberland Signed-off-by: Jan Höppner Signed-off-by: Martin Schwidefsky --- arch/s390/include/uapi/asm/dasd.h | 6 +- drivers/s390/block/dasd.c | 19 ++- drivers/s390/block/dasd_devmap.c | 1 + drivers/s390/block/dasd_fba.c | 202 +++++++++++++++++++++++++++++- drivers/s390/block/dasd_int.h | 3 + 5 files changed, 226 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/uapi/asm/dasd.h b/arch/s390/include/uapi/asm/dasd.h index 1340311dab77..ab5797cdc1b7 100644 --- a/arch/s390/include/uapi/asm/dasd.h +++ b/arch/s390/include/uapi/asm/dasd.h @@ -72,7 +72,10 @@ typedef struct dasd_information2_t { * 0x02: use diag discipline (diag) * 0x04: set the device initially online (internal use only) * 0x08: enable ERP related logging - * 0x20: give access to raw eckd data + * 0x10: allow I/O to fail on lost paths + * 0x20: allow I/O to fail when a lock was stolen + * 0x40: give access to raw eckd data + * 0x80: enable discard support */ #define DASD_FEATURE_DEFAULT 0x00 #define DASD_FEATURE_READONLY 0x01 @@ -82,6 +85,7 @@ typedef struct dasd_information2_t { #define DASD_FEATURE_FAILFAST 0x10 #define DASD_FEATURE_FAILONSLCK 0x20 #define DASD_FEATURE_USERAW 0x40 +#define DASD_FEATURE_DISCARD 0x80 #define DASD_PARTN_BITS 2 diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 83f80710d555..9c97ad1ee121 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3178,7 +3178,9 @@ static int dasd_alloc_queue(struct dasd_block *block) */ static void dasd_setup_queue(struct dasd_block *block) { + unsigned int logical_block_size = block->bp_block; struct request_queue *q = block->request_queue; + unsigned int max_bytes, max_discard_sectors; int max; if (block->base->features & DASD_FEATURE_USERAW) { @@ -3195,7 +3197,7 @@ static void dasd_setup_queue(struct dasd_block *block) } queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); q->limits.max_dev_sectors = max; - blk_queue_logical_block_size(q, block->bp_block); + blk_queue_logical_block_size(q, logical_block_size); blk_queue_max_hw_sectors(q, max); blk_queue_max_segments(q, USHRT_MAX); /* with page sized segments we can translate each segement into @@ -3203,6 +3205,21 @@ static void dasd_setup_queue(struct dasd_block *block) */ blk_queue_max_segment_size(q, PAGE_SIZE); blk_queue_segment_boundary(q, PAGE_SIZE - 1); + + /* Only activate blocklayer discard support for devices that support it */ + if (block->base->features & DASD_FEATURE_DISCARD) { + q->limits.discard_granularity = logical_block_size; + q->limits.discard_alignment = PAGE_SIZE; + + /* Calculate max_discard_sectors and make it PAGE aligned */ + max_bytes = USHRT_MAX * logical_block_size; + max_bytes = ALIGN(max_bytes, PAGE_SIZE) - PAGE_SIZE; + max_discard_sectors = max_bytes / logical_block_size; + + blk_queue_max_discard_sectors(q, max_discard_sectors); + blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + } } /* diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 7b539d92eeaf..e38042ce94e6 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -1676,6 +1676,7 @@ dasd_set_feature(struct ccw_device *cdev, int feature, int flag) spin_unlock(&dasd_devmap_lock); return 0; } +EXPORT_SYMBOL(dasd_set_feature); int diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index 462cab5d4302..6168ccdb389c 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -174,6 +174,9 @@ dasd_fba_check_characteristics(struct dasd_device *device) if (readonly) set_bit(DASD_FLAG_DEVICE_RO, &device->flags); + /* FBA supports discard, set the according feature bit */ + dasd_set_feature(cdev, DASD_FEATURE_DISCARD, 1); + dev_info(&device->cdev->dev, "New FBA DASD %04X/%02X (CU %04X/%02X) with %d MB " "and %d B/blk%s\n", @@ -247,9 +250,192 @@ static void dasd_fba_check_for_device_change(struct dasd_device *device, dasd_generic_handle_state_change(device); }; -static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev, - struct dasd_block *block, - struct request *req) + +/* + * Builds a CCW with no data payload + */ +static void ccw_write_no_data(struct ccw1 *ccw) +{ + ccw->cmd_code = DASD_FBA_CCW_WRITE; + ccw->flags |= CCW_FLAG_SLI; + ccw->count = 0; +} + +/* + * Builds a CCW that writes only zeroes. + */ +static void ccw_write_zero(struct ccw1 *ccw, int count) +{ + ccw->cmd_code = DASD_FBA_CCW_WRITE; + ccw->flags |= CCW_FLAG_SLI; + ccw->count = count; + ccw->cda = (__u32) (addr_t) page_to_phys(ZERO_PAGE(0)); +} + +/* + * Helper function to count the amount of necessary CCWs within a given range + * with 4k alignment and command chaining in mind. + */ +static int count_ccws(sector_t first_rec, sector_t last_rec, + unsigned int blocks_per_page) +{ + sector_t wz_stop = 0, d_stop = 0; + int cur_pos = 0; + int count = 0; + + if (first_rec % blocks_per_page != 0) { + wz_stop = first_rec + blocks_per_page - + (first_rec % blocks_per_page) - 1; + if (wz_stop > last_rec) + wz_stop = last_rec; + cur_pos = wz_stop - first_rec + 1; + count++; + } + + if (last_rec - (first_rec + cur_pos) + 1 >= blocks_per_page) { + if ((last_rec - blocks_per_page + 1) % blocks_per_page != 0) + d_stop = last_rec - ((last_rec - blocks_per_page + 1) % + blocks_per_page); + else + d_stop = last_rec; + + cur_pos += d_stop - (first_rec + cur_pos) + 1; + count++; + } + + if (cur_pos == 0 || first_rec + cur_pos - 1 < last_rec) + count++; + + return count; +} + +/* + * This function builds a CCW request for block layer discard requests. + * Each page in the z/VM hypervisor that represents certain records of an FBA + * device will be padded with zeros. This is a special behaviour of the WRITE + * command which is triggered when no data payload is added to the CCW. + * + * Note: Due to issues in some z/VM versions, we can't fully utilise this + * special behaviour. We have to keep a 4k (or 8 block) alignment in mind to + * work around those issues and write actual zeroes to the unaligned parts in + * the request. This workaround might be removed in the future. + */ +static struct dasd_ccw_req *dasd_fba_build_cp_discard( + struct dasd_device *memdev, + struct dasd_block *block, + struct request *req) +{ + struct LO_fba_data *LO_data; + struct dasd_ccw_req *cqr; + struct ccw1 *ccw; + + sector_t wz_stop = 0, d_stop = 0; + sector_t first_rec, last_rec; + + unsigned int blksize = block->bp_block; + unsigned int blocks_per_page; + int wz_count = 0; + int d_count = 0; + int cur_pos = 0; /* Current position within the extent */ + int count = 0; + int cplength; + int datasize; + int nr_ccws; + + first_rec = blk_rq_pos(req) >> block->s2b_shift; + last_rec = + (blk_rq_pos(req) + blk_rq_sectors(req) - 1) >> block->s2b_shift; + count = last_rec - first_rec + 1; + + blocks_per_page = BLOCKS_PER_PAGE(blksize); + nr_ccws = count_ccws(first_rec, last_rec, blocks_per_page); + + /* define extent + nr_ccws * locate record + nr_ccws * single CCW */ + cplength = 1 + 2 * nr_ccws; + datasize = sizeof(struct DE_fba_data) + + nr_ccws * (sizeof(struct LO_fba_data) + sizeof(struct ccw1)); + + cqr = dasd_smalloc_request(DASD_FBA_MAGIC, cplength, datasize, memdev); + if (IS_ERR(cqr)) + return cqr; + + ccw = cqr->cpaddr; + + define_extent(ccw++, cqr->data, WRITE, blksize, first_rec, count); + LO_data = cqr->data + sizeof(struct DE_fba_data); + + /* First part is not aligned. Calculate range to write zeroes. */ + if (first_rec % blocks_per_page != 0) { + wz_stop = first_rec + blocks_per_page - + (first_rec % blocks_per_page) - 1; + if (wz_stop > last_rec) + wz_stop = last_rec; + wz_count = wz_stop - first_rec + 1; + + ccw[-1].flags |= CCW_FLAG_CC; + locate_record(ccw++, LO_data++, WRITE, cur_pos, wz_count); + + ccw[-1].flags |= CCW_FLAG_CC; + ccw_write_zero(ccw++, wz_count * blksize); + + cur_pos = wz_count; + } + + /* We can do proper discard when we've got at least blocks_per_page blocks. */ + if (last_rec - (first_rec + cur_pos) + 1 >= blocks_per_page) { + /* is last record at page boundary? */ + if ((last_rec - blocks_per_page + 1) % blocks_per_page != 0) + d_stop = last_rec - ((last_rec - blocks_per_page + 1) % + blocks_per_page); + else + d_stop = last_rec; + + d_count = d_stop - (first_rec + cur_pos) + 1; + + ccw[-1].flags |= CCW_FLAG_CC; + locate_record(ccw++, LO_data++, WRITE, cur_pos, d_count); + + ccw[-1].flags |= CCW_FLAG_CC; + ccw_write_no_data(ccw++); + + cur_pos += d_count; + } + + /* We might still have some bits left which need to be zeroed. */ + if (cur_pos == 0 || first_rec + cur_pos - 1 < last_rec) { + if (d_stop != 0) + wz_count = last_rec - d_stop; + else if (wz_stop != 0) + wz_count = last_rec - wz_stop; + else + wz_count = count; + + ccw[-1].flags |= CCW_FLAG_CC; + locate_record(ccw++, LO_data++, WRITE, cur_pos, wz_count); + + ccw[-1].flags |= CCW_FLAG_CC; + ccw_write_zero(ccw++, wz_count * blksize); + } + + if (blk_noretry_request(req) || + block->base->features & DASD_FEATURE_FAILFAST) + set_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags); + + cqr->startdev = memdev; + cqr->memdev = memdev; + cqr->block = block; + cqr->expires = memdev->default_expires * HZ; /* default 5 minutes */ + cqr->retries = memdev->default_retries; + cqr->buildclk = get_tod_clock(); + cqr->status = DASD_CQR_FILLED; + + return cqr; +} + +static struct dasd_ccw_req *dasd_fba_build_cp_regular( + struct dasd_device *memdev, + struct dasd_block *block, + struct request *req) { struct dasd_fba_private *private = block->base->private; unsigned long *idaws; @@ -372,6 +558,16 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev, return cqr; } +static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device *memdev, + struct dasd_block *block, + struct request *req) +{ + if (req_op(req) == REQ_OP_DISCARD || req_op(req) == REQ_OP_WRITE_ZEROES) + return dasd_fba_build_cp_discard(memdev, block, req); + else + return dasd_fba_build_cp_regular(memdev, block, req); +} + static int dasd_fba_free_cp(struct dasd_ccw_req *cqr, struct request *req) { diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 66248795b973..f9e25fc03d6b 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -167,6 +167,9 @@ do { \ printk(d_loglevel PRINTK_HEADER " " d_string "\n", d_args); \ } while(0) +/* Macro to calculate number of blocks per page */ +#define BLOCKS_PER_PAGE(blksize) (PAGE_SIZE / blksize) + struct dasd_ccw_req { unsigned int magic; /* Eye catcher */ struct list_head devlist; /* for dasd_device request queue */ From fa41ba0d08de7c975c3e94d0067553f9b934221f Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 24 Aug 2017 12:55:08 +0200 Subject: [PATCH 50/50] s390/mm: avoid empty zero pages for KVM guests to avoid postcopy hangs Right now there is a potential hang situation for postcopy migrations, if the guest is enabling storage keys on the target system during the postcopy process. For storage key virtualization, we have to forbid the empty zero page as the storage key is a property of the physical page frame. As we enable storage key handling lazily we then drop all mappings for empty zero pages for lazy refaulting later on. This does not work with the postcopy migration, which relies on the empty zero page never triggering a fault again in the future. The reason is that postcopy migration will simply read a page on the target system if that page is a known zero page to fault in an empty zero page. At the same time postcopy remembers that this page was already transferred - so any future userfault on that page will NOT be retransmitted again to avoid races. If now the guest enters the storage key mode while in postcopy, we will break this assumption of postcopy. The solution is to disable the empty zero page for KVM guests early on and not during storage key enablement. With this change, the postcopy migration process is guaranteed to start after no zero pages are left. As guest pages are very likely not empty zero pages anyway the memory overhead is also pretty small. While at it this also adds proper page table locking to the zero page removal. Signed-off-by: Christian Borntraeger Acked-by: Janosch Frank Cc: stable@vger.kernel.org Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 2 +- arch/s390/mm/gmap.c | 39 +++++++++++++++++++++++++++------ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index c9560e035207..dce708e061ea 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -505,7 +505,7 @@ static inline int mm_alloc_pgste(struct mm_struct *mm) * In the case that a guest uses storage keys * faults should no longer be backed by zero pages */ -#define mm_forbids_zeropage mm_use_skey +#define mm_forbids_zeropage mm_has_pgste static inline int mm_use_skey(struct mm_struct *mm) { #ifdef CONFIG_PGSTE diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 53292c03e312..9e1494e3d849 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2120,6 +2120,37 @@ static inline void thp_split_mm(struct mm_struct *mm) #endif } +/* + * Remove all empty zero pages from the mapping for lazy refaulting + * - This must be called after mm->context.has_pgste is set, to avoid + * future creation of zero pages + * - This must be called after THP was enabled + */ +static int __zap_zero_pages(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + unsigned long addr; + + for (addr = start; addr != end; addr += PAGE_SIZE) { + pte_t *ptep; + spinlock_t *ptl; + + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (is_zero_pfn(pte_pfn(*ptep))) + ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_unmap_unlock(ptep, ptl); + } + return 0; +} + +static inline void zap_zero_pages(struct mm_struct *mm) +{ + struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; + + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); +} + /* * switch on pgstes for its userspace process (for kvm) */ @@ -2137,6 +2168,7 @@ int s390_enable_sie(void) mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); + zap_zero_pages(mm); up_write(&mm->mmap_sem); return 0; } @@ -2149,13 +2181,6 @@ EXPORT_SYMBOL_GPL(s390_enable_sie); static int __s390_enable_skey(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - /* - * Remove all zero page mappings, - * after establishing a policy to forbid zero page mappings - * following faults for that page will get fresh anonymous pages - */ - if (is_zero_pfn(pte_pfn(*pte))) - ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); /* Clear storage key */ ptep_zap_key(walk->mm, addr, pte); return 0;