From f85b2b297c16b6d9fa8d9f2f26b73b5571dfb859 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 30 Oct 2018 08:19:54 +0100 Subject: [PATCH 01/98] s390/qdio: clean up pci_out_supported() pci_out_supported() currently takes a single queue as parameter, even though Output IRQ support is a per-device feature. Adjust the parameter, so that the macro can also be used in code paths with no access to a queue struct. This allows us to remove the remaining open-coded checks for QIB_AC_OUTBOUND_PCI_SUPPORTED. Signed-off-by: Julian Wiedmann Reviewed-by: Benjamin Block Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio.h | 3 +-- drivers/s390/cio/qdio_main.c | 17 +++++++++-------- drivers/s390/cio/qdio_setup.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index a6f7c2986b94..2c29141005ca 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -341,8 +341,7 @@ static inline int multicast_outbound(struct qdio_q *q) (q->nr == q->irq_ptr->nr_output_qs - 1); } -#define pci_out_supported(q) \ - (q->irq_ptr->qib.ac & QIB_AC_OUTBOUND_PCI_SUPPORTED) +#define pci_out_supported(irq) ((irq)->qib.ac & QIB_AC_OUTBOUND_PCI_SUPPORTED) #define is_qebsm(q) (q->irq_ptr->sch_token != 0) #define need_siga_in(q) (q->irq_ptr->siga_flag.input) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 9537e656e927..02d515fa10a1 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -371,7 +371,7 @@ static inline int qdio_siga_input(struct qdio_q *q) static inline void qdio_sync_queues(struct qdio_q *q) { /* PCI capable outbound queues will also be scanned so sync them too */ - if (pci_out_supported(q)) + if (pci_out_supported(q->irq_ptr)) qdio_siga_sync_all(q); else qdio_siga_sync_q(q); @@ -718,7 +718,7 @@ static int get_outbound_buffer_frontier(struct qdio_q *q) if (need_siga_sync(q)) if (((queue_type(q) != QDIO_IQDIO_QFMT) && - !pci_out_supported(q)) || + !pci_out_supported(q->irq_ptr)) || (queue_type(q) == QDIO_IQDIO_QFMT && multicast_outbound(q))) qdio_siga_sync_q(q); @@ -843,9 +843,9 @@ static void __qdio_outbound_processing(struct qdio_q *q) if (qdio_outbound_q_moved(q)) qdio_kick_handler(q); - if (queue_type(q) == QDIO_ZFCP_QFMT) - if (!pci_out_supported(q) && !qdio_outbound_q_done(q)) - goto sched; + if (queue_type(q) == QDIO_ZFCP_QFMT && !pci_out_supported(q->irq_ptr) && + !qdio_outbound_q_done(q)) + goto sched; if (q->u.out.pci_out_enabled) return; @@ -883,13 +883,14 @@ void qdio_outbound_timer(struct timer_list *t) static inline void qdio_check_outbound_after_thinint(struct qdio_q *q) { + struct qdio_irq *irq = q->irq_ptr; struct qdio_q *out; int i; - if (!pci_out_supported(q)) + if (!pci_out_supported(irq)) return; - for_each_output_queue(q->irq_ptr, out, i) + for_each_output_queue(irq, out, i) if (!qdio_outbound_q_done(out)) qdio_tasklet_schedule(out); } @@ -976,7 +977,7 @@ static void qdio_int_handler_pci(struct qdio_irq *irq_ptr) } } - if (!(irq_ptr->qib.ac & QIB_AC_OUTBOUND_PCI_SUPPORTED)) + if (!pci_out_supported(irq_ptr)) return; for_each_output_queue(irq_ptr, q, i) { diff --git a/drivers/s390/cio/qdio_setup.c b/drivers/s390/cio/qdio_setup.c index a59887fad13e..99d7d2566a3a 100644 --- a/drivers/s390/cio/qdio_setup.c +++ b/drivers/s390/cio/qdio_setup.c @@ -523,7 +523,7 @@ void qdio_print_subchannel_info(struct qdio_irq *irq_ptr, irq_ptr->schid.sch_no, is_thinint_irq(irq_ptr), (irq_ptr->sch_token) ? 1 : 0, - (irq_ptr->qib.ac & QIB_AC_OUTBOUND_PCI_SUPPORTED) ? 1 : 0, + pci_out_supported(irq_ptr) ? 1 : 0, css_general_characteristics.aif_tdd, (irq_ptr->siga_flag.input) ? "R" : " ", (irq_ptr->siga_flag.output) ? "W" : " ", From 2f2f3839fb8d0fb548fa6ee2d3bec656dc61f1ac Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Tue, 30 Oct 2018 08:21:27 +0100 Subject: [PATCH 02/98] s390/qdio: clean up qdio_check_outbound_after_thinint() This helper is not thinint-specific, qdio_get_next_buffers() also calls it for non-thinint devices. So give it a more fitting name, and while at it adjust its parameter. Signed-off-by: Julian Wiedmann Reviewed-by: Benjamin Block Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio_main.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 02d515fa10a1..c25a40ecc105 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -881,9 +881,8 @@ void qdio_outbound_timer(struct timer_list *t) qdio_tasklet_schedule(q); } -static inline void qdio_check_outbound_after_thinint(struct qdio_q *q) +static inline void qdio_check_outbound_pci_queues(struct qdio_irq *irq) { - struct qdio_irq *irq = q->irq_ptr; struct qdio_q *out; int i; @@ -901,11 +900,8 @@ static void __tiqdio_inbound_processing(struct qdio_q *q) if (need_siga_sync(q) && need_siga_sync_after_ai(q)) qdio_sync_queues(q); - /* - * The interrupt could be caused by a PCI request. Check the - * PCI capable outbound queues. - */ - qdio_check_outbound_after_thinint(q); + /* The interrupt could be caused by a PCI request: */ + qdio_check_outbound_pci_queues(q->irq_ptr); if (!qdio_inbound_q_moved(q)) return; @@ -1687,8 +1683,7 @@ int qdio_get_next_buffers(struct ccw_device *cdev, int nr, int *bufnr, if (need_siga_sync(q)) qdio_sync_queues(q); - /* check the PCI capable outbound queues. */ - qdio_check_outbound_after_thinint(q); + qdio_check_outbound_pci_queues(irq_ptr); if (!qdio_inbound_q_moved(q)) return 0; From 46a984ffb86c8542fa510656fa8cb33befe8ee8f Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 28 Mar 2019 11:21:47 +0100 Subject: [PATCH 03/98] s390/cpum_cf: Add support for CPU-MF SVN 6 Add support for the CPU-Measurement Facility counter second version number 6. This number is used to detect some more counters in the crypto counter set and the extended counter set. Signed-off-by: Thomas Richter Reviewed-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/perf_cpum_cf.c | 15 +++- arch/s390/kernel/perf_cpum_cf_events.c | 107 +++++++++++++++++-------- 2 files changed, 84 insertions(+), 38 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index e1c54d28713a..48d48b6187c0 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,8 +2,8 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2017 - * Author(s): Hendrik Brueckner + * Copyright IBM Corp. 2012, 2019 + * Author(s): Hendrik Brueckner */ #define KMSG_COMPONENT "cpum_cf" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt @@ -26,7 +26,7 @@ static enum cpumf_ctr_set get_counter_set(u64 event) set = CPUMF_CTR_SET_USER; else if (event < 128) set = CPUMF_CTR_SET_CRYPTO; - else if (event < 256) + else if (event < 288) set = CPUMF_CTR_SET_EXT; else if (event >= 448 && event < 496) set = CPUMF_CTR_SET_MT_DIAG; @@ -50,12 +50,19 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_CRYPTO: + if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 && + hwc->config > 79) || + (cpuhw->info.csvn >= 6 && hwc->config > 83)) + err = -EOPNOTSUPP; + break; case CPUMF_CTR_SET_EXT: if (cpuhw->info.csvn < 1) err = -EOPNOTSUPP; if ((cpuhw->info.csvn == 1 && hwc->config > 159) || (cpuhw->info.csvn == 2 && hwc->config > 175) || - (cpuhw->info.csvn > 2 && hwc->config > 255)) + (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5 + && hwc->config > 255) || + (cpuhw->info.csvn >= 6 && hwc->config > 287)) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_MT_DIAG: diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index b45238c89728..34cc96449b30 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -31,22 +31,26 @@ CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_CPU_CYCLES, 0x0020); CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_INSTRUCTIONS, 0x0021); CPUMF_EVENT_ATTR(cf_fvn3, L1D_DIR_WRITES, 0x0004); CPUMF_EVENT_ATTR(cf_fvn3, L1D_PENALTY_CYCLES, 0x0005); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_FUNCTIONS, 0x0040); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_CYCLES, 0x0041); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_BLOCKED_FUNCTIONS, 0x0042); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_BLOCKED_CYCLES, 0x0043); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_FUNCTIONS, 0x0044); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_CYCLES, 0x0045); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_BLOCKED_FUNCTIONS, 0x0046); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_BLOCKED_CYCLES, 0x0047); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_FUNCTIONS, 0x0048); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_CYCLES, 0x0049); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_BLOCKED_FUNCTIONS, 0x004a); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_BLOCKED_CYCLES, 0x004b); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_FUNCTIONS, 0x004c); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_CYCLES, 0x004d); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_BLOCKED_FUNCTIONS, 0x004e); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_BLOCKED_CYCLES, 0x004f); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_FUNCTIONS, 0x0040); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_CYCLES, 0x0041); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS, 0x0042); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_BLOCKED_CYCLES, 0x0043); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_FUNCTIONS, 0x0044); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_CYCLES, 0x0045); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS, 0x0046); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_BLOCKED_CYCLES, 0x0047); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_FUNCTIONS, 0x0048); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_CYCLES, 0x0049); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS, 0x004a); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_BLOCKED_CYCLES, 0x004b); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_FUNCTIONS, 0x004c); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_CYCLES, 0x004d); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS, 0x004e); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_BLOCKED_CYCLES, 0x004f); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_FUNCTION_COUNT, 0x0050); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_CYCLES_COUNT, 0x0051); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_BLOCKED_FUNCTION_COUNT, 0x0052); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_BLOCKED_CYCLES_COUNT, 0x0053); CPUMF_EVENT_ATTR(cf_z10, L1I_L2_SOURCED_WRITES, 0x0080); CPUMF_EVENT_ATTR(cf_z10, L1D_L2_SOURCED_WRITES, 0x0081); CPUMF_EVENT_ATTR(cf_z10, L1I_L3_LOCAL_WRITES, 0x0082); @@ -262,23 +266,47 @@ static struct attribute *cpumcf_fvn3_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_generic_pmu_event_attr[] __initdata = { - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, AES_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, AES_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, AES_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, AES_BLOCKED_CYCLES), +static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_6, ECC_FUNCTION_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_CYCLES_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_BLOCKED_FUNCTION_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_BLOCKED_CYCLES_COUNT), NULL, }; @@ -562,7 +590,18 @@ __init const struct attribute_group **cpumf_cf_event_group(void) default: cfvn = none; } - csvn = cpumcf_svn_generic_pmu_event_attr; + + /* Determine version specific crypto set */ + switch (ci.csvn) { + case 1 ... 5: + csvn = cpumcf_svn_12345_pmu_event_attr; + break; + case 6: + csvn = cpumcf_svn_6_pmu_event_attr; + break; + default: + csvn = none; + } /* Determine model-specific counter set(s) */ get_cpu_id(&cpu_id); From bf9921a9c15bad089c08b94c300a6cafa035a612 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 1 Apr 2019 19:10:45 +0200 Subject: [PATCH 04/98] s390: introduce .boot.preserved.data section Introduce .boot.preserve.data section which is similar to .boot.data and "shared" between the decompressor code and the decompressed kernel. The decompressor will store values in it, and copy over to the decompressed image before starting it. This method allows to avoid using pre-defined addresses and other hacks to pass values between those boot phases. Unlike .boot.data section .boot.preserved.data is NOT a part of init data, and hence will be preserved for the kernel life time. Signed-off-by: Gerald Schaefer Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/boot/compressed/decompressor.h | 2 ++ arch/s390/boot/compressed/vmlinux.lds.S | 1 + arch/s390/boot/startup.c | 4 ++++ arch/s390/include/asm/sections.h | 7 +++++++ arch/s390/include/asm/vmlinux.lds.h | 13 +++++++++++++ arch/s390/kernel/vmlinux.lds.S | 4 ++++ 6 files changed, 31 insertions(+) diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/compressed/decompressor.h index e1c1f2ec60f4..424cf524aac1 100644 --- a/arch/s390/boot/compressed/decompressor.h +++ b/arch/s390/boot/compressed/decompressor.h @@ -17,6 +17,8 @@ struct vmlinux_info { unsigned long bss_size; /* uncompressed image .bss size */ unsigned long bootdata_off; unsigned long bootdata_size; + unsigned long bootdata_preserved_off; + unsigned long bootdata_preserved_size; }; extern char _vmlinux_info[]; diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/compressed/vmlinux.lds.S index 7efc3938f595..1c8ef393c656 100644 --- a/arch/s390/boot/compressed/vmlinux.lds.S +++ b/arch/s390/boot/compressed/vmlinux.lds.S @@ -34,6 +34,7 @@ SECTIONS _edata = . ; } BOOT_DATA + BOOT_DATA_PRESERVED /* * uncompressed image info used by the decompressor it should match diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index bdfc5549a299..57d7f9446e29 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -6,6 +6,7 @@ #include "boot.h" extern char __boot_data_start[], __boot_data_end[]; +extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; void error(char *x) { @@ -43,6 +44,9 @@ static void copy_bootdata(void) if (__boot_data_end - __boot_data_start != vmlinux.bootdata_size) error(".boot.data section size mismatch"); memcpy((void *)vmlinux.bootdata_off, __boot_data_start, vmlinux.bootdata_size); + if (__boot_data_preserved_end - __boot_data_preserved_start != vmlinux.bootdata_preserved_size) + error(".boot.preserved.data section size mismatch"); + memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); } void startup_kernel(void) diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index 7afe4620685c..29e55739b516 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -16,4 +16,11 @@ */ #define __bootdata(var) __section(.boot.data.var) var +/* + * .boot.preserved.data is similar to .boot.data, but it is not part of the + * .init section and thus will be preserved for later use in the decompressed + * kernel. + */ +#define __bootdata_preserved(var) __section(.boot.preserved.data.var) var + #endif diff --git a/arch/s390/include/asm/vmlinux.lds.h b/arch/s390/include/asm/vmlinux.lds.h index 2d127f900352..cbe670a6861b 100644 --- a/arch/s390/include/asm/vmlinux.lds.h +++ b/arch/s390/include/asm/vmlinux.lds.h @@ -18,3 +18,16 @@ *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.boot.data*))) \ __boot_data_end = .; \ } + +/* + * .boot.preserved.data is similar to .boot.data, but it is not part of the + * .init section and thus will be preserved for later use in the decompressed + * kernel. + */ +#define BOOT_DATA_PRESERVED \ + . = ALIGN(PAGE_SIZE); \ + .boot.preserved.data : { \ + __boot_data_preserved_start = .; \ + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.boot.preserved.data*))) \ + __boot_data_preserved_end = .; \ + } diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 8429ab079715..6ef9c62bb01b 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -72,6 +72,7 @@ SECTIONS __end_ro_after_init = .; RW_DATA_SECTION(0x100, PAGE_SIZE, THREAD_SIZE) + BOOT_DATA_PRESERVED _edata = .; /* End of data section */ @@ -161,6 +162,9 @@ SECTIONS QUAD(__bss_stop - __bss_start) /* bss_size */ QUAD(__boot_data_start) /* bootdata_off */ QUAD(__boot_data_end - __boot_data_start) /* bootdata_size */ + QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */ + QUAD(__boot_data_preserved_end - + __boot_data_preserved_start) /* bootdata_preserved_size */ } :NONE /* Debugging sections. */ From 1e941d39493f1820475d80729a03cd7ab8c3c86d Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 1 Apr 2019 19:10:51 +0200 Subject: [PATCH 05/98] s390: move ipl block to .boot.preserved.data section .boot.preserved.data is a better fit for ipl block than .boot.data which is discarded after init. Reusing .boot.preserved.data allows to simplify code a little bit and avoid copying data from .boot.data to persistent variables. Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/boot/ipl_parm.c | 24 ++++++++++++------------ arch/s390/include/asm/boot_data.h | 4 ++-- arch/s390/include/asm/ipl.h | 1 - arch/s390/kernel/early.c | 1 - arch/s390/kernel/ipl.c | 15 ++------------- 5 files changed, 16 insertions(+), 29 deletions(-) diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 36beb56de021..dc0438d08751 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -10,8 +10,8 @@ #include "boot.h" char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; -struct ipl_parameter_block __bootdata(early_ipl_block); -int __bootdata(early_ipl_block_valid); +struct ipl_parameter_block __bootdata_preserved(ipl_block); +int __bootdata_preserved(ipl_block_valid); unsigned long __bootdata(memory_end); int __bootdata(memory_end_set); @@ -45,10 +45,10 @@ void store_ipl_parmblock(void) { int rc; - rc = __diag308(DIAG308_STORE, &early_ipl_block); + rc = __diag308(DIAG308_STORE, &ipl_block); if (rc == DIAG308_RC_OK && - early_ipl_block.hdr.version <= IPL_MAX_SUPPORTED_VERSION) - early_ipl_block_valid = 1; + ipl_block.hdr.version <= IPL_MAX_SUPPORTED_VERSION) + ipl_block_valid = 1; } static size_t scpdata_length(const char *buf, size_t count) @@ -103,14 +103,14 @@ static void append_ipl_block_parm(void) delim = early_command_line + len; /* '\0' character position */ parm = early_command_line + len + 1; /* append right after '\0' */ - switch (early_ipl_block.hdr.pbt) { + switch (ipl_block.hdr.pbt) { case DIAG308_IPL_TYPE_CCW: rc = ipl_block_get_ascii_vmparm( - parm, COMMAND_LINE_SIZE - len - 1, &early_ipl_block); + parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; case DIAG308_IPL_TYPE_FCP: rc = ipl_block_get_ascii_scpdata( - parm, COMMAND_LINE_SIZE - len - 1, &early_ipl_block); + parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; } if (rc) { @@ -141,7 +141,7 @@ void setup_boot_command_line(void) strcpy(early_command_line, strim(COMMAND_LINE)); /* append IPL PARM data to the boot command line */ - if (early_ipl_block_valid) + if (ipl_block_valid) append_ipl_block_parm(); } @@ -234,9 +234,9 @@ void parse_boot_command_line(void) void setup_memory_end(void) { #ifdef CONFIG_CRASH_DUMP - if (!OLDMEM_BASE && early_ipl_block_valid && - early_ipl_block.hdr.pbt == DIAG308_IPL_TYPE_FCP && - early_ipl_block.ipl_info.fcp.opt == DIAG308_IPL_OPT_DUMP) { + if (!OLDMEM_BASE && ipl_block_valid && + ipl_block.hdr.pbt == DIAG308_IPL_TYPE_FCP && + ipl_block.ipl_info.fcp.opt == DIAG308_IPL_OPT_DUMP) { if (!sclp_early_get_hsa_size(&memory_end) && memory_end) memory_end_set = 1; } diff --git a/arch/s390/include/asm/boot_data.h b/arch/s390/include/asm/boot_data.h index 2d999ccb977a..d794802a2291 100644 --- a/arch/s390/include/asm/boot_data.h +++ b/arch/s390/include/asm/boot_data.h @@ -5,7 +5,7 @@ #include extern char early_command_line[COMMAND_LINE_SIZE]; -extern struct ipl_parameter_block early_ipl_block; -extern int early_ipl_block_valid; +extern struct ipl_parameter_block ipl_block; +extern int ipl_block_valid; #endif /* _ASM_S390_BOOT_DATA_H */ diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index a8389e2d2f03..6403cc9952bf 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -88,7 +88,6 @@ void __init save_area_add_regs(struct save_area *, void *regs); void __init save_area_add_vxrs(struct save_area *, __vector128 *vxrs); extern void s390_reset_system(void); -extern void ipl_store_parameters(void); extern size_t ipl_block_get_ascii_vmparm(char *dest, size_t size, const struct ipl_parameter_block *ipb); diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index d6edf45f93b9..c196abda36c7 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -309,7 +309,6 @@ void __init startup_init(void) setup_facility_list(); detect_machine_type(); setup_arch_string(); - ipl_store_parameters(); setup_boot_command_line(); detect_diag9c(); detect_diag44(); diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 18a5d6317acc..d2e8dc60f2cf 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -119,11 +119,8 @@ static char *dump_type_str(enum dump_type type) } } -struct ipl_parameter_block __bootdata(early_ipl_block); -int __bootdata(early_ipl_block_valid); - -static int ipl_block_valid; -static struct ipl_parameter_block ipl_block; +int __bootdata_preserved(ipl_block_valid); +struct ipl_parameter_block __bootdata_preserved(ipl_block); static int reipl_capabilities = IPL_TYPE_UNKNOWN; @@ -1675,14 +1672,6 @@ void __init setup_ipl(void) atomic_notifier_chain_register(&panic_notifier_list, &on_panic_nb); } -void __init ipl_store_parameters(void) -{ - if (early_ipl_block_valid) { - memcpy(&ipl_block, &early_ipl_block, sizeof(ipl_block)); - ipl_block_valid = 1; - } -} - void s390_reset_system(void) { /* Disable prefixing */ From fd184e1a8b650159ca553c53dd0fee75df799b4a Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 1 Apr 2019 19:10:56 +0200 Subject: [PATCH 06/98] s390: introduce .boot.preserved.data section compile time validation Same as for .boot.data section make sure that .boot.preserved.data sections of vmlinux and arch/s390/compressed/vmlinux match before producing the compressed kernel image. Symbols presence, order and sizes are cross-checked. Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/boot/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index c844eaf24ed7..458d18c44c67 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -30,7 +30,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o string.o ebcdic.o obj-y += sclp_early_core.o mem.o ipl_vmparm.o cmdline.o ctype.o -targets := bzImage startup.a section_cmp.boot.data $(obj-y) +targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) subdir- := compressed OBJECTS := $(addprefix $(obj)/,$(obj-y)) @@ -48,7 +48,7 @@ define cmd_section_cmp touch $@ endef -$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data FORCE +$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE $(call if_changed,objcopy) $(obj)/section_cmp%: vmlinux $(obj)/compressed/vmlinux FORCE From 5abb9351dfd937d43193f4d09af9c72bfe2c4180 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 1 Apr 2019 19:11:03 +0200 Subject: [PATCH 07/98] s390/uv: introduce guest side ultravisor code The Ultravisor Call Facility (stfle bit 158) defines an API to the Ultravisor (UV calls), a mini hypervisor located at machine level. With help of the Ultravisor, KVM will be able to run "protected" VMs, special VMs whose memory and management data are unavailable to KVM. The protected VMs can also request services from the Ultravisor. The guest api consists of UV calls to share and unshare memory with the kvm hypervisor. To enable this feature support PROTECTED_VIRTUALIZATION_GUEST kconfig option has been introduced. Co-developed-by: Janosch Frank Signed-off-by: Janosch Frank Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 11 ++++ arch/s390/boot/Makefile | 1 + arch/s390/boot/startup.c | 2 + arch/s390/boot/uv.c | 24 +++++++ arch/s390/include/asm/uv.h | 132 +++++++++++++++++++++++++++++++++++++ arch/s390/kernel/setup.c | 5 ++ 6 files changed, 175 insertions(+) create mode 100644 arch/s390/boot/uv.c create mode 100644 arch/s390/include/asm/uv.h diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b6e3d0653002..4ac8d49e206a 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -837,6 +837,17 @@ config HAVE_PNETID menu "Virtualization" +config PROTECTED_VIRTUALIZATION_GUEST + def_bool n + prompt "Protected virtualization guest support" + help + Select this option, if you want to be able to run this + kernel as a protected virtualization KVM guest. + Protected virtualization capable machines have a mini hypervisor + located at machine level (an ultravisor). With help of the + Ultravisor, KVM will be able to run "protected" VMs, special + VMs whose memory and management data are unavailable to KVM. + config PFAULT def_bool y prompt "Pseudo page fault support" diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 458d18c44c67..a5ae68b2aa84 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -30,6 +30,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o string.o ebcdic.o obj-y += sclp_early_core.o mem.o ipl_vmparm.o cmdline.o ctype.o +obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) subdir- := compressed diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 57d7f9446e29..2bd4a62d436c 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "compressed/decompressor.h" #include "boot.h" @@ -53,6 +54,7 @@ void startup_kernel(void) { void *img; + uv_query_info(); rescue_initrd(); sclp_early_read_info(); store_ipl_parmblock(); diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c new file mode 100644 index 000000000000..ed007f4a6444 --- /dev/null +++ b/arch/s390/boot/uv.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +int __bootdata_preserved(prot_virt_guest); + +void uv_query_info(void) +{ + struct uv_cb_qui uvcb = { + .header.cmd = UVC_CMD_QUI, + .header.len = sizeof(uvcb) + }; + + if (!test_facility(158)) + return; + + if (uv_call(0, (uint64_t)&uvcb)) + return; + + if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) && + test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list)) + prot_virt_guest = 1; +} diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h new file mode 100644 index 000000000000..ef3c00b049ab --- /dev/null +++ b/arch/s390/include/asm/uv.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ultravisor Interfaces + * + * Copyright IBM Corp. 2019 + * + * Author(s): + * Vasily Gorbik + * Janosch Frank + */ +#ifndef _ASM_S390_UV_H +#define _ASM_S390_UV_H + +#include +#include +#include +#include + +#define UVC_RC_EXECUTED 0x0001 +#define UVC_RC_INV_CMD 0x0002 +#define UVC_RC_INV_STATE 0x0003 +#define UVC_RC_INV_LEN 0x0005 +#define UVC_RC_NO_RESUME 0x0007 + +#define UVC_CMD_QUI 0x0001 +#define UVC_CMD_SET_SHARED_ACCESS 0x1000 +#define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 + +/* Bits in installed uv calls */ +enum uv_cmds_inst { + BIT_UVC_CMD_QUI = 0, + BIT_UVC_CMD_SET_SHARED_ACCESS = 8, + BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9, +}; + +struct uv_cb_header { + u16 len; + u16 cmd; /* Command Code */ + u16 rc; /* Response Code */ + u16 rrc; /* Return Reason Code */ +} __packed __aligned(8); + +struct uv_cb_qui { + struct uv_cb_header header; + u64 reserved08; + u64 inst_calls_list[4]; + u64 reserved30[15]; +} __packed __aligned(8); + +struct uv_cb_share { + struct uv_cb_header header; + u64 reserved08[3]; + u64 paddr; + u64 reserved28; +} __packed __aligned(8); + +static inline int uv_call(unsigned long r1, unsigned long r2) +{ + int cc; + + asm volatile( + "0: .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" + " brc 3,0b\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=d" (cc) + : [r1] "a" (r1), [r2] "a" (r2) + : "memory", "cc"); + return cc; +} + +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +extern int prot_virt_guest; + +static inline int is_prot_virt_guest(void) +{ + return prot_virt_guest; +} + +static inline int share(unsigned long addr, u16 cmd) +{ + struct uv_cb_share uvcb = { + .header.cmd = cmd, + .header.len = sizeof(uvcb), + .paddr = addr + }; + + if (!is_prot_virt_guest()) + return -ENOTSUPP; + /* + * Sharing is page wise, if we encounter addresses that are + * not page aligned, we assume something went wrong. If + * malloced structs are passed to this function, we could leak + * data to the hypervisor. + */ + BUG_ON(addr & ~PAGE_MASK); + + if (!uv_call(0, (u64)&uvcb)) + return 0; + return -EINVAL; +} + +/* + * Guest 2 request to the Ultravisor to make a page shared with the + * hypervisor for IO. + * + * @addr: Real or absolute address of the page to be shared + */ +static inline int uv_set_shared(unsigned long addr) +{ + return share(addr, UVC_CMD_SET_SHARED_ACCESS); +} + +/* + * Guest 2 request to the Ultravisor to make a page unshared. + * + * @addr: Real or absolute address of the page to be unshared + */ +static inline int uv_remove_shared(unsigned long addr) +{ + return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS); +} + +void uv_query_info(void); +#else +#define is_prot_virt_guest() 0 +static inline int uv_set_shared(unsigned long addr) { return 0; } +static inline int uv_remove_shared(unsigned long addr) { return 0; } +static inline void uv_query_info(void) {} +#endif + +#endif /* _ASM_S390_UV_H */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 2c642af526ce..70197a68e6fa 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -70,6 +70,7 @@ #include #include #include +#include #include "entry.h" /* @@ -89,6 +90,10 @@ char elf_platform[ELF_PLATFORM_SIZE]; unsigned long int_hwcap = 0; +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +int __bootdata_preserved(prot_virt_guest); +#endif + int __bootdata(noexec_disabled); int __bootdata(memory_end_set); unsigned long __bootdata(memory_end); From db9492cef45efc347beed7b617dfdfac399f662b Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 1 Apr 2019 19:11:04 +0200 Subject: [PATCH 08/98] s390/protvirt: add memory sharing for diag 308 set/store Add sharing of ipl parameter block for diag 308 set/store calls to allow kvm access in protected virtualization environment. Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/boot/ipl_parm.c | 3 +++ arch/s390/kernel/ipl.c | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index dc0438d08751..385d0c7a9635 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "boot.h" char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; @@ -45,7 +46,9 @@ void store_ipl_parmblock(void) { int rc; + uv_set_shared(__pa(&ipl_block)); rc = __diag308(DIAG308_STORE, &ipl_block); + uv_remove_shared(__pa(&ipl_block)); if (rc == DIAG308_RC_OK && ipl_block.hdr.version <= IPL_MAX_SUPPORTED_VERSION) ipl_block_valid = 1; diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index d2e8dc60f2cf..cffe3374d201 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "entry.h" #define IPL_PARM_BLOCK_VERSION 0 @@ -863,15 +864,21 @@ static void __reipl_run(void *unused) { switch (reipl_type) { case IPL_TYPE_CCW: + uv_set_shared(__pa(reipl_block_ccw)); diag308(DIAG308_SET, reipl_block_ccw); + uv_remove_shared(__pa(reipl_block_ccw)); diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_FCP: + uv_set_shared(__pa(reipl_block_fcp)); diag308(DIAG308_SET, reipl_block_fcp); + uv_remove_shared(__pa(reipl_block_fcp)); diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_NSS: + uv_set_shared(__pa(reipl_block_nss)); diag308(DIAG308_SET, reipl_block_nss); + uv_remove_shared(__pa(reipl_block_nss)); diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_UNKNOWN: @@ -1142,7 +1149,9 @@ static struct kset *dump_kset; static void diag308_dump(void *dump_block) { + uv_set_shared(__pa(dump_block)); diag308(DIAG308_SET, dump_block); + uv_remove_shared(__pa(dump_block)); while (1) { if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302) break; From 093ddccb55157f909f203f9e50bce0c24431e791 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 1 Apr 2019 19:11:08 +0200 Subject: [PATCH 09/98] s390/protvirt: block kernel command line alteration Disallow kernel command line alteration via ipl parameter block if running in protected virtualization environment. Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/boot/ipl_parm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 385d0c7a9635..1900670a83fd 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -144,7 +144,7 @@ void setup_boot_command_line(void) strcpy(early_command_line, strim(COMMAND_LINE)); /* append IPL PARM data to the boot command line */ - if (ipl_block_valid) + if (!is_prot_virt_guest() && ipl_block_valid) append_ipl_block_parm(); } From 34298422cc442d65e0ee271e47f2926f63fe91e8 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 28 Mar 2019 13:48:44 +0100 Subject: [PATCH 10/98] s390/qdio: fix output of DSCI value in debug file The DSCI is a 1-byte field, placed at the start of an u32. So when printing it to a queue's debug state, limit the output to the part that's actually occupied by the DSCI. When the DSCI is set this gives us the expected output of '1', rather than the current (obscure) value of '16777216'. Suggested-by: Jens Remus Signed-off-by: Julian Wiedmann Reviewed-by: Jens Remus Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio_debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c index d2f98e5829d4..4d77d87538c9 100644 --- a/drivers/s390/cio/qdio_debug.c +++ b/drivers/s390/cio/qdio_debug.c @@ -128,8 +128,8 @@ static int qstat_show(struct seq_file *m, void *v) seq_printf(m, "polling: %d ack start: %d ack count: %d\n", q->u.in.polling, q->u.in.ack_start, q->u.in.ack_count); - seq_printf(m, "DSCI: %d IRQs disabled: %u\n", - *(u32 *)q->irq_ptr->dsci, + seq_printf(m, "DSCI: %x IRQs disabled: %u\n", + *(u8 *)q->irq_ptr->dsci, test_bit(QDIO_QUEUE_IRQS_DISABLED, &q->u.in.queue_irq_state)); } From b39544c6e02fdf86c394a9367b10e707b800f4d7 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 28 Mar 2019 10:39:25 +0100 Subject: [PATCH 11/98] s390/qdio: pass up count of ready-to-process SBALs When qdio_{in,out}bound_q_moved() scans a queue for pending work, it currently only returns a boolean to its caller. The interface to the upper-layer-drivers (qdio_kick_handler() and qdio_get_next_buffers()) then re-calculates the number of pending SBALs from the q->first_to_check and q->first_to_kick cursors. Refactor this so that whenever get_{in,out}bound_buffer_frontier() adjusted the queue's first_to_check cursor, it also returns the corresponding count of ready-to-process SBALs (and 0 else). A subsequent patch will then make use of this additional information. Signed-off-by: Julian Wiedmann Reviewed-by: Jens Remus Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio_main.c | 77 ++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index c25a40ecc105..195f35256cc5 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -504,7 +504,7 @@ static int get_inbound_buffer_frontier(struct qdio_q *q) */ count = min(atomic_read(&q->nr_buf_used), QDIO_MAX_BUFFERS_MASK); if (!count) - goto out; + return 0; /* * No siga sync here, as a PCI or we after a thin interrupt @@ -512,7 +512,7 @@ static int get_inbound_buffer_frontier(struct qdio_q *q) */ count = get_buf_states(q, q->first_to_check, &state, count, 1, 0); if (!count) - goto out; + return 0; switch (state) { case SLSB_P_INPUT_PRIMED: @@ -522,7 +522,7 @@ static int get_inbound_buffer_frontier(struct qdio_q *q) qperf_inc(q, inbound_queue_full); if (q->irq_ptr->perf_stat_enabled) account_sbals(q, count); - break; + return count; case SLSB_P_INPUT_ERROR: process_buffer_error(q, count); q->first_to_check = add_buf(q->first_to_check, count); @@ -530,7 +530,7 @@ static int get_inbound_buffer_frontier(struct qdio_q *q) qperf_inc(q, inbound_queue_full); if (q->irq_ptr->perf_stat_enabled) account_sbals_error(q, count); - break; + return count; case SLSB_CU_INPUT_EMPTY: case SLSB_P_INPUT_NOT_INIT: case SLSB_P_INPUT_ACK: @@ -538,27 +538,26 @@ static int get_inbound_buffer_frontier(struct qdio_q *q) q->q_stats.nr_sbal_nop++; DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "in nop:%1d %#02x", q->nr, q->first_to_check); - break; + return 0; default: WARN_ON_ONCE(1); + return 0; } -out: - return q->first_to_check; } static int qdio_inbound_q_moved(struct qdio_q *q) { - int bufnr; + int count; - bufnr = get_inbound_buffer_frontier(q); + count = get_inbound_buffer_frontier(q); - if (bufnr != q->last_move) { - q->last_move = bufnr; + if (count) { + q->last_move = q->first_to_check; if (!is_thinint_irq(q->irq_ptr) && MACHINE_IS_LPAR) q->u.in.timestamp = get_tod_clock(); - return 1; - } else - return 0; + } + + return count; } static inline int qdio_inbound_q_done(struct qdio_q *q) @@ -678,9 +677,12 @@ static inline int qdio_tasklet_schedule(struct qdio_q *q) static void __qdio_inbound_processing(struct qdio_q *q) { + int count; + qperf_inc(q, tasklet_inbound); - if (!qdio_inbound_q_moved(q)) + count = qdio_inbound_q_moved(q); + if (count == 0) return; qdio_kick_handler(q); @@ -729,12 +731,12 @@ static int get_outbound_buffer_frontier(struct qdio_q *q) */ count = min(atomic_read(&q->nr_buf_used), QDIO_MAX_BUFFERS_MASK); if (!count) - goto out; + return 0; count = get_buf_states(q, q->first_to_check, &state, count, 0, q->u.out.use_cq); if (!count) - goto out; + return 0; switch (state) { case SLSB_P_OUTPUT_EMPTY: @@ -746,31 +748,28 @@ static int get_outbound_buffer_frontier(struct qdio_q *q) q->first_to_check = add_buf(q->first_to_check, count); if (q->irq_ptr->perf_stat_enabled) account_sbals(q, count); - - break; + return count; case SLSB_P_OUTPUT_ERROR: process_buffer_error(q, count); q->first_to_check = add_buf(q->first_to_check, count); atomic_sub(count, &q->nr_buf_used); if (q->irq_ptr->perf_stat_enabled) account_sbals_error(q, count); - break; + return count; case SLSB_CU_OUTPUT_PRIMED: /* the adapter has not fetched the output yet */ if (q->irq_ptr->perf_stat_enabled) q->q_stats.nr_sbal_nop++; DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "out primed:%1d", q->nr); - break; + return 0; case SLSB_P_OUTPUT_NOT_INIT: case SLSB_P_OUTPUT_HALTED: - break; + return 0; default: WARN_ON_ONCE(1); + return 0; } - -out: - return q->first_to_check; } /* all buffers processed? */ @@ -781,16 +780,16 @@ static inline int qdio_outbound_q_done(struct qdio_q *q) static inline int qdio_outbound_q_moved(struct qdio_q *q) { - int bufnr; + int count; - bufnr = get_outbound_buffer_frontier(q); + count = get_outbound_buffer_frontier(q); - if (bufnr != q->last_move) { - q->last_move = bufnr; + if (count) { + q->last_move = q->first_to_check; DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "out moved:%1d", q->nr); - return 1; - } else - return 0; + } + + return count; } static int qdio_kick_outbound_q(struct qdio_q *q, unsigned long aob) @@ -837,10 +836,13 @@ retry: static void __qdio_outbound_processing(struct qdio_q *q) { + int count; + qperf_inc(q, tasklet_outbound); WARN_ON_ONCE(atomic_read(&q->nr_buf_used) < 0); - if (qdio_outbound_q_moved(q)) + count = qdio_outbound_q_moved(q); + if (count) qdio_kick_handler(q); if (queue_type(q) == QDIO_ZFCP_QFMT && !pci_out_supported(q->irq_ptr) && @@ -896,6 +898,8 @@ static inline void qdio_check_outbound_pci_queues(struct qdio_irq *irq) static void __tiqdio_inbound_processing(struct qdio_q *q) { + int count; + qperf_inc(q, tasklet_inbound); if (need_siga_sync(q) && need_siga_sync_after_ai(q)) qdio_sync_queues(q); @@ -903,7 +907,8 @@ static void __tiqdio_inbound_processing(struct qdio_q *q) /* The interrupt could be caused by a PCI request: */ qdio_check_outbound_pci_queues(q->irq_ptr); - if (!qdio_inbound_q_moved(q)) + count = qdio_inbound_q_moved(q); + if (count == 0) return; qdio_kick_handler(q); @@ -1671,6 +1676,7 @@ int qdio_get_next_buffers(struct ccw_device *cdev, int nr, int *bufnr, struct qdio_q *q; int start, end; struct qdio_irq *irq_ptr = cdev->private->qdio_data; + int count; if (!irq_ptr) return -ENODEV; @@ -1685,7 +1691,8 @@ int qdio_get_next_buffers(struct ccw_device *cdev, int nr, int *bufnr, qdio_check_outbound_pci_queues(irq_ptr); - if (!qdio_inbound_q_moved(q)) + count = qdio_inbound_q_moved(q); + if (count == 0) return 0; /* Note: upper-layer MUST stop processing immediately here ... */ From 65e4f776385ac5cab021ad8d992e851375305906 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 28 Mar 2019 10:43:46 +0100 Subject: [PATCH 12/98] s390/qdio: simplify SBAL range calculation When passing a range of ready-to-process SBALs to the upper-layer driver, use the available 'count' instead of calculating the distance between the first_to_check and first_to_kick cursors. This simplifies the logic of the queue-scan path, and opens up the possibility of scanning all 128 SBALs in one go (as determining the reported count no longer requires wrap-around safe arithmetic on the queue's cursors). Signed-off-by: Julian Wiedmann Reviewed-by: Jens Remus Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio_main.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 195f35256cc5..b6bc02efa0d0 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -636,17 +636,13 @@ static inline unsigned long qdio_aob_for_buffer(struct qdio_output_q *q, return phys_aob; } -static void qdio_kick_handler(struct qdio_q *q) +static void qdio_kick_handler(struct qdio_q *q, unsigned int count) { int start = q->first_to_kick; - int end = q->first_to_check; - int count; if (unlikely(q->irq_ptr->state != QDIO_IRQ_STATE_ACTIVE)) return; - count = sub_buf(end, start); - if (q->is_input_q) { qperf_inc(q, inbound_handler); DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "kih s:%02x c:%02x", start, count); @@ -662,7 +658,7 @@ static void qdio_kick_handler(struct qdio_q *q) q->irq_ptr->int_parm); /* for the next time */ - q->first_to_kick = end; + q->first_to_kick = add_buf(start, count); q->qdio_error = 0; } @@ -685,7 +681,7 @@ static void __qdio_inbound_processing(struct qdio_q *q) if (count == 0) return; - qdio_kick_handler(q); + qdio_kick_handler(q, count); if (!qdio_inbound_q_done(q)) { /* means poll time is not yet over */ @@ -843,7 +839,7 @@ static void __qdio_outbound_processing(struct qdio_q *q) count = qdio_outbound_q_moved(q); if (count) - qdio_kick_handler(q); + qdio_kick_handler(q, count); if (queue_type(q) == QDIO_ZFCP_QFMT && !pci_out_supported(q->irq_ptr) && !qdio_outbound_q_done(q)) @@ -911,7 +907,7 @@ static void __tiqdio_inbound_processing(struct qdio_q *q) if (count == 0) return; - qdio_kick_handler(q); + qdio_kick_handler(q, count); if (!qdio_inbound_q_done(q)) { qperf_inc(q, tasklet_inbound_resched); @@ -1674,7 +1670,6 @@ int qdio_get_next_buffers(struct ccw_device *cdev, int nr, int *bufnr, int *error) { struct qdio_q *q; - int start, end; struct qdio_irq *irq_ptr = cdev->private->qdio_data; int count; @@ -1699,15 +1694,14 @@ int qdio_get_next_buffers(struct ccw_device *cdev, int nr, int *bufnr, if (unlikely(q->irq_ptr->state != QDIO_IRQ_STATE_ACTIVE)) return -EIO; - start = q->first_to_kick; - end = q->first_to_check; - *bufnr = start; + *bufnr = q->first_to_kick; *error = q->qdio_error; /* for the next time */ - q->first_to_kick = end; + q->first_to_kick = add_buf(q->first_to_kick, count); q->qdio_error = 0; - return sub_buf(end, start); + + return count; } EXPORT_SYMBOL(qdio_get_next_buffers); From dccbbaff174df174bb30c21e05d7b732d013ea1a Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Thu, 28 Mar 2019 10:45:11 +0100 Subject: [PATCH 13/98] s390/qdio: eliminate queue's last_move cursor This cursor is used for debugging only. But since commit "s390/qdio: pass up count of ready-to-process SBALs" it effectively duplicates the first_to_check cursor, diverging for just a short moment when get_*_buffer_frontier() updates q->first_to_check. Signed-off-by: Julian Wiedmann Reviewed-by: Jens Remus Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio.h | 3 --- drivers/s390/cio/qdio_debug.c | 5 ++--- drivers/s390/cio/qdio_main.c | 11 +++-------- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h index 2c29141005ca..a06944399865 100644 --- a/drivers/s390/cio/qdio.h +++ b/drivers/s390/cio/qdio.h @@ -228,9 +228,6 @@ struct qdio_q { */ int first_to_check; - /* first_to_check of the last time */ - int last_move; - /* beginning position for calling the program */ int first_to_kick; diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c index 4d77d87538c9..35410e6eda2e 100644 --- a/drivers/s390/cio/qdio_debug.c +++ b/drivers/s390/cio/qdio_debug.c @@ -121,9 +121,8 @@ static int qstat_show(struct seq_file *m, void *v) seq_printf(m, "Timestamp: %Lx Last AI: %Lx\n", q->timestamp, last_ai_time); - seq_printf(m, "nr_used: %d ftc: %d last_move: %d\n", - atomic_read(&q->nr_buf_used), - q->first_to_check, q->last_move); + seq_printf(m, "nr_used: %d ftc: %d\n", + atomic_read(&q->nr_buf_used), q->first_to_check); if (q->is_input_q) { seq_printf(m, "polling: %d ack start: %d ack count: %d\n", q->u.in.polling, q->u.in.ack_start, diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index b6bc02efa0d0..0ccd3b30af78 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -551,11 +551,8 @@ static int qdio_inbound_q_moved(struct qdio_q *q) count = get_inbound_buffer_frontier(q); - if (count) { - q->last_move = q->first_to_check; - if (!is_thinint_irq(q->irq_ptr) && MACHINE_IS_LPAR) - q->u.in.timestamp = get_tod_clock(); - } + if (count && !is_thinint_irq(q->irq_ptr) && MACHINE_IS_LPAR) + q->u.in.timestamp = get_tod_clock(); return count; } @@ -780,10 +777,8 @@ static inline int qdio_outbound_q_moved(struct qdio_q *q) count = get_outbound_buffer_frontier(q); - if (count) { - q->last_move = q->first_to_check; + if (count) DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "out moved:%1d", q->nr); - } return count; } From 81a8f2beb32a5951ecf04385301f50879abc092b Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Sun, 7 Apr 2019 14:55:09 +0200 Subject: [PATCH 14/98] s390/mm: silence compiler warning when compiling without CONFIG_PGSTE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If CONFIG_PGSTE is not set (e.g. when compiling without KVM), GCC complains: CC arch/s390/mm/pgtable.o arch/s390/mm/pgtable.c:413:15: warning: ‘pmd_alloc_map’ defined but not used [-Wunused-function] static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) ^~~~~~~~~~~~~ Wrap the function with "#ifdef CONFIG_PGSTE" to silence the warning. Signed-off-by: Thomas Huth Reviewed-by: David Hildenbrand Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/mm/pgtable.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 8485d6dc2754..9ebd01219812 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -410,6 +410,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, return old; } +#ifdef CONFIG_PGSTE static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -427,6 +428,7 @@ static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) pmd = pmd_alloc(mm, pud, addr); return pmd; } +#endif pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t new) From e91012ee855ad9f5ef2ab106a3de51db93fe4d0c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2019 23:26:20 +0200 Subject: [PATCH 15/98] s390: cio: fix cio_irb declaration clang points out that the declaration of cio_irb does not match the definition exactly, it is missing the alignment attribute: ../drivers/s390/cio/cio.c:50:1: warning: section does not match previous declaration [-Wsection] DEFINE_PER_CPU_ALIGNED(struct irb, cio_irb); ^ ../include/linux/percpu-defs.h:150:2: note: expanded from macro 'DEFINE_PER_CPU_ALIGNED' DEFINE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ ^ ../include/linux/percpu-defs.h:93:9: note: expanded from macro 'DEFINE_PER_CPU_SECTION' extern __PCPU_ATTRS(sec) __typeof__(type) name; \ ^ ../include/linux/percpu-defs.h:49:26: note: expanded from macro '__PCPU_ATTRS' __percpu __attribute__((section(PER_CPU_BASE_SECTION sec))) \ ^ ../drivers/s390/cio/cio.h:118:1: note: previous attribute is here DECLARE_PER_CPU(struct irb, cio_irb); ^ ../include/linux/percpu-defs.h:111:2: note: expanded from macro 'DECLARE_PER_CPU' DECLARE_PER_CPU_SECTION(type, name, "") ^ ../include/linux/percpu-defs.h:87:9: note: expanded from macro 'DECLARE_PER_CPU_SECTION' extern __PCPU_ATTRS(sec) __typeof__(type) name ^ ../include/linux/percpu-defs.h:49:26: note: expanded from macro '__PCPU_ATTRS' __percpu __attribute__((section(PER_CPU_BASE_SECTION sec))) \ ^ Use DECLARE_PER_CPU_ALIGNED() here, to make the two match. Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/cio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h index 9811fd8a0c73..92eabbb5f18d 100644 --- a/drivers/s390/cio/cio.h +++ b/drivers/s390/cio/cio.h @@ -115,7 +115,7 @@ struct subchannel { struct schib_config config; } __attribute__ ((aligned(8))); -DECLARE_PER_CPU(struct irb, cio_irb); +DECLARE_PER_CPU_ALIGNED(struct irb, cio_irb); #define to_subchannel(n) container_of(n, struct subchannel, dev) From e24e4712efad737ca09ff299276737331cd021d9 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 10 Apr 2019 12:28:41 +0200 Subject: [PATCH 16/98] s390/rseq: use trap4 for RSEQ_SIG Use trap4 as the guard instruction for the restartable sequence abort handler. Signed-off-by: Martin Schwidefsky --- tools/testing/selftests/rseq/rseq-s390.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h index 1069e85258ce..0afdf7957974 100644 --- a/tools/testing/selftests/rseq/rseq-s390.h +++ b/tools/testing/selftests/rseq/rseq-s390.h @@ -1,6 +1,13 @@ /* SPDX-License-Identifier: LGPL-2.1 OR MIT */ -#define RSEQ_SIG 0x53053053 +/* + * RSEQ_SIG uses the trap4 instruction. As Linux does not make use of the + * access-register mode nor the linkage stack this instruction will always + * cause a special-operation exception (the trap-enabled bit in the DUCT + * is and will stay 0). The instruction pattern is + * b2 ff 0f ff trap4 4095(%r0) + */ +#define RSEQ_SIG 0xB2FF0FFF #define rseq_smp_mb() __asm__ __volatile__ ("bcr 15,0" ::: "memory") #define rseq_smp_rmb() rseq_smp_mb() From 7aa0055e06475f7b486c0673bc59c6478bc055fa Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 10 Apr 2019 15:48:43 +0200 Subject: [PATCH 17/98] s390: fine-tune stack switch helper The CALL_ON_STACK helper currently does not work with clang and for calls without arguments. It does not initialize r2 although the constraint is "+&d". Rework the CALL_FMT_x and the CALL_ON_STACK macros to work with clang and produce optimal code in all cases. Reported-by: Arnd Bergmann Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/processor.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 81038ab357ce..0ee022247580 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -261,12 +261,12 @@ static __no_kasan_or_inline unsigned short stap(void) CALL_ARGS_4(arg1, arg2, arg3, arg4); \ register unsigned long r4 asm("6") = (unsigned long)(arg5) -#define CALL_FMT_0 -#define CALL_FMT_1 CALL_FMT_0, "0" (r2) -#define CALL_FMT_2 CALL_FMT_1, "d" (r3) -#define CALL_FMT_3 CALL_FMT_2, "d" (r4) -#define CALL_FMT_4 CALL_FMT_3, "d" (r5) -#define CALL_FMT_5 CALL_FMT_4, "d" (r6) +#define CALL_FMT_0 "=&d" (r2) : +#define CALL_FMT_1 "+&d" (r2) : +#define CALL_FMT_2 CALL_FMT_1 "d" (r3), +#define CALL_FMT_3 CALL_FMT_2 "d" (r4), +#define CALL_FMT_4 CALL_FMT_3 "d" (r5), +#define CALL_FMT_5 CALL_FMT_4 "d" (r6), #define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory" #define CALL_CLOBBER_4 CALL_CLOBBER_5 @@ -286,10 +286,10 @@ static __no_kasan_or_inline unsigned short stap(void) " stg %[_prev],%[_bc](15)\n" \ " brasl 14,%[_fn]\n" \ " la 15,0(%[_prev])\n" \ - : "+&d" (r2), [_prev] "=&a" (prev) \ - : [_stack] "a" (stack), \ + : [_prev] "=&a" (prev), CALL_FMT_##nr \ + [_stack] "a" (stack), \ [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ - [_fn] "X" (fn) CALL_FMT_##nr : CALL_CLOBBER_##nr); \ + [_fn] "X" (fn) : CALL_CLOBBER_##nr); \ r2; \ }) From c1afcaec2af6d80d057b184eb86cbb018fce6517 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2019 23:26:14 +0200 Subject: [PATCH 18/98] s390: remove -fno-strength-reduce flag This was added as a workaround for really old compilers, and it prevents building with clang now. I can see no reason for keeping it, as it has already been removed for most architectures in the pre-git era, so let's remove it everywhere, rather than only for clang. Signed-off-by: Arnd Bergmann Signed-off-by: Martin Schwidefsky --- arch/s390/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index e21053e5e0da..9c079a506325 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -111,7 +111,7 @@ endif cfi := $(call as-instr,.cfi_startproc\n.cfi_val_offset 15$(comma)-160\n.cfi_endproc,-DCONFIG_AS_CFI_VAL_OFFSET=1) KBUILD_CFLAGS += -mbackchain -msoft-float $(cflags-y) -KBUILD_CFLAGS += -pipe -fno-strength-reduce -Wno-sign-compare +KBUILD_CFLAGS += -pipe -Wno-sign-compare KBUILD_CFLAGS += -fno-asynchronous-unwind-tables $(cfi) KBUILD_AFLAGS += $(aflags-y) $(cfi) export KBUILD_AFLAGS_DECOMPRESSOR From 96ca7674ea66e9f0e2cb9f75588d4d16e5abb724 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2019 23:26:15 +0200 Subject: [PATCH 19/98] s390: don't build vdso32 with clang clang does not support 31 bit object files on s390, so skip the 32-bit vdso here, and only build it when using gcc to compile the kernel. Signed-off-by: Arnd Bergmann Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 3 +++ arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/vdso.c | 10 +++++----- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 4ac8d49e206a..566f0b460d21 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -388,6 +388,9 @@ config COMPAT (and some other stuff like libraries and such) is needed for executing 31 bit applications. It is safe to say "Y". +config COMPAT_VDSO + def_bool COMPAT && !CC_IS_CLANG + config SYSVIPC_COMPAT def_bool y if COMPAT && SYSVIPC diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 8a62c7f72e1b..1222db6d4ee9 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -86,7 +86,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace.o # vdso obj-y += vdso64/ -obj-$(CONFIG_COMPAT) += vdso32/ +obj-$(CONFIG_COMPAT_VDSO) += vdso32/ chkbss := head64.o early_nobss.o include $(srctree)/arch/s390/scripts/Makefile.chkbss diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index e7920a68a12e..243d8b1185bf 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -29,7 +29,7 @@ #include #include -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO extern char vdso32_start, vdso32_end; static void *vdso32_kbase = &vdso32_start; static unsigned int vdso32_pages; @@ -55,7 +55,7 @@ static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, vdso_pagelist = vdso64_pagelist; vdso_pages = vdso64_pages; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO if (vma->vm_mm->context.compat_mm) { vdso_pagelist = vdso32_pagelist; vdso_pages = vdso32_pages; @@ -76,7 +76,7 @@ static int vdso_mremap(const struct vm_special_mapping *sm, unsigned long vdso_pages; vdso_pages = vdso64_pages; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO if (vma->vm_mm->context.compat_mm) vdso_pages = vdso32_pages; #endif @@ -223,7 +223,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return 0; vdso_pages = vdso64_pages; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO mm->context.compat_mm = is_compat_task(); if (mm->context.compat_mm) vdso_pages = vdso32_pages; @@ -280,7 +280,7 @@ static int __init vdso_init(void) int i; vdso_init_data(vdso_data); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO /* Calculate the size of the 32 bit vDSO */ vdso32_pages = ((&vdso32_end - &vdso32_start + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; From efb150df1de6462003890cbfea5fa02a28821530 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2019 23:26:21 +0200 Subject: [PATCH 20/98] s390: syscall_wrapper: avoid clang warning Building system calls with clang results in a warning about an alias from a global function to a static one: ../fs/namei.c:3847:1: warning: unused function '__se_sys_mkdirat' [-Wunused-function] SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) ^ ../include/linux/syscalls.h:219:36: note: expanded from macro 'SYSCALL_DEFINE3' #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) ^ ../include/linux/syscalls.h:228:2: note: expanded from macro 'SYSCALL_DEFINEx' __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) ^ ../arch/s390/include/asm/syscall_wrapper.h:126:18: note: expanded from macro '__SYSCALL_DEFINEx' asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ ^ :31:1: note: expanded from here __se_sys_mkdirat ^ The only reference to the static __se_sys_mkdirat() here is the alias, but this only gets evaluated later. Making this function global as well avoids the warning. Signed-off-by: Arnd Bergmann Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/syscall_wrapper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index 5596c5c625d2..3c3d6fe8e2f0 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -119,8 +119,8 @@ "Type aliasing is used to sanitize syscall arguments");\ asmlinkage long __s390x_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_sys##name)))); \ - ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ - static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ + long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ __S390_SYS_STUBx(x, name, __VA_ARGS__) \ asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ From 0a113efc3b48e9a8952e3c6e19dfdcc48b62226f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2019 23:26:22 +0200 Subject: [PATCH 21/98] s390: make __load_psw_mask work with clang clang fails to use the %O and %R inline assembly modifiers the same way as gcc, leading to build failures with every use of __load_psw_mask(): /tmp/nmi-4a9f80.s: Assembler messages: /tmp/nmi-4a9f80.s:571: Error: junk at end of line: `+8(160(%r11))' /tmp/nmi-4a9f80.s:626: Error: junk at end of line: `+8(160(%r11))' Replace these with a more conventional way of passing the addresses that should work with both clang and gcc. Signed-off-by: Arnd Bergmann Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/processor.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 0ee022247580..8aa85e39f50b 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -339,10 +339,10 @@ static __no_kasan_or_inline void __load_psw_mask(unsigned long mask) asm volatile( " larl %0,1f\n" - " stg %0,%O1+8(%R1)\n" - " lpswe %1\n" + " stg %0,%1\n" + " lpswe %2\n" "1:" - : "=&d" (addr), "=Q" (psw) : "Q" (psw) : "memory", "cc"); + : "=&d" (addr), "=Q" (psw.addr) : "Q" (psw) : "memory", "cc"); } /* From 9a0ceb9cfbee7da4a005135c620603ea837878a9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2019 23:26:24 +0200 Subject: [PATCH 22/98] s390: make chkbss work with clang llvm skips an empty .bss section entirely, which makes the check fail with an unexpected error: /tmp/binutils-multi-test/bin/s390x-linux-gnu-objdump: section '.bss' mentioned in a -j option, but not found in any input file error: arch/s390/boot/compressed/decompressor.o .bss section is not empty ../arch/s390/scripts/Makefile.chkbss:20: recipe for target 'arch/s390/boot/compressed/decompressor.o.chkbss' failed Change the check so we first see if a .bss section exists before trying to read its size. Signed-off-by: Arnd Bergmann Signed-off-by: Martin Schwidefsky --- arch/s390/scripts/Makefile.chkbss | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/scripts/Makefile.chkbss b/arch/s390/scripts/Makefile.chkbss index cd7e8f4419f5..884a9caff5fb 100644 --- a/arch/s390/scripts/Makefile.chkbss +++ b/arch/s390/scripts/Makefile.chkbss @@ -11,7 +11,8 @@ chkbss: $(addprefix $(obj)/, $(chkbss-files)) quiet_cmd_chkbss = CHKBSS $< cmd_chkbss = \ - if ! $(OBJDUMP) -j .bss -w -h $< | awk 'END { if ($$3) exit 1 }'; then \ + if $(OBJDUMP) -h $< | grep -q "\.bss" && \ + ! $(OBJDUMP) -j .bss -w -h $< | awk 'END { if ($$3) exit 1 }'; then \ echo "error: $< .bss section is not empty" >&2; exit 1; \ fi; \ touch $@; From 475c8e9e89db2fcecf837ac41dcb6aed4ba72c61 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 9 Apr 2019 09:33:12 -0700 Subject: [PATCH 23/98] s390: Convert IS_ENABLED uses to __is_defined IS_ENABLED should be reserved for CONFIG_ uses so convert the uses of IS_ENABLED with a #define to __is_defined. Signed-off-by: Joe Perches Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/nospec-branch.c | 6 +++--- arch/s390/kernel/nospec-sysfs.c | 2 +- arch/s390/net/bpf_jit_comp.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index bdddaae96559..8e484130b9b8 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -37,7 +37,7 @@ static int __init nospec_report(void) { if (test_facility(156)) pr_info("Spectre V2 mitigation: etokens\n"); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) pr_info("Spectre V2 mitigation: execute trampolines\n"); if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) pr_info("Spectre V2 mitigation: limited branch prediction\n"); @@ -63,10 +63,10 @@ void __init nospec_auto_detect(void) * The machine supports etokens. * Disable expolines and disable nobp. */ - if (IS_ENABLED(CC_USING_EXPOLINE)) + if (__is_defined(CC_USING_EXPOLINE)) nospec_disable = 1; __clear_facility(82, S390_lowcore.alt_stfle_fac_list); - } else if (IS_ENABLED(CC_USING_EXPOLINE)) { + } else if (__is_defined(CC_USING_EXPOLINE)) { /* * The kernel has been compiled with expolines. * Keep expolines enabled and disable nobp. diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c index e30e580ae362..48f472bf9290 100644 --- a/arch/s390/kernel/nospec-sysfs.c +++ b/arch/s390/kernel/nospec-sysfs.c @@ -15,7 +15,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, { if (test_facility(156)) return sprintf(buf, "Mitigation: etokens\n"); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) return sprintf(buf, "Mitigation: execute trampolines\n"); if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) return sprintf(buf, "Mitigation: limited branch prediction\n"); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 51dd0267d014..5e7c63033159 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -455,7 +455,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ save_restore_regs(jit, REGS_RESTORE, stack_depth); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) { + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { jit->r14_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r14 thunk */ if (test_facility(35)) { @@ -473,7 +473,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) /* br %r14 */ _EMIT2(0x07fe); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable && + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable && (jit->seen & SEEN_FUNC)) { jit->r1_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r1 thunk */ @@ -999,7 +999,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i /* lg %w1,(%l) */ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_W1, REG_0, REG_L, EMIT_CONST_U64(func)); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) { + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { /* brasl %r14,__s390_indirect_jump_r1 */ EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); } else { From 6e042492a272bdc507d37982b890dc5266d5bb01 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2019 23:26:23 +0200 Subject: [PATCH 24/98] s390: avoid __builtin_return_address(n) on clang llvm on s390 has problems with __builtin_return_address(n), with n>0, this results in a somewhat cryptic error message: fatal error: error in backend: Unsupported stack frame traversal count To work around it, use the direct return address directly. This is probably not ideal here, but gets things to compile and should only lead to inferior reporting, not to misbehavior of the generated code. Link: https://bugs.llvm.org/show_bug.cgi?id=41424 Signed-off-by: Arnd Bergmann Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ftrace.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 5a3c95b11952..6e0ed0338785 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -13,7 +13,12 @@ #ifndef __ASSEMBLY__ +#ifdef CONFIG_CC_IS_CLANG +/* https://bugs.llvm.org/show_bug.cgi?id=41424 */ +#define ftrace_return_address(n) 0UL +#else #define ftrace_return_address(n) __builtin_return_address(n) +#endif void _mcount(void); void ftrace_caller(void); From 913140e221567b3ecd21b4242257a7e3fa279026 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Apr 2019 23:26:18 +0200 Subject: [PATCH 25/98] s390: zcrypt: initialize variables before_use The 'func_code' variable gets printed in debug statements without a prior initialization in multiple functions, as reported when building with clang: drivers/s390/crypto/zcrypt_api.c:659:6: warning: variable 'func_code' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (mex->outputdatalength < mex->inputdatalength) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/s390/crypto/zcrypt_api.c:725:29: note: uninitialized use occurs here trace_s390_zcrypt_rep(mex, func_code, rc, ^~~~~~~~~ drivers/s390/crypto/zcrypt_api.c:659:2: note: remove the 'if' if its condition is always false if (mex->outputdatalength < mex->inputdatalength) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/s390/crypto/zcrypt_api.c:654:24: note: initialize the variable 'func_code' to silence this warning unsigned int func_code; ^ Add initializations to all affected code paths to shut up the warning and make the warning output consistent. Signed-off-by: Arnd Bergmann Signed-off-by: Martin Schwidefsky --- drivers/s390/crypto/zcrypt_api.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c index 689c2af7026a..c31b2d31cd83 100644 --- a/drivers/s390/crypto/zcrypt_api.c +++ b/drivers/s390/crypto/zcrypt_api.c @@ -659,6 +659,7 @@ static long zcrypt_rsa_modexpo(struct ap_perms *perms, trace_s390_zcrypt_req(mex, TP_ICARSAMODEXPO); if (mex->outputdatalength < mex->inputdatalength) { + func_code = 0; rc = -EINVAL; goto out; } @@ -742,6 +743,7 @@ static long zcrypt_rsa_crt(struct ap_perms *perms, trace_s390_zcrypt_req(crt, TP_ICARSACRT); if (crt->outputdatalength < crt->inputdatalength) { + func_code = 0; rc = -EINVAL; goto out; } @@ -951,6 +953,7 @@ static long zcrypt_send_ep11_cprb(struct ap_perms *perms, targets = kcalloc(target_num, sizeof(*targets), GFP_KERNEL); if (!targets) { + func_code = 0; rc = -ENOMEM; goto out; } @@ -958,6 +961,7 @@ static long zcrypt_send_ep11_cprb(struct ap_perms *perms, uptr = (struct ep11_target_dev __force __user *) xcrb->targets; if (copy_from_user(targets, uptr, target_num * sizeof(*targets))) { + func_code = 0; rc = -EFAULT; goto out_free; } From 5b2ad270529ff8abfd8a324ceece1ae1704875a0 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 8 Apr 2019 11:19:54 +0200 Subject: [PATCH 26/98] s390/qdio: limit direct access to first_to_check cursor Refactor all the low-level helpers to take the first_to_check cursor as parameter, rather than accessing it directly. Signed-off-by: Julian Wiedmann Reviewed-by: Jens Remus Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio_main.c | 56 +++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 0ccd3b30af78..045f2aad0b3c 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -415,7 +415,8 @@ static inline void account_sbals(struct qdio_q *q, unsigned int count) q->q_stats.nr_sbals[pos]++; } -static void process_buffer_error(struct qdio_q *q, int count) +static void process_buffer_error(struct qdio_q *q, unsigned int start, + int count) { unsigned char state = (q->is_input_q) ? SLSB_P_INPUT_NOT_INIT : SLSB_P_OUTPUT_NOT_INIT; @@ -424,29 +425,29 @@ static void process_buffer_error(struct qdio_q *q, int count) /* special handling for no target buffer empty */ if (queue_type(q) == QDIO_IQDIO_QFMT && !q->is_input_q && - q->sbal[q->first_to_check]->element[15].sflags == 0x10) { + q->sbal[start]->element[15].sflags == 0x10) { qperf_inc(q, target_full); - DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "OUTFULL FTC:%02x", - q->first_to_check); + DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "OUTFULL FTC:%02x", start); goto set; } DBF_ERROR("%4x BUF ERROR", SCH_NO(q)); DBF_ERROR((q->is_input_q) ? "IN:%2d" : "OUT:%2d", q->nr); - DBF_ERROR("FTC:%3d C:%3d", q->first_to_check, count); + DBF_ERROR("FTC:%3d C:%3d", start, count); DBF_ERROR("F14:%2x F15:%2x", - q->sbal[q->first_to_check]->element[14].sflags, - q->sbal[q->first_to_check]->element[15].sflags); + q->sbal[start]->element[14].sflags, + q->sbal[start]->element[15].sflags); set: /* * Interrupts may be avoided as long as the error is present * so change the buffer state immediately to avoid starvation. */ - set_buf_states(q, q->first_to_check, state, count); + set_buf_states(q, start, state, count); } -static inline void inbound_primed(struct qdio_q *q, int count) +static inline void inbound_primed(struct qdio_q *q, unsigned int start, + int count) { int new; @@ -457,7 +458,7 @@ static inline void inbound_primed(struct qdio_q *q, int count) if (!q->u.in.polling) { q->u.in.polling = 1; q->u.in.ack_count = count; - q->u.in.ack_start = q->first_to_check; + q->u.in.ack_start = start; return; } @@ -465,7 +466,7 @@ static inline void inbound_primed(struct qdio_q *q, int count) set_buf_states(q, q->u.in.ack_start, SLSB_P_INPUT_NOT_INIT, q->u.in.ack_count); q->u.in.ack_count = count; - q->u.in.ack_start = q->first_to_check; + q->u.in.ack_start = start; return; } @@ -473,7 +474,7 @@ static inline void inbound_primed(struct qdio_q *q, int count) * ACK the newest buffer. The ACK will be removed in qdio_stop_polling * or by the next inbound run. */ - new = add_buf(q->first_to_check, count - 1); + new = add_buf(start, count - 1); if (q->u.in.polling) { /* reset the previous ACK but first set the new one */ set_buf_state(q, new, SLSB_P_INPUT_ACK); @@ -488,11 +489,12 @@ static inline void inbound_primed(struct qdio_q *q, int count) if (!count) return; /* need to change ALL buffers to get more interrupts */ - set_buf_states(q, q->first_to_check, SLSB_P_INPUT_NOT_INIT, count); + set_buf_states(q, start, SLSB_P_INPUT_NOT_INIT, count); } static int get_inbound_buffer_frontier(struct qdio_q *q) { + unsigned int start = q->first_to_check; unsigned char state = 0; int count; @@ -510,22 +512,22 @@ static int get_inbound_buffer_frontier(struct qdio_q *q) * No siga sync here, as a PCI or we after a thin interrupt * already sync'ed the queues. */ - count = get_buf_states(q, q->first_to_check, &state, count, 1, 0); + count = get_buf_states(q, start, &state, count, 1, 0); if (!count) return 0; switch (state) { case SLSB_P_INPUT_PRIMED: - inbound_primed(q, count); - q->first_to_check = add_buf(q->first_to_check, count); + inbound_primed(q, start, count); + q->first_to_check = add_buf(start, count); if (atomic_sub_return(count, &q->nr_buf_used) == 0) qperf_inc(q, inbound_queue_full); if (q->irq_ptr->perf_stat_enabled) account_sbals(q, count); return count; case SLSB_P_INPUT_ERROR: - process_buffer_error(q, count); - q->first_to_check = add_buf(q->first_to_check, count); + process_buffer_error(q, start, count); + q->first_to_check = add_buf(start, count); if (atomic_sub_return(count, &q->nr_buf_used) == 0) qperf_inc(q, inbound_queue_full); if (q->irq_ptr->perf_stat_enabled) @@ -537,7 +539,7 @@ static int get_inbound_buffer_frontier(struct qdio_q *q) if (q->irq_ptr->perf_stat_enabled) q->q_stats.nr_sbal_nop++; DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "in nop:%1d %#02x", - q->nr, q->first_to_check); + q->nr, start); return 0; default: WARN_ON_ONCE(1); @@ -559,6 +561,7 @@ static int qdio_inbound_q_moved(struct qdio_q *q) static inline int qdio_inbound_q_done(struct qdio_q *q) { + unsigned int start = q->first_to_check; unsigned char state = 0; if (!atomic_read(&q->nr_buf_used)) @@ -566,7 +569,7 @@ static inline int qdio_inbound_q_done(struct qdio_q *q) if (need_siga_sync(q)) qdio_siga_sync_q(q); - get_buf_state(q, q->first_to_check, &state, 0); + get_buf_state(q, start, &state, 0); if (state == SLSB_P_INPUT_PRIMED || state == SLSB_P_INPUT_ERROR) /* more work coming */ @@ -584,8 +587,7 @@ static inline int qdio_inbound_q_done(struct qdio_q *q) * has (probably) not moved (see qdio_inbound_processing). */ if (get_tod_clock_fast() > q->u.in.timestamp + QDIO_INPUT_THRESHOLD) { - DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "in done:%02x", - q->first_to_check); + DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "in done:%02x", start); return 1; } else return 0; @@ -706,6 +708,7 @@ void qdio_inbound_processing(unsigned long data) static int get_outbound_buffer_frontier(struct qdio_q *q) { + unsigned int start = q->first_to_check; unsigned char state = 0; int count; @@ -726,8 +729,7 @@ static int get_outbound_buffer_frontier(struct qdio_q *q) if (!count) return 0; - count = get_buf_states(q, q->first_to_check, &state, count, 0, - q->u.out.use_cq); + count = get_buf_states(q, start, &state, count, 0, q->u.out.use_cq); if (!count) return 0; @@ -738,13 +740,13 @@ static int get_outbound_buffer_frontier(struct qdio_q *q) "out empty:%1d %02x", q->nr, count); atomic_sub(count, &q->nr_buf_used); - q->first_to_check = add_buf(q->first_to_check, count); + q->first_to_check = add_buf(start, count); if (q->irq_ptr->perf_stat_enabled) account_sbals(q, count); return count; case SLSB_P_OUTPUT_ERROR: - process_buffer_error(q, count); - q->first_to_check = add_buf(q->first_to_check, count); + process_buffer_error(q, start, count); + q->first_to_check = add_buf(start, count); atomic_sub(count, &q->nr_buf_used); if (q->irq_ptr->perf_stat_enabled) account_sbals_error(q, count); From 6bcf74e2d15c8339f0e4a0f0613638873098e85f Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 8 Apr 2019 13:32:12 +0200 Subject: [PATCH 27/98] s390/qdio: consolidate index tracking for queue scan qdio.ko offers a small number of high-level functions to drive the scanning of a QDIO queue for ready-to-process SBALs: qdio_get_next_buffers(), __[ti]qdio_inbound_processing() and __qdio_outbound_processing(). Let each of those functions maintain the 'start' index for their current scan, and pass it to lower-level helpers as needed. This improves the code's overall layering, and allows us to eliminate the additional first_to_kick cursor with a follow-on patch. Signed-off-by: Julian Wiedmann Reviewed-by: Jens Remus Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio_main.c | 55 ++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c index 045f2aad0b3c..cfce255521ac 100644 --- a/drivers/s390/cio/qdio_main.c +++ b/drivers/s390/cio/qdio_main.c @@ -492,9 +492,8 @@ static inline void inbound_primed(struct qdio_q *q, unsigned int start, set_buf_states(q, start, SLSB_P_INPUT_NOT_INIT, count); } -static int get_inbound_buffer_frontier(struct qdio_q *q) +static int get_inbound_buffer_frontier(struct qdio_q *q, unsigned int start) { - unsigned int start = q->first_to_check; unsigned char state = 0; int count; @@ -519,7 +518,6 @@ static int get_inbound_buffer_frontier(struct qdio_q *q) switch (state) { case SLSB_P_INPUT_PRIMED: inbound_primed(q, start, count); - q->first_to_check = add_buf(start, count); if (atomic_sub_return(count, &q->nr_buf_used) == 0) qperf_inc(q, inbound_queue_full); if (q->irq_ptr->perf_stat_enabled) @@ -527,7 +525,6 @@ static int get_inbound_buffer_frontier(struct qdio_q *q) return count; case SLSB_P_INPUT_ERROR: process_buffer_error(q, start, count); - q->first_to_check = add_buf(start, count); if (atomic_sub_return(count, &q->nr_buf_used) == 0) qperf_inc(q, inbound_queue_full); if (q->irq_ptr->perf_stat_enabled) @@ -547,11 +544,11 @@ static int get_inbound_buffer_frontier(struct qdio_q *q) } } -static int qdio_inbound_q_moved(struct qdio_q *q) +static int qdio_inbound_q_moved(struct qdio_q *q, unsigned int start) { int count; - count = get_inbound_buffer_frontier(q); + count = get_inbound_buffer_frontier(q, start); if (count && !is_thinint_irq(q->irq_ptr) && MACHINE_IS_LPAR) q->u.in.timestamp = get_tod_clock(); @@ -559,9 +556,8 @@ static int qdio_inbound_q_moved(struct qdio_q *q) return count; } -static inline int qdio_inbound_q_done(struct qdio_q *q) +static inline int qdio_inbound_q_done(struct qdio_q *q, unsigned int start) { - unsigned int start = q->first_to_check; unsigned char state = 0; if (!atomic_read(&q->nr_buf_used)) @@ -672,17 +668,20 @@ static inline int qdio_tasklet_schedule(struct qdio_q *q) static void __qdio_inbound_processing(struct qdio_q *q) { + unsigned int start = q->first_to_check; int count; qperf_inc(q, tasklet_inbound); - count = qdio_inbound_q_moved(q); + count = qdio_inbound_q_moved(q, start); if (count == 0) return; + start = add_buf(start, count); + q->first_to_check = start; qdio_kick_handler(q, count); - if (!qdio_inbound_q_done(q)) { + if (!qdio_inbound_q_done(q, start)) { /* means poll time is not yet over */ qperf_inc(q, tasklet_inbound_resched); if (!qdio_tasklet_schedule(q)) @@ -694,7 +693,7 @@ static void __qdio_inbound_processing(struct qdio_q *q) * We need to check again to not lose initiative after * resetting the ACK state. */ - if (!qdio_inbound_q_done(q)) { + if (!qdio_inbound_q_done(q, start)) { qperf_inc(q, tasklet_inbound_resched2); qdio_tasklet_schedule(q); } @@ -706,9 +705,8 @@ void qdio_inbound_processing(unsigned long data) __qdio_inbound_processing(q); } -static int get_outbound_buffer_frontier(struct qdio_q *q) +static int get_outbound_buffer_frontier(struct qdio_q *q, unsigned int start) { - unsigned int start = q->first_to_check; unsigned char state = 0; int count; @@ -740,13 +738,11 @@ static int get_outbound_buffer_frontier(struct qdio_q *q) "out empty:%1d %02x", q->nr, count); atomic_sub(count, &q->nr_buf_used); - q->first_to_check = add_buf(start, count); if (q->irq_ptr->perf_stat_enabled) account_sbals(q, count); return count; case SLSB_P_OUTPUT_ERROR: process_buffer_error(q, start, count); - q->first_to_check = add_buf(start, count); atomic_sub(count, &q->nr_buf_used); if (q->irq_ptr->perf_stat_enabled) account_sbals_error(q, count); @@ -773,11 +769,11 @@ static inline int qdio_outbound_q_done(struct qdio_q *q) return atomic_read(&q->nr_buf_used) == 0; } -static inline int qdio_outbound_q_moved(struct qdio_q *q) +static inline int qdio_outbound_q_moved(struct qdio_q *q, unsigned int start) { int count; - count = get_outbound_buffer_frontier(q); + count = get_outbound_buffer_frontier(q, start); if (count) DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "out moved:%1d", q->nr); @@ -829,14 +825,17 @@ retry: static void __qdio_outbound_processing(struct qdio_q *q) { + unsigned int start = q->first_to_check; int count; qperf_inc(q, tasklet_outbound); WARN_ON_ONCE(atomic_read(&q->nr_buf_used) < 0); - count = qdio_outbound_q_moved(q); - if (count) + count = qdio_outbound_q_moved(q, start); + if (count) { + q->first_to_check = add_buf(start, count); qdio_kick_handler(q, count); + } if (queue_type(q) == QDIO_ZFCP_QFMT && !pci_out_supported(q->irq_ptr) && !qdio_outbound_q_done(q)) @@ -891,6 +890,7 @@ static inline void qdio_check_outbound_pci_queues(struct qdio_irq *irq) static void __tiqdio_inbound_processing(struct qdio_q *q) { + unsigned int start = q->first_to_check; int count; qperf_inc(q, tasklet_inbound); @@ -900,13 +900,15 @@ static void __tiqdio_inbound_processing(struct qdio_q *q) /* The interrupt could be caused by a PCI request: */ qdio_check_outbound_pci_queues(q->irq_ptr); - count = qdio_inbound_q_moved(q); + count = qdio_inbound_q_moved(q, start); if (count == 0) return; + start = add_buf(start, count); + q->first_to_check = start; qdio_kick_handler(q, count); - if (!qdio_inbound_q_done(q)) { + if (!qdio_inbound_q_done(q, start)) { qperf_inc(q, tasklet_inbound_resched); if (!qdio_tasklet_schedule(q)) return; @@ -917,7 +919,7 @@ static void __tiqdio_inbound_processing(struct qdio_q *q) * We need to check again to not lose initiative after * resetting the ACK state. */ - if (!qdio_inbound_q_done(q)) { + if (!qdio_inbound_q_done(q, start)) { qperf_inc(q, tasklet_inbound_resched2); qdio_tasklet_schedule(q); } @@ -1637,7 +1639,7 @@ int qdio_start_irq(struct ccw_device *cdev, int nr) */ if (test_nonshared_ind(irq_ptr)) goto rescan; - if (!qdio_inbound_q_done(q)) + if (!qdio_inbound_q_done(q, q->first_to_check)) goto rescan; return 0; @@ -1668,11 +1670,13 @@ int qdio_get_next_buffers(struct ccw_device *cdev, int nr, int *bufnr, { struct qdio_q *q; struct qdio_irq *irq_ptr = cdev->private->qdio_data; + unsigned int start; int count; if (!irq_ptr) return -ENODEV; q = irq_ptr->input_qs[nr]; + start = q->first_to_check; /* * Cannot rely on automatic sync after interrupt since queues may @@ -1683,10 +1687,13 @@ int qdio_get_next_buffers(struct ccw_device *cdev, int nr, int *bufnr, qdio_check_outbound_pci_queues(irq_ptr); - count = qdio_inbound_q_moved(q); + count = qdio_inbound_q_moved(q, start); if (count == 0) return 0; + start = add_buf(start, count); + q->first_to_check = start; + /* Note: upper-layer MUST stop processing immediately here ... */ if (unlikely(q->irq_ptr->state != QDIO_IRQ_STATE_ACTIVE)) return -EIO; From d1874a0c2805fcfa9162c972d6b7541e57adb542 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 23 Apr 2019 10:51:12 +0200 Subject: [PATCH 28/98] s390/mm: make the pxd_offset functions more robust Change the way how pgd_offset, p4d_offset, pud_offset and pmd_offset walk the page tables. pgd_offset now always calculates the index for the top-level page table and adds it to the pgd, this is either a segment table offset for a 2-level setup, a region-3 offset for 3-levels, region-2 offset for 4-levels, or a region-1 offset for a 5-level setup. The other three functions p4d_offset, pud_offset and pmd_offset will only add the respective offset if they dereference the passed pointer. With the new way of walking the page tables a sequence like this from mm/gup.c now works: pgdp = pgd_offset(current->mm, addr); pgd = READ_ONCE(*pgdp); p4dp = p4d_offset(&pgd, addr); p4d = READ_ONCE(*p4dp); pudp = pud_offset(&p4d, addr); pud = READ_ONCE(*pudp); pmdp = pmd_offset(&pud, addr); pmd = READ_ONCE(*pmdp); Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 69 +++++++++++++++++++++------------ arch/s390/mm/gup.c | 33 ++++++---------- 2 files changed, 56 insertions(+), 46 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 76dc344edb8c..d48be9c27df9 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1204,42 +1204,67 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -#define pgd_offset_k(address) pgd_offset(&init_mm, address) -#define pgd_offset_raw(pgd, addr) ((pgd) + pgd_index(addr)) - #define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN) #define pud_deref(pud) (pud_val(pud) & _REGION_ENTRY_ORIGIN) #define p4d_deref(pud) (p4d_val(pud) & _REGION_ENTRY_ORIGIN) #define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) +/* + * The pgd_offset function *always* adds the index for the top-level + * region/segment table. This is done to get a sequence like the + * following to work: + * pgdp = pgd_offset(current->mm, addr); + * pgd = READ_ONCE(*pgdp); + * p4dp = p4d_offset(&pgd, addr); + * ... + * The subsequent p4d_offset, pud_offset and pmd_offset functions + * only add an index if they dereferenced the pointer. + */ +static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address) +{ + unsigned long rste; + unsigned int shift; + + /* Get the first entry of the top level table */ + rste = pgd_val(*pgd); + /* Pick up the shift from the table type of the first entry */ + shift = ((rste & _REGION_ENTRY_TYPE_MASK) >> 2) * 11 + 20; + return pgd + ((address >> shift) & (PTRS_PER_PGD - 1)); +} + +#define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address) +#define pgd_offset_k(address) pgd_offset(&init_mm, address) + static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { - p4d_t *p4d = (p4d_t *) pgd; - - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) - p4d = (p4d_t *) pgd_deref(*pgd); - return p4d + p4d_index(address); + if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) + return (p4d_t *) pgd_deref(*pgd) + p4d_index(address); + return (p4d_t *) pgd; } static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { - pud_t *pud = (pud_t *) p4d; - - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - pud = (pud_t *) p4d_deref(*p4d); - return pud + pud_index(address); + if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) + return (pud_t *) p4d_deref(*p4d) + pud_index(address); + return (pud_t *) p4d; } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { - pmd_t *pmd = (pmd_t *) pud; - - if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - pmd = (pmd_t *) pud_deref(*pud); - return pmd + pmd_index(address); + if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3) + return (pmd_t *) pud_deref(*pud) + pmd_index(address); + return (pmd_t *) pud; } +static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address) +{ + return (pte_t *) pmd_deref(*pmd) + pte_index(address); +} + +#define pte_offset_kernel(pmd, address) pte_offset(pmd, address) +#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) +#define pte_unmap(pte) do { } while (0) + #define pfn_pte(pfn,pgprot) mk_pte_phys(__pa((pfn) << PAGE_SHIFT),(pgprot)) #define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT) #define pte_page(x) pfn_to_page(pte_pfn(x)) @@ -1249,12 +1274,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) #define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) #define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) -/* Find an entry in the lowest level page table.. */ -#define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr)) -#define pte_offset_kernel(pmd, address) pte_offset(pmd,address) -#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) -#define pte_unmap(pte) do { } while (0) - static inline pmd_t pmd_wrprotect(pmd_t pmd) { pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE; diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 2809d11c7a28..a5ffcb132191 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -18,7 +18,7 @@ * inlines everything into a single function which results in too much * register pressure. */ -static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, +static inline int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { struct page *head, *page; @@ -27,7 +27,7 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, mask = (write ? _PAGE_PROTECT : 0) | _PAGE_INVALID | _PAGE_SPECIAL; - ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr); + ptep = pte_offset_map(&pmd, addr); do { pte = *ptep; barrier(); @@ -93,16 +93,13 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, } -static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, +static inline int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp, pmd; - pmdp = (pmd_t *) pudp; - if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - pmdp = (pmd_t *) pud_deref(pud); - pmdp += pmd_index(addr); + pmdp = pmd_offset(&pud, addr); do { pmd = *pmdp; barrier(); @@ -120,7 +117,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, if (!gup_huge_pmd(pmdp, pmd, addr, next, write, pages, nr)) return 0; - } else if (!gup_pte_range(pmdp, pmd, addr, next, + } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); @@ -166,16 +163,13 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, return 1; } -static inline int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, +static inline int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp, pud; - pudp = (pud_t *) p4dp; - if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - pudp = (pud_t *) p4d_deref(p4d); - pudp += pud_index(addr); + pudp = pud_offset(&p4d, addr); do { pud = *pudp; barrier(); @@ -186,7 +180,7 @@ static inline int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, if (!gup_huge_pud(pudp, pud, addr, next, write, pages, nr)) return 0; - } else if (!gup_pmd_range(pudp, pud, addr, next, write, pages, + } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); @@ -194,23 +188,20 @@ static inline int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, return 1; } -static inline int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, +static inline int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp, p4d; - p4dp = (p4d_t *) pgdp; - if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) - p4dp = (p4d_t *) pgd_deref(pgd); - p4dp += p4d_index(addr); + p4dp = p4d_offset(&pgd, addr); do { p4d = *p4dp; barrier(); next = p4d_addr_end(addr, end); if (p4d_none(p4d)) return 0; - if (!gup_pud_range(p4dp, p4d, addr, next, write, pages, nr)) + if (!gup_pud_range(p4d, addr, next, write, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); @@ -253,7 +244,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; - if (!gup_p4d_range(pgdp, pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); From 1a42010cdc26bb7e5912984f3c91b8c6d55f089a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 23 Apr 2019 10:53:21 +0200 Subject: [PATCH 29/98] s390/mm: convert to the generic get_user_pages_fast code Define the gup_fast_permitted to check against the asce_limit of the mm attached to the current task, then replace the s390 specific gup code with the generic implementation in mm/gup.c. Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 1 + arch/s390/include/asm/pgtable.h | 12 ++ arch/s390/mm/Makefile | 2 +- arch/s390/mm/gup.c | 291 -------------------------------- 4 files changed, 14 insertions(+), 292 deletions(-) delete mode 100644 arch/s390/mm/gup.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 566f0b460d21..1c3fcf19c3af 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -149,6 +149,7 @@ config S390 select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS + select HAVE_GENERIC_GUP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index d48be9c27df9..394bec31cb97 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1265,6 +1265,18 @@ static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address) #define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) #define pte_unmap(pte) do { } while (0) +static inline bool gup_fast_permitted(unsigned long start, int nr_pages) +{ + unsigned long len, end; + + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (end < start) + return false; + return end <= current->mm->context.asce_limit; +} +#define gup_fast_permitted gup_fast_permitted + #define pfn_pte(pfn,pgprot) mk_pte_phys(__pa((pfn) << PAGE_SHIFT),(pgprot)) #define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT) #define pte_page(x) pfn_to_page(pte_pfn(x)) diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index f5880bfd1b0c..3175413186b9 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -4,7 +4,7 @@ # obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o -obj-y += page-states.o gup.o pageattr.o pgtable.o pgalloc.o +obj-y += page-states.o pageattr.o pgtable.o pgalloc.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c deleted file mode 100644 index a5ffcb132191..000000000000 --- a/arch/s390/mm/gup.c +++ /dev/null @@ -1,291 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Lockless get_user_pages_fast for s390 - * - * Copyright IBM Corp. 2010 - * Author(s): Martin Schwidefsky - */ -#include -#include -#include -#include -#include -#include -#include - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static inline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - unsigned long mask; - pte_t *ptep, pte; - - mask = (write ? _PAGE_PROTECT : 0) | _PAGE_INVALID | _PAGE_SPECIAL; - - ptep = pte_offset_map(&pmd, addr); - do { - pte = *ptep; - barrier(); - /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_protnone(pte)) - return 0; - if ((pte_val(pte) & mask) != 0) - return 0; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - head = compound_head(page); - if (!page_cache_get_speculative(head)) - return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(head); - return 0; - } - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - - return 1; -} - -static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - unsigned long mask; - int refs; - - mask = (write ? _SEGMENT_ENTRY_PROTECT : 0) | _SEGMENT_ENTRY_INVALID; - if ((pmd_val(pmd) & mask) != 0) - return 0; - VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); - - refs = 0; - head = pmd_page(pmd); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - - -static inline int gup_pmd_range(pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp, pmd; - - pmdp = pmd_offset(&pud, addr); - do { - pmd = *pmdp; - barrier(); - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_large(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) - return 0; - if (!gup_huge_pmd(pmdp, pmd, addr, next, - write, pages, nr)) - return 0; - } else if (!gup_pte_range(pmd, addr, next, - write, pages, nr)) - return 0; - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - unsigned long mask; - int refs; - - mask = (write ? _REGION_ENTRY_PROTECT : 0) | _REGION_ENTRY_INVALID; - if ((pud_val(pud) & mask) != 0) - return 0; - VM_BUG_ON(!pfn_valid(pud_pfn(pud))); - - refs = 0; - head = pud_page(pud); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pud_val(pud) != pud_val(*pudp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - -static inline int gup_pud_range(p4d_t p4d, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp, pud; - - pudp = pud_offset(&p4d, addr); - do { - pud = *pudp; - barrier(); - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_large(pud))) { - if (!gup_huge_pud(pudp, pud, addr, next, write, pages, - nr)) - return 0; - } else if (!gup_pmd_range(pud, addr, next, write, pages, - nr)) - return 0; - } while (pudp++, addr = next, addr != end); - - return 1; -} - -static inline int gup_p4d_range(pgd_t pgd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - p4d_t *p4dp, p4d; - - p4dp = p4d_offset(&pgd, addr); - do { - p4d = *p4dp; - barrier(); - next = p4d_addr_end(addr, end); - if (p4d_none(p4d)) - return 0; - if (!gup_pud_range(p4d, addr, next, write, pages, nr)) - return 0; - } while (p4dp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next, flags; - pgd_t *pgdp, pgd; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if ((end <= start) || (end > mm->context.asce_limit)) - return 0; - /* - * local_irq_save() doesn't prevent pagetable teardown, but does - * prevent the pagetables from being freed on s390. - * - * So long as we atomically load page table pointers versus teardown, - * we can follow the address down to the the page and take a ref on it. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd = *pgdp; - barrier(); - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - int nr, ret; - - might_sleep(); - start &= PAGE_MASK; - nr = __get_user_pages_fast(start, nr_pages, write, pages); - if (nr == nr_pages) - return nr; - - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, pages, - write ? FOLL_WRITE : 0); - /* Have to be a bit careful with return values */ - if (nr > 0) - ret = (ret < 0) ? nr : ret + nr; - return ret; -} From 1c410fd6a561af452aba282b1cd3cabef2080d72 Mon Sep 17 00:00:00 2001 From: Thomas-Mich Richter Date: Tue, 23 Apr 2019 11:36:27 +0200 Subject: [PATCH 30/98] s390/cpum_cf_diag: Add support for CPU-MF SVN 6 Add support for the CPU-Measurement Facility counter second version number 6. This number is used to detect some more counters in the crypto counter set and the extended counter set. Signed-off-by: Thomas Richter Reviewed-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/perf_cpum_cf_diag.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index b6854812d2ed..d4e031f7b9c8 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -306,15 +306,20 @@ static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset, ctrset_size = 2; break; case CPUMF_CTR_SET_CRYPTO: - ctrset_size = 16; + if (info->csvn >= 1 && info->csvn <= 5) + ctrset_size = 16; + else if (info->csvn == 6) + ctrset_size = 20; break; case CPUMF_CTR_SET_EXT: if (info->csvn == 1) ctrset_size = 32; else if (info->csvn == 2) ctrset_size = 48; - else if (info->csvn >= 3) + else if (info->csvn >= 3 && info->csvn <= 5) ctrset_size = 128; + else if (info->csvn == 6) + ctrset_size = 160; break; case CPUMF_CTR_SET_MT_DIAG: if (info->csvn > 3) From 01eb42afb45719cb41bb32c278e068073738899d Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 23 Apr 2019 15:36:36 +0200 Subject: [PATCH 31/98] s390/kasan: fix strncpy_from_user kasan checks arch/s390/lib/uaccess.c is built without kasan instrumentation. Kasan checks are performed explicitly in copy_from_user/copy_to_user functions. But since those functions could be inlined, calls from files like uaccess.c with instrumentation disabled won't generate kasan reports. This is currently the case with strncpy_from_user function which was revealed by newly added kasan test. Avoid inlining of copy_from_user/copy_to_user when the kernel is built with kasan support to make sure kasan checks are fully functional. Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/uaccess.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 007fcb9aeeb8..bd2fd9a7821d 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -55,8 +55,10 @@ raw_copy_from_user(void *to, const void __user *from, unsigned long n); unsigned long __must_check raw_copy_to_user(void __user *to, const void *from, unsigned long n); +#ifndef CONFIG_KASAN #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER +#endif #ifdef CONFIG_HAVE_MARCH_Z10_FEATURES From c9f621524e70774688db3cec60d85fa4c7de52e3 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 24 Apr 2019 12:49:44 +0200 Subject: [PATCH 32/98] s390/mm: fix pxd_bad with folded page tables With git commit d1874a0c2805fcfa9162c972d6b7541e57adb542 "s390/mm: make the pxd_offset functions more robust" and a 2-level page table it can now happen that pgd_bad() gets asked to verify a large segment table entry. If the entry is marked as dirty pgd_bad() will incorrectly return true. Change the pgd_bad(), p4d_bad(), pud_bad() and pmd_bad() functions to first verify the table type, return false if the table level is lower than what the function is suppossed to check, return true if the table level is too high, and otherwise check the relevant region and segment table bits. pmd_bad() has to check against ~SEGMENT_ENTRY_BITS for normal page table pointers or ~SEGMENT_ENTRY_BITS_LARGE for large segment table entries. Same for pud_bad() which has to check against ~_REGION_ENTRY_BITS or ~_REGION_ENTRY_BITS_LARGE. Fixes: d1874a0c2805 ("s390/mm: make the pxd_offset functions more robust") Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 394bec31cb97..9f0195d5fa16 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -238,7 +238,7 @@ static inline int is_module_addr(void *addr) #define _REGION_ENTRY_NOEXEC 0x100 /* region no-execute bit */ #define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */ #define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ -#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ +#define _REGION_ENTRY_TYPE_MASK 0x0c /* region table type mask */ #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ #define _REGION_ENTRY_TYPE_R2 0x08 /* region second table type */ #define _REGION_ENTRY_TYPE_R3 0x04 /* region third table type */ @@ -277,6 +277,7 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */ #define _SEGMENT_ENTRY_NOEXEC 0x100 /* segment no-execute bit */ #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ +#define _SEGMENT_ENTRY_TYPE_MASK 0x0c /* segment table type mask */ #define _SEGMENT_ENTRY (0) #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID) @@ -614,15 +615,9 @@ static inline int pgd_none(pgd_t pgd) static inline int pgd_bad(pgd_t pgd) { - /* - * With dynamic page table levels the pgd can be a region table - * entry or a segment table entry. Check for the bit that are - * invalid for either table entry. - */ - unsigned long mask = - ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID & - ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH; - return (pgd_val(pgd) & mask) != 0; + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R1) + return 0; + return (pgd_val(pgd) & ~_REGION_ENTRY_BITS) != 0; } static inline unsigned long pgd_pfn(pgd_t pgd) @@ -703,6 +698,8 @@ static inline int pmd_large(pmd_t pmd) static inline int pmd_bad(pmd_t pmd) { + if ((pmd_val(pmd) & _SEGMENT_ENTRY_TYPE_MASK) > 0) + return 1; if (pmd_large(pmd)) return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0; return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; @@ -710,8 +707,12 @@ static inline int pmd_bad(pmd_t pmd) static inline int pud_bad(pud_t pud) { - if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3) - return pmd_bad(__pmd(pud_val(pud))); + unsigned long type = pud_val(pud) & _REGION_ENTRY_TYPE_MASK; + + if (type > _REGION_ENTRY_TYPE_R3) + return 1; + if (type < _REGION_ENTRY_TYPE_R3) + return 0; if (pud_large(pud)) return (pud_val(pud) & ~_REGION_ENTRY_BITS_LARGE) != 0; return (pud_val(pud) & ~_REGION_ENTRY_BITS) != 0; @@ -719,8 +720,12 @@ static inline int pud_bad(pud_t pud) static inline int p4d_bad(p4d_t p4d) { - if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) - return pud_bad(__pud(p4d_val(p4d))); + unsigned long type = p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK; + + if (type > _REGION_ENTRY_TYPE_R2) + return 1; + if (type < _REGION_ENTRY_TYPE_R2) + return 0; return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0; } From 71189f263f8a3db7b72ca75be14e7309375e8707 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 21 Jan 2019 09:55:18 +0100 Subject: [PATCH 33/98] vfio-ccw: make it safe to access channel programs When we get a solicited interrupt, the start function may have been cleared by a csch, but we still have a channel program structure allocated. Make it safe to call the cp accessors in any case, so we can call them unconditionally. While at it, also make sure that functions called from other parts of the code return gracefully if the channel program structure has not been initialized (even though that is a bug in the caller). Reviewed-by: Eric Farman Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 21 ++++++++++++++++++++- drivers/s390/cio/vfio_ccw_cp.h | 2 ++ drivers/s390/cio/vfio_ccw_fsm.c | 5 +++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 384b3987eeb4..0e79799e9a71 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -362,6 +362,7 @@ static void cp_unpin_free(struct channel_program *cp) struct ccwchain *chain, *temp; int i; + cp->initialized = false; list_for_each_entry_safe(chain, temp, &cp->ccwchain_list, next) { for (i = 0; i < chain->ch_len; i++) { pfn_array_table_unpin_free(chain->ch_pat + i, @@ -732,6 +733,9 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) */ cp->orb.cmd.c64 = 1; + if (!ret) + cp->initialized = true; + return ret; } @@ -746,7 +750,8 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) */ void cp_free(struct channel_program *cp) { - cp_unpin_free(cp); + if (cp->initialized) + cp_unpin_free(cp); } /** @@ -791,6 +796,10 @@ int cp_prefetch(struct channel_program *cp) struct ccwchain *chain; int len, idx, ret; + /* this is an error in the caller */ + if (!cp->initialized) + return -EINVAL; + list_for_each_entry(chain, &cp->ccwchain_list, next) { len = chain->ch_len; for (idx = 0; idx < len; idx++) { @@ -826,6 +835,10 @@ union orb *cp_get_orb(struct channel_program *cp, u32 intparm, u8 lpm) struct ccwchain *chain; struct ccw1 *cpa; + /* this is an error in the caller */ + if (!cp->initialized) + return NULL; + orb = &cp->orb; orb->cmd.intparm = intparm; @@ -862,6 +875,9 @@ void cp_update_scsw(struct channel_program *cp, union scsw *scsw) u32 cpa = scsw->cmd.cpa; u32 ccw_head; + if (!cp->initialized) + return; + /* * LATER: * For now, only update the cmd.cpa part. We may need to deal with @@ -898,6 +914,9 @@ bool cp_iova_pinned(struct channel_program *cp, u64 iova) struct ccwchain *chain; int i; + if (!cp->initialized) + return false; + list_for_each_entry(chain, &cp->ccwchain_list, next) { for (i = 0; i < chain->ch_len; i++) if (pfn_array_table_iova_pinned(chain->ch_pat + i, diff --git a/drivers/s390/cio/vfio_ccw_cp.h b/drivers/s390/cio/vfio_ccw_cp.h index a4b74fb1aa57..3c20cd208da5 100644 --- a/drivers/s390/cio/vfio_ccw_cp.h +++ b/drivers/s390/cio/vfio_ccw_cp.h @@ -21,6 +21,7 @@ * @ccwchain_list: list head of ccwchains * @orb: orb for the currently processed ssch request * @mdev: the mediated device to perform page pinning/unpinning + * @initialized: whether this instance is actually initialized * * @ccwchain_list is the head of a ccwchain list, that contents the * translated result of the guest channel program that pointed out by @@ -30,6 +31,7 @@ struct channel_program { struct list_head ccwchain_list; union orb orb; struct device *mdev; + bool initialized; }; extern int cp_init(struct channel_program *cp, struct device *mdev, diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c index cab17865aafe..e7c9877c9f1e 100644 --- a/drivers/s390/cio/vfio_ccw_fsm.c +++ b/drivers/s390/cio/vfio_ccw_fsm.c @@ -31,6 +31,10 @@ static int fsm_io_helper(struct vfio_ccw_private *private) private->state = VFIO_CCW_STATE_BUSY; orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm); + if (!orb) { + ret = -EIO; + goto out; + } /* Issue "Start Subchannel" */ ccode = ssch(sch->schid, orb); @@ -64,6 +68,7 @@ static int fsm_io_helper(struct vfio_ccw_private *private) default: ret = ccode; } +out: spin_unlock_irqrestore(sch->lock, flags); return ret; } From 690f6a1581c7c08e85451f62bcbfe40f29072842 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 29 Jan 2019 16:13:57 +0100 Subject: [PATCH 34/98] vfio-ccw: rework ssch state handling The flow for processing ssch requests can be improved by splitting the BUSY state: - CP_PROCESSING: We reject any user space requests while we are in the process of translating a channel program and submitting it to the hardware. Use -EAGAIN to signal user space that it should retry the request. - CP_PENDING: We have successfully submitted a request with ssch and are now expecting an interrupt. As we can't handle more than one channel program being processed, reject any further requests with -EBUSY. A final interrupt will move us out of this state. By making this a separate state, we make it possible to issue a halt or a clear while we're still waiting for the final interrupt for the ssch (in a follow-on patch). It also makes a lot of sense not to preemptively filter out writes to the io_region if we're in an incorrect state: the state machine will handle this correctly. Reviewed-by: Eric Farman Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_fsm.c | 19 ++++++++++++++----- drivers/s390/cio/vfio_ccw_ops.c | 2 -- drivers/s390/cio/vfio_ccw_private.h | 3 ++- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c index e7c9877c9f1e..b4a141fbd1a8 100644 --- a/drivers/s390/cio/vfio_ccw_fsm.c +++ b/drivers/s390/cio/vfio_ccw_fsm.c @@ -28,7 +28,6 @@ static int fsm_io_helper(struct vfio_ccw_private *private) sch = private->sch; spin_lock_irqsave(sch->lock, flags); - private->state = VFIO_CCW_STATE_BUSY; orb = cp_get_orb(&private->cp, (u32)(addr_t)sch, sch->lpm); if (!orb) { @@ -46,6 +45,7 @@ static int fsm_io_helper(struct vfio_ccw_private *private) */ sch->schib.scsw.cmd.actl |= SCSW_ACTL_START_PEND; ret = 0; + private->state = VFIO_CCW_STATE_CP_PENDING; break; case 1: /* Status pending */ case 2: /* Busy */ @@ -107,6 +107,12 @@ static void fsm_io_busy(struct vfio_ccw_private *private, private->io_region->ret_code = -EBUSY; } +static void fsm_io_retry(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ + private->io_region->ret_code = -EAGAIN; +} + static void fsm_disabled_irq(struct vfio_ccw_private *private, enum vfio_ccw_event event) { @@ -135,8 +141,7 @@ static void fsm_io_request(struct vfio_ccw_private *private, struct mdev_device *mdev = private->mdev; char *errstr = "request"; - private->state = VFIO_CCW_STATE_BUSY; - + private->state = VFIO_CCW_STATE_CP_PROCESSING; memcpy(scsw, io_region->scsw_area, sizeof(*scsw)); if (scsw->cmd.fctl & SCSW_FCTL_START_FUNC) { @@ -181,7 +186,6 @@ static void fsm_io_request(struct vfio_ccw_private *private, } err_out: - private->state = VFIO_CCW_STATE_IDLE; trace_vfio_ccw_io_fctl(scsw->cmd.fctl, get_schid(private), io_region->ret_code, errstr); } @@ -221,7 +225,12 @@ fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS] = { [VFIO_CCW_EVENT_IO_REQ] = fsm_io_request, [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, }, - [VFIO_CCW_STATE_BUSY] = { + [VFIO_CCW_STATE_CP_PROCESSING] = { + [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper, + [VFIO_CCW_EVENT_IO_REQ] = fsm_io_retry, + [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, + }, + [VFIO_CCW_STATE_CP_PENDING] = { [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper, [VFIO_CCW_EVENT_IO_REQ] = fsm_io_busy, [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index f673e106c041..3fdcc6dfe0bf 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -193,8 +193,6 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev, return -EINVAL; private = dev_get_drvdata(mdev_parent_dev(mdev)); - if (private->state != VFIO_CCW_STATE_IDLE) - return -EACCES; region = private->io_region; if (copy_from_user((void *)region + *ppos, buf, count)) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 08e9a7dc9176..50c52efb4fcb 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -63,7 +63,8 @@ enum vfio_ccw_state { VFIO_CCW_STATE_NOT_OPER, VFIO_CCW_STATE_STANDBY, VFIO_CCW_STATE_IDLE, - VFIO_CCW_STATE_BUSY, + VFIO_CCW_STATE_CP_PROCESSING, + VFIO_CCW_STATE_CP_PENDING, /* last element! */ NR_VFIO_CCW_STATES }; From 4f76617378ee97c557b526cb58d3c61eb0a9c963 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 8 Jan 2019 15:53:03 +0100 Subject: [PATCH 35/98] vfio-ccw: protect the I/O region Introduce a mutex to disallow concurrent reads or writes to the I/O region. This makes sure that the data the kernel or user space see is always consistent. The same mutex will be used to protect the async region as well. Reviewed-by: Eric Farman Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_drv.c | 3 +++ drivers/s390/cio/vfio_ccw_ops.c | 32 +++++++++++++++++++---------- drivers/s390/cio/vfio_ccw_private.h | 2 ++ 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 0b3b9de45c60..5ea0da1dd954 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -84,7 +84,9 @@ static void vfio_ccw_sch_io_todo(struct work_struct *work) if (is_final) cp_free(&private->cp); } + mutex_lock(&private->io_mutex); memcpy(private->io_region->irb_area, irb, sizeof(*irb)); + mutex_unlock(&private->io_mutex); if (private->io_trigger) eventfd_signal(private->io_trigger, 1); @@ -129,6 +131,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) private->sch = sch; dev_set_drvdata(&sch->dev, private); + mutex_init(&private->io_mutex); spin_lock_irq(sch->lock); private->state = VFIO_CCW_STATE_NOT_OPER; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 3fdcc6dfe0bf..025c8a832bc8 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -169,16 +169,20 @@ static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev, { struct vfio_ccw_private *private; struct ccw_io_region *region; + int ret; if (*ppos + count > sizeof(*region)) return -EINVAL; private = dev_get_drvdata(mdev_parent_dev(mdev)); + mutex_lock(&private->io_mutex); region = private->io_region; if (copy_to_user(buf, (void *)region + *ppos, count)) - return -EFAULT; - - return count; + ret = -EFAULT; + else + ret = count; + mutex_unlock(&private->io_mutex); + return ret; } static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev, @@ -188,23 +192,29 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev, { struct vfio_ccw_private *private; struct ccw_io_region *region; + int ret; if (*ppos + count > sizeof(*region)) return -EINVAL; private = dev_get_drvdata(mdev_parent_dev(mdev)); + if (!mutex_trylock(&private->io_mutex)) + return -EAGAIN; region = private->io_region; - if (copy_from_user((void *)region + *ppos, buf, count)) - return -EFAULT; - - vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_IO_REQ); - if (region->ret_code != 0) { - private->state = VFIO_CCW_STATE_IDLE; - return region->ret_code; + if (copy_from_user((void *)region + *ppos, buf, count)) { + ret = -EFAULT; + goto out_unlock; } - return count; + vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_IO_REQ); + if (region->ret_code != 0) + private->state = VFIO_CCW_STATE_IDLE; + ret = (region->ret_code != 0) ? region->ret_code : count; + +out_unlock: + mutex_unlock(&private->io_mutex); + return ret; } static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 50c52efb4fcb..32173cbd838d 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -28,6 +28,7 @@ * @mdev: pointer to the mediated device * @nb: notifier for vfio events * @io_region: MMIO region to input/output I/O arguments/results + * @io_mutex: protect against concurrent update of I/O regions * @cp: channel program for the current I/O operation * @irb: irb info received from interrupt * @scsw: scsw info @@ -42,6 +43,7 @@ struct vfio_ccw_private { struct mdev_device *mdev; struct notifier_block nb; struct ccw_io_region *io_region; + struct mutex io_mutex; struct channel_program cp; struct irb irb; From db8e5d17ac03a65e2e0ee0ba50bf61a99741d871 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Thu, 19 Jul 2018 17:53:08 +0200 Subject: [PATCH 36/98] vfio-ccw: add capabilities chain Allow to extend the regions used by vfio-ccw. The first user will be handling of halt and clear subchannel. Reviewed-by: Eric Farman Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_ops.c | 187 ++++++++++++++++++++++++---- drivers/s390/cio/vfio_ccw_private.h | 38 ++++++ include/uapi/linux/vfio.h | 2 + 3 files changed, 201 insertions(+), 26 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 025c8a832bc8..5b989faa011f 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -3,13 +3,17 @@ * Physical device callbacks for vfio_ccw * * Copyright IBM Corp. 2017 + * Copyright Red Hat, Inc. 2019 * * Author(s): Dong Jia Shi * Xiao Feng Ren + * Cornelia Huck */ #include #include +#include +#include #include "vfio_ccw_private.h" @@ -157,27 +161,33 @@ static void vfio_ccw_mdev_release(struct mdev_device *mdev) { struct vfio_ccw_private *private = dev_get_drvdata(mdev_parent_dev(mdev)); + int i; vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &private->nb); + + for (i = 0; i < private->num_regions; i++) + private->region[i].ops->release(private, &private->region[i]); + + private->num_regions = 0; + kfree(private->region); + private->region = NULL; } -static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev, - char __user *buf, - size_t count, - loff_t *ppos) +static ssize_t vfio_ccw_mdev_read_io_region(struct vfio_ccw_private *private, + char __user *buf, size_t count, + loff_t *ppos) { - struct vfio_ccw_private *private; + loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; struct ccw_io_region *region; int ret; - if (*ppos + count > sizeof(*region)) + if (pos + count > sizeof(*region)) return -EINVAL; - private = dev_get_drvdata(mdev_parent_dev(mdev)); mutex_lock(&private->io_mutex); region = private->io_region; - if (copy_to_user(buf, (void *)region + *ppos, count)) + if (copy_to_user(buf, (void *)region + pos, count)) ret = -EFAULT; else ret = count; @@ -185,24 +195,47 @@ static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev, return ret; } -static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev, - const char __user *buf, - size_t count, - loff_t *ppos) +static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev, + char __user *buf, + size_t count, + loff_t *ppos) { + unsigned int index = VFIO_CCW_OFFSET_TO_INDEX(*ppos); struct vfio_ccw_private *private; + + private = dev_get_drvdata(mdev_parent_dev(mdev)); + + if (index >= VFIO_CCW_NUM_REGIONS + private->num_regions) + return -EINVAL; + + switch (index) { + case VFIO_CCW_CONFIG_REGION_INDEX: + return vfio_ccw_mdev_read_io_region(private, buf, count, ppos); + default: + index -= VFIO_CCW_NUM_REGIONS; + return private->region[index].ops->read(private, buf, count, + ppos); + } + + return -EINVAL; +} + +static ssize_t vfio_ccw_mdev_write_io_region(struct vfio_ccw_private *private, + const char __user *buf, + size_t count, loff_t *ppos) +{ + loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; struct ccw_io_region *region; int ret; - if (*ppos + count > sizeof(*region)) + if (pos + count > sizeof(*region)) return -EINVAL; - private = dev_get_drvdata(mdev_parent_dev(mdev)); if (!mutex_trylock(&private->io_mutex)) return -EAGAIN; region = private->io_region; - if (copy_from_user((void *)region + *ppos, buf, count)) { + if (copy_from_user((void *)region + pos, buf, count)) { ret = -EFAULT; goto out_unlock; } @@ -217,19 +250,52 @@ out_unlock: return ret; } -static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info) +static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev, + const char __user *buf, + size_t count, + loff_t *ppos) { + unsigned int index = VFIO_CCW_OFFSET_TO_INDEX(*ppos); + struct vfio_ccw_private *private; + + private = dev_get_drvdata(mdev_parent_dev(mdev)); + + if (index >= VFIO_CCW_NUM_REGIONS + private->num_regions) + return -EINVAL; + + switch (index) { + case VFIO_CCW_CONFIG_REGION_INDEX: + return vfio_ccw_mdev_write_io_region(private, buf, count, ppos); + default: + index -= VFIO_CCW_NUM_REGIONS; + return private->region[index].ops->write(private, buf, count, + ppos); + } + + return -EINVAL; +} + +static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info, + struct mdev_device *mdev) +{ + struct vfio_ccw_private *private; + + private = dev_get_drvdata(mdev_parent_dev(mdev)); info->flags = VFIO_DEVICE_FLAGS_CCW | VFIO_DEVICE_FLAGS_RESET; - info->num_regions = VFIO_CCW_NUM_REGIONS; + info->num_regions = VFIO_CCW_NUM_REGIONS + private->num_regions; info->num_irqs = VFIO_CCW_NUM_IRQS; return 0; } static int vfio_ccw_mdev_get_region_info(struct vfio_region_info *info, - u16 *cap_type_id, - void **cap_type) + struct mdev_device *mdev, + unsigned long arg) { + struct vfio_ccw_private *private; + int i; + + private = dev_get_drvdata(mdev_parent_dev(mdev)); switch (info->index) { case VFIO_CCW_CONFIG_REGION_INDEX: info->offset = 0; @@ -237,9 +303,55 @@ static int vfio_ccw_mdev_get_region_info(struct vfio_region_info *info, info->flags = VFIO_REGION_INFO_FLAG_READ | VFIO_REGION_INFO_FLAG_WRITE; return 0; - default: - return -EINVAL; + default: /* all other regions are handled via capability chain */ + { + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1 }; + int ret; + + if (info->index >= + VFIO_CCW_NUM_REGIONS + private->num_regions) + return -EINVAL; + + info->index = array_index_nospec(info->index, + VFIO_CCW_NUM_REGIONS + + private->num_regions); + + i = info->index - VFIO_CCW_NUM_REGIONS; + + info->offset = VFIO_CCW_INDEX_TO_OFFSET(info->index); + info->size = private->region[i].size; + info->flags = private->region[i].flags; + + cap_type.type = private->region[i].type; + cap_type.subtype = private->region[i].subtype; + + ret = vfio_info_add_capability(&caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + + info->flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info->argsz < sizeof(*info) + caps.size) { + info->argsz = sizeof(*info) + caps.size; + info->cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(*info)); + if (copy_to_user((void __user *)arg + sizeof(*info), + caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; + } + info->cap_offset = sizeof(*info); + } + + kfree(caps.buf); + } + } + return 0; } static int vfio_ccw_mdev_get_irq_info(struct vfio_irq_info *info) @@ -316,6 +428,32 @@ static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev, } } +int vfio_ccw_register_dev_region(struct vfio_ccw_private *private, + unsigned int subtype, + const struct vfio_ccw_regops *ops, + size_t size, u32 flags, void *data) +{ + struct vfio_ccw_region *region; + + region = krealloc(private->region, + (private->num_regions + 1) * sizeof(*region), + GFP_KERNEL); + if (!region) + return -ENOMEM; + + private->region = region; + private->region[private->num_regions].type = VFIO_REGION_TYPE_CCW; + private->region[private->num_regions].subtype = subtype; + private->region[private->num_regions].ops = ops; + private->region[private->num_regions].size = size; + private->region[private->num_regions].flags = flags; + private->region[private->num_regions].data = data; + + private->num_regions++; + + return 0; +} + static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, unsigned int cmd, unsigned long arg) @@ -336,7 +474,7 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, if (info.argsz < minsz) return -EINVAL; - ret = vfio_ccw_mdev_get_device_info(&info); + ret = vfio_ccw_mdev_get_device_info(&info, mdev); if (ret) return ret; @@ -345,8 +483,6 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, case VFIO_DEVICE_GET_REGION_INFO: { struct vfio_region_info info; - u16 cap_type_id = 0; - void *cap_type = NULL; minsz = offsetofend(struct vfio_region_info, offset); @@ -356,8 +492,7 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, if (info.argsz < minsz) return -EINVAL; - ret = vfio_ccw_mdev_get_region_info(&info, &cap_type_id, - &cap_type); + ret = vfio_ccw_mdev_get_region_info(&info, mdev, arg); if (ret) return ret; diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 32173cbd838d..d888a2573470 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -3,9 +3,11 @@ * Private stuff for vfio_ccw driver * * Copyright IBM Corp. 2017 + * Copyright Red Hat, Inc. 2019 * * Author(s): Dong Jia Shi * Xiao Feng Ren + * Cornelia Huck */ #ifndef _VFIO_CCW_PRIVATE_H_ @@ -19,6 +21,38 @@ #include "css.h" #include "vfio_ccw_cp.h" +#define VFIO_CCW_OFFSET_SHIFT 10 +#define VFIO_CCW_OFFSET_TO_INDEX(off) (off >> VFIO_CCW_OFFSET_SHIFT) +#define VFIO_CCW_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_CCW_OFFSET_SHIFT) +#define VFIO_CCW_OFFSET_MASK (((u64)(1) << VFIO_CCW_OFFSET_SHIFT) - 1) + +/* capability chain handling similar to vfio-pci */ +struct vfio_ccw_private; +struct vfio_ccw_region; + +struct vfio_ccw_regops { + ssize_t (*read)(struct vfio_ccw_private *private, char __user *buf, + size_t count, loff_t *ppos); + ssize_t (*write)(struct vfio_ccw_private *private, + const char __user *buf, size_t count, loff_t *ppos); + void (*release)(struct vfio_ccw_private *private, + struct vfio_ccw_region *region); +}; + +struct vfio_ccw_region { + u32 type; + u32 subtype; + const struct vfio_ccw_regops *ops; + void *data; + size_t size; + u32 flags; +}; + +int vfio_ccw_register_dev_region(struct vfio_ccw_private *private, + unsigned int subtype, + const struct vfio_ccw_regops *ops, + size_t size, u32 flags, void *data); + /** * struct vfio_ccw_private * @sch: pointer to the subchannel @@ -29,6 +63,8 @@ * @nb: notifier for vfio events * @io_region: MMIO region to input/output I/O arguments/results * @io_mutex: protect against concurrent update of I/O regions + * @region: additional regions for other subchannel operations + * @num_regions: number of additional regions * @cp: channel program for the current I/O operation * @irb: irb info received from interrupt * @scsw: scsw info @@ -44,6 +80,8 @@ struct vfio_ccw_private { struct notifier_block nb; struct ccw_io_region *io_region; struct mutex io_mutex; + struct vfio_ccw_region *region; + int num_regions; struct channel_program cp; struct irb irb; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 02bb7ad6e986..56e2413d3e00 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -353,6 +353,8 @@ struct vfio_region_gfx_edid { #define VFIO_DEVICE_GFX_LINK_STATE_DOWN 2 }; +#define VFIO_REGION_TYPE_CCW (2) + /* * 10de vendor sub-type * From b0940857379eb930a4310625c4cd89254e4478b6 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Wed, 4 Apr 2018 13:03:38 +0200 Subject: [PATCH 37/98] s390/cio: export hsch to modules The vfio-ccw code will need this, and it matches treatment of ssch and csch. Reviewed-by: Pierre Morel Reviewed-by: Halil Pasic Signed-off-by: Cornelia Huck --- drivers/s390/cio/ioasm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/s390/cio/ioasm.c b/drivers/s390/cio/ioasm.c index 14d328338ce2..08eb10283b18 100644 --- a/drivers/s390/cio/ioasm.c +++ b/drivers/s390/cio/ioasm.c @@ -233,6 +233,7 @@ int hsch(struct subchannel_id schid) return ccode; } +EXPORT_SYMBOL(hsch); static inline int __xsch(struct subchannel_id schid) { From d5afd5d135c8cc43bd2568361b4c91f0bd488c3f Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Mon, 23 Jul 2018 16:03:27 +0200 Subject: [PATCH 38/98] vfio-ccw: add handling for async channel instructions Add a region to the vfio-ccw device that can be used to submit asynchronous I/O instructions. ssch continues to be handled by the existing I/O region; the new region handles hsch and csch. Interrupt status continues to be reported through the same channels as for ssch. Acked-by: Eric Farman Reviewed-by: Farhan Ali Signed-off-by: Cornelia Huck --- drivers/s390/cio/Makefile | 3 +- drivers/s390/cio/vfio_ccw_async.c | 88 ++++++++++++++++++++ drivers/s390/cio/vfio_ccw_drv.c | 46 ++++++++--- drivers/s390/cio/vfio_ccw_fsm.c | 119 +++++++++++++++++++++++++++- drivers/s390/cio/vfio_ccw_ops.c | 13 ++- drivers/s390/cio/vfio_ccw_private.h | 5 ++ include/uapi/linux/vfio.h | 2 + include/uapi/linux/vfio_ccw.h | 12 +++ 8 files changed, 270 insertions(+), 18 deletions(-) create mode 100644 drivers/s390/cio/vfio_ccw_async.c diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile index f230516abb96..f6a8db04177c 100644 --- a/drivers/s390/cio/Makefile +++ b/drivers/s390/cio/Makefile @@ -20,5 +20,6 @@ obj-$(CONFIG_CCWGROUP) += ccwgroup.o qdio-objs := qdio_main.o qdio_thinint.o qdio_debug.o qdio_setup.o obj-$(CONFIG_QDIO) += qdio.o -vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o vfio_ccw_ops.o vfio_ccw_fsm.o +vfio_ccw-objs += vfio_ccw_drv.o vfio_ccw_cp.o vfio_ccw_ops.o vfio_ccw_fsm.o \ + vfio_ccw_async.o obj-$(CONFIG_VFIO_CCW) += vfio_ccw.o diff --git a/drivers/s390/cio/vfio_ccw_async.c b/drivers/s390/cio/vfio_ccw_async.c new file mode 100644 index 000000000000..8c1d2357ef5b --- /dev/null +++ b/drivers/s390/cio/vfio_ccw_async.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Async I/O region for vfio_ccw + * + * Copyright Red Hat, Inc. 2019 + * + * Author(s): Cornelia Huck + */ + +#include +#include + +#include "vfio_ccw_private.h" + +static ssize_t vfio_ccw_async_region_read(struct vfio_ccw_private *private, + char __user *buf, size_t count, + loff_t *ppos) +{ + unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS; + loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; + struct ccw_cmd_region *region; + int ret; + + if (pos + count > sizeof(*region)) + return -EINVAL; + + mutex_lock(&private->io_mutex); + region = private->region[i].data; + if (copy_to_user(buf, (void *)region + pos, count)) + ret = -EFAULT; + else + ret = count; + mutex_unlock(&private->io_mutex); + return ret; +} + +static ssize_t vfio_ccw_async_region_write(struct vfio_ccw_private *private, + const char __user *buf, size_t count, + loff_t *ppos) +{ + unsigned int i = VFIO_CCW_OFFSET_TO_INDEX(*ppos) - VFIO_CCW_NUM_REGIONS; + loff_t pos = *ppos & VFIO_CCW_OFFSET_MASK; + struct ccw_cmd_region *region; + int ret; + + if (pos + count > sizeof(*region)) + return -EINVAL; + + if (!mutex_trylock(&private->io_mutex)) + return -EAGAIN; + + region = private->region[i].data; + if (copy_from_user((void *)region + pos, buf, count)) { + ret = -EFAULT; + goto out_unlock; + } + + vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_ASYNC_REQ); + + ret = region->ret_code ? region->ret_code : count; + +out_unlock: + mutex_unlock(&private->io_mutex); + return ret; +} + +static void vfio_ccw_async_region_release(struct vfio_ccw_private *private, + struct vfio_ccw_region *region) +{ + +} + +const struct vfio_ccw_regops vfio_ccw_async_region_ops = { + .read = vfio_ccw_async_region_read, + .write = vfio_ccw_async_region_write, + .release = vfio_ccw_async_region_release, +}; + +int vfio_ccw_register_async_dev_regions(struct vfio_ccw_private *private) +{ + return vfio_ccw_register_dev_region(private, + VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD, + &vfio_ccw_async_region_ops, + sizeof(struct ccw_cmd_region), + VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE, + private->cmd_region); +} diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 5ea0da1dd954..c39d01943a6a 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -3,9 +3,11 @@ * VFIO based Physical Subchannel device driver * * Copyright IBM Corp. 2017 + * Copyright Red Hat, Inc. 2019 * * Author(s): Dong Jia Shi * Xiao Feng Ren + * Cornelia Huck */ #include @@ -23,6 +25,7 @@ struct workqueue_struct *vfio_ccw_work_q; static struct kmem_cache *vfio_ccw_io_region; +static struct kmem_cache *vfio_ccw_cmd_region; /* * Helpers @@ -110,7 +113,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) { struct pmcw *pmcw = &sch->schib.pmcw; struct vfio_ccw_private *private; - int ret; + int ret = -ENOMEM; if (pmcw->qf) { dev_warn(&sch->dev, "vfio: ccw: does not support QDIO: %s\n", @@ -124,10 +127,13 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) private->io_region = kmem_cache_zalloc(vfio_ccw_io_region, GFP_KERNEL | GFP_DMA); - if (!private->io_region) { - kfree(private); - return -ENOMEM; - } + if (!private->io_region) + goto out_free; + + private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region, + GFP_KERNEL | GFP_DMA); + if (!private->cmd_region) + goto out_free; private->sch = sch; dev_set_drvdata(&sch->dev, private); @@ -155,7 +161,10 @@ out_disable: cio_disable_subchannel(sch); out_free: dev_set_drvdata(&sch->dev, NULL); - kmem_cache_free(vfio_ccw_io_region, private->io_region); + if (private->cmd_region) + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); + if (private->io_region) + kmem_cache_free(vfio_ccw_io_region, private->io_region); kfree(private); return ret; } @@ -170,6 +179,7 @@ static int vfio_ccw_sch_remove(struct subchannel *sch) dev_set_drvdata(&sch->dev, NULL); + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); kmem_cache_free(vfio_ccw_io_region, private->io_region); kfree(private); @@ -244,7 +254,7 @@ static struct css_driver vfio_ccw_sch_driver = { static int __init vfio_ccw_sch_init(void) { - int ret; + int ret = -ENOMEM; vfio_ccw_work_q = create_singlethread_workqueue("vfio-ccw"); if (!vfio_ccw_work_q) @@ -254,20 +264,30 @@ static int __init vfio_ccw_sch_init(void) sizeof(struct ccw_io_region), 0, SLAB_ACCOUNT, 0, sizeof(struct ccw_io_region), NULL); - if (!vfio_ccw_io_region) { - destroy_workqueue(vfio_ccw_work_q); - return -ENOMEM; - } + if (!vfio_ccw_io_region) + goto out_err; + + vfio_ccw_cmd_region = kmem_cache_create_usercopy("vfio_ccw_cmd_region", + sizeof(struct ccw_cmd_region), 0, + SLAB_ACCOUNT, 0, + sizeof(struct ccw_cmd_region), NULL); + if (!vfio_ccw_cmd_region) + goto out_err; isc_register(VFIO_CCW_ISC); ret = css_driver_register(&vfio_ccw_sch_driver); if (ret) { isc_unregister(VFIO_CCW_ISC); - kmem_cache_destroy(vfio_ccw_io_region); - destroy_workqueue(vfio_ccw_work_q); + goto out_err; } return ret; + +out_err: + kmem_cache_destroy(vfio_ccw_cmd_region); + kmem_cache_destroy(vfio_ccw_io_region); + destroy_workqueue(vfio_ccw_work_q); + return ret; } static void __exit vfio_ccw_sch_exit(void) diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c index b4a141fbd1a8..49d9d3da0282 100644 --- a/drivers/s390/cio/vfio_ccw_fsm.c +++ b/drivers/s390/cio/vfio_ccw_fsm.c @@ -3,8 +3,10 @@ * Finite state machine for vfio-ccw device handling * * Copyright IBM Corp. 2017 + * Copyright Red Hat, Inc. 2019 * * Author(s): Dong Jia Shi + * Cornelia Huck */ #include @@ -73,6 +75,75 @@ out: return ret; } +static int fsm_do_halt(struct vfio_ccw_private *private) +{ + struct subchannel *sch; + unsigned long flags; + int ccode; + int ret; + + sch = private->sch; + + spin_lock_irqsave(sch->lock, flags); + + /* Issue "Halt Subchannel" */ + ccode = hsch(sch->schid); + + switch (ccode) { + case 0: + /* + * Initialize device status information + */ + sch->schib.scsw.cmd.actl |= SCSW_ACTL_HALT_PEND; + ret = 0; + break; + case 1: /* Status pending */ + case 2: /* Busy */ + ret = -EBUSY; + break; + case 3: /* Device not operational */ + ret = -ENODEV; + break; + default: + ret = ccode; + } + spin_unlock_irqrestore(sch->lock, flags); + return ret; +} + +static int fsm_do_clear(struct vfio_ccw_private *private) +{ + struct subchannel *sch; + unsigned long flags; + int ccode; + int ret; + + sch = private->sch; + + spin_lock_irqsave(sch->lock, flags); + + /* Issue "Clear Subchannel" */ + ccode = csch(sch->schid); + + switch (ccode) { + case 0: + /* + * Initialize device status information + */ + sch->schib.scsw.cmd.actl = SCSW_ACTL_CLEAR_PEND; + /* TODO: check what else we might need to clear */ + ret = 0; + break; + case 3: /* Device not operational */ + ret = -ENODEV; + break; + default: + ret = ccode; + } + spin_unlock_irqrestore(sch->lock, flags); + return ret; +} + static void fsm_notoper(struct vfio_ccw_private *private, enum vfio_ccw_event event) { @@ -113,6 +184,24 @@ static void fsm_io_retry(struct vfio_ccw_private *private, private->io_region->ret_code = -EAGAIN; } +static void fsm_async_error(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ + struct ccw_cmd_region *cmd_region = private->cmd_region; + + pr_err("vfio-ccw: FSM: %s request from state:%d\n", + cmd_region->command == VFIO_CCW_ASYNC_CMD_HSCH ? "halt" : + cmd_region->command == VFIO_CCW_ASYNC_CMD_CSCH ? "clear" : + "", private->state); + cmd_region->ret_code = -EIO; +} + +static void fsm_async_retry(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ + private->cmd_region->ret_code = -EAGAIN; +} + static void fsm_disabled_irq(struct vfio_ccw_private *private, enum vfio_ccw_event event) { @@ -176,11 +265,11 @@ static void fsm_io_request(struct vfio_ccw_private *private, } return; } else if (scsw->cmd.fctl & SCSW_FCTL_HALT_FUNC) { - /* XXX: Handle halt. */ + /* halt is handled via the async cmd region */ io_region->ret_code = -EOPNOTSUPP; goto err_out; } else if (scsw->cmd.fctl & SCSW_FCTL_CLEAR_FUNC) { - /* XXX: Handle clear. */ + /* clear is handled via the async cmd region */ io_region->ret_code = -EOPNOTSUPP; goto err_out; } @@ -190,6 +279,27 @@ err_out: io_region->ret_code, errstr); } +/* + * Deal with an async request from userspace. + */ +static void fsm_async_request(struct vfio_ccw_private *private, + enum vfio_ccw_event event) +{ + struct ccw_cmd_region *cmd_region = private->cmd_region; + + switch (cmd_region->command) { + case VFIO_CCW_ASYNC_CMD_HSCH: + cmd_region->ret_code = fsm_do_halt(private); + break; + case VFIO_CCW_ASYNC_CMD_CSCH: + cmd_region->ret_code = fsm_do_clear(private); + break; + default: + /* should not happen? */ + cmd_region->ret_code = -EINVAL; + } +} + /* * Got an interrupt for a normal io (state busy). */ @@ -213,26 +323,31 @@ fsm_func_t *vfio_ccw_jumptable[NR_VFIO_CCW_STATES][NR_VFIO_CCW_EVENTS] = { [VFIO_CCW_STATE_NOT_OPER] = { [VFIO_CCW_EVENT_NOT_OPER] = fsm_nop, [VFIO_CCW_EVENT_IO_REQ] = fsm_io_error, + [VFIO_CCW_EVENT_ASYNC_REQ] = fsm_async_error, [VFIO_CCW_EVENT_INTERRUPT] = fsm_disabled_irq, }, [VFIO_CCW_STATE_STANDBY] = { [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper, [VFIO_CCW_EVENT_IO_REQ] = fsm_io_error, + [VFIO_CCW_EVENT_ASYNC_REQ] = fsm_async_error, [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, }, [VFIO_CCW_STATE_IDLE] = { [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper, [VFIO_CCW_EVENT_IO_REQ] = fsm_io_request, + [VFIO_CCW_EVENT_ASYNC_REQ] = fsm_async_request, [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, }, [VFIO_CCW_STATE_CP_PROCESSING] = { [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper, [VFIO_CCW_EVENT_IO_REQ] = fsm_io_retry, + [VFIO_CCW_EVENT_ASYNC_REQ] = fsm_async_retry, [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, }, [VFIO_CCW_STATE_CP_PENDING] = { [VFIO_CCW_EVENT_NOT_OPER] = fsm_notoper, [VFIO_CCW_EVENT_IO_REQ] = fsm_io_busy, + [VFIO_CCW_EVENT_ASYNC_REQ] = fsm_async_request, [VFIO_CCW_EVENT_INTERRUPT] = fsm_irq, }, }; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 5b989faa011f..3e2802a4a43d 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -150,11 +150,20 @@ static int vfio_ccw_mdev_open(struct mdev_device *mdev) struct vfio_ccw_private *private = dev_get_drvdata(mdev_parent_dev(mdev)); unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; + int ret; private->nb.notifier_call = vfio_ccw_mdev_notifier; - return vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, - &events, &private->nb); + ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &events, &private->nb); + if (ret) + return ret; + + ret = vfio_ccw_register_async_dev_regions(private); + if (ret) + vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + &private->nb); + return ret; } static void vfio_ccw_mdev_release(struct mdev_device *mdev) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index d888a2573470..f1092c3dc1b1 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -53,6 +53,8 @@ int vfio_ccw_register_dev_region(struct vfio_ccw_private *private, const struct vfio_ccw_regops *ops, size_t size, u32 flags, void *data); +int vfio_ccw_register_async_dev_regions(struct vfio_ccw_private *private); + /** * struct vfio_ccw_private * @sch: pointer to the subchannel @@ -64,6 +66,7 @@ int vfio_ccw_register_dev_region(struct vfio_ccw_private *private, * @io_region: MMIO region to input/output I/O arguments/results * @io_mutex: protect against concurrent update of I/O regions * @region: additional regions for other subchannel operations + * @cmd_region: MMIO region for asynchronous I/O commands other than START * @num_regions: number of additional regions * @cp: channel program for the current I/O operation * @irb: irb info received from interrupt @@ -81,6 +84,7 @@ struct vfio_ccw_private { struct ccw_io_region *io_region; struct mutex io_mutex; struct vfio_ccw_region *region; + struct ccw_cmd_region *cmd_region; int num_regions; struct channel_program cp; @@ -116,6 +120,7 @@ enum vfio_ccw_event { VFIO_CCW_EVENT_NOT_OPER, VFIO_CCW_EVENT_IO_REQ, VFIO_CCW_EVENT_INTERRUPT, + VFIO_CCW_EVENT_ASYNC_REQ, /* last element! */ NR_VFIO_CCW_EVENTS }; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 56e2413d3e00..8f10748dac79 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -354,6 +354,8 @@ struct vfio_region_gfx_edid { }; #define VFIO_REGION_TYPE_CCW (2) +/* ccw sub-types */ +#define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) /* * 10de vendor sub-type diff --git a/include/uapi/linux/vfio_ccw.h b/include/uapi/linux/vfio_ccw.h index 2ec5f367ff78..cbecbf0cd54f 100644 --- a/include/uapi/linux/vfio_ccw.h +++ b/include/uapi/linux/vfio_ccw.h @@ -12,6 +12,7 @@ #include +/* used for START SUBCHANNEL, always present */ struct ccw_io_region { #define ORB_AREA_SIZE 12 __u8 orb_area[ORB_AREA_SIZE]; @@ -22,4 +23,15 @@ struct ccw_io_region { __u32 ret_code; } __packed; +/* + * used for processing commands that trigger asynchronous actions + * Note: this is controlled by a capability + */ +#define VFIO_CCW_ASYNC_CMD_HSCH (1 << 0) +#define VFIO_CCW_ASYNC_CMD_CSCH (1 << 1) +struct ccw_cmd_region { + __u32 command; + __u32 ret_code; +} __packed; + #endif From cea5dde42a83b5f0a039da672f8686455936b8d8 Mon Sep 17 00:00:00 2001 From: Farhan Ali Date: Mon, 8 Apr 2019 17:05:31 -0400 Subject: [PATCH 39/98] vfio-ccw: Do not call flush_workqueue while holding the spinlock Currently we call flush_workqueue while holding the subchannel spinlock. But flush_workqueue function can go to sleep, so do not call the function while holding the spinlock. Fixes the following bug: [ 285.203430] BUG: scheduling while atomic: bash/14193/0x00000002 [ 285.203434] INFO: lockdep is turned off. .... [ 285.203485] Preemption disabled at: [ 285.203488] [<000003ff80243e5c>] vfio_ccw_sch_quiesce+0xbc/0x120 [vfio_ccw] [ 285.203496] CPU: 7 PID: 14193 Comm: bash Tainted: G W .... [ 285.203504] Call Trace: [ 285.203510] ([<0000000000113772>] show_stack+0x82/0xd0) [ 285.203514] [<0000000000b7a102>] dump_stack+0x92/0xd0 [ 285.203518] [<000000000017b8be>] __schedule_bug+0xde/0xf8 [ 285.203524] [<0000000000b95b5a>] __schedule+0x7a/0xc38 [ 285.203528] [<0000000000b9678a>] schedule+0x72/0xb0 [ 285.203533] [<0000000000b9bfbc>] schedule_timeout+0x34/0x528 [ 285.203538] [<0000000000b97608>] wait_for_common+0x118/0x1b0 [ 285.203544] [<0000000000166d6a>] flush_workqueue+0x182/0x548 [ 285.203550] [<000003ff80243e6e>] vfio_ccw_sch_quiesce+0xce/0x120 [vfio_ccw] [ 285.203556] [<000003ff80245278>] vfio_ccw_mdev_reset+0x38/0x70 [vfio_ccw] [ 285.203562] [<000003ff802458b0>] vfio_ccw_mdev_remove+0x40/0x78 [vfio_ccw] [ 285.203567] [<000003ff801a499c>] mdev_device_remove_ops+0x3c/0x80 [mdev] [ 285.203573] [<000003ff801a4d5c>] mdev_device_remove+0xc4/0x130 [mdev] [ 285.203578] [<000003ff801a5074>] remove_store+0x6c/0xa8 [mdev] [ 285.203582] [<000000000046f494>] kernfs_fop_write+0x14c/0x1f8 [ 285.203588] [<00000000003c1530>] __vfs_write+0x38/0x1a8 [ 285.203593] [<00000000003c187c>] vfs_write+0xb4/0x198 [ 285.203597] [<00000000003c1af2>] ksys_write+0x5a/0xb0 [ 285.203601] [<0000000000b9e270>] system_call+0xdc/0x2d8 Signed-off-by: Farhan Ali Reviewed-by: Eric Farman Reviewed-by: Pierre Morel Message-Id: <626bab8bb2958ae132452e1ddaf1b20882ad5a9d.1554756534.git.alifm@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index c39d01943a6a..d72aa8c760c5 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -57,9 +57,9 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch) wait_for_completion_timeout(&completion, 3*HZ); - spin_lock_irq(sch->lock); private->completion = NULL; flush_workqueue(vfio_ccw_work_q); + spin_lock_irq(sch->lock); ret = cio_cancel_halt_clear(sch, &iretry); }; From b49bdc8602b7c9c7a977758bee4125683f73e59f Mon Sep 17 00:00:00 2001 From: Farhan Ali Date: Mon, 8 Apr 2019 17:05:33 -0400 Subject: [PATCH 40/98] vfio-ccw: Release any channel program when releasing/removing vfio-ccw mdev When releasing the vfio-ccw mdev, we currently do not release any existing channel program and its pinned pages. This can lead to the following warning: [1038876.561565] WARNING: CPU: 2 PID: 144727 at drivers/vfio/vfio_iommu_type1.c:1494 vfio_sanity_check_pfn_list+0x40/0x70 [vfio_iommu_type1] .... 1038876.561921] Call Trace: [1038876.561935] ([<00000009897fb870>] 0x9897fb870) [1038876.561949] [<000003ff8013bf62>] vfio_iommu_type1_detach_group+0xda/0x2f0 [vfio_iommu_type1] [1038876.561965] [<000003ff8007b634>] __vfio_group_unset_container+0x64/0x190 [vfio] [1038876.561978] [<000003ff8007b87e>] vfio_group_put_external_user+0x26/0x38 [vfio] [1038876.562024] [<000003ff806fc608>] kvm_vfio_group_put_external_user+0x40/0x60 [kvm] [1038876.562045] [<000003ff806fcb9e>] kvm_vfio_destroy+0x5e/0xd0 [kvm] [1038876.562065] [<000003ff806f63fc>] kvm_put_kvm+0x2a4/0x3d0 [kvm] [1038876.562083] [<000003ff806f655e>] kvm_vm_release+0x36/0x48 [kvm] [1038876.562098] [<00000000003c2dc4>] __fput+0x144/0x228 [1038876.562113] [<000000000016ee82>] task_work_run+0x8a/0xd8 [1038876.562125] [<000000000014c7a8>] do_exit+0x5d8/0xd90 [1038876.562140] [<000000000014d084>] do_group_exit+0xc4/0xc8 [1038876.562155] [<000000000015c046>] get_signal+0x9ae/0xa68 [1038876.562169] [<0000000000108d66>] do_signal+0x66/0x768 [1038876.562185] [<0000000000b9e37e>] system_call+0x1ea/0x2d8 [1038876.562195] 2 locks held by qemu-system-s39/144727: [1038876.562205] #0: 00000000537abaf9 (&container->group_lock){++++}, at: __vfio_group_unset_container+0x3c/0x190 [vfio] [1038876.562230] #1: 00000000670008b5 (&iommu->lock){+.+.}, at: vfio_iommu_type1_detach_group+0x36/0x2f0 [vfio_iommu_type1] [1038876.562250] Last Breaking-Event-Address: [1038876.562262] [<000003ff8013aa24>] vfio_sanity_check_pfn_list+0x3c/0x70 [vfio_iommu_type1] [1038876.562272] irq event stamp: 4236481 [1038876.562287] hardirqs last enabled at (4236489): [<00000000001cee7a>] console_unlock+0x6d2/0x740 [1038876.562299] hardirqs last disabled at (4236496): [<00000000001ce87e>] console_unlock+0xd6/0x740 [1038876.562311] softirqs last enabled at (4234162): [<0000000000b9fa1e>] __do_softirq+0x556/0x598 [1038876.562325] softirqs last disabled at (4234153): [<000000000014e4cc>] irq_exit+0xac/0x108 [1038876.562337] ---[ end trace 6c96d467b1c3ca06 ]--- Similarly we do not free the channel program when we are removing the vfio-ccw device. Let's fix this by resetting the device and freeing the channel program and pinned pages in the release path. For the remove path we can just quiesce the device, since in the remove path the mediated device is going away for good and so we don't need to do a full reset. Signed-off-by: Farhan Ali Message-Id: Acked-by: Eric Farman Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_ops.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 3e2802a4a43d..5eb61116ca6f 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -134,11 +134,12 @@ static int vfio_ccw_mdev_remove(struct mdev_device *mdev) if ((private->state != VFIO_CCW_STATE_NOT_OPER) && (private->state != VFIO_CCW_STATE_STANDBY)) { - if (!vfio_ccw_mdev_reset(mdev)) + if (!vfio_ccw_sch_quiesce(private->sch)) private->state = VFIO_CCW_STATE_STANDBY; /* The state will be NOT_OPER on error. */ } + cp_free(&private->cp); private->mdev = NULL; atomic_inc(&private->avail); @@ -172,6 +173,14 @@ static void vfio_ccw_mdev_release(struct mdev_device *mdev) dev_get_drvdata(mdev_parent_dev(mdev)); int i; + if ((private->state != VFIO_CCW_STATE_NOT_OPER) && + (private->state != VFIO_CCW_STATE_STANDBY)) { + if (!vfio_ccw_mdev_reset(mdev)) + private->state = VFIO_CCW_STATE_STANDBY; + /* The state will be NOT_OPER on error. */ + } + + cp_free(&private->cp); vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, &private->nb); From d1ffa760d22aa1d8190478e5ef555c59a771db27 Mon Sep 17 00:00:00 2001 From: Farhan Ali Date: Tue, 16 Apr 2019 17:23:14 -0400 Subject: [PATCH 41/98] vfio-ccw: Prevent quiesce function going into an infinite loop The quiesce function calls cio_cancel_halt_clear() and if we get an -EBUSY we go into a loop where we: - wait for any interrupts - flush all I/O in the workqueue - retry cio_cancel_halt_clear During the period where we are waiting for interrupts or flushing all I/O, the channel subsystem could have completed a halt/clear action and turned off the corresponding activity control bits in the subchannel status word. This means the next time we call cio_cancel_halt_clear(), we will again start by calling cancel subchannel and so we can be stuck between calling cancel and halt forever. Rather than calling cio_cancel_halt_clear() immediately after waiting, let's try to disable the subchannel. If we succeed in disabling the subchannel then we know nothing else can happen with the device. Suggested-by: Eric Farman Signed-off-by: Farhan Ali Message-Id: <4d5a4b98ab1b41ac6131b5c36de18b76c5d66898.1555449329.git.alifm@linux.ibm.com> Reviewed-by: Eric Farman Acked-by: Halil Pasic Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_drv.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index d72aa8c760c5..ee8767f5845a 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -43,26 +43,30 @@ int vfio_ccw_sch_quiesce(struct subchannel *sch) if (ret != -EBUSY) goto out_unlock; + iretry = 255; do { - iretry = 255; ret = cio_cancel_halt_clear(sch, &iretry); - while (ret == -EBUSY) { - /* - * Flush all I/O and wait for - * cancel/halt/clear completion. - */ - private->completion = &completion; - spin_unlock_irq(sch->lock); + if (ret == -EIO) { + pr_err("vfio_ccw: could not quiesce subchannel 0.%x.%04x!\n", + sch->schid.ssid, sch->schid.sch_no); + break; + } + + /* + * Flush all I/O and wait for + * cancel/halt/clear completion. + */ + private->completion = &completion; + spin_unlock_irq(sch->lock); + + if (ret == -EBUSY) wait_for_completion_timeout(&completion, 3*HZ); - private->completion = NULL; - flush_workqueue(vfio_ccw_work_q); - spin_lock_irq(sch->lock); - ret = cio_cancel_halt_clear(sch, &iretry); - }; - + private->completion = NULL; + flush_workqueue(vfio_ccw_work_q); + spin_lock_irq(sch->lock); ret = cio_disable_subchannel(sch); } while (ret == -EBUSY); out_unlock: From 23d1aee92b42587e696ab48a9da774819ecd3718 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Thu, 11 Apr 2019 15:47:34 +0200 Subject: [PATCH 42/98] s390/crypto: rework generate_entropy function for pseudo random dd Here is a rework of the generate_entropy function of the pseudo random device driver exploiting the prno CPACF instruction. George Spelvin pointed out some issues with the existing implementation. One point was, that the buffer used to store the stckf values is 2 pages which are initially filled with get_random_bytes() for each 64 byte junk produced by the function. Another point was that the stckf values only carry entropy in the LSB and thus a buffer of 2 pages is not really needed. Then there was a comment about the use of the kimd cpacf function without proper initialization. The rework addresses these points and now one page is used and only one half of this is filled with get_random_bytes() on each chunk of 64 bytes requested data. The other half of the page is filled with stckf values exored into with an overlap of 4 bytes. This can be done due to the fact that only the lower 4 bytes carry entropy we need. For more details about the algorithm used, see the header of the function. The generate_entropy() function now uses the cpacf function klmd with proper initialization of the parameter block to perform the sha512 hash. George also pointed out some issues with the internal buffers used for seeding and reads. These buffers are now zeroed with memzero_implicit after use. Signed-off-by: Harald Freudenberger Reported-by: George Spelvin Suggested-by: George Spelvin Reviewed-by: Patrick Steuer Signed-off-by: Martin Schwidefsky --- arch/s390/crypto/prng.c | 84 ++++++++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 27 deletions(-) diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index a97a1802cfb4..fba37906045b 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -115,46 +115,68 @@ static const u8 initial_parm_block[32] __initconst = { /* * generate_entropy: - * This algorithm produces 64 bytes of entropy data based on 1024 - * individual stckf() invocations assuming that each stckf() value - * contributes 0.25 bits of entropy. So the caller gets 256 bit - * entropy per 64 byte or 4 bits entropy per byte. + * This function fills a given buffer with random bytes. The entropy within + * the random bytes given back is assumed to have at least 50% - meaning + * a 64 bytes buffer has at least 64 * 8 / 2 = 256 bits of entropy. + * Within the function the entropy generation is done in junks of 64 bytes. + * So the caller should also ask for buffer fill in multiples of 64 bytes. + * The generation of the entropy is based on the assumption that every stckf() + * invocation produces 0.5 bits of entropy. To accumulate 256 bits of entropy + * at least 512 stckf() values are needed. The entropy relevant part of the + * stckf value is bit 51 (counting starts at the left with bit nr 0) so + * here we use the lower 4 bytes and exor the values into 2k of bufferspace. + * To be on the save side, if there is ever a problem with stckf() the + * other half of the page buffer is filled with bytes from urandom via + * get_random_bytes(), so this function consumes 2k of urandom for each + * requested 64 bytes output data. Finally the buffer page is condensed into + * a 64 byte value by hashing with a SHA512 hash. */ static int generate_entropy(u8 *ebuf, size_t nbytes) { int n, ret = 0; - u8 *pg, *h, hash[64]; + u8 *pg, pblock[80] = { + /* 8 x 64 bit init values */ + 0x6A, 0x09, 0xE6, 0x67, 0xF3, 0xBC, 0xC9, 0x08, + 0xBB, 0x67, 0xAE, 0x85, 0x84, 0xCA, 0xA7, 0x3B, + 0x3C, 0x6E, 0xF3, 0x72, 0xFE, 0x94, 0xF8, 0x2B, + 0xA5, 0x4F, 0xF5, 0x3A, 0x5F, 0x1D, 0x36, 0xF1, + 0x51, 0x0E, 0x52, 0x7F, 0xAD, 0xE6, 0x82, 0xD1, + 0x9B, 0x05, 0x68, 0x8C, 0x2B, 0x3E, 0x6C, 0x1F, + 0x1F, 0x83, 0xD9, 0xAB, 0xFB, 0x41, 0xBD, 0x6B, + 0x5B, 0xE0, 0xCD, 0x19, 0x13, 0x7E, 0x21, 0x79, + /* 128 bit counter total message bit length */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 }; - /* allocate 2 pages */ - pg = (u8 *) __get_free_pages(GFP_KERNEL, 1); + /* allocate one page stckf buffer */ + pg = (u8 *) __get_free_page(GFP_KERNEL); if (!pg) { prng_errorflag = PRNG_GEN_ENTROPY_FAILED; return -ENOMEM; } + /* fill the ebuf in chunks of 64 byte each */ while (nbytes) { - /* fill pages with urandom bytes */ - get_random_bytes(pg, 2*PAGE_SIZE); - /* exor pages with 1024 stckf values */ - for (n = 0; n < 2 * PAGE_SIZE / sizeof(u64); n++) { - u64 *p = ((u64 *)pg) + n; + /* fill lower 2k with urandom bytes */ + get_random_bytes(pg, PAGE_SIZE / 2); + /* exor upper 2k with 512 stckf values, offset 4 bytes each */ + for (n = 0; n < 512; n++) { + int offset = (PAGE_SIZE / 2) + (n * 4) - 4; + u64 *p = (u64 *)(pg + offset); *p ^= get_tod_clock_fast(); } - n = (nbytes < sizeof(hash)) ? nbytes : sizeof(hash); - if (n < sizeof(hash)) - h = hash; - else - h = ebuf; - /* hash over the filled pages */ - cpacf_kimd(CPACF_KIMD_SHA_512, h, pg, 2*PAGE_SIZE); - if (n < sizeof(hash)) - memcpy(ebuf, hash, n); + /* hash over the filled page */ + cpacf_klmd(CPACF_KLMD_SHA_512, pblock, pg, PAGE_SIZE); + n = (nbytes < 64) ? nbytes : 64; + memcpy(ebuf, pblock, n); ret += n; ebuf += n; nbytes -= n; } - free_pages((unsigned long)pg, 1); + memzero_explicit(pblock, sizeof(pblock)); + memzero_explicit(pg, PAGE_SIZE); + free_page((unsigned long)pg); return ret; } @@ -345,7 +367,7 @@ static int __init prng_sha512_selftest(void) static int __init prng_sha512_instantiate(void) { int ret, datalen; - u8 seed[64 + 32 + 16]; + u8 seed[128 + 16]; pr_debug("prng runs in SHA-512 mode " "with chunksize=%d and reseed_limit=%u\n", @@ -368,16 +390,22 @@ static int __init prng_sha512_instantiate(void) if (ret) goto outfree; - /* generate initial seed bytestring, with 256 + 128 bits entropy */ - ret = generate_entropy(seed, 64 + 32); - if (ret != 64 + 32) + /* + * generate initial seed bytestring, we need at least + * 256 + 128 bits entropy. However, the generate_entropy() + * function anyway works in 64 byte junks so we pull + * 2*64 bytes here. + */ + ret = generate_entropy(seed, 128); + if (ret != 128) goto outfree; /* followed by 16 bytes of unique nonce */ - get_tod_clock_ext(seed + 64 + 32); + get_tod_clock_ext(seed + 128); /* initial seed of the prno drng */ cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, &prng_data->prnows, NULL, 0, seed, sizeof(seed)); + memzero_explicit(seed, sizeof(seed)); /* if fips mode is enabled, generate a first block of random bytes for the FIPS 140-2 Conditional Self Test */ @@ -416,6 +444,7 @@ static int prng_sha512_reseed(void) /* do a reseed of the prno drng with this bytestring */ cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, &prng_data->prnows, NULL, 0, seed, sizeof(seed)); + memzero_explicit(seed, sizeof(seed)); return 0; } @@ -592,6 +621,7 @@ static ssize_t prng_sha512_read(struct file *file, char __user *ubuf, ret = -EFAULT; break; } + memzero_explicit(p, n); ubuf += n; nbytes -= n; ret += n; From 769f020b6c9283d61c59de3559375ec7e961a424 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Tue, 16 Apr 2019 13:41:26 +0200 Subject: [PATCH 43/98] s390/crypto: use TRNG for seeding/reseeding With the z14 machine there came also a CPACF hardware extension which provides a True Random Number Generator. This TRNG can be accessed with a new subfunction code within the CPACF prno instruction and provides random data with very high entropy. So if there is a TRNG available, let's use it for initial seeding and reseeding instead of the current implementation which tries to generate entropy based on stckf (store clock fast) jitters. For details about the amount of data needed and pulled for seeding and reseeding there can be explaining comments in the code found. Signed-off-by: Harald Freudenberger Signed-off-by: Martin Schwidefsky --- arch/s390/crypto/prng.c | 67 +++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 20 deletions(-) diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index fba37906045b..12cca467af7d 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -61,6 +61,7 @@ static unsigned int prng_reseed_limit; module_param_named(reseed_limit, prng_reseed_limit, int, 0); MODULE_PARM_DESC(prng_reseed_limit, "PRNG reseed limit"); +static bool trng_available; /* * Any one who considers arithmetical methods of producing random digits is, @@ -366,7 +367,7 @@ static int __init prng_sha512_selftest(void) static int __init prng_sha512_instantiate(void) { - int ret, datalen; + int ret, datalen, seedlen; u8 seed[128 + 16]; pr_debug("prng runs in SHA-512 mode " @@ -390,21 +391,35 @@ static int __init prng_sha512_instantiate(void) if (ret) goto outfree; - /* - * generate initial seed bytestring, we need at least - * 256 + 128 bits entropy. However, the generate_entropy() - * function anyway works in 64 byte junks so we pull - * 2*64 bytes here. - */ - ret = generate_entropy(seed, 128); - if (ret != 128) - goto outfree; - /* followed by 16 bytes of unique nonce */ - get_tod_clock_ext(seed + 128); + /* generate initial seed, we need at least 256 + 128 bits entropy. */ + if (trng_available) { + /* + * Trng available, so use it. The trng works in chunks of + * 32 bytes and produces 100% entropy. So we pull 64 bytes + * which gives us 512 bits entropy. + */ + seedlen = 2 * 32; + cpacf_trng(NULL, 0, seed, seedlen); + } else { + /* + * No trng available, so use the generate_entropy() function. + * This function works in 64 byte junks and produces + * 50% entropy. So we pull 2*64 bytes which gives us 512 bits + * of entropy. + */ + seedlen = 2 * 64; + ret = generate_entropy(seed, seedlen); + if (ret != seedlen) + goto outfree; + } - /* initial seed of the prno drng */ + /* append the seed by 16 bytes of unique nonce */ + get_tod_clock_ext(seed + seedlen); + seedlen += 16; + + /* now initial seed of the prno drng */ cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, - &prng_data->prnows, NULL, 0, seed, sizeof(seed)); + &prng_data->prnows, NULL, 0, seed, seedlen); memzero_explicit(seed, sizeof(seed)); /* if fips mode is enabled, generate a first block of random @@ -433,17 +448,25 @@ static void prng_sha512_deinstantiate(void) static int prng_sha512_reseed(void) { - int ret; + int ret, seedlen; u8 seed[64]; - /* fetch 256 bits of fresh entropy */ - ret = generate_entropy(seed, sizeof(seed)); - if (ret != sizeof(seed)) - return ret; + /* We need at least 256 bits of fresh entropy for reseeding */ + if (trng_available) { + /* trng produces 256 bits entropy in 32 bytes */ + seedlen = 32; + cpacf_trng(NULL, 0, seed, seedlen); + } else { + /* generate_entropy() produces 256 bits entropy in 64 bytes */ + seedlen = 64; + ret = generate_entropy(seed, seedlen); + if (ret != sizeof(seed)) + return ret; + } /* do a reseed of the prno drng with this bytestring */ cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, - &prng_data->prnows, NULL, 0, seed, sizeof(seed)); + &prng_data->prnows, NULL, 0, seed, seedlen); memzero_explicit(seed, sizeof(seed)); return 0; @@ -803,6 +826,10 @@ static int __init prng_init(void) if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) return -EOPNOTSUPP; + /* check if TRNG subfunction is available */ + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + trng_available = true; + /* choose prng mode */ if (prng_mode != PRNG_MODE_TDES) { /* check for MSA5 support for PRNO operations */ From a8fd61688dfad6fdce95fa64cacd8a66595697b8 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 5 Feb 2019 16:15:01 +0100 Subject: [PATCH 44/98] s390: report new CPU capabilities Add hardware capability bits and features tags to /proc/cpuinfo for 4 new CPU features: "Vector-Enhancements Facility 2" (tag "vxe2", hwcap 2^15) "Vector-Packed-Decimal-Enhancement Facility" (tag "vxp", hwcap 2^16) "Enhanced-Sort Facility" (tag "sort", hwcap 2^17) "Deflate-Conversion Facility" (tag "dflt", hwcap 2^18) Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/elf.h | 4 ++++ arch/s390/kernel/processor.c | 3 ++- arch/s390/kernel/setup.c | 8 ++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index f74639a05f0f..5775fc22f410 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -107,6 +107,10 @@ #define HWCAP_S390_VXRS_BCD 4096 #define HWCAP_S390_VXRS_EXT 8192 #define HWCAP_S390_GS 16384 +#define HWCAP_S390_VXRS_EXT2 32768 +#define HWCAP_S390_VXRS_PDE 65536 +#define HWCAP_S390_SORT 131072 +#define HWCAP_S390_DFLT 262144 /* Internal bits, not exposed via elf */ #define HWCAP_INT_SIE 1UL diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 6fe2e1875058..5de13307b703 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -109,7 +109,8 @@ static void show_cpu_summary(struct seq_file *m, void *v) { static const char *hwcap_str[] = { "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", - "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs" + "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs", + "vxe2", "vxp", "sort", "dflt" }; static const char * const int_hwcap_str[] = { "sie" diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 70197a68e6fa..12d136e567c4 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -919,7 +919,15 @@ static int __init setup_hwcaps(void) elf_hwcap |= HWCAP_S390_VXRS_EXT; if (test_facility(135)) elf_hwcap |= HWCAP_S390_VXRS_BCD; + if (test_facility(148)) + elf_hwcap |= HWCAP_S390_VXRS_EXT2; + if (test_facility(152)) + elf_hwcap |= HWCAP_S390_VXRS_PDE; } + if (test_facility(150)) + elf_hwcap |= HWCAP_S390_SORT; + if (test_facility(151)) + elf_hwcap |= HWCAP_S390_DFLT; /* * Guarded storage support HWCAP_S390_GS is bit 12. From 86c74d869d321bee4753dc3f8c3d1c3809d8ed8a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 20 Feb 2019 14:08:26 +0100 Subject: [PATCH 45/98] s390/ipl: make ipl_info less confusing The ipl_info union in struct ipl_parameter_block has the same name as the struct ipl_info. This does not help while reading the code and the union in struct ipl_parameter_block does not need to be named. Drop the name from the union. Reviewed-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/boot/ipl_parm.c | 14 ++--- arch/s390/include/asm/ipl.h | 2 +- arch/s390/kernel/ipl.c | 110 +++++++++++++++++----------------- arch/s390/kernel/ipl_vmparm.c | 8 +-- 4 files changed, 66 insertions(+), 68 deletions(-) diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 1900670a83fd..849cf786db91 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -71,26 +71,26 @@ static size_t ipl_block_get_ascii_scpdata(char *dest, size_t size, size_t i; int has_lowercase; - count = min(size - 1, scpdata_length(ipb->ipl_info.fcp.scp_data, - ipb->ipl_info.fcp.scp_data_len)); + count = min(size - 1, scpdata_length(ipb->fcp.scp_data, + ipb->fcp.scp_data_len)); if (!count) goto out; has_lowercase = 0; for (i = 0; i < count; i++) { - if (!isascii(ipb->ipl_info.fcp.scp_data[i])) { + if (!isascii(ipb->fcp.scp_data[i])) { count = 0; goto out; } - if (!has_lowercase && islower(ipb->ipl_info.fcp.scp_data[i])) + if (!has_lowercase && islower(ipb->fcp.scp_data[i])) has_lowercase = 1; } if (has_lowercase) - memcpy(dest, ipb->ipl_info.fcp.scp_data, count); + memcpy(dest, ipb->fcp.scp_data, count); else for (i = 0; i < count; i++) - dest[i] = tolower(ipb->ipl_info.fcp.scp_data[i]); + dest[i] = tolower(ipb->fcp.scp_data[i]); out: dest[count] = '\0'; return count; @@ -239,7 +239,7 @@ void setup_memory_end(void) #ifdef CONFIG_CRASH_DUMP if (!OLDMEM_BASE && ipl_block_valid && ipl_block.hdr.pbt == DIAG308_IPL_TYPE_FCP && - ipl_block.ipl_info.fcp.opt == DIAG308_IPL_OPT_DUMP) { + ipl_block.fcp.opt == DIAG308_IPL_OPT_DUMP) { if (!sclp_early_get_hsa_size(&memory_end) && memory_end) memory_end_set = 1; } diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 6403cc9952bf..3530b613234f 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -78,7 +78,7 @@ struct ipl_parameter_block { struct ipl_block_fcp fcp; struct ipl_block_ccw ccw; char raw[PAGE_SIZE - sizeof(struct ipl_list_hdr)]; - } ipl_info; + }; } __packed __aligned(PAGE_SIZE); struct save_area; diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index cffe3374d201..d452f403a429 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -248,7 +248,7 @@ static __init enum ipl_type get_ipl_type(void) case DIAG308_IPL_TYPE_CCW: return IPL_TYPE_CCW; case DIAG308_IPL_TYPE_FCP: - if (ipl_block.ipl_info.fcp.opt == DIAG308_IPL_OPT_DUMP) + if (ipl_block.fcp.opt == DIAG308_IPL_OPT_DUMP) return IPL_TYPE_FCP_DUMP; else return IPL_TYPE_FCP; @@ -285,12 +285,11 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj, { switch (ipl_info.type) { case IPL_TYPE_CCW: - return sprintf(page, "0.%x.%04x\n", ipl_block.ipl_info.ccw.ssid, - ipl_block.ipl_info.ccw.devno); + return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid, + ipl_block.ccw.devno); case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: - return sprintf(page, "0.0.%04x\n", - ipl_block.ipl_info.fcp.devno); + return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno); default: return 0; } @@ -314,8 +313,8 @@ static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { - unsigned int size = ipl_block.ipl_info.fcp.scp_data_len; - void *scp_data = &ipl_block.ipl_info.fcp.scp_data; + unsigned int size = ipl_block.fcp.scp_data_len; + void *scp_data = &ipl_block.fcp.scp_data; return memory_read_from_buffer(buf, count, &off, scp_data, size); } @@ -331,13 +330,13 @@ static struct bin_attribute *ipl_fcp_bin_attrs[] = { /* FCP ipl device attributes */ DEFINE_IPL_ATTR_RO(ipl_fcp, wwpn, "0x%016llx\n", - (unsigned long long)ipl_block.ipl_info.fcp.wwpn); + (unsigned long long)ipl_block.fcp.wwpn); DEFINE_IPL_ATTR_RO(ipl_fcp, lun, "0x%016llx\n", - (unsigned long long)ipl_block.ipl_info.fcp.lun); + (unsigned long long)ipl_block.fcp.lun); DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n", - (unsigned long long)ipl_block.ipl_info.fcp.bootprog); + (unsigned long long)ipl_block.fcp.bootprog); DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n", - (unsigned long long)ipl_block.ipl_info.fcp.br_lba); + (unsigned long long)ipl_block.fcp.br_lba); static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) @@ -493,14 +492,14 @@ static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb, if (!(isalnum(buf[i]) || isascii(buf[i]) || isprint(buf[i]))) return -EINVAL; - memset(ipb->ipl_info.ccw.vm_parm, 0, DIAG308_VMPARM_SIZE); - ipb->ipl_info.ccw.vm_parm_len = ip_len; + memset(ipb->ccw.vm_parm, 0, DIAG308_VMPARM_SIZE); + ipb->ccw.vm_parm_len = ip_len; if (ip_len > 0) { - ipb->ipl_info.ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; - memcpy(ipb->ipl_info.ccw.vm_parm, buf, ip_len); - ASCEBC(ipb->ipl_info.ccw.vm_parm, ip_len); + ipb->ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; + memcpy(ipb->ccw.vm_parm, buf, ip_len); + ASCEBC(ipb->ccw.vm_parm, ip_len); } else { - ipb->ipl_info.ccw.vm_flags &= ~DIAG308_VM_FLAGS_VP_VALID; + ipb->ccw.vm_flags &= ~DIAG308_VM_FLAGS_VP_VALID; } return len; @@ -547,8 +546,8 @@ static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { - size_t size = reipl_block_fcp->ipl_info.fcp.scp_data_len; - void *scp_data = reipl_block_fcp->ipl_info.fcp.scp_data; + size_t size = reipl_block_fcp->fcp.scp_data_len; + void *scp_data = reipl_block_fcp->fcp.scp_data; return memory_read_from_buffer(buf, count, &off, scp_data, size); } @@ -564,15 +563,15 @@ static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, if (off) return -EINVAL; - memcpy(reipl_block_fcp->ipl_info.fcp.scp_data, buf, count); + memcpy(reipl_block_fcp->fcp.scp_data, buf, count); if (scpdata_len % 8) { padding = 8 - (scpdata_len % 8); - memset(reipl_block_fcp->ipl_info.fcp.scp_data + scpdata_len, + memset(reipl_block_fcp->fcp.scp_data + scpdata_len, 0, padding); scpdata_len += padding; } - reipl_block_fcp->ipl_info.fcp.scp_data_len = scpdata_len; + reipl_block_fcp->fcp.scp_data_len = scpdata_len; reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN + scpdata_len; reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN + scpdata_len; @@ -588,15 +587,15 @@ static struct bin_attribute *reipl_fcp_bin_attrs[] = { }; DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%llx\n", - reipl_block_fcp->ipl_info.fcp.wwpn); + reipl_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%llx\n", - reipl_block_fcp->ipl_info.fcp.lun); + reipl_block_fcp->fcp.lun); DEFINE_IPL_ATTR_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", - reipl_block_fcp->ipl_info.fcp.bootprog); + reipl_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(reipl_fcp, br_lba, "%lld\n", "%lld\n", - reipl_block_fcp->ipl_info.fcp.br_lba); + reipl_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", - reipl_block_fcp->ipl_info.fcp.devno); + reipl_block_fcp->fcp.devno); static void reipl_get_ascii_loadparm(char *loadparm, struct ipl_parameter_block *ibp) @@ -678,7 +677,7 @@ static struct attribute_group reipl_fcp_attr_group = { }; /* CCW reipl device attributes */ -DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ipl_info.ccw); +DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); /* NSS wrapper */ static ssize_t reipl_nss_loadparm_show(struct kobject *kobj, @@ -740,7 +739,7 @@ static struct attribute_group reipl_ccw_attr_group_lpar = { static void reipl_get_ascii_nss_name(char *dst, struct ipl_parameter_block *ipb) { - memcpy(dst, ipb->ipl_info.ccw.nss_name, NSS_NAME_SIZE); + memcpy(dst, ipb->ccw.nss_name, NSS_NAME_SIZE); EBCASC(dst, NSS_NAME_SIZE); dst[NSS_NAME_SIZE] = 0; } @@ -768,15 +767,15 @@ static ssize_t reipl_nss_name_store(struct kobject *kobj, if (nss_len > NSS_NAME_SIZE) return -EINVAL; - memset(reipl_block_nss->ipl_info.ccw.nss_name, 0x40, NSS_NAME_SIZE); + memset(reipl_block_nss->ccw.nss_name, 0x40, NSS_NAME_SIZE); if (nss_len > 0) { - reipl_block_nss->ipl_info.ccw.vm_flags |= + reipl_block_nss->ccw.vm_flags |= DIAG308_VM_FLAGS_NSS_VALID; - memcpy(reipl_block_nss->ipl_info.ccw.nss_name, buf, nss_len); - ASCEBC(reipl_block_nss->ipl_info.ccw.nss_name, nss_len); - EBC_TOUPPER(reipl_block_nss->ipl_info.ccw.nss_name, nss_len); + memcpy(reipl_block_nss->ccw.nss_name, buf, nss_len); + ASCEBC(reipl_block_nss->ccw.nss_name, nss_len); + EBC_TOUPPER(reipl_block_nss->ccw.nss_name, nss_len); } else { - reipl_block_nss->ipl_info.ccw.vm_flags &= + reipl_block_nss->ccw.vm_flags &= ~DIAG308_VM_FLAGS_NSS_VALID; } @@ -916,13 +915,12 @@ static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) /* VM PARM */ if (MACHINE_IS_VM && ipl_block_valid && - (ipl_block.ipl_info.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID)) { + (ipl_block.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID)) { - ipb->ipl_info.ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; - ipb->ipl_info.ccw.vm_parm_len = - ipl_block.ipl_info.ccw.vm_parm_len; - memcpy(ipb->ipl_info.ccw.vm_parm, - ipl_block.ipl_info.ccw.vm_parm, DIAG308_VMPARM_SIZE); + ipb->ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; + ipb->ccw.vm_parm_len = ipl_block.ccw.vm_parm_len; + memcpy(ipb->ccw.vm_parm, + ipl_block.ccw.vm_parm, DIAG308_VMPARM_SIZE); } } @@ -962,8 +960,8 @@ static int __init reipl_ccw_init(void) reipl_block_ccw_init(reipl_block_ccw); if (ipl_info.type == IPL_TYPE_CCW) { - reipl_block_ccw->ipl_info.ccw.ssid = ipl_block.ipl_info.ccw.ssid; - reipl_block_ccw->ipl_info.ccw.devno = ipl_block.ipl_info.ccw.devno; + reipl_block_ccw->ccw.ssid = ipl_block.ccw.ssid; + reipl_block_ccw->ccw.devno = ipl_block.ccw.devno; reipl_block_ccw_fill_parms(reipl_block_ccw); } @@ -1008,7 +1006,7 @@ static int __init reipl_fcp_init(void) reipl_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; reipl_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; - reipl_block_fcp->ipl_info.fcp.opt = DIAG308_IPL_OPT_IPL; + reipl_block_fcp->fcp.opt = DIAG308_IPL_OPT_IPL; } reipl_capabilities |= IPL_TYPE_FCP; return 0; @@ -1074,15 +1072,15 @@ static struct shutdown_action __refdata reipl_action = { /* FCP dump device attributes */ DEFINE_IPL_ATTR_RW(dump_fcp, wwpn, "0x%016llx\n", "%llx\n", - dump_block_fcp->ipl_info.fcp.wwpn); + dump_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(dump_fcp, lun, "0x%016llx\n", "%llx\n", - dump_block_fcp->ipl_info.fcp.lun); + dump_block_fcp->fcp.lun); DEFINE_IPL_ATTR_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", - dump_block_fcp->ipl_info.fcp.bootprog); + dump_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", - dump_block_fcp->ipl_info.fcp.br_lba); + dump_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", - dump_block_fcp->ipl_info.fcp.devno); + dump_block_fcp->fcp.devno); static struct attribute *dump_fcp_attrs[] = { &sys_dump_fcp_device_attr.attr, @@ -1099,7 +1097,7 @@ static struct attribute_group dump_fcp_attr_group = { }; /* CCW dump device attributes */ -DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ipl_info.ccw); +DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ccw); static struct attribute *dump_ccw_attrs[] = { &sys_dump_ccw_device_attr.attr, @@ -1219,7 +1217,7 @@ static int __init dump_fcp_init(void) dump_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; dump_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; dump_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; - dump_block_fcp->ipl_info.fcp.opt = DIAG308_IPL_OPT_DUMP; + dump_block_fcp->fcp.opt = DIAG308_IPL_OPT_DUMP; dump_capabilities |= DUMP_TYPE_FCP; return 0; } @@ -1663,15 +1661,15 @@ void __init setup_ipl(void) ipl_info.type = get_ipl_type(); switch (ipl_info.type) { case IPL_TYPE_CCW: - ipl_info.data.ccw.dev_id.ssid = ipl_block.ipl_info.ccw.ssid; - ipl_info.data.ccw.dev_id.devno = ipl_block.ipl_info.ccw.devno; + ipl_info.data.ccw.dev_id.ssid = ipl_block.ccw.ssid; + ipl_info.data.ccw.dev_id.devno = ipl_block.ccw.devno; break; case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: ipl_info.data.fcp.dev_id.ssid = 0; - ipl_info.data.fcp.dev_id.devno = ipl_block.ipl_info.fcp.devno; - ipl_info.data.fcp.wwpn = ipl_block.ipl_info.fcp.wwpn; - ipl_info.data.fcp.lun = ipl_block.ipl_info.fcp.lun; + ipl_info.data.fcp.dev_id.devno = ipl_block.fcp.devno; + ipl_info.data.fcp.wwpn = ipl_block.fcp.wwpn; + ipl_info.data.fcp.lun = ipl_block.fcp.lun; break; case IPL_TYPE_NSS: case IPL_TYPE_UNKNOWN: diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c index 411838c0a0af..41613c1617ff 100644 --- a/arch/s390/kernel/ipl_vmparm.c +++ b/arch/s390/kernel/ipl_vmparm.c @@ -11,11 +11,11 @@ size_t ipl_block_get_ascii_vmparm(char *dest, size_t size, char has_lowercase = 0; len = 0; - if ((ipb->ipl_info.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID) && - (ipb->ipl_info.ccw.vm_parm_len > 0)) { + if ((ipb->ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID) && + (ipb->ccw.vm_parm_len > 0)) { - len = min_t(size_t, size - 1, ipb->ipl_info.ccw.vm_parm_len); - memcpy(dest, ipb->ipl_info.ccw.vm_parm, len); + len = min_t(size_t, size - 1, ipb->ccw.vm_parm_len); + memcpy(dest, ipb->ccw.vm_parm, len); /* If at least one character is lowercase, we assume mixed * case; otherwise we convert everything to lowercase. */ From 5f1207fbe74450eb887155ba432bfc079312b0fe Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 20 Feb 2019 14:26:51 +0100 Subject: [PATCH 46/98] s390/ipl: provide uapi header for list directed IPL The IPL parameter block is used as an interface between Linux and the machine to query and change the boot device and boot options. To be able to create IPL parameter block in user space and pass it as segment to kexec provide an uapi header with proper structure definitions for the block. Signed-off-by: Martin Schwidefsky --- arch/s390/boot/ipl_parm.c | 10 +-- arch/s390/include/asm/ipl.h | 103 +++++++------------------------ arch/s390/include/uapi/asm/ipl.h | 94 ++++++++++++++++++++++++++++ arch/s390/kernel/ipl.c | 80 ++++++++++++------------ arch/s390/kernel/ipl_vmparm.c | 2 +- 5 files changed, 161 insertions(+), 128 deletions(-) create mode 100644 arch/s390/include/uapi/asm/ipl.h diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 849cf786db91..96777092fb14 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -106,12 +106,12 @@ static void append_ipl_block_parm(void) delim = early_command_line + len; /* '\0' character position */ parm = early_command_line + len + 1; /* append right after '\0' */ - switch (ipl_block.hdr.pbt) { - case DIAG308_IPL_TYPE_CCW: + switch (ipl_block.pb0_hdr.pbt) { + case IPL_PBT_CCW: rc = ipl_block_get_ascii_vmparm( parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; - case DIAG308_IPL_TYPE_FCP: + case IPL_PBT_FCP: rc = ipl_block_get_ascii_scpdata( parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; @@ -238,8 +238,8 @@ void setup_memory_end(void) { #ifdef CONFIG_CRASH_DUMP if (!OLDMEM_BASE && ipl_block_valid && - ipl_block.hdr.pbt == DIAG308_IPL_TYPE_FCP && - ipl_block.fcp.opt == DIAG308_IPL_OPT_DUMP) { + ipl_block.pb0_hdr.pbt == IPL_PBT_FCP && + ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) { if (!sclp_early_get_hsa_size(&memory_end) && memory_end) memory_end_set = 1; } diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 3530b613234f..878f6fd5f2e7 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -12,74 +12,34 @@ #include #include #include +#include -#define NSS_NAME_SIZE 8 +struct ipl_parameter_block { + struct ipl_pl_hdr hdr; + union { + struct ipl_pb_hdr pb0_hdr; + struct ipl_pb0_common common; + struct ipl_pb0_fcp fcp; + struct ipl_pb0_ccw ccw; + char raw[PAGE_SIZE - sizeof(struct ipl_pl_hdr)]; + }; +} __packed __aligned(PAGE_SIZE); -#define IPL_PARM_BLK_FCP_LEN (sizeof(struct ipl_list_hdr) + \ - sizeof(struct ipl_block_fcp)) +#define NSS_NAME_SIZE 8 -#define IPL_PARM_BLK0_FCP_LEN (sizeof(struct ipl_block_fcp) + 16) - -#define IPL_PARM_BLK_CCW_LEN (sizeof(struct ipl_list_hdr) + \ - sizeof(struct ipl_block_ccw)) - -#define IPL_PARM_BLK0_CCW_LEN (sizeof(struct ipl_block_ccw) + 16) +#define IPL_BP_FCP_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_fcp)) +#define IPL_BP0_FCP_LEN (sizeof(struct ipl_pb0_fcp)) +#define IPL_BP_CCW_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_ccw)) +#define IPL_BP0_CCW_LEN (sizeof(struct ipl_pb0_ccw)) #define IPL_MAX_SUPPORTED_VERSION (0) -struct ipl_list_hdr { - u32 len; - u8 reserved1[3]; - u8 version; - u32 blk0_len; - u8 pbt; - u8 flags; - u16 reserved2; - u8 loadparm[8]; -} __attribute__((packed)); - -struct ipl_block_fcp { - u8 reserved1[305-1]; - u8 opt; - u8 reserved2[3]; - u16 reserved3; - u16 devno; - u8 reserved4[4]; - u64 wwpn; - u64 lun; - u32 bootprog; - u8 reserved5[12]; - u64 br_lba; - u32 scp_data_len; - u8 reserved6[260]; - u8 scp_data[]; -} __attribute__((packed)); - -#define DIAG308_VMPARM_SIZE 64 -#define DIAG308_SCPDATA_SIZE (PAGE_SIZE - (sizeof(struct ipl_list_hdr) + \ - offsetof(struct ipl_block_fcp, scp_data))) - -struct ipl_block_ccw { - u8 reserved1[84]; - u16 reserved2 : 13; - u8 ssid : 3; - u16 devno; - u8 vm_flags; - u8 reserved3[3]; - u32 vm_parm_len; - u8 nss_name[8]; - u8 vm_parm[DIAG308_VMPARM_SIZE]; - u8 reserved4[8]; -} __attribute__((packed)); - -struct ipl_parameter_block { - struct ipl_list_hdr hdr; - union { - struct ipl_block_fcp fcp; - struct ipl_block_ccw ccw; - char raw[PAGE_SIZE - sizeof(struct ipl_list_hdr)]; - }; -} __packed __aligned(PAGE_SIZE); +#define DIAG308_VMPARM_SIZE (64) +#define DIAG308_SCPDATA_OFFSET offsetof(struct ipl_parameter_block, \ + fcp.scp_data) +#define DIAG308_SCPDATA_SIZE (PAGE_SIZE - DIAG308_SCPDATA_OFFSET) struct save_area; struct save_area * __init save_area_alloc(bool is_boot_cpu); @@ -132,25 +92,6 @@ enum diag308_subcode { DIAG308_STORE = 6, }; -enum diag308_ipl_type { - DIAG308_IPL_TYPE_FCP = 0, - DIAG308_IPL_TYPE_CCW = 2, -}; - -enum diag308_opt { - DIAG308_IPL_OPT_IPL = 0x10, - DIAG308_IPL_OPT_DUMP = 0x20, -}; - -enum diag308_flags { - DIAG308_FLAGS_LP_VALID = 0x80, -}; - -enum diag308_vm_flags { - DIAG308_VM_FLAGS_NSS_VALID = 0x80, - DIAG308_VM_FLAGS_VP_VALID = 0x40, -}; - enum diag308_rc { DIAG308_RC_OK = 0x0001, DIAG308_RC_NOCONFIG = 0x0102, diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h new file mode 100644 index 000000000000..3b513b39fca0 --- /dev/null +++ b/arch/s390/include/uapi/asm/ipl.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_UAPI_IPL_H +#define _ASM_S390_UAPI_IPL_H + +#include + +/* IPL Parameter List header */ +struct ipl_pl_hdr { + __u32 len; + __u8 reserved1[3]; + __u8 version; +} __packed; + +/* IPL Parameter Block header */ +struct ipl_pb_hdr { + __u32 len; + __u8 pbt; +} __packed; + +/* IPL Parameter Block types */ +enum ipl_pbt { + IPL_PBT_FCP = 0, + IPL_PBT_SCP_DATA = 1, + IPL_PBT_CCW = 2, +}; + +/* IPL Parameter Block 0 with common fields */ +struct ipl_pb0_common { + __u32 len; + __u8 pbt; + __u8 flags; + __u8 reserved1[2]; + __u8 loadparm[8]; + __u8 reserved2[84]; +} __packed; + +#define IPL_PB0_FLAG_LOADPARM 0x80 + +/* IPL Parameter Block 0 for FCP */ +struct ipl_pb0_fcp { + __u32 len; + __u8 pbt; + __u8 reserved1[3]; + __u8 loadparm[8]; + __u8 reserved2[304]; + __u8 opt; + __u8 reserved3[3]; + __u8 cssid; + __u8 reserved4[1]; + __u16 devno; + __u8 reserved5[4]; + __u64 wwpn; + __u64 lun; + __u32 bootprog; + __u8 reserved6[12]; + __u64 br_lba; + __u32 scp_data_len; + __u8 reserved7[260]; + __u8 scp_data[]; +} __packed; + +#define IPL_PB0_FCP_OPT_IPL 0x10 +#define IPL_PB0_FCP_OPT_DUMP 0x20 + +/* IPL Parameter Block 0 for CCW */ +struct ipl_pb0_ccw { + __u32 len; + __u8 pbt; + __u8 flags; + __u8 reserved1[2]; + __u8 loadparm[8]; + __u8 reserved2[84]; + __u16 reserved3 : 13; + __u8 ssid : 3; + __u16 devno; + __u8 vm_flags; + __u8 reserved4[3]; + __u32 vm_parm_len; + __u8 nss_name[8]; + __u8 vm_parm[64]; + __u8 reserved5[8]; +} __packed; + +#define IPL_PB0_CCW_VM_FLAG_NSS 0x80 +#define IPL_PB0_CCW_VM_FLAG_VP 0x40 + +/* IPL Parameter Block 1 for additional SCP data */ +struct ipl_pb1_scp_data { + __u32 len; + __u8 pbt; + __u8 scp_data[]; +} __packed; + +#endif diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index d452f403a429..f9718bc67cd4 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -244,11 +244,11 @@ static __init enum ipl_type get_ipl_type(void) if (!ipl_block_valid) return IPL_TYPE_UNKNOWN; - switch (ipl_block.hdr.pbt) { - case DIAG308_IPL_TYPE_CCW: + switch (ipl_block.pb0_hdr.pbt) { + case IPL_PBT_CCW: return IPL_TYPE_CCW; - case DIAG308_IPL_TYPE_FCP: - if (ipl_block.fcp.opt == DIAG308_IPL_OPT_DUMP) + case IPL_PBT_FCP: + if (ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) return IPL_TYPE_FCP_DUMP; else return IPL_TYPE_FCP; @@ -272,7 +272,7 @@ static ssize_t ipl_vm_parm_show(struct kobject *kobj, { char parm[DIAG308_VMPARM_SIZE + 1] = {}; - if (ipl_block_valid && (ipl_block.hdr.pbt == DIAG308_IPL_TYPE_CCW)) + if (ipl_block_valid && (ipl_block.pb0_hdr.pbt == IPL_PBT_CCW)) ipl_block_get_ascii_vmparm(parm, sizeof(parm), &ipl_block); return sprintf(page, "%s\n", parm); } @@ -495,11 +495,11 @@ static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb, memset(ipb->ccw.vm_parm, 0, DIAG308_VMPARM_SIZE); ipb->ccw.vm_parm_len = ip_len; if (ip_len > 0) { - ipb->ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; + ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; memcpy(ipb->ccw.vm_parm, buf, ip_len); ASCEBC(ipb->ccw.vm_parm, ip_len); } else { - ipb->ccw.vm_flags &= ~DIAG308_VM_FLAGS_VP_VALID; + ipb->ccw.vm_flags &= ~IPL_PB0_CCW_VM_FLAG_VP; } return len; @@ -571,9 +571,9 @@ static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, scpdata_len += padding; } + reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN + scpdata_len; + reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN + scpdata_len; reipl_block_fcp->fcp.scp_data_len = scpdata_len; - reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN + scpdata_len; - reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN + scpdata_len; return count; } @@ -600,7 +600,7 @@ DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", static void reipl_get_ascii_loadparm(char *loadparm, struct ipl_parameter_block *ibp) { - memcpy(loadparm, ibp->hdr.loadparm, LOADPARM_LEN); + memcpy(loadparm, ibp->common.loadparm, LOADPARM_LEN); EBCASC(loadparm, LOADPARM_LEN); loadparm[LOADPARM_LEN] = 0; strim(loadparm); @@ -635,11 +635,11 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, return -EINVAL; } /* initialize loadparm with blanks */ - memset(ipb->hdr.loadparm, ' ', LOADPARM_LEN); + memset(ipb->common.loadparm, ' ', LOADPARM_LEN); /* copy and convert to ebcdic */ - memcpy(ipb->hdr.loadparm, buf, lp_len); - ASCEBC(ipb->hdr.loadparm, LOADPARM_LEN); - ipb->hdr.flags |= DIAG308_FLAGS_LP_VALID; + memcpy(ipb->common.loadparm, buf, lp_len); + ASCEBC(ipb->common.loadparm, LOADPARM_LEN); + ipb->common.flags |= IPL_PB0_FLAG_LOADPARM; return len; } @@ -769,14 +769,12 @@ static ssize_t reipl_nss_name_store(struct kobject *kobj, memset(reipl_block_nss->ccw.nss_name, 0x40, NSS_NAME_SIZE); if (nss_len > 0) { - reipl_block_nss->ccw.vm_flags |= - DIAG308_VM_FLAGS_NSS_VALID; + reipl_block_nss->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_NSS; memcpy(reipl_block_nss->ccw.nss_name, buf, nss_len); ASCEBC(reipl_block_nss->ccw.nss_name, nss_len); EBC_TOUPPER(reipl_block_nss->ccw.nss_name, nss_len); } else { - reipl_block_nss->ccw.vm_flags &= - ~DIAG308_VM_FLAGS_NSS_VALID; + reipl_block_nss->ccw.vm_flags &= ~IPL_PB0_CCW_VM_FLAG_NSS; } return len; @@ -896,10 +894,10 @@ static void reipl_run(struct shutdown_trigger *trigger) static void reipl_block_ccw_init(struct ipl_parameter_block *ipb) { - ipb->hdr.len = IPL_PARM_BLK_CCW_LEN; + ipb->hdr.len = IPL_BP_CCW_LEN; ipb->hdr.version = IPL_PARM_BLOCK_VERSION; - ipb->hdr.blk0_len = IPL_PARM_BLK0_CCW_LEN; - ipb->hdr.pbt = DIAG308_IPL_TYPE_CCW; + ipb->pb0_hdr.len = IPL_BP0_CCW_LEN; + ipb->pb0_hdr.pbt = IPL_PBT_CCW; } static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) @@ -907,17 +905,17 @@ static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) /* LOADPARM */ /* check if read scp info worked and set loadparm */ if (sclp_ipl_info.is_valid) - memcpy(ipb->hdr.loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); + memcpy(ipb->ccw.loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); else /* read scp info failed: set empty loadparm (EBCDIC blanks) */ - memset(ipb->hdr.loadparm, 0x40, LOADPARM_LEN); - ipb->hdr.flags = DIAG308_FLAGS_LP_VALID; + memset(ipb->ccw.loadparm, 0x40, LOADPARM_LEN); + ipb->ccw.flags = IPL_PB0_FLAG_LOADPARM; /* VM PARM */ if (MACHINE_IS_VM && ipl_block_valid && - (ipl_block.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID)) { + (ipl_block.ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP)) { - ipb->ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; + ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; ipb->ccw.vm_parm_len = ipl_block.ccw.vm_parm_len; memcpy(ipb->ccw.vm_parm, ipl_block.ccw.vm_parm, DIAG308_VMPARM_SIZE); @@ -999,14 +997,14 @@ static int __init reipl_fcp_init(void) * is invalid in the SCSI IPL parameter block, so take it * always from sclp_ipl_info. */ - memcpy(reipl_block_fcp->hdr.loadparm, sclp_ipl_info.loadparm, + memcpy(reipl_block_fcp->fcp.loadparm, sclp_ipl_info.loadparm, LOADPARM_LEN); } else { - reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; + reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN; reipl_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; - reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; - reipl_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; - reipl_block_fcp->fcp.opt = DIAG308_IPL_OPT_IPL; + reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN; + reipl_block_fcp->fcp.pbt = IPL_PBT_FCP; + reipl_block_fcp->fcp.opt = IPL_PB0_FCP_OPT_IPL; } reipl_capabilities |= IPL_TYPE_FCP; return 0; @@ -1024,10 +1022,10 @@ static int __init reipl_type_init(void) /* * If we have an OS info reipl block, this will be used */ - if (reipl_block->hdr.pbt == DIAG308_IPL_TYPE_FCP) { + if (reipl_block->pb0_hdr.pbt == IPL_PBT_FCP) { memcpy(reipl_block_fcp, reipl_block, size); reipl_type = IPL_TYPE_FCP; - } else if (reipl_block->hdr.pbt == DIAG308_IPL_TYPE_CCW) { + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) { memcpy(reipl_block_ccw, reipl_block, size); reipl_type = IPL_TYPE_CCW; } @@ -1191,10 +1189,10 @@ static int __init dump_ccw_init(void) free_page((unsigned long)dump_block_ccw); return rc; } - dump_block_ccw->hdr.len = IPL_PARM_BLK_CCW_LEN; + dump_block_ccw->hdr.len = IPL_BP_CCW_LEN; dump_block_ccw->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_ccw->hdr.blk0_len = IPL_PARM_BLK0_CCW_LEN; - dump_block_ccw->hdr.pbt = DIAG308_IPL_TYPE_CCW; + dump_block_ccw->ccw.len = IPL_BP0_CCW_LEN; + dump_block_ccw->ccw.pbt = IPL_PBT_CCW; dump_capabilities |= DUMP_TYPE_CCW; return 0; } @@ -1213,11 +1211,11 @@ static int __init dump_fcp_init(void) free_page((unsigned long)dump_block_fcp); return rc; } - dump_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; + dump_block_fcp->hdr.len = IPL_BP_FCP_LEN; dump_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; - dump_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; - dump_block_fcp->fcp.opt = DIAG308_IPL_OPT_DUMP; + dump_block_fcp->fcp.len = IPL_BP0_FCP_LEN; + dump_block_fcp->fcp.pbt = IPL_PBT_FCP; + dump_block_fcp->fcp.opt = IPL_PB0_FCP_OPT_DUMP; dump_capabilities |= DUMP_TYPE_FCP; return 0; } @@ -1576,7 +1574,7 @@ static int __init s390_ipl_init(void) * READ SCP info provides the correct value. */ if (memcmp(sclp_ipl_info.loadparm, str, sizeof(str)) == 0 && ipl_block_valid) - memcpy(sclp_ipl_info.loadparm, ipl_block.hdr.loadparm, LOADPARM_LEN); + memcpy(sclp_ipl_info.loadparm, ipl_block.ccw.loadparm, LOADPARM_LEN); shutdown_actions_init(); shutdown_triggers_init(); return 0; diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c index 41613c1617ff..af43535a976d 100644 --- a/arch/s390/kernel/ipl_vmparm.c +++ b/arch/s390/kernel/ipl_vmparm.c @@ -11,7 +11,7 @@ size_t ipl_block_get_ascii_vmparm(char *dest, size_t size, char has_lowercase = 0; len = 0; - if ((ipb->ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID) && + if ((ipb->ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP) && (ipb->ccw.vm_parm_len > 0)) { len = min_t(size_t, size - 1, ipb->ccw.vm_parm_len); From d29af5b7a886033e6a4eb5f0a9a25cd00da63ae8 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 20 Feb 2019 16:14:16 +0100 Subject: [PATCH 47/98] s390/ipl: add definitions for the IPL report block To transport the information required for secure boot a new IPL report will be created at boot time. It will be written to memory right after the IPL parameter block. To work with the IPL report a couple of additional structure definitions are added the the uapi/ipl.h header. Signed-off-by: Martin Schwidefsky --- arch/s390/include/uapi/asm/ipl.h | 62 +++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h index 3b513b39fca0..fd32b1cd80d2 100644 --- a/arch/s390/include/uapi/asm/ipl.h +++ b/arch/s390/include/uapi/asm/ipl.h @@ -7,10 +7,15 @@ /* IPL Parameter List header */ struct ipl_pl_hdr { __u32 len; - __u8 reserved1[3]; + __u8 flags; + __u8 reserved1[2]; __u8 version; } __packed; +#define IPL_PL_FLAG_IPLPS 0x80 +#define IPL_PL_FLAG_SIPL 0x40 +#define IPL_PL_FLAG_IPLSR 0x20 + /* IPL Parameter Block header */ struct ipl_pb_hdr { __u32 len; @@ -91,4 +96,59 @@ struct ipl_pb1_scp_data { __u8 scp_data[]; } __packed; +/* IPL Report List header */ +struct ipl_rl_hdr { + __u32 len; + __u8 flags; + __u8 reserved1[2]; + __u8 version; + __u8 reserved2[8]; +} __packed; + +/* IPL Report Block header */ +struct ipl_rb_hdr { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; +} __packed; + +/* IPL Report Block types */ +enum ipl_rbt { + IPL_RBT_CERTIFICATES = 1, + IPL_RBT_COMPONENTS = 2, +}; + +/* IPL Report Block for the certificate list */ +struct ipl_rb_certificate_entry { + __u64 addr; + __u64 len; +} __packed; + +struct ipl_rb_certificates { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; + struct ipl_rb_certificate_entry entries[]; +} __packed; + +/* IPL Report Block for the component list */ +struct ipl_rb_component_entry { + __u64 addr; + __u64 len; + __u8 flags; + __u8 reserved1[5]; + __u16 certificate_index; + __u8 reserved2[8]; +}; + +#define IPL_RB_COMPONENT_FLAG_SIGNED 0x80 +#define IPL_RB_COMPONENT_FLAG_VERIFIED 0x40 + +struct ipl_rb_components { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; + struct ipl_rb_component_entry entries[]; +} __packed; + #endif From 9641b8cc733f70a5400aa7e6831de4542c46a94c Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 21 Feb 2019 14:23:04 +0100 Subject: [PATCH 48/98] s390/ipl: read IPL report at early boot Read the IPL Report block provided by secure-boot, add the entries of the certificate list to the system key ring and print the list of components. PR: Adjust to Vasilys bootdata_preserved patch set. Preserve ipl_cert_list for later use in kexec_file. Signed-off-by: Martin Schwidefsky Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/boot/Makefile | 5 +- arch/s390/boot/boot.h | 2 + arch/s390/boot/ipl_report.c | 165 ++++++++++++++++++ arch/s390/boot/startup.c | 20 +-- arch/s390/include/asm/boot_data.h | 7 + arch/s390/kernel/ipl.c | 19 ++ arch/s390/kernel/setup.c | 45 +++++ security/integrity/Kconfig | 11 +- security/integrity/Makefile | 8 +- .../integrity/platform_certs/load_ipl_s390.c | 36 ++++ 10 files changed, 301 insertions(+), 17 deletions(-) create mode 100644 arch/s390/boot/ipl_report.c create mode 100644 security/integrity/platform_certs/load_ipl_s390.c diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index a5ae68b2aa84..1f8fd68beae3 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -28,8 +28,9 @@ endif CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char -obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o string.o ebcdic.o -obj-y += sclp_early_core.o mem.o ipl_vmparm.o cmdline.o ctype.o +obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o +obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o +obj-y += ctype.o obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) subdir- := compressed diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 82bc06346e05..ca395fcff15e 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -10,4 +10,6 @@ void parse_boot_command_line(void); void setup_memory_end(void); void print_missing_facilities(void); +unsigned long read_ipl_report(unsigned long safe_offset); + #endif /* BOOT_BOOT_H */ diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c new file mode 100644 index 000000000000..0b4965573656 --- /dev/null +++ b/arch/s390/boot/ipl_report.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include "boot.h" + +int __bootdata_preserved(ipl_secure_flag); + +unsigned long __bootdata_preserved(ipl_cert_list_addr); +unsigned long __bootdata_preserved(ipl_cert_list_size); + +unsigned long __bootdata(early_ipl_comp_list_addr); +unsigned long __bootdata(early_ipl_comp_list_size); + +#define for_each_rb_entry(entry, rb) \ + for (entry = rb->entries; \ + (void *) entry + sizeof(*entry) <= (void *) rb + rb->len; \ + entry++) + +static inline bool intersects(unsigned long addr0, unsigned long size0, + unsigned long addr1, unsigned long size1) +{ + return addr0 + size0 > addr1 && addr1 + size1 > addr0; +} + +static unsigned long find_bootdata_space(struct ipl_rb_components *comps, + struct ipl_rb_certificates *certs, + unsigned long safe_addr) +{ + struct ipl_rb_certificate_entry *cert; + struct ipl_rb_component_entry *comp; + size_t size; + + /* + * Find the length for the IPL report boot data + */ + early_ipl_comp_list_size = 0; + for_each_rb_entry(comp, comps) + early_ipl_comp_list_size += sizeof(*comp); + ipl_cert_list_size = 0; + for_each_rb_entry(cert, certs) + ipl_cert_list_size += sizeof(unsigned int) + cert->len; + size = ipl_cert_list_size + early_ipl_comp_list_size; + + /* + * Start from safe_addr to find a free memory area large + * enough for the IPL report boot data. This area is used + * for ipl_cert_list_addr/ipl_cert_list_size and + * early_ipl_comp_list_addr/early_ipl_comp_list_size. It must + * not overlap with any component or any certificate. + */ +repeat: + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && + intersects(INITRD_START, INITRD_SIZE, safe_addr, size)) + safe_addr = INITRD_START + INITRD_SIZE; + for_each_rb_entry(comp, comps) + if (intersects(safe_addr, size, comp->addr, comp->len)) { + safe_addr = comp->addr + comp->len; + goto repeat; + } + for_each_rb_entry(cert, certs) + if (intersects(safe_addr, size, cert->addr, cert->len)) { + safe_addr = cert->addr + cert->len; + goto repeat; + } + early_ipl_comp_list_addr = safe_addr; + ipl_cert_list_addr = safe_addr + early_ipl_comp_list_size; + + return safe_addr + size; +} + +static void copy_components_bootdata(struct ipl_rb_components *comps) +{ + struct ipl_rb_component_entry *comp, *ptr; + + ptr = (struct ipl_rb_component_entry *) early_ipl_comp_list_addr; + for_each_rb_entry(comp, comps) + memcpy(ptr++, comp, sizeof(*ptr)); +} + +static void copy_certificates_bootdata(struct ipl_rb_certificates *certs) +{ + struct ipl_rb_certificate_entry *cert; + void *ptr; + + ptr = (void *) ipl_cert_list_addr; + for_each_rb_entry(cert, certs) { + *(unsigned int *) ptr = cert->len; + ptr += sizeof(unsigned int); + memcpy(ptr, (void *) cert->addr, cert->len); + ptr += cert->len; + } +} + +unsigned long read_ipl_report(unsigned long safe_addr) +{ + struct ipl_rb_certificates *certs; + struct ipl_rb_components *comps; + struct ipl_pl_hdr *pl_hdr; + struct ipl_rl_hdr *rl_hdr; + struct ipl_rb_hdr *rb_hdr; + unsigned long tmp; + void *rl_end; + + /* + * Check if there is a IPL report by looking at the copy + * of the IPL parameter information block. + */ + if (!ipl_block_valid || + !(ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR)) + return safe_addr; + ipl_secure_flag = !!(ipl_block.hdr.flags & IPL_PL_FLAG_SIPL); + /* + * There is an IPL report, to find it load the pointer to the + * IPL parameter information block from lowcore and skip past + * the IPL parameter list, then align the address to a double + * word boundary. + */ + tmp = (unsigned long) S390_lowcore.ipl_parmblock_ptr; + pl_hdr = (struct ipl_pl_hdr *) tmp; + tmp = (tmp + pl_hdr->len + 7) & -8UL; + rl_hdr = (struct ipl_rl_hdr *) tmp; + /* Walk through the IPL report blocks in the IPL Report list */ + certs = NULL; + comps = NULL; + rl_end = (void *) rl_hdr + rl_hdr->len; + rb_hdr = (void *) rl_hdr + sizeof(*rl_hdr); + while ((void *) rb_hdr + sizeof(*rb_hdr) < rl_end && + (void *) rb_hdr + rb_hdr->len <= rl_end) { + + switch (rb_hdr->rbt) { + case IPL_RBT_CERTIFICATES: + certs = (struct ipl_rb_certificates *) rb_hdr; + break; + case IPL_RBT_COMPONENTS: + comps = (struct ipl_rb_components *) rb_hdr; + break; + default: + break; + } + + rb_hdr = (void *) rb_hdr + rb_hdr->len; + } + + /* + * With either the component list or the certificate list + * missing the kernel will stay ignorant of secure IPL. + */ + if (!comps || !certs) + return safe_addr; + + /* + * Copy component and certificate list to a safe area + * where the decompressed kernel can find them. + */ + safe_addr = find_bootdata_space(comps, certs, safe_addr); + copy_components_bootdata(comps); + copy_certificates_bootdata(certs); + + return safe_addr; +} diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 2bd4a62d436c..90898976a941 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -25,19 +25,16 @@ unsigned long mem_safe_offset(void) } #endif -static void rescue_initrd(void) +static void rescue_initrd(unsigned long addr) { - unsigned long min_initrd_addr; - if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD)) return; if (!INITRD_START || !INITRD_SIZE) return; - min_initrd_addr = mem_safe_offset(); - if (min_initrd_addr <= INITRD_START) + if (addr <= INITRD_START) return; - memmove((void *)min_initrd_addr, (void *)INITRD_START, INITRD_SIZE); - INITRD_START = min_initrd_addr; + memmove((void *)addr, (void *)INITRD_START, INITRD_SIZE); + INITRD_START = addr; } static void copy_bootdata(void) @@ -52,12 +49,15 @@ static void copy_bootdata(void) void startup_kernel(void) { + unsigned long safe_addr; void *img; - uv_query_info(); - rescue_initrd(); - sclp_early_read_info(); store_ipl_parmblock(); + safe_addr = mem_safe_offset(); + safe_addr = read_ipl_report(safe_addr); + uv_query_info(); + rescue_initrd(safe_addr); + sclp_early_read_info(); setup_boot_command_line(); parse_boot_command_line(); setup_memory_end(); diff --git a/arch/s390/include/asm/boot_data.h b/arch/s390/include/asm/boot_data.h index d794802a2291..f7eed27b3220 100644 --- a/arch/s390/include/asm/boot_data.h +++ b/arch/s390/include/asm/boot_data.h @@ -7,5 +7,12 @@ extern char early_command_line[COMMAND_LINE_SIZE]; extern struct ipl_parameter_block ipl_block; extern int ipl_block_valid; +extern int ipl_secure_flag; + +extern unsigned long ipl_cert_list_addr; +extern unsigned long ipl_cert_list_size; + +extern unsigned long early_ipl_comp_list_addr; +extern unsigned long early_ipl_comp_list_size; #endif /* _ASM_S390_BOOT_DATA_H */ diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index f9718bc67cd4..0567de4005b4 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -122,6 +122,13 @@ static char *dump_type_str(enum dump_type type) int __bootdata_preserved(ipl_block_valid); struct ipl_parameter_block __bootdata_preserved(ipl_block); +int __bootdata_preserved(ipl_secure_flag); + +unsigned long __bootdata_preserved(ipl_cert_list_addr); +unsigned long __bootdata_preserved(ipl_cert_list_size); + +unsigned long __bootdata(early_ipl_comp_list_addr); +unsigned long __bootdata(early_ipl_comp_list_size); static int reipl_capabilities = IPL_TYPE_UNKNOWN; @@ -267,6 +274,15 @@ static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); +static ssize_t ipl_secure_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%i\n", !!ipl_secure_flag); +} + +static struct kobj_attribute sys_ipl_secure_attr = + __ATTR(secure, 0444, ipl_secure_show, NULL); + static ssize_t ipl_vm_parm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { @@ -362,6 +378,7 @@ static struct attribute *ipl_fcp_attrs[] = { &sys_ipl_fcp_bootprog_attr.attr, &sys_ipl_fcp_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_secure_attr.attr, NULL, }; @@ -377,6 +394,7 @@ static struct attribute *ipl_ccw_attrs_vm[] = { &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_vm_parm_attr.attr, + &sys_ipl_secure_attr.attr, NULL, }; @@ -384,6 +402,7 @@ static struct attribute *ipl_ccw_attrs_lpar[] = { &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_secure_attr.attr, NULL, }; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 12d136e567c4..ffc87520aca9 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -50,6 +50,7 @@ #include #include +#include #include #include #include @@ -741,6 +742,15 @@ static void __init reserve_initrd(void) #endif } +/* + * Reserve the memory area used to pass the certificate lists + */ +static void __init reserve_certificate_list(void) +{ + if (ipl_cert_list_addr) + memblock_reserve(ipl_cert_list_addr, ipl_cert_list_size); +} + static void __init reserve_mem_detect_info(void) { unsigned long start, size; @@ -1035,6 +1045,38 @@ static void __init setup_control_program_code(void) asm volatile("diag %0,0,0x318\n" : : "d" (diag318_info.val)); } +/* + * Print the component list from the IPL report + */ +static void __init log_component_list(void) +{ + struct ipl_rb_component_entry *ptr, *end; + char *str; + + if (!early_ipl_comp_list_addr) + return; + if (ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR) + pr_info("Linux is running with Secure-IPL enabled\n"); + else + pr_info("Linux is running with Secure-IPL disabled\n"); + ptr = (void *) early_ipl_comp_list_addr; + end = (void *) ptr + early_ipl_comp_list_size; + pr_info("The IPL report contains the following components:\n"); + while (ptr < end) { + if (ptr->flags & IPL_RB_COMPONENT_FLAG_SIGNED) { + if (ptr->flags & IPL_RB_COMPONENT_FLAG_VERIFIED) + str = "signed, verified"; + else + str = "signed, verification failed"; + } else { + str = "not signed"; + } + pr_info("%016llx - %016llx (%s)\n", + ptr->addr, ptr->addr + ptr->len, str); + ptr++; + } +} + /* * Setup function called from init/main.c just after the banner * was printed. @@ -1055,6 +1097,8 @@ void __init setup_arch(char **cmdline_p) else pr_info("Linux is running as a guest in 64-bit mode\n"); + log_component_list(); + /* Have one command line that is parsed and saved in /proc/cmdline */ /* boot_command_line has been already set up in early.c */ *cmdline_p = boot_command_line; @@ -1086,6 +1130,7 @@ void __init setup_arch(char **cmdline_p) reserve_oldmem(); reserve_kernel(); reserve_initrd(); + reserve_certificate_list(); reserve_mem_detect_info(); memblock_allow_resize(); diff --git a/security/integrity/Kconfig b/security/integrity/Kconfig index 2ea4ec9991d5..3ba1168b1756 100644 --- a/security/integrity/Kconfig +++ b/security/integrity/Kconfig @@ -55,13 +55,22 @@ config INTEGRITY_PLATFORM_KEYRING bool "Provide keyring for platform/firmware trusted keys" depends on INTEGRITY_ASYMMETRIC_KEYS depends on SYSTEM_BLACKLIST_KEYRING - depends on EFI help Provide a separate, distinct keyring for platform trusted keys, which the kernel automatically populates during initialization from values provided by the platform for verifying the kexec'ed kerned image and, possibly, the initramfs signature. +config LOAD_UEFI_KEYS + depends on INTEGRITY_PLATFORM_KEYRING + depends on EFI + def_bool y + +config LOAD_IPL_KEYS + depends on INTEGRITY_PLATFORM_KEYRING + depends on S390 + def_bool y + config INTEGRITY_AUDIT bool "Enables integrity auditing support " depends on AUDIT diff --git a/security/integrity/Makefile b/security/integrity/Makefile index 86df9aba8c0f..19faace69644 100644 --- a/security/integrity/Makefile +++ b/security/integrity/Makefile @@ -9,10 +9,10 @@ integrity-y := iint.o integrity-$(CONFIG_INTEGRITY_AUDIT) += integrity_audit.o integrity-$(CONFIG_INTEGRITY_SIGNATURE) += digsig.o integrity-$(CONFIG_INTEGRITY_ASYMMETRIC_KEYS) += digsig_asymmetric.o -integrity-$(CONFIG_INTEGRITY_PLATFORM_KEYRING) += platform_certs/platform_keyring.o \ - platform_certs/efi_parser.o \ - platform_certs/load_uefi.o -obj-$(CONFIG_LOAD_UEFI_KEYS) += platform_certs/load_uefi.o +integrity-$(CONFIG_INTEGRITY_PLATFORM_KEYRING) += platform_certs/platform_keyring.o +integrity-$(CONFIG_LOAD_UEFI_KEYS) += platform_certs/efi_parser.o \ + platform_certs/load_uefi.o +integrity-$(CONFIG_LOAD_IPL_KEYS) += platform_certs/load_ipl_s390.o $(obj)/load_uefi.o: KBUILD_CFLAGS += -fshort-wchar subdir-$(CONFIG_IMA) += ima diff --git a/security/integrity/platform_certs/load_ipl_s390.c b/security/integrity/platform_certs/load_ipl_s390.c new file mode 100644 index 000000000000..e769dcb7ea94 --- /dev/null +++ b/security/integrity/platform_certs/load_ipl_s390.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../integrity.h" + +/* + * Load the certs contained in the IPL report created by the machine loader + * into the platform trusted keyring. + */ +static int __init load_ipl_certs(void) +{ + void *ptr, *end; + unsigned int len; + + if (!ipl_cert_list_addr) + return 0; + /* Copy the certificates to the system keyring */ + ptr = (void *) ipl_cert_list_addr; + end = ptr + ipl_cert_list_size; + while ((void *) ptr < end) { + len = *(unsigned int *) ptr; + ptr += sizeof(unsigned int); + add_to_platform_keyring("IPL:db", ptr, len); + ptr += len; + } + return 0; +} +late_initcall(load_ipl_certs); From 937347ac56bfca10c76153ac700e88a4b41f7130 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 25 Feb 2019 17:23:39 +0100 Subject: [PATCH 49/98] s390/ipl: add helper functions to create an IPL report PR: Adjusted to the use in kexec_file later. Signed-off-by: Martin Schwidefsky Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ipl.h | 27 ++++++++ arch/s390/kernel/ipl.c | 134 ++++++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+) diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 878f6fd5f2e7..bf62af49af06 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -81,6 +81,33 @@ extern struct ipl_info ipl_info; extern void setup_ipl(void); extern void set_os_info_reipl_block(void); +struct ipl_report { + struct ipl_parameter_block *ipib; + struct list_head components; + struct list_head certificates; + size_t size; +}; + +struct ipl_report_component { + struct list_head list; + struct ipl_rb_component_entry entry; +}; + +struct ipl_report_certificate { + struct list_head list; + struct ipl_rb_certificate_entry entry; + void *key; +}; + +struct kexec_buf; +struct ipl_report *ipl_report_init(struct ipl_parameter_block *ipib); +void *ipl_report_finish(struct ipl_report *report); +int ipl_report_free(struct ipl_report *report); +int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, + unsigned char flags, unsigned short cert); +int ipl_report_add_certificate(struct ipl_report *report, void *key, + unsigned long addr, unsigned long len); + /* * DIAG 308 support */ diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 0567de4005b4..6f2bb64cf70e 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -1705,3 +1705,137 @@ void s390_reset_system(void) __ctl_clear_bit(0, 28); diag308_reset(); } + +#ifdef CONFIG_KEXEC_FILE + +int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, + unsigned char flags, unsigned short cert) +{ + struct ipl_report_component *comp; + + comp = vzalloc(sizeof(*comp)); + if (!comp) + return -ENOMEM; + list_add_tail(&comp->list, &report->components); + + comp->entry.addr = kbuf->mem; + comp->entry.len = kbuf->memsz; + comp->entry.flags = flags; + comp->entry.certificate_index = cert; + + report->size += sizeof(comp->entry); + + return 0; +} + +int ipl_report_add_certificate(struct ipl_report *report, void *key, + unsigned long addr, unsigned long len) +{ + struct ipl_report_certificate *cert; + + cert = vzalloc(sizeof(*cert)); + if (!cert) + return -ENOMEM; + list_add_tail(&cert->list, &report->certificates); + + cert->entry.addr = addr; + cert->entry.len = len; + cert->key = key; + + report->size += sizeof(cert->entry); + report->size += cert->entry.len; + + return 0; +} + +struct ipl_report *ipl_report_init(struct ipl_parameter_block *ipib) +{ + struct ipl_report *report; + + report = vzalloc(sizeof(*report)); + if (!report) + return ERR_PTR(-ENOMEM); + + report->ipib = ipib; + INIT_LIST_HEAD(&report->components); + INIT_LIST_HEAD(&report->certificates); + + report->size = ALIGN(ipib->hdr.len, 8); + report->size += sizeof(struct ipl_rl_hdr); + report->size += sizeof(struct ipl_rb_components); + report->size += sizeof(struct ipl_rb_certificates); + + return report; +} + +void *ipl_report_finish(struct ipl_report *report) +{ + struct ipl_report_certificate *cert; + struct ipl_report_component *comp; + struct ipl_rb_certificates *certs; + struct ipl_parameter_block *ipib; + struct ipl_rb_components *comps; + struct ipl_rl_hdr *rl_hdr; + void *buf, *ptr; + + buf = vzalloc(report->size); + if (!buf) + return ERR_PTR(-ENOMEM); + ptr = buf; + + memcpy(ptr, report->ipib, report->ipib->hdr.len); + ipib = ptr; + if (ipl_secure_flag) + ipib->hdr.flags |= IPL_PL_FLAG_SIPL; + ipib->hdr.flags |= IPL_PL_FLAG_IPLSR; + ptr += report->ipib->hdr.len; + ptr = PTR_ALIGN(ptr, 8); + + rl_hdr = ptr; + ptr += sizeof(*rl_hdr); + + comps = ptr; + comps->rbt = IPL_RBT_COMPONENTS; + ptr += sizeof(*comps); + list_for_each_entry(comp, &report->components, list) { + memcpy(ptr, &comp->entry, sizeof(comp->entry)); + ptr += sizeof(comp->entry); + } + comps->len = ptr - (void *)comps; + + certs = ptr; + certs->rbt = IPL_RBT_CERTIFICATES; + ptr += sizeof(*certs); + list_for_each_entry(cert, &report->certificates, list) { + memcpy(ptr, &cert->entry, sizeof(cert->entry)); + ptr += sizeof(cert->entry); + } + certs->len = ptr - (void *)certs; + rl_hdr->len = ptr - (void *)rl_hdr; + + list_for_each_entry(cert, &report->certificates, list) { + memcpy(ptr, cert->key, cert->entry.len); + ptr += cert->entry.len; + } + + BUG_ON(ptr > buf + report->size); + return buf; +} + +int ipl_report_free(struct ipl_report *report) +{ + struct ipl_report_component *comp, *ncomp; + struct ipl_report_certificate *cert, *ncert; + + list_for_each_entry_safe(comp, ncomp, &report->components, list) + vfree(comp); + + list_for_each_entry_safe(cert, ncert, &report->certificates, list) + vfree(cert); + + vfree(report); + + return 0; +} + +#endif From f6780686525cc69a16a893cac0dd6adfbf25b7ff Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 17 Apr 2019 16:32:27 +0200 Subject: [PATCH 50/98] s390/boot: pad bzImage to 4K In order to be able to sign the bzImage independent of the block size of the IPL device, align the bzImage to 4096 bytes. Signed-off-by: Martin Schwidefsky --- arch/s390/boot/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 1f8fd68beae3..c1993c57300f 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -50,6 +50,7 @@ define cmd_section_cmp touch $@ endef +OBJCOPYFLAGS_bzImage := --pad-to $$(readelf -s $(obj)/compressed/vmlinux | awk '/\<_end\>/ {print or(strtonum("0x"$$2),4095)+1}') $(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE $(call if_changed,objcopy) From f3df44e7c9869b7691a4a0b57fa39ca47060b424 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Thu, 14 Mar 2019 13:50:32 +0100 Subject: [PATCH 51/98] s390/zcore: Rename ipl_block to mitigate name collision With git commit 1e941d39493f1820475d80729a03cd7ab8c3c86d "s390: move ipl block to .boot.preserved.data section" the earl_ipl_block got renamed to ipl_block and became publicly available via boot_data.h. This might cause problems with zcore, which has it's own ipl_block variable. Thus rename the ipl_block in zcore to prevent name collision and highlight that it's only used locally. Signed-off-by: Philipp Rudo Fixes: 1e941d39493f ("s390: move ipl block to .boot.preserved.data section") Signed-off-by: Martin Schwidefsky --- drivers/s390/char/zcore.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index 76d3c50bf078..f75d3bfb5af3 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -51,7 +51,7 @@ static struct dentry *zcore_dir; static struct dentry *zcore_memmap_file; static struct dentry *zcore_reipl_file; static struct dentry *zcore_hsa_file; -static struct ipl_parameter_block *ipl_block; +static struct ipl_parameter_block *zcore_ipl_block; static char hsa_buf[PAGE_SIZE] __aligned(PAGE_SIZE); @@ -182,8 +182,8 @@ static const struct file_operations zcore_memmap_fops = { static ssize_t zcore_reipl_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - if (ipl_block) { - diag308(DIAG308_SET, ipl_block); + if (zcore_ipl_block) { + diag308(DIAG308_SET, zcore_ipl_block); diag308(DIAG308_LOAD_CLEAR, NULL); } return count; @@ -265,18 +265,20 @@ static int __init zcore_reipl_init(void) return rc; if (ipib_info.ipib == 0) return 0; - ipl_block = (void *) __get_free_page(GFP_KERNEL); - if (!ipl_block) + zcore_ipl_block = (void *) __get_free_page(GFP_KERNEL); + if (!zcore_ipl_block) return -ENOMEM; if (ipib_info.ipib < sclp.hsa_size) - rc = memcpy_hsa_kernel(ipl_block, ipib_info.ipib, PAGE_SIZE); + rc = memcpy_hsa_kernel(zcore_ipl_block, ipib_info.ipib, + PAGE_SIZE); else - rc = memcpy_real(ipl_block, (void *) ipib_info.ipib, PAGE_SIZE); - if (rc || (__force u32)csum_partial(ipl_block, ipl_block->hdr.len, 0) != + rc = memcpy_real(zcore_ipl_block, (void *) ipib_info.ipib, + PAGE_SIZE); + if (rc || (__force u32)csum_partial(zcore_ipl_block, zcore_ipl_block->hdr.len, 0) != ipib_info.checksum) { TRACE("Checksum does not match\n"); - free_page((unsigned long) ipl_block); - ipl_block = NULL; + free_page((unsigned long) zcore_ipl_block); + zcore_ipl_block = NULL; } return 0; } From 6339a3889ad4d0dd930ed7a1e873fb81d3e690f7 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Mon, 1 Apr 2019 12:13:42 +0200 Subject: [PATCH 52/98] s390/kexec_file: Fix potential segment overlap in ELF loader When loading an ELF image via kexec_file the segment alignment is ignored in the calculation for the load address of the next segment. When there are multiple segments this can lead to segment overlap and thus load failure. Signed-off-by: Philipp Rudo Fixes: 8be018827154 ("s390/kexec_file: Add ELF loader") Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/kexec_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 5a286b012043..1d1c77c647d2 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -53,7 +53,7 @@ static int kexec_file_add_elf_kernel(struct kimage *image, if (ret) return ret; - data->memsz += buf.memsz; + data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; } return 0; From 729829d775c9a5217abc784b2f16087d79c4eec8 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Mon, 1 Apr 2019 12:48:43 +0200 Subject: [PATCH 53/98] s390/kexec_file: Fix detection of text segment in ELF loader To register data for the next kernel (command line, oldmem_base, etc.) the current kernel needs to find the ELF segment that contains head.S. This is currently done by checking ifor 'phdr->p_paddr == 0'. This works fine for the current kernel build but in theory the first few pages could be skipped. Make the detection more robust by checking if the entry point lies within the segment. Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/kexec_elf.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 1d1c77c647d2..5cf340b778f1 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -19,10 +19,15 @@ static int kexec_file_add_elf_kernel(struct kimage *image, struct kexec_buf buf; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; + Elf_Addr entry; int i, ret; ehdr = (Elf_Ehdr *)kernel; buf.image = image; + if (image->type == KEXEC_TYPE_CRASH) + entry = STARTUP_KDUMP_OFFSET; + else + entry = ehdr->e_entry; phdr = (void *)ehdr + ehdr->e_phoff; for (i = 0; i < ehdr->e_phnum; i++, phdr++) { @@ -35,7 +40,7 @@ static int kexec_file_add_elf_kernel(struct kimage *image, buf.mem = ALIGN(phdr->p_paddr, phdr->p_align); buf.memsz = phdr->p_memsz; - if (phdr->p_paddr == 0) { + if (entry - phdr->p_paddr < phdr->p_memsz) { data->kernel_buf = buf.buffer; data->memsz += STARTUP_NORMAL_OFFSET; From 61f3f8fc223524692aaa87cf46d95ca3015e83a9 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Tue, 5 Mar 2019 17:18:13 +0100 Subject: [PATCH 54/98] s390/purgatory: Reduce purgatory size The purgatory is compiled into the vmlinux and keept in memory all the time during runtime. Thus any section not needed to load the purgatory unnecessarily bloats up its foot print in file- and memorysize. Reduce the purgatory size by stripping the unneeded sections from the purgatory. This reduces the purgatories size by ~33%. Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/purgatory/Makefile | 14 +++++--- arch/s390/purgatory/purgatory.lds.S | 54 +++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 arch/s390/purgatory/purgatory.lds.S diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index ce6a3f75065b..175ff6c71735 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -4,7 +4,7 @@ OBJECT_FILES_NON_STANDARD := y purgatory-y := head.o purgatory.o string.o sha256.o mem.o -targets += $(purgatory-y) purgatory.ro kexec-purgatory.c +targets += $(purgatory-y) purgatory.lds purgatory purgatory.ro kexec-purgatory.c PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE @@ -16,8 +16,6 @@ $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(obj)/string.o: $(srctree)/arch/s390/lib/string.c FORCE $(call if_changed_rule,cc_o_c) -LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -LDFLAGS_purgatory.ro += -z nodefaultlib KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding @@ -25,9 +23,17 @@ KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) -$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE +LDFLAGS_purgatory := -r --no-undefined -nostdlib -z nodefaultlib -T +$(obj)/purgatory: $(obj)/purgatory.lds $(PURGATORY_OBJS) FORCE $(call if_changed,ld) +OBJCOPYFLAGS_purgatory.ro := -O elf64-s390 +OBJCOPYFLAGS_purgatory.ro += --remove-section='*debug*' +OBJCOPYFLAGS_purgatory.ro += --remove-section='.comment' +OBJCOPYFLAGS_purgatory.ro += --remove-section='.note.*' +$(obj)/purgatory.ro: $(obj)/purgatory FORCE + $(call if_changed,objcopy) + quiet_cmd_bin2c = BIN2C $@ cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ diff --git a/arch/s390/purgatory/purgatory.lds.S b/arch/s390/purgatory/purgatory.lds.S new file mode 100644 index 000000000000..482eb4fbcef1 --- /dev/null +++ b/arch/s390/purgatory/purgatory.lds.S @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include + +OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") +OUTPUT_ARCH(s390:64-bit) + +ENTRY(purgatory_start) + +SECTIONS +{ + . = 0; + .head.text : { + _head = . ; + HEAD_TEXT + _ehead = . ; + } + .text : { + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + + . = ALIGN(256); + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(8); /* For convenience during zeroing */ + _ebss = .; + } + _end = .; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.eh_frame) + *(*__ksymtab*) + *(___kcrctab*) + } +} From 4c0f032d496385fa8071e404a1bc33f4abbc2f81 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Tue, 5 Mar 2019 18:08:36 +0100 Subject: [PATCH 55/98] s390/purgatory: Omit use of bin2c Omit use of script/bin2c hack. Directly include into assembler file instead. Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/purgatory/Makefile | 9 +++------ arch/s390/purgatory/kexec-purgatory.S | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 arch/s390/purgatory/kexec-purgatory.S diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index 175ff6c71735..b20934d91986 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -4,7 +4,7 @@ OBJECT_FILES_NON_STANDARD := y purgatory-y := head.o purgatory.o string.o sha256.o mem.o -targets += $(purgatory-y) purgatory.lds purgatory purgatory.ro kexec-purgatory.c +targets += $(purgatory-y) purgatory.lds purgatory purgatory.ro PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE @@ -34,10 +34,7 @@ OBJCOPYFLAGS_purgatory.ro += --remove-section='.note.*' $(obj)/purgatory.ro: $(obj)/purgatory FORCE $(call if_changed,objcopy) -quiet_cmd_bin2c = BIN2C $@ - cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ - -$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE - $(call if_changed,bin2c) +$(obj)/kexec-purgatory.o: $(obj)/kexec-purgatory.S $(obj)/purgatory.ro FORCE + $(call if_changed_rule,as_o_S) obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += kexec-purgatory.o diff --git a/arch/s390/purgatory/kexec-purgatory.S b/arch/s390/purgatory/kexec-purgatory.S new file mode 100644 index 000000000000..8293753100ae --- /dev/null +++ b/arch/s390/purgatory/kexec-purgatory.S @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + + .section .rodata, "a" + + .align 8 +kexec_purgatory: + .globl kexec_purgatory + .incbin "arch/s390/purgatory/purgatory.ro" +.Lkexec_purgatroy_end: + + .align 8 +kexec_purgatory_size: + .globl kexec_purgatory_size + .quad .Lkexec_purgatroy_end - kexec_purgatory From d0d249d75dda1b101624316a52d117be07b8ccff Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Wed, 6 Mar 2019 17:36:26 +0100 Subject: [PATCH 56/98] s390/kexec_file: Simplify parmarea access Access the parmarea in head.S via a struct instead of individual offsets. While at it make the fields in the parmarea .quads. Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/boot/head.S | 11 ++++++----- arch/s390/include/asm/kexec.h | 7 ++++--- arch/s390/include/asm/setup.h | 10 ++++++++++ arch/s390/kernel/kexec_elf.c | 1 + arch/s390/kernel/kexec_image.c | 1 + arch/s390/kernel/machine_kexec_file.c | 23 ++++++----------------- 6 files changed, 28 insertions(+), 25 deletions(-) diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index ce2cbbc41742..d585c4dbdac7 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -323,13 +323,14 @@ ENTRY(startup_kdump) # # params at 10400 (setup.h) +# Must be keept in sync with struct parmarea in setup.h # .org PARMAREA - .long 0,0 # IPL_DEVICE - .long 0,0 # INITRD_START - .long 0,0 # INITRD_SIZE - .long 0,0 # OLDMEM_BASE - .long 0,0 # OLDMEM_SIZE + .quad 0 # IPL_DEVICE + .quad 0 # INITRD_START + .quad 0 # INITRD_SIZE + .quad 0 # OLDMEM_BASE + .quad 0 # OLDMEM_SIZE .org COMMAND_LINE .byte "root=/dev/ram0 ro" diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 825dd0f7f221..08dc2b74858d 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -11,6 +11,7 @@ #include #include +#include /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. * I.e. Maximum page that is mapped directly into kernel memory, @@ -51,11 +52,11 @@ struct s390_load_data { /* Pointer to the kernel buffer. Used to register cmdline etc.. */ void *kernel_buf; + /* Parmarea in the kernel buffer. */ + struct parmarea *parm; + /* Total size of loaded segments in memory. Used as an offset. */ size_t memsz; - - /* Load address of initrd. Used to register INITRD_START in kernel. */ - unsigned long initrd_load_addr; }; int kexec_file_add_purgatory(struct kimage *image, diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index efda97804aa4..b603cc09c895 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -65,6 +65,16 @@ #define OLDMEM_SIZE (*(unsigned long *) (OLDMEM_SIZE_OFFSET)) #define COMMAND_LINE ((char *) (COMMAND_LINE_OFFSET)) +struct parmarea { + unsigned long ipl_device; /* 0x10400 */ + unsigned long initrd_start; /* 0x10408 */ + unsigned long initrd_size; /* 0x10410 */ + unsigned long oldmem_base; /* 0x10418 */ + unsigned long oldmem_size; /* 0x10420 */ + char pad1[0x10480 - 0x10428]; /* 0x10428 - 0x10480 */ + char command_line[ARCH_COMMAND_LINE_SIZE]; /* 0x10480 */ +}; + extern int noexec_disabled; extern int memory_end_set; extern unsigned long memory_end; diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 5cf340b778f1..1cdf90767cba 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -42,6 +42,7 @@ static int kexec_file_add_elf_kernel(struct kimage *image, if (entry - phdr->p_paddr < phdr->p_memsz) { data->kernel_buf = buf.buffer; + data->parm = buf.buffer + PARMAREA; data->memsz += STARTUP_NORMAL_OFFSET; buf.buffer += STARTUP_NORMAL_OFFSET; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index 3800852595e8..d9025adc2bbb 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -32,6 +32,7 @@ static int kexec_file_add_image_kernel(struct kimage *image, ret = kexec_add_buffer(&buf); data->kernel_buf = kernel; + data->parm = (void *)kernel + PARMAREA; data->memsz += buf.memsz + STARTUP_NORMAL_OFFSET; return ret; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 32023b4f9dc0..8a85ecd8428c 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -25,24 +25,12 @@ int *kexec_file_update_kernel(struct kimage *image, if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) return ERR_PTR(-EINVAL); - if (image->cmdline_buf_len) - memcpy(data->kernel_buf + COMMAND_LINE_OFFSET, - image->cmdline_buf, image->cmdline_buf_len); + memcpy(data->parm->command_line, image->cmdline_buf, + image->cmdline_buf_len); if (image->type == KEXEC_TYPE_CRASH) { - loc = (unsigned long *)(data->kernel_buf + OLDMEM_BASE_OFFSET); - *loc = crashk_res.start; - - loc = (unsigned long *)(data->kernel_buf + OLDMEM_SIZE_OFFSET); - *loc = crashk_res.end - crashk_res.start + 1; - } - - if (image->initrd_buf) { - loc = (unsigned long *)(data->kernel_buf + INITRD_START_OFFSET); - *loc = data->initrd_load_addr; - - loc = (unsigned long *)(data->kernel_buf + INITRD_SIZE_OFFSET); - *loc = image->initrd_buf_len; + data->parm->oldmem_base = crashk_res.start; + data->parm->oldmem_size = crashk_res.end - crashk_res.start + 1; } return NULL; @@ -127,7 +115,8 @@ int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data, buf.mem += crashk_res.start; buf.memsz = buf.bufsz; - data->initrd_load_addr = buf.mem; + data->parm->initrd_start = buf.mem; + data->parm->initrd_size = buf.memsz; data->memsz += buf.memsz; ret = kexec_add_buffer(&buf); From 8e4964261374aaec9f4a83de076ceb11c8cdc044 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Thu, 7 Mar 2019 12:48:03 +0100 Subject: [PATCH 57/98] s390/kexec_file: Unify loader code s390_image_load and s390_elf_load have the same code to load the different components. Combine this functionality in one shared function. While at it move kexec_file_update_kernel into the new function as well. Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/kexec.h | 10 ++--- arch/s390/kernel/kexec_elf.c | 31 +++---------- arch/s390/kernel/kexec_image.c | 37 ++++------------ arch/s390/kernel/machine_kexec_file.c | 64 +++++++++++++++++---------- 4 files changed, 58 insertions(+), 84 deletions(-) diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 08dc2b74858d..a38a57ec6d8f 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -59,13 +59,9 @@ struct s390_load_data { size_t memsz; }; -int kexec_file_add_purgatory(struct kimage *image, - struct s390_load_data *data); -int kexec_file_add_initrd(struct kimage *image, - struct s390_load_data *data, - char *initrd, unsigned long initrd_len); -int *kexec_file_update_kernel(struct kimage *iamge, - struct s390_load_data *data); +void *kexec_file_add_components(struct kimage *image, + int (*add_kernel)(struct kimage *image, + struct s390_load_data *data)); extern const struct kexec_file_ops s390_kexec_image_ops; extern const struct kexec_file_ops s390_kexec_elf_ops; diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 1cdf90767cba..c74ff6b54344 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -12,16 +12,17 @@ #include #include -static int kexec_file_add_elf_kernel(struct kimage *image, - struct s390_load_data *data, - char *kernel, unsigned long kernel_len) +static int kexec_file_add_kernel_elf(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; Elf_Addr entry; + void *kernel; int i, ret; + kernel = image->kernel_buf; ehdr = (Elf_Ehdr *)kernel; buf.image = image; if (image->type == KEXEC_TYPE_CRASH) @@ -62,7 +63,7 @@ static int kexec_file_add_elf_kernel(struct kimage *image, data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; } - return 0; + return data->memsz ? 0 : -EINVAL; } static void *s390_elf_load(struct kimage *image, @@ -70,11 +71,10 @@ static void *s390_elf_load(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len) { - struct s390_load_data data = {0}; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; size_t size; - int i, ret; + int i; /* image->fobs->probe already checked for valid ELF magic number. */ ehdr = (Elf_Ehdr *)kernel; @@ -107,24 +107,7 @@ static void *s390_elf_load(struct kimage *image, if (size > kernel_len) return ERR_PTR(-EINVAL); - ret = kexec_file_add_elf_kernel(image, &data, kernel, kernel_len); - if (ret) - return ERR_PTR(ret); - - if (!data.memsz) - return ERR_PTR(-EINVAL); - - if (initrd) { - ret = kexec_file_add_initrd(image, &data, initrd, initrd_len); - if (ret) - return ERR_PTR(ret); - } - - ret = kexec_file_add_purgatory(image, &data); - if (ret) - return ERR_PTR(ret); - - return kexec_file_update_kernel(image, &data); + return kexec_file_add_components(image, kexec_file_add_kernel_elf); } static int s390_elf_probe(const char *buf, unsigned long len) diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index d9025adc2bbb..d7e65eeae22f 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -12,30 +12,26 @@ #include #include -static int kexec_file_add_image_kernel(struct kimage *image, - struct s390_load_data *data, - char *kernel, unsigned long kernel_len) +static int kexec_file_add_kernel_image(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; - int ret; buf.image = image; - buf.buffer = kernel + STARTUP_NORMAL_OFFSET; - buf.bufsz = kernel_len - STARTUP_NORMAL_OFFSET; + buf.buffer = image->kernel_buf + STARTUP_NORMAL_OFFSET; + buf.bufsz = image->kernel_buf_len - STARTUP_NORMAL_OFFSET; buf.mem = STARTUP_NORMAL_OFFSET; if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; buf.memsz = buf.bufsz; - ret = kexec_add_buffer(&buf); - - data->kernel_buf = kernel; - data->parm = (void *)kernel + PARMAREA; + data->kernel_buf = image->kernel_buf; + data->parm = image->kernel_buf + PARMAREA; data->memsz += buf.memsz + STARTUP_NORMAL_OFFSET; - return ret; + return kexec_add_buffer(&buf); } static void *s390_image_load(struct kimage *image, @@ -43,24 +39,7 @@ static void *s390_image_load(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len) { - struct s390_load_data data = {0}; - int ret; - - ret = kexec_file_add_image_kernel(image, &data, kernel, kernel_len); - if (ret) - return ERR_PTR(ret); - - if (initrd) { - ret = kexec_file_add_initrd(image, &data, initrd, initrd_len); - if (ret) - return ERR_PTR(ret); - } - - ret = kexec_file_add_purgatory(image, &data); - if (ret) - return ERR_PTR(ret); - - return kexec_file_update_kernel(image, &data); + return kexec_file_add_components(image, kexec_file_add_kernel_image); } static int s390_image_probe(const char *buf, unsigned long len) diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 8a85ecd8428c..08409d61aeca 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -17,25 +17,6 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { NULL, }; -int *kexec_file_update_kernel(struct kimage *image, - struct s390_load_data *data) -{ - unsigned long *loc; - - if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) - return ERR_PTR(-EINVAL); - - memcpy(data->parm->command_line, image->cmdline_buf, - image->cmdline_buf_len); - - if (image->type == KEXEC_TYPE_CRASH) { - data->parm->oldmem_base = crashk_res.start; - data->parm->oldmem_size = crashk_res.end - crashk_res.start + 1; - } - - return NULL; -} - static int kexec_file_update_purgatory(struct kimage *image) { u64 entry, type; @@ -78,7 +59,8 @@ static int kexec_file_update_purgatory(struct kimage *image) return ret; } -int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) +static int kexec_file_add_purgatory(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; int ret; @@ -98,16 +80,16 @@ int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) return ret; } -int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data, - char *initrd, unsigned long initrd_len) +static int kexec_file_add_initrd(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; int ret; buf.image = image; - buf.buffer = initrd; - buf.bufsz = initrd_len; + buf.buffer = image->initrd_buf; + buf.bufsz = image->initrd_buf_len; data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; @@ -123,6 +105,40 @@ int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data, return ret; } +void *kexec_file_add_components(struct kimage *image, + int (*add_kernel)(struct kimage *image, + struct s390_load_data *data)) +{ + struct s390_load_data data = {0}; + int ret; + + ret = add_kernel(image, &data); + if (ret) + return ERR_PTR(ret); + + if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) + return ERR_PTR(-EINVAL); + memcpy(data.parm->command_line, image->cmdline_buf, + image->cmdline_buf_len); + + if (image->type == KEXEC_TYPE_CRASH) { + data.parm->oldmem_base = crashk_res.start; + data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1; + } + + if (image->initrd_buf) { + ret = kexec_file_add_initrd(image, &data); + if (ret) + return ERR_PTR(ret); + } + + ret = kexec_file_add_purgatory(image, &data); + if (ret) + return ERR_PTR(ret); + + return NULL; +} + int arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, const Elf_Shdr *relsec, From 653beba24d4cd281b078eab48c9bce956939061c Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Thu, 7 Mar 2019 15:56:34 +0100 Subject: [PATCH 58/98] s390/kexec_file: Load new kernel to absolute 0 The leading 64 kB of a kernel image doesn't contain any data needed to boot the new kernel when it was loaded via kexec_file. Thus kexec_file currently strips them off before loading the image. Keep the leading 64 kB in order to be able to pass a ipl_report to the next kernel. Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/kexec.h | 6 ++++++ arch/s390/kernel/kexec_elf.c | 16 ++++------------ arch/s390/kernel/kexec_image.c | 9 +++++---- arch/s390/kernel/machine_kexec_file.c | 12 ++++++++++-- arch/s390/kernel/relocate_kernel.S | 3 +++ 5 files changed, 28 insertions(+), 18 deletions(-) diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index a38a57ec6d8f..9ec077b0fb4d 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -43,6 +43,9 @@ /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_S390 +/* Allow kexec_file to load a segment to 0 */ +#define KEXEC_BUF_MEM_UNKNOWN -1 + /* Provide a dummy definition to avoid build failures. */ static inline void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) { } @@ -52,6 +55,9 @@ struct s390_load_data { /* Pointer to the kernel buffer. Used to register cmdline etc.. */ void *kernel_buf; + /* Load address of the kernel_buf. */ + unsigned long kernel_mem; + /* Parmarea in the kernel buffer. */ struct parmarea *parm; diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index c74ff6b54344..42bcd93f4318 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -39,28 +39,20 @@ static int kexec_file_add_kernel_elf(struct kimage *image, buf.bufsz = phdr->p_filesz; buf.mem = ALIGN(phdr->p_paddr, phdr->p_align); + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; buf.memsz = phdr->p_memsz; + data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; if (entry - phdr->p_paddr < phdr->p_memsz) { data->kernel_buf = buf.buffer; + data->kernel_mem = buf.mem; data->parm = buf.buffer + PARMAREA; - data->memsz += STARTUP_NORMAL_OFFSET; - - buf.buffer += STARTUP_NORMAL_OFFSET; - buf.bufsz -= STARTUP_NORMAL_OFFSET; - - buf.mem += STARTUP_NORMAL_OFFSET; - buf.memsz -= STARTUP_NORMAL_OFFSET; } - if (image->type == KEXEC_TYPE_CRASH) - buf.mem += crashk_res.start; - ret = kexec_add_buffer(&buf); if (ret) return ret; - - data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; } return data->memsz ? 0 : -EINVAL; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index d7e65eeae22f..7281540605b7 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -19,17 +19,18 @@ static int kexec_file_add_kernel_image(struct kimage *image, buf.image = image; - buf.buffer = image->kernel_buf + STARTUP_NORMAL_OFFSET; - buf.bufsz = image->kernel_buf_len - STARTUP_NORMAL_OFFSET; + buf.buffer = image->kernel_buf; + buf.bufsz = image->kernel_buf_len; - buf.mem = STARTUP_NORMAL_OFFSET; + buf.mem = 0; if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; buf.memsz = buf.bufsz; data->kernel_buf = image->kernel_buf; + data->kernel_mem = buf.mem; data->parm = image->kernel_buf + PARMAREA; - data->memsz += buf.memsz + STARTUP_NORMAL_OFFSET; + data->memsz += buf.memsz; return kexec_add_buffer(&buf); } diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 08409d61aeca..0e2a5a7a1b7c 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -17,7 +17,8 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { NULL, }; -static int kexec_file_update_purgatory(struct kimage *image) +static int kexec_file_update_purgatory(struct kimage *image, + struct s390_load_data *data) { u64 entry, type; int ret; @@ -76,7 +77,7 @@ static int kexec_file_add_purgatory(struct kimage *image, if (ret) return ret; - ret = kexec_file_update_purgatory(image); + ret = kexec_file_update_purgatory(image, data); return ret; } @@ -136,6 +137,13 @@ void *kexec_file_add_components(struct kimage *image, if (ret) return ERR_PTR(ret); + if (data.kernel_mem == 0) { + unsigned long restart_psw = 0x0008000080000000UL; + restart_psw += image->start; + memcpy(data.kernel_buf, &restart_psw, sizeof(restart_psw)); + image->start = 0; + } + return NULL; } diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index c97c2d40fe15..1b56f087ce2c 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -58,10 +58,13 @@ ENTRY(relocate_kernel) j .base .done: sgr %r0,%r0 # clear register r0 + cghi %r3,0 + je .diag la %r4,load_psw-.base(%r13) # load psw-address into the register o %r3,4(%r4) # or load address into psw st %r3,4(%r4) mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 + .diag: diag %r0,%r0,0x308 .align 8 From e23a8020ce4e094e10d717d39a8ce799243bf8c1 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Tue, 26 Feb 2019 10:50:39 +0100 Subject: [PATCH 59/98] s390/kexec_file: Signature verification prototype Add kernel signature verification to kexec_file. The verification is based on module signature verification and works with kernel images signed via scripts/sign-file. Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 11 ++++ arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/performance_defconfig | 1 + arch/s390/defconfig | 1 + arch/s390/include/asm/kexec.h | 1 + arch/s390/kernel/kexec_elf.c | 3 + arch/s390/kernel/kexec_image.c | 3 + arch/s390/kernel/machine_kexec_file.c | 74 +++++++++++++++++++++++++ 8 files changed, 95 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 1c3fcf19c3af..21e851b0a989 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -553,6 +553,17 @@ config ARCH_HAS_KEXEC_PURGATORY def_bool y depends on KEXEC_FILE +config KEXEC_VERIFY_SIG + bool "Verify kernel signature during kexec_file_load() syscall" + depends on KEXEC_FILE && SYSTEM_DATA_VERIFICATION + help + This option makes kernel signature verification mandatory for + the kexec_file_load() syscall. + + In addition to that option, you need to enable signature + verification for the corresponding kernel image type being + loaded in order for this to work. + config ARCH_RANDOM def_bool y prompt "s390 architectural random number generation API" diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 9824c7bad9d4..b0920b35f87b 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -64,6 +64,7 @@ CONFIG_NUMA=y CONFIG_PREEMPT=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_VERIFY_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 4fcbe5792744..09aa5cb14873 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -65,6 +65,7 @@ CONFIG_NR_CPUS=512 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_VERIFY_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/defconfig b/arch/s390/defconfig index 4d58a92b5d97..c59b922cb6c5 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -39,6 +39,7 @@ CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_VERIFY_SIG=y CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 9ec077b0fb4d..59026899e766 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -65,6 +65,7 @@ struct s390_load_data { size_t memsz; }; +int s390_verify_sig(const char *kernel, unsigned long kernel_len); void *kexec_file_add_components(struct kimage *image, int (*add_kernel)(struct kimage *image, struct s390_load_data *data)); diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 42bcd93f4318..01e45d89b636 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -125,4 +125,7 @@ static int s390_elf_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_elf_ops = { .probe = s390_elf_probe, .load = s390_elf_load, +#ifdef CONFIG_KEXEC_VERIFY_SIG + .verify_sig = s390_verify_sig, +#endif /* CONFIG_KEXEC_VERIFY_SIG */ }; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index 7281540605b7..c378bbac5b35 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -54,4 +54,7 @@ static int s390_image_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_image_ops = { .probe = s390_image_probe, .load = s390_image_load, +#ifdef CONFIG_KEXEC_VERIFY_SIG + .verify_sig = s390_verify_sig, +#endif /* CONFIG_KEXEC_VERIFY_SIG */ }; diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 0e2a5a7a1b7c..e0f6e618e1bf 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -8,7 +8,11 @@ */ #include +#include #include +#include +#include +#include #include const struct kexec_file_ops * const kexec_file_loaders[] = { @@ -17,6 +21,76 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { NULL, }; +#ifdef CONFIG_KEXEC_VERIFY_SIG +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + * - Signer's name + * - Key identifier + * - Signature data + * - Information block + */ +struct module_signature { + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ + u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ +}; + +#define PKEY_ID_PKCS7 2 + +int s390_verify_sig(const char *kernel, unsigned long kernel_len) +{ + const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; + struct module_signature *ms; + unsigned long sig_len; + + /* Skip signature verification when not secure IPLed. */ + if (!ipl_secure_flag) + return 0; + + if (marker_len > kernel_len) + return -EKEYREJECTED; + + if (memcmp(kernel + kernel_len - marker_len, MODULE_SIG_STRING, + marker_len)) + return -EKEYREJECTED; + kernel_len -= marker_len; + + ms = (void *)kernel + kernel_len - sizeof(*ms); + kernel_len -= sizeof(*ms); + + sig_len = be32_to_cpu(ms->sig_len); + if (sig_len >= kernel_len) + return -EKEYREJECTED; + kernel_len -= sig_len; + + if (ms->id_type != PKEY_ID_PKCS7) + return -EKEYREJECTED; + + if (ms->algo != 0 || + ms->hash != 0 || + ms->signer_len != 0 || + ms->key_id_len != 0 || + ms->__pad[0] != 0 || + ms->__pad[1] != 0 || + ms->__pad[2] != 0) { + return -EBADMSG; + } + + return verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); +} +#endif /* CONFIG_KEXEC_VERIFY_SIG */ + static int kexec_file_update_purgatory(struct kimage *image, struct s390_load_data *data) { From 99feaa717e558cf4f2ad0faf53acac3cf9cc7438 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Mon, 18 Mar 2019 12:53:47 +0100 Subject: [PATCH 60/98] s390/kexec_file: Create ipl report and pass to next kernel Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ipl.h | 2 + arch/s390/include/asm/kexec.h | 2 + arch/s390/kernel/kexec_elf.c | 5 ++ arch/s390/kernel/kexec_image.c | 5 ++ arch/s390/kernel/machine_kexec_file.c | 81 ++++++++++++++++++++++++--- 5 files changed, 86 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index bf62af49af06..420d39ebdc46 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -36,6 +36,8 @@ struct ipl_parameter_block { #define IPL_MAX_SUPPORTED_VERSION (0) +#define IPL_RB_CERT_UNKNOWN ((unsigned short)-1) + #define DIAG308_VMPARM_SIZE (64) #define DIAG308_SCPDATA_OFFSET offsetof(struct ipl_parameter_block, \ fcp.scp_data) diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 59026899e766..305d3465574f 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -63,6 +63,8 @@ struct s390_load_data { /* Total size of loaded segments in memory. Used as an offset. */ size_t memsz; + + struct ipl_report *report; }; int s390_verify_sig(const char *kernel, unsigned long kernel_len); diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 01e45d89b636..6d0635ceddd0 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -10,6 +10,7 @@ #include #include #include +#include #include static int kexec_file_add_kernel_elf(struct kimage *image, @@ -50,6 +51,10 @@ static int kexec_file_add_kernel_elf(struct kimage *image, data->parm = buf.buffer + PARMAREA; } + ipl_report_add_component(data->report, &buf, + IPL_RB_COMPONENT_FLAG_SIGNED | + IPL_RB_COMPONENT_FLAG_VERIFIED, + IPL_RB_CERT_UNKNOWN); ret = kexec_add_buffer(&buf); if (ret) return ret; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index c378bbac5b35..58318bf89fd9 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -10,6 +10,7 @@ #include #include #include +#include #include static int kexec_file_add_kernel_image(struct kimage *image, @@ -32,6 +33,10 @@ static int kexec_file_add_kernel_image(struct kimage *image, data->parm = image->kernel_buf + PARMAREA; data->memsz += buf.memsz; + ipl_report_add_component(data->report, &buf, + IPL_RB_COMPONENT_FLAG_SIGNED | + IPL_RB_COMPONENT_FLAG_VERIFIED, + IPL_RB_CERT_UNKNOWN); return kexec_add_buffer(&buf); } diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index e0f6e618e1bf..48cab9600ed9 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -150,9 +151,9 @@ static int kexec_file_add_purgatory(struct kimage *image, ret = kexec_load_purgatory(image, &buf); if (ret) return ret; + data->memsz += buf.memsz; - ret = kexec_file_update_purgatory(image, data); - return ret; + return kexec_file_update_purgatory(image, data); } static int kexec_file_add_initrd(struct kimage *image, @@ -177,7 +178,60 @@ static int kexec_file_add_initrd(struct kimage *image, data->memsz += buf.memsz; ret = kexec_add_buffer(&buf); - return ret; + if (ret) + return ret; + + return ipl_report_add_component(data->report, &buf, 0, 0); +} + +static int kexec_file_add_ipl_report(struct kimage *image, + struct s390_load_data *data) +{ + __u32 *lc_ipl_parmblock_ptr; + unsigned int len, ncerts; + struct kexec_buf buf; + unsigned long addr; + void *ptr, *end; + + buf.image = image; + + data->memsz = ALIGN(data->memsz, PAGE_SIZE); + buf.mem = data->memsz; + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + + ptr = (void *)ipl_cert_list_addr; + end = ptr + ipl_cert_list_size; + ncerts = 0; + while (ptr < end) { + ncerts++; + len = *(unsigned int *)ptr; + ptr += sizeof(len); + ptr += len; + } + + addr = data->memsz + data->report->size; + addr += ncerts * sizeof(struct ipl_rb_certificate_entry); + ptr = (void *)ipl_cert_list_addr; + while (ptr < end) { + len = *(unsigned int *)ptr; + ptr += sizeof(len); + ipl_report_add_certificate(data->report, ptr, addr, len); + addr += len; + ptr += len; + } + + buf.buffer = ipl_report_finish(data->report); + buf.bufsz = data->report->size; + buf.memsz = buf.bufsz; + + data->memsz += buf.memsz; + + lc_ipl_parmblock_ptr = + data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); + *lc_ipl_parmblock_ptr = (__u32)buf.mem; + + return kexec_add_buffer(&buf); } void *kexec_file_add_components(struct kimage *image, @@ -187,12 +241,18 @@ void *kexec_file_add_components(struct kimage *image, struct s390_load_data data = {0}; int ret; + data.report = ipl_report_init(&ipl_block); + if (IS_ERR(data.report)) + return data.report; + ret = add_kernel(image, &data); if (ret) - return ERR_PTR(ret); + goto out; - if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) - return ERR_PTR(-EINVAL); + if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) { + ret = -EINVAL; + goto out; + } memcpy(data.parm->command_line, image->cmdline_buf, image->cmdline_buf_len); @@ -204,12 +264,12 @@ void *kexec_file_add_components(struct kimage *image, if (image->initrd_buf) { ret = kexec_file_add_initrd(image, &data); if (ret) - return ERR_PTR(ret); + goto out; } ret = kexec_file_add_purgatory(image, &data); if (ret) - return ERR_PTR(ret); + goto out; if (data.kernel_mem == 0) { unsigned long restart_psw = 0x0008000080000000UL; @@ -218,7 +278,10 @@ void *kexec_file_add_components(struct kimage *image, image->start = 0; } - return NULL; + ret = kexec_file_add_ipl_report(image, &data); +out: + ipl_report_free(data.report); + return ERR_PTR(ret); } int arch_kexec_apply_relocations_add(struct purgatory_info *pi, From 268a78404973594d1a7ec3a2b6a2474e0543a435 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Tue, 26 Mar 2019 15:45:53 +0100 Subject: [PATCH 61/98] s390/kexec_file: Disable kexec_load when IPLed secure A kernel loaded via kexec_load cannot be verified. Thus disable kexec_load systemcall in kernels which where IPLed securely. Use the IMA mechanism to do so. Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/Makefile | 2 ++ arch/s390/kernel/ima_arch.c | 14 ++++++++++++++ include/linux/ima.h | 2 +- 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 arch/s390/kernel/ima_arch.c diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 1222db6d4ee9..d28acd7ba81e 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -77,6 +77,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o +obj-$(CONFIG_IMA) += ima_arch.o + obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o diff --git a/arch/s390/kernel/ima_arch.c b/arch/s390/kernel/ima_arch.c new file mode 100644 index 000000000000..f3c3e6e1c5d3 --- /dev/null +++ b/arch/s390/kernel/ima_arch.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +bool arch_ima_get_secureboot(void) +{ + return ipl_secure_flag; +} + +const char * const *arch_get_ima_policy(void) +{ + return NULL; +} diff --git a/include/linux/ima.h b/include/linux/ima.h index dc12fbcf484c..fd9f7cf4cdf5 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -31,7 +31,7 @@ extern void ima_post_path_mknod(struct dentry *dentry); extern void ima_add_kexec_buffer(struct kimage *image); #endif -#if defined(CONFIG_X86) && defined(CONFIG_EFI) +#if (defined(CONFIG_X86) && defined(CONFIG_EFI)) || defined(CONFIG_S390) extern bool arch_ima_get_secureboot(void); extern const char * const *arch_get_ima_policy(void); #else From c9896acc7851109d4e84c1e3a54cb1b9794dea6b Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Mon, 8 Apr 2019 14:24:08 +0200 Subject: [PATCH 62/98] s390/ipl: Provide has_secure sysfs attribute Provide an interface for userspace so it can find out if a machine is capeable of doing secure boot. The interface is, for example, needed for zipl so it can find out which file format it can/should write to disk. Signed-off-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/sclp.h | 2 ++ arch/s390/kernel/ipl.c | 17 +++++++++++++++++ drivers/s390/char/sclp.h | 4 +++- drivers/s390/char/sclp_early.c | 2 ++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index ef4c9dec06a4..ef5d8fa92122 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -79,6 +79,8 @@ struct sclp_info { unsigned char has_kss : 1; unsigned char has_gisaf : 1; unsigned char has_diag318 : 1; + unsigned char has_sipl : 1; + unsigned char has_sipl_g2 : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 6f2bb64cf70e..e123c0df83f1 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -283,6 +283,20 @@ static ssize_t ipl_secure_show(struct kobject *kobj, static struct kobj_attribute sys_ipl_secure_attr = __ATTR(secure, 0444, ipl_secure_show, NULL); +static ssize_t ipl_has_secure_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + if (MACHINE_IS_LPAR) + return sprintf(page, "%i\n", !!sclp.has_sipl); + else if (MACHINE_IS_VM) + return sprintf(page, "%i\n", !!sclp.has_sipl_g2); + else + return sprintf(page, "%i\n", 0); +} + +static struct kobj_attribute sys_ipl_has_secure_attr = + __ATTR(has_secure, 0444, ipl_has_secure_show, NULL); + static ssize_t ipl_vm_parm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { @@ -379,6 +393,7 @@ static struct attribute *ipl_fcp_attrs[] = { &sys_ipl_fcp_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; @@ -395,6 +410,7 @@ static struct attribute *ipl_ccw_attrs_vm[] = { &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_vm_parm_attr.attr, &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; @@ -403,6 +419,7 @@ static struct attribute *ipl_ccw_attrs_lpar[] = { &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h index 367e9d384d85..287382dc21c5 100644 --- a/drivers/s390/char/sclp.h +++ b/drivers/s390/char/sclp.h @@ -197,7 +197,9 @@ struct read_info_sccb { u32 hmfai; /* 124-127 */ u8 _pad_128[134 - 128]; /* 128-133 */ u8 byte_134; /* 134 */ - u8 _pad_135[4096 - 135]; /* 135-4095 */ + u8 _pad_135; /* 135 */ + u16 cbl; /* 136-137 */ + u8 _pad_138[4096 - 138]; /* 138-4095 */ } __packed __aligned(PAGE_SIZE); struct read_storage_sccb { diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 8332788681c4..dae9de3d7c96 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -40,6 +40,8 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb) sclp.has_gisaf = !!(sccb->fac118 & 0x08); sclp.has_hvs = !!(sccb->fac119 & 0x80); sclp.has_kss = !!(sccb->fac98 & 0x01); + sclp.has_sipl = !!(sccb->cbl & 0x02); + sclp.has_sipl_g2 = !!(sccb->cbl & 0x04); if (sccb->fac85 & 0x02) S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP; if (sccb->fac91 & 0x40) From 6324b4de6dca40361399d3e9a2f2f1cbe8e7e11e Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 12 Feb 2019 16:23:13 +0100 Subject: [PATCH 63/98] s390/pci: mark command line parser data __initdata No point to keep that around. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index dc9bc82c072c..3262d7ea66d0 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -958,7 +958,7 @@ static void zpci_mem_exit(void) kmem_cache_destroy(zdev_fmb_cache); } -static unsigned int s390_pci_probe = 1; +static unsigned int s390_pci_probe __initdata = 1; static unsigned int s390_pci_initialized; char * __init pcibios_setup(char *str) From 066ee72aecdcb8cb2fd5968c852f304d7b88e2be Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 12 Feb 2019 18:41:49 +0100 Subject: [PATCH 64/98] s390/pci: remove unused define No users of pr_debug in that file. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/pci/pci.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 3262d7ea66d0..d88cee13425a 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -37,8 +37,6 @@ #include #include -#define DEBUG /* enable pr_debug */ - #define SIC_IRQ_MODE_ALL 0 #define SIC_IRQ_MODE_SINGLE 1 From c840927cf5f24d080236775e4c3a934e778069f5 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 12 Feb 2019 18:39:46 +0100 Subject: [PATCH 65/98] s390/pci: move everything irq related to pci_irq.c Move everything interrupt related from pci.c to pci_irq.c. No functional change. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pci.h | 6 + arch/s390/pci/Makefile | 2 +- arch/s390/pci/pci.c | 216 ----------------------------------- arch/s390/pci/pci_irq.c | 222 ++++++++++++++++++++++++++++++++++++ 4 files changed, 229 insertions(+), 217 deletions(-) create mode 100644 arch/s390/pci/pci_irq.c diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 4e0efebc56a9..71517451750b 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -26,6 +26,9 @@ int pci_proc_domain(struct pci_bus *); #define ZPCI_BUS_NR 0 /* default bus number */ #define ZPCI_DEVFN 0 /* default device number */ +#define ZPCI_NR_DMA_SPACES 1 +#define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS + /* PCI Function Controls */ #define ZPCI_FC_FN_ENABLED 0x80 #define ZPCI_FC_ERROR 0x40 @@ -219,6 +222,9 @@ struct zpci_dev *get_zdev_by_fid(u32); int zpci_dma_init(void); void zpci_dma_exit(void); +int __init zpci_irq_init(void); +void __init zpci_irq_exit(void); + /* FMB */ int zpci_fmb_enable_device(struct zpci_dev *); int zpci_fmb_disable_device(struct zpci_dev *); diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index 22d0871291ee..748626a33028 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -3,5 +3,5 @@ # Makefile for the s390 PCI subsystem. # -obj-$(CONFIG_PCI) += pci.o pci_dma.o pci_clp.o pci_sysfs.o \ +obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \ pci_event.o pci_debug.o pci_insn.o pci_mmio.o diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index d88cee13425a..716e773af788 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -24,11 +24,8 @@ #include #include #include -#include -#include #include #include -#include #include #include @@ -37,28 +34,13 @@ #include #include -#define SIC_IRQ_MODE_ALL 0 -#define SIC_IRQ_MODE_SINGLE 1 - -#define ZPCI_NR_DMA_SPACES 1 -#define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS - /* list of all detected zpci devices */ static LIST_HEAD(zpci_list); static DEFINE_SPINLOCK(zpci_list_lock); -static struct irq_chip zpci_irq_chip = { - .name = "zPCI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, -}; - static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES); static DEFINE_SPINLOCK(zpci_domain_lock); -static struct airq_iv *zpci_aisb_iv; -static struct airq_iv *zpci_aibv[ZPCI_NR_DEVICES]; - #define ZPCI_IOMAP_ENTRIES \ min(((unsigned long) ZPCI_NR_DEVICES * PCI_BAR_COUNT / 2), \ ZPCI_IOMAP_MAX_ENTRIES) @@ -121,39 +103,6 @@ int pci_proc_domain(struct pci_bus *bus) } EXPORT_SYMBOL_GPL(pci_proc_domain); -/* Modify PCI: Register adapter interruptions */ -static int zpci_set_airq(struct zpci_dev *zdev) -{ - u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); - struct zpci_fib fib = {0}; - u8 status; - - fib.isc = PCI_ISC; - fib.sum = 1; /* enable summary notifications */ - fib.noi = airq_iv_end(zdev->aibv); - fib.aibv = (unsigned long) zdev->aibv->vector; - fib.aibvo = 0; /* each zdev has its own interrupt vector */ - fib.aisb = (unsigned long) zpci_aisb_iv->vector + (zdev->aisb/64)*8; - fib.aisbo = zdev->aisb & 63; - - return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; -} - -/* Modify PCI: Unregister adapter interruptions */ -static int zpci_clear_airq(struct zpci_dev *zdev) -{ - u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); - struct zpci_fib fib = {0}; - u8 cc, status; - - cc = zpci_mod_fc(req, &fib, &status); - if (cc == 3 || (cc == 1 && status == 24)) - /* Function already gone or IRQs already deregistered. */ - cc = 0; - - return cc ? -EIO : 0; -} - /* Modify PCI: Register I/O address translation parameters */ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, u64 base, u64 limit, u64 iota) @@ -352,136 +301,6 @@ static struct pci_ops pci_root_ops = { .write = pci_write, }; -static void zpci_irq_handler(struct airq_struct *airq) -{ - unsigned long si, ai; - struct airq_iv *aibv; - int irqs_on = 0; - - inc_irq_stat(IRQIO_PCI); - for (si = 0;;) { - /* Scan adapter summary indicator bit vector */ - si = airq_iv_scan(zpci_aisb_iv, si, airq_iv_end(zpci_aisb_iv)); - if (si == -1UL) { - if (irqs_on++) - /* End of second scan with interrupts on. */ - break; - /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC)) - break; - si = 0; - continue; - } - - /* Scan the adapter interrupt vector for this device. */ - aibv = zpci_aibv[si]; - for (ai = 0;;) { - ai = airq_iv_scan(aibv, ai, airq_iv_end(aibv)); - if (ai == -1UL) - break; - inc_irq_stat(IRQIO_MSI); - airq_iv_lock(aibv, ai); - generic_handle_irq(airq_iv_get_data(aibv, ai)); - airq_iv_unlock(aibv, ai); - } - } -} - -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) -{ - struct zpci_dev *zdev = to_zpci(pdev); - unsigned int hwirq, msi_vecs; - unsigned long aisb; - struct msi_desc *msi; - struct msi_msg msg; - int rc, irq; - - zdev->aisb = -1UL; - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; - msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); - - /* Allocate adapter summary indicator bit */ - aisb = airq_iv_alloc_bit(zpci_aisb_iv); - if (aisb == -1UL) - return -EIO; - zdev->aisb = aisb; - - /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); - if (!zdev->aibv) - return -ENOMEM; - - /* Wire up shortcut pointer */ - zpci_aibv[aisb] = zdev->aibv; - - /* Request MSI interrupts */ - hwirq = 0; - for_each_pci_msi_entry(msi, pdev) { - if (hwirq >= msi_vecs) - break; - irq = irq_alloc_desc(0); /* Alloc irq on node 0 */ - if (irq < 0) - return -ENOMEM; - rc = irq_set_msi_desc(irq, msi); - if (rc) - return rc; - irq_set_chip_and_handler(irq, &zpci_irq_chip, - handle_simple_irq); - msg.data = hwirq; - msg.address_lo = zdev->msi_addr & 0xffffffff; - msg.address_hi = zdev->msi_addr >> 32; - pci_write_msi_msg(irq, &msg); - airq_iv_set_data(zdev->aibv, hwirq, irq); - hwirq++; - } - - /* Enable adapter interrupts */ - rc = zpci_set_airq(zdev); - if (rc) - return rc; - - return (msi_vecs == nvec) ? 0 : msi_vecs; -} - -void arch_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct zpci_dev *zdev = to_zpci(pdev); - struct msi_desc *msi; - int rc; - - /* Disable adapter interrupts */ - rc = zpci_clear_airq(zdev); - if (rc) - return; - - /* Release MSI interrupts */ - for_each_pci_msi_entry(msi, pdev) { - if (!msi->irq) - continue; - if (msi->msi_attrib.is_msix) - __pci_msix_desc_mask_irq(msi, 1); - else - __pci_msi_desc_mask_irq(msi, 1, 1); - irq_set_msi_desc(msi->irq, NULL); - irq_free_desc(msi->irq); - msi->msg.address_lo = 0; - msi->msg.address_hi = 0; - msi->msg.data = 0; - msi->irq = 0; - } - - if (zdev->aisb != -1UL) { - zpci_aibv[zdev->aisb] = NULL; - airq_iv_free_bit(zpci_aisb_iv, zdev->aisb); - zdev->aisb = -1UL; - } - if (zdev->aibv) { - airq_iv_release(zdev->aibv); - zdev->aibv = NULL; - } -} - #ifdef CONFIG_PCI_IOV static struct resource iov_res = { .name = "PCI IOV res", @@ -531,41 +350,6 @@ static void zpci_unmap_resources(struct pci_dev *pdev) } } -static struct airq_struct zpci_airq = { - .handler = zpci_irq_handler, - .isc = PCI_ISC, -}; - -static int __init zpci_irq_init(void) -{ - int rc; - - rc = register_adapter_interrupt(&zpci_airq); - if (rc) - goto out; - /* Set summary to 1 to be called every time for the ISC. */ - *zpci_airq.lsi_ptr = 1; - - rc = -ENOMEM; - zpci_aisb_iv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); - if (!zpci_aisb_iv) - goto out_airq; - - zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC); - return 0; - -out_airq: - unregister_adapter_interrupt(&zpci_airq); -out: - return rc; -} - -static void zpci_irq_exit(void) -{ - airq_iv_release(zpci_aisb_iv); - unregister_adapter_interrupt(&zpci_airq); -} - static int zpci_alloc_iomap(struct zpci_dev *zdev) { unsigned long entry; diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c new file mode 100644 index 000000000000..3b129cd35b6e --- /dev/null +++ b/arch/s390/pci/pci_irq.c @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: GPL-2.0 +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include + +#include +#include + +#define SIC_IRQ_MODE_ALL 0 +#define SIC_IRQ_MODE_SINGLE 1 + +static struct airq_iv *zpci_aisb_iv; +static struct airq_iv *zpci_aibv[ZPCI_NR_DEVICES]; + +/* Modify PCI: Register adapter interruptions */ +static int zpci_set_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); + struct zpci_fib fib = {0}; + u8 status; + + fib.isc = PCI_ISC; + fib.sum = 1; /* enable summary notifications */ + fib.noi = airq_iv_end(zdev->aibv); + fib.aibv = (unsigned long) zdev->aibv->vector; + fib.aibvo = 0; /* each zdev has its own interrupt vector */ + fib.aisb = (unsigned long) zpci_aisb_iv->vector + (zdev->aisb/64)*8; + fib.aisbo = zdev->aisb & 63; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister adapter interruptions */ +static int zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {0}; + u8 cc, status; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +static struct irq_chip zpci_irq_chip = { + .name = "zPCI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, +}; + +static void zpci_irq_handler(struct airq_struct *airq) +{ + unsigned long si, ai; + struct airq_iv *aibv; + int irqs_on = 0; + + inc_irq_stat(IRQIO_PCI); + for (si = 0;;) { + /* Scan adapter summary indicator bit vector */ + si = airq_iv_scan(zpci_aisb_iv, si, airq_iv_end(zpci_aisb_iv)); + if (si == -1UL) { + if (irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, reenable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC)) + break; + si = 0; + continue; + } + + /* Scan the adapter interrupt vector for this device. */ + aibv = zpci_aibv[si]; + for (ai = 0;;) { + ai = airq_iv_scan(aibv, ai, airq_iv_end(aibv)); + if (ai == -1UL) + break; + inc_irq_stat(IRQIO_MSI); + airq_iv_lock(aibv, ai); + generic_handle_irq(airq_iv_get_data(aibv, ai)); + airq_iv_unlock(aibv, ai); + } + } +} + +int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + struct zpci_dev *zdev = to_zpci(pdev); + unsigned int hwirq, msi_vecs; + unsigned long aisb; + struct msi_desc *msi; + struct msi_msg msg; + int rc, irq; + + zdev->aisb = -1UL; + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); + + /* Allocate adapter summary indicator bit */ + aisb = airq_iv_alloc_bit(zpci_aisb_iv); + if (aisb == -1UL) + return -EIO; + zdev->aisb = aisb; + + /* Create adapter interrupt vector */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); + if (!zdev->aibv) + return -ENOMEM; + + /* Wire up shortcut pointer */ + zpci_aibv[aisb] = zdev->aibv; + + /* Request MSI interrupts */ + hwirq = 0; + for_each_pci_msi_entry(msi, pdev) { + if (hwirq >= msi_vecs) + break; + irq = irq_alloc_desc(0); /* Alloc irq on node 0 */ + if (irq < 0) + return -ENOMEM; + rc = irq_set_msi_desc(irq, msi); + if (rc) + return rc; + irq_set_chip_and_handler(irq, &zpci_irq_chip, + handle_simple_irq); + msg.data = hwirq; + msg.address_lo = zdev->msi_addr & 0xffffffff; + msg.address_hi = zdev->msi_addr >> 32; + pci_write_msi_msg(irq, &msg); + airq_iv_set_data(zdev->aibv, hwirq, irq); + hwirq++; + } + + /* Enable adapter interrupts */ + rc = zpci_set_airq(zdev); + if (rc) + return rc; + + return (msi_vecs == nvec) ? 0 : msi_vecs; +} + +void arch_teardown_msi_irqs(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + struct msi_desc *msi; + int rc; + + /* Disable adapter interrupts */ + rc = zpci_clear_airq(zdev); + if (rc) + return; + + /* Release MSI interrupts */ + for_each_pci_msi_entry(msi, pdev) { + if (!msi->irq) + continue; + if (msi->msi_attrib.is_msix) + __pci_msix_desc_mask_irq(msi, 1); + else + __pci_msi_desc_mask_irq(msi, 1, 1); + irq_set_msi_desc(msi->irq, NULL); + irq_free_desc(msi->irq); + msi->msg.address_lo = 0; + msi->msg.address_hi = 0; + msi->msg.data = 0; + msi->irq = 0; + } + + if (zdev->aisb != -1UL) { + zpci_aibv[zdev->aisb] = NULL; + airq_iv_free_bit(zpci_aisb_iv, zdev->aisb); + zdev->aisb = -1UL; + } + if (zdev->aibv) { + airq_iv_release(zdev->aibv); + zdev->aibv = NULL; + } +} + +static struct airq_struct zpci_airq = { + .handler = zpci_irq_handler, + .isc = PCI_ISC, +}; + +int __init zpci_irq_init(void) +{ + int rc; + + rc = register_adapter_interrupt(&zpci_airq); + if (rc) + goto out; + /* Set summary to 1 to be called every time for the ISC. */ + *zpci_airq.lsi_ptr = 1; + + rc = -ENOMEM; + zpci_aisb_iv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); + if (!zpci_aisb_iv) + goto out_airq; + + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC); + return 0; + +out_airq: + unregister_adapter_interrupt(&zpci_airq); +out: + return rc; +} + +void __init zpci_irq_exit(void) +{ + airq_iv_release(zpci_aisb_iv); + unregister_adapter_interrupt(&zpci_airq); +} From 0a9fddfaa8ea0f66564329ce89390c8dd38b2df0 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 12 Feb 2019 12:37:50 +0100 Subject: [PATCH 66/98] s390/sclp: detect DIRQ facility Detect the adapter CPU directed interruption facility. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/sclp.h | 1 + drivers/s390/char/sclp.h | 2 +- drivers/s390/char/sclp_early.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index ef5d8fa92122..f577c5f6031a 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -81,6 +81,7 @@ struct sclp_info { unsigned char has_diag318 : 1; unsigned char has_sipl : 1; unsigned char has_sipl_g2 : 1; + unsigned char has_dirq : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h index 287382dc21c5..043f32ba3749 100644 --- a/drivers/s390/char/sclp.h +++ b/drivers/s390/char/sclp.h @@ -197,7 +197,7 @@ struct read_info_sccb { u32 hmfai; /* 124-127 */ u8 _pad_128[134 - 128]; /* 128-133 */ u8 byte_134; /* 134 */ - u8 _pad_135; /* 135 */ + u8 cpudirq; /* 135 */ u16 cbl; /* 136-137 */ u8 _pad_138[4096 - 138]; /* 138-4095 */ } __packed __aligned(PAGE_SIZE); diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index dae9de3d7c96..fdad2a980129 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -95,6 +95,7 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb) sclp.mtid_prev = (sccb->fac42 & 0x80) ? (sccb->fac66 & 31) : 0; sclp.hmfai = sccb->hmfai; + sclp.has_dirq = !!(sccb->cpudirq & 0x80); } /* From 30e63ef2ef43f014bf2039bd57cc917780d6a44b Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Sun, 28 Oct 2018 11:51:56 +0100 Subject: [PATCH 67/98] s390/airq: recognize directed interrupts Add an extra parameter for airq handlers to recognize floating vs. directed interrupts. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/airq.h | 2 +- arch/s390/kvm/interrupt.c | 2 +- arch/s390/pci/pci_irq.c | 2 +- drivers/s390/cio/airq.c | 2 +- drivers/s390/cio/cio.h | 2 +- drivers/s390/cio/qdio_thinint.c | 4 ++-- drivers/s390/crypto/ap_bus.c | 4 ++-- drivers/s390/virtio/virtio_ccw.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index fcf539efb32f..91e78df365c7 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -14,7 +14,7 @@ struct airq_struct { struct hlist_node list; /* Handler queueing. */ - void (*handler)(struct airq_struct *); /* Thin-interrupt handler */ + void (*handler)(struct airq_struct *airq, bool floating); u8 *lsi_ptr; /* Local-Summary-Indicator pointer */ u8 lsi_mask; /* Local-Summary-Indicator mask */ u8 isc; /* Interrupt-subclass */ diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 82162867f378..37503ae62486 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3194,7 +3194,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); -static void gib_alert_irq_handler(struct airq_struct *airq) +static void gib_alert_irq_handler(struct airq_struct *airq, bool floating) { inc_irq_stat(IRQIO_GAL); process_gib_alert_list(); diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 3b129cd35b6e..55e22e392376 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -56,7 +56,7 @@ static struct irq_chip zpci_irq_chip = { .irq_mask = pci_msi_mask_irq, }; -static void zpci_irq_handler(struct airq_struct *airq) +static void zpci_irq_handler(struct airq_struct *airq, bool floating) { unsigned long si, ai; struct airq_iv *aibv; diff --git a/drivers/s390/cio/airq.c b/drivers/s390/cio/airq.c index a45011e4529e..e045e79f061c 100644 --- a/drivers/s390/cio/airq.c +++ b/drivers/s390/cio/airq.c @@ -95,7 +95,7 @@ static irqreturn_t do_airq_interrupt(int irq, void *dummy) rcu_read_lock(); hlist_for_each_entry_rcu(airq, head, list) if ((*airq->lsi_ptr & airq->lsi_mask) != 0) - airq->handler(airq); + airq->handler(airq, !tpi_info->directed_irq); rcu_read_unlock(); return IRQ_HANDLED; diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h index 92eabbb5f18d..06a91743335a 100644 --- a/drivers/s390/cio/cio.h +++ b/drivers/s390/cio/cio.h @@ -51,7 +51,7 @@ struct tpi_info { struct subchannel_id schid; u32 intparm; u32 adapter_IO:1; - u32 :1; + u32 directed_irq:1; u32 isc:3; u32 :27; u32 type:3; diff --git a/drivers/s390/cio/qdio_thinint.c b/drivers/s390/cio/qdio_thinint.c index 07dea602205b..28d59ac2204c 100644 --- a/drivers/s390/cio/qdio_thinint.c +++ b/drivers/s390/cio/qdio_thinint.c @@ -40,7 +40,7 @@ static LIST_HEAD(tiq_list); static DEFINE_MUTEX(tiq_list_lock); /* Adapter interrupt definitions */ -static void tiqdio_thinint_handler(struct airq_struct *airq); +static void tiqdio_thinint_handler(struct airq_struct *airq, bool floating); static struct airq_struct tiqdio_airq = { .handler = tiqdio_thinint_handler, @@ -179,7 +179,7 @@ static inline void tiqdio_call_inq_handlers(struct qdio_irq *irq) * tiqdio_thinint_handler - thin interrupt handler for qdio * @airq: pointer to adapter interrupt descriptor */ -static void tiqdio_thinint_handler(struct airq_struct *airq) +static void tiqdio_thinint_handler(struct airq_struct *airq, bool floating) { u32 si_used = clear_shared_ind(); struct qdio_q *q; diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 1546389d71db..cc30e4f07fff 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -116,7 +116,7 @@ static int user_set_domain; static struct bus_type ap_bus_type; /* Adapter interrupt definitions */ -static void ap_interrupt_handler(struct airq_struct *airq); +static void ap_interrupt_handler(struct airq_struct *airq, bool floating); static int ap_airq_flag; @@ -393,7 +393,7 @@ static enum hrtimer_restart ap_poll_timeout(struct hrtimer *unused) * ap_interrupt_handler() - Schedule ap_tasklet on interrupt * @airq: pointer to adapter interrupt descriptor */ -static void ap_interrupt_handler(struct airq_struct *airq) +static void ap_interrupt_handler(struct airq_struct *airq, bool floating) { inc_irq_stat(IRQIO_APB); if (!ap_suspend_flag) diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 74c328321889..991420caa4f2 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -182,7 +182,7 @@ static void drop_airq_indicator(struct virtqueue *vq, struct airq_info *info) write_unlock_irqrestore(&info->lock, flags); } -static void virtio_airq_handler(struct airq_struct *airq) +static void virtio_airq_handler(struct airq_struct *airq, bool floating) { struct airq_info *info = container_of(airq, struct airq_info, airq); unsigned long ai; From b1f548645cb587c14f5e751b9163939d0723885f Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 14 Feb 2019 12:56:50 +0100 Subject: [PATCH 68/98] s390/pci: clarify interrupt vector usage Rename and clarify the usage of the interrupt bit vectors. Also change the array of the per-function bit vectors to be dynamically allocated. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/pci/pci_irq.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 55e22e392376..0170db93be82 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -14,8 +14,15 @@ #define SIC_IRQ_MODE_ALL 0 #define SIC_IRQ_MODE_SINGLE 1 -static struct airq_iv *zpci_aisb_iv; -static struct airq_iv *zpci_aibv[ZPCI_NR_DEVICES]; +/* + * summary bit vector - one summary bit per function + */ +static struct airq_iv *zpci_sbv; + +/* + * interrupt bit vectors - one vector per function + */ +static struct airq_iv **zpci_ibv; /* Modify PCI: Register adapter interruptions */ static int zpci_set_airq(struct zpci_dev *zdev) @@ -29,7 +36,7 @@ static int zpci_set_airq(struct zpci_dev *zdev) fib.noi = airq_iv_end(zdev->aibv); fib.aibv = (unsigned long) zdev->aibv->vector; fib.aibvo = 0; /* each zdev has its own interrupt vector */ - fib.aisb = (unsigned long) zpci_aisb_iv->vector + (zdev->aisb/64)*8; + fib.aisb = (unsigned long) zpci_sbv->vector + (zdev->aisb/64)*8; fib.aisbo = zdev->aisb & 63; return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; @@ -65,7 +72,7 @@ static void zpci_irq_handler(struct airq_struct *airq, bool floating) inc_irq_stat(IRQIO_PCI); for (si = 0;;) { /* Scan adapter summary indicator bit vector */ - si = airq_iv_scan(zpci_aisb_iv, si, airq_iv_end(zpci_aisb_iv)); + si = airq_iv_scan(zpci_sbv, si, airq_iv_end(zpci_sbv)); if (si == -1UL) { if (irqs_on++) /* End of second scan with interrupts on. */ @@ -78,7 +85,7 @@ static void zpci_irq_handler(struct airq_struct *airq, bool floating) } /* Scan the adapter interrupt vector for this device. */ - aibv = zpci_aibv[si]; + aibv = zpci_ibv[si]; for (ai = 0;;) { ai = airq_iv_scan(aibv, ai, airq_iv_end(aibv)); if (ai == -1UL) @@ -106,7 +113,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); /* Allocate adapter summary indicator bit */ - aisb = airq_iv_alloc_bit(zpci_aisb_iv); + aisb = airq_iv_alloc_bit(zpci_sbv); if (aisb == -1UL) return -EIO; zdev->aisb = aisb; @@ -117,7 +124,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) return -ENOMEM; /* Wire up shortcut pointer */ - zpci_aibv[aisb] = zdev->aibv; + zpci_ibv[aisb] = zdev->aibv; /* Request MSI interrupts */ hwirq = 0; @@ -176,8 +183,8 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) } if (zdev->aisb != -1UL) { - zpci_aibv[zdev->aisb] = NULL; - airq_iv_free_bit(zpci_aisb_iv, zdev->aisb); + zpci_ibv[zdev->aisb] = NULL; + airq_iv_free_bit(zpci_sbv, zdev->aisb); zdev->aisb = -1UL; } if (zdev->aibv) { @@ -202,13 +209,19 @@ int __init zpci_irq_init(void) *zpci_airq.lsi_ptr = 1; rc = -ENOMEM; - zpci_aisb_iv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); - if (!zpci_aisb_iv) + zpci_ibv = kcalloc(ZPCI_NR_DEVICES, sizeof(*zpci_ibv), GFP_KERNEL); + if (!zpci_ibv) goto out_airq; + zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); + if (!zpci_sbv) + goto out_free; + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC); return 0; +out_free: + kfree(zpci_ibv); out_airq: unregister_adapter_interrupt(&zpci_airq); out: @@ -217,6 +230,7 @@ out: void __init zpci_irq_exit(void) { - airq_iv_release(zpci_aisb_iv); + airq_iv_release(zpci_sbv); + kfree(zpci_ibv); unregister_adapter_interrupt(&zpci_airq); } From 414cbd1e3d14ec0e60666a0fb9d8ae2d77eb7c63 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 27 Feb 2019 13:56:08 +0100 Subject: [PATCH 69/98] s390/airq: provide cacheline aligned ivs Provide the ability to create cachesize aligned interrupt vectors. These will be used for per-CPU interrupt vectors. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/airq.h | 10 +++++---- drivers/s390/cio/airq.c | 39 +++++++++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index 91e78df365c7..c10d2ee2dfda 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -35,13 +35,15 @@ struct airq_iv { unsigned int *data; /* 32 bit value associated with each bit */ unsigned long bits; /* Number of bits in the vector */ unsigned long end; /* Number of highest allocated bit + 1 */ + unsigned long flags; /* Allocation flags */ spinlock_t lock; /* Lock to protect alloc & free */ }; -#define AIRQ_IV_ALLOC 1 /* Use an allocation bit mask */ -#define AIRQ_IV_BITLOCK 2 /* Allocate the lock bit mask */ -#define AIRQ_IV_PTR 4 /* Allocate the ptr array */ -#define AIRQ_IV_DATA 8 /* Allocate the data array */ +#define AIRQ_IV_ALLOC 1 /* Use an allocation bit mask */ +#define AIRQ_IV_BITLOCK 2 /* Allocate the lock bit mask */ +#define AIRQ_IV_PTR 4 /* Allocate the ptr array */ +#define AIRQ_IV_DATA 8 /* Allocate the data array */ +#define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */ struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags); void airq_iv_release(struct airq_iv *iv); diff --git a/drivers/s390/cio/airq.c b/drivers/s390/cio/airq.c index e045e79f061c..4534afc63591 100644 --- a/drivers/s390/cio/airq.c +++ b/drivers/s390/cio/airq.c @@ -27,6 +27,8 @@ static DEFINE_SPINLOCK(airq_lists_lock); static struct hlist_head airq_lists[MAX_ISC+1]; +static struct kmem_cache *airq_iv_cache; + /** * register_adapter_interrupt() - register adapter interrupt handler * @airq: pointer to adapter interrupt descriptor @@ -129,10 +131,21 @@ struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags) if (!iv) goto out; iv->bits = bits; + iv->flags = flags; size = BITS_TO_LONGS(bits) * sizeof(unsigned long); - iv->vector = kzalloc(size, GFP_KERNEL); - if (!iv->vector) - goto out_free; + + if (flags & AIRQ_IV_CACHELINE) { + if ((cache_line_size() * BITS_PER_BYTE) < bits) + goto out_free; + + iv->vector = kmem_cache_zalloc(airq_iv_cache, GFP_KERNEL); + if (!iv->vector) + goto out_free; + } else { + iv->vector = kzalloc(size, GFP_KERNEL); + if (!iv->vector) + goto out_free; + } if (flags & AIRQ_IV_ALLOC) { iv->avail = kmalloc(size, GFP_KERNEL); if (!iv->avail) @@ -165,7 +178,10 @@ out_free: kfree(iv->ptr); kfree(iv->bitlock); kfree(iv->avail); - kfree(iv->vector); + if (iv->flags & AIRQ_IV_CACHELINE) + kmem_cache_free(airq_iv_cache, iv->vector); + else + kfree(iv->vector); kfree(iv); out: return NULL; @@ -181,7 +197,10 @@ void airq_iv_release(struct airq_iv *iv) kfree(iv->data); kfree(iv->ptr); kfree(iv->bitlock); - kfree(iv->vector); + if (iv->flags & AIRQ_IV_CACHELINE) + kmem_cache_free(airq_iv_cache, iv->vector); + else + kfree(iv->vector); kfree(iv->avail); kfree(iv); } @@ -275,3 +294,13 @@ unsigned long airq_iv_scan(struct airq_iv *iv, unsigned long start, return bit; } EXPORT_SYMBOL(airq_iv_scan); + +static int __init airq_init(void) +{ + airq_iv_cache = kmem_cache_create("airq_iv_cache", cache_line_size(), + cache_line_size(), 0, NULL); + if (!airq_iv_cache) + return -ENOMEM; + return 0; +} +subsys_initcall(airq_init); From e979ce7bced2ee019b5b1a040295484bd7f23680 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 27 Sep 2018 13:57:12 +0200 Subject: [PATCH 70/98] s390/pci: provide support for CPU directed interrupts Up until now all interrupts on s390 have been floating. For MSI interrupts we've used a global summary bit vector (with a bit for each function) and a per-function interrupt bit vector (with a bit per MSI). This patch introduces a new IRQ delivery mode: CPU directed interrupts. In this new mode a per-CPU interrupt bit vector is used (with a bit per MSI per function). Further it is now possible to direct an IRQ to a specific CPU so we can finally support IRQ affinity. If an interrupt can't be delivered because the appointed CPU is occupied by a hypervisor the interrupt is delivered floating. For this a global summary bit vector is used (with a bit per CPU). Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pci.h | 2 + arch/s390/include/asm/pci_clp.h | 6 +- arch/s390/include/asm/pci_insn.h | 74 ++++++- arch/s390/pci/pci_insn.c | 10 +- arch/s390/pci/pci_irq.c | 337 ++++++++++++++++++++++++++----- 5 files changed, 367 insertions(+), 62 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 71517451750b..a285754d0742 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -115,6 +115,8 @@ struct zpci_dev { /* IRQ stuff */ u64 msi_addr; /* MSI address */ unsigned int max_msi; /* maximum number of MSI's */ + unsigned int msi_first_bit; + unsigned int msi_nr_irqs; struct airq_iv *aibv; /* adapter interrupt bit vector */ unsigned long aisb; /* number of the summary bit */ diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index b3b31b31f0d3..d2d824a91e66 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -118,7 +118,11 @@ struct clp_rsp_query_pci_grp { u8 refresh : 1; /* TLB refresh mode */ u16 reserved2; u16 mui; - u64 reserved3; + u16 : 16; + u16 maxfaal; + u16 : 4; + u16 dnoi : 12; + u16 maxcpu; u64 dasm; /* dma address space mask */ u64 msia; /* MSI address */ u64 reserved4; diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index ba22a6ea51a1..ab1031070503 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -38,6 +38,8 @@ #define ZPCI_MOD_FC_RESET_ERROR 7 #define ZPCI_MOD_FC_RESET_BLOCK 9 #define ZPCI_MOD_FC_SET_MEASURE 10 +#define ZPCI_MOD_FC_REG_INT_D 16 +#define ZPCI_MOD_FC_DEREG_INT_D 17 /* FIB function controls */ #define ZPCI_FIB_FC_ENABLED 0x80 @@ -51,16 +53,7 @@ #define ZPCI_FIB_FC_LS_BLOCKED 0x20 #define ZPCI_FIB_FC_DMAAS_REG 0x10 -/* Function Information Block */ -struct zpci_fib { - u32 fmt : 8; /* format */ - u32 : 24; - u32 : 32; - u8 fc; /* function controls */ - u64 : 56; - u64 pba; /* PCI base address */ - u64 pal; /* PCI address limit */ - u64 iota; /* I/O Translation Anchor */ +struct zpci_fib_fmt0 { u32 : 1; u32 isc : 3; /* Interrupt subclass */ u32 noi : 12; /* Number of interrupts */ @@ -72,16 +65,75 @@ struct zpci_fib { u32 : 32; u64 aibv; /* Adapter int bit vector address */ u64 aisb; /* Adapter int summary bit address */ +}; + +struct zpci_fib_fmt1 { + u32 : 4; + u32 noi : 12; + u32 : 16; + u32 dibvo : 16; + u32 : 16; + u64 : 64; + u64 : 64; +}; + +/* Function Information Block */ +struct zpci_fib { + u32 fmt : 8; /* format */ + u32 : 24; + u32 : 32; + u8 fc; /* function controls */ + u64 : 56; + u64 pba; /* PCI base address */ + u64 pal; /* PCI address limit */ + u64 iota; /* I/O Translation Anchor */ + union { + struct zpci_fib_fmt0 fmt0; + struct zpci_fib_fmt1 fmt1; + }; u64 fmb_addr; /* Function measurement block address and key */ u32 : 32; u32 gd; } __packed __aligned(8); +/* directed interruption information block */ +struct zpci_diib { + u32 : 1; + u32 isc : 3; + u32 : 28; + u16 : 16; + u16 nr_cpus; + u64 disb_addr; + u64 : 64; + u64 : 64; +} __packed __aligned(8); + +/* cpu directed interruption information block */ +struct zpci_cdiib { + u64 : 64; + u64 dibv_addr; + u64 : 64; + u64 : 64; + u64 : 64; +} __packed __aligned(8); + +union zpci_sic_iib { + struct zpci_diib diib; + struct zpci_cdiib cdiib; +}; + u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status); int zpci_refresh_trans(u64 fn, u64 addr, u64 range); int zpci_load(u64 *data, u64 req, u64 offset); int zpci_store(u64 data, u64 req, u64 offset); int zpci_store_block(const u64 *data, u64 req, u64 offset); -int zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc); +int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); + +static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc) +{ + union zpci_sic_iib iib = {{0}}; + + return __zpci_set_irq_ctrl(ctl, isc, &iib); +} #endif diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index f069929e8211..4b2ca068d40e 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -96,13 +96,15 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range) } /* Set Interruption Controls */ -int zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc) +int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) { if (!test_facility(72)) return -EIO; - asm volatile ( - " .insn rsy,0xeb00000000d1,%[ctl],%[isc],%[u]\n" - : : [ctl] "d" (ctl), [isc] "d" (isc << 27), [u] "Q" (*unused)); + + asm volatile( + ".insn rsy,0xeb00000000d1,%[ctl],%[isc],%[iib]\n" + : : [ctl] "d" (ctl), [isc] "d" (isc << 27), [iib] "Q" (*iib)); + return 0; } diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 0170db93be82..4bfd902f27f4 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -7,20 +7,31 @@ #include #include #include +#include #include #include +static enum {FLOATING, DIRECTED} irq_delivery; + #define SIC_IRQ_MODE_ALL 0 #define SIC_IRQ_MODE_SINGLE 1 +#define SIC_IRQ_MODE_DIRECT 4 +#define SIC_IRQ_MODE_D_ALL 16 +#define SIC_IRQ_MODE_D_SINGLE 17 +#define SIC_IRQ_MODE_SET_CPU 18 /* - * summary bit vector - one summary bit per function + * summary bit vector + * FLOATING - summary bit per function + * DIRECTED - summary bit per cpu (only used in fallback path) */ static struct airq_iv *zpci_sbv; /* - * interrupt bit vectors - one vector per function + * interrupt bit vectors + * FLOATING - interrupt bit vector per function + * DIRECTED - interrupt bit vector per cpu */ static struct airq_iv **zpci_ibv; @@ -31,13 +42,13 @@ static int zpci_set_airq(struct zpci_dev *zdev) struct zpci_fib fib = {0}; u8 status; - fib.isc = PCI_ISC; - fib.sum = 1; /* enable summary notifications */ - fib.noi = airq_iv_end(zdev->aibv); - fib.aibv = (unsigned long) zdev->aibv->vector; - fib.aibvo = 0; /* each zdev has its own interrupt vector */ - fib.aisb = (unsigned long) zpci_sbv->vector + (zdev->aisb/64)*8; - fib.aisbo = zdev->aisb & 63; + fib.fmt0.isc = PCI_ISC; + fib.fmt0.sum = 1; /* enable summary notifications */ + fib.fmt0.noi = airq_iv_end(zdev->aibv); + fib.fmt0.aibv = (unsigned long) zdev->aibv->vector; + fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */ + fib.fmt0.aisb = (unsigned long) zpci_sbv->vector + (zdev->aisb/64)*8; + fib.fmt0.aisbo = zdev->aisb & 63; return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; } @@ -57,13 +68,134 @@ static int zpci_clear_airq(struct zpci_dev *zdev) return cc ? -EIO : 0; } +/* Modify PCI: Register CPU directed interruptions */ +static int zpci_set_directed_irq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT_D); + struct zpci_fib fib = {0}; + u8 status; + + fib.fmt = 1; + fib.fmt1.noi = zdev->msi_nr_irqs; + fib.fmt1.dibvo = zdev->msi_first_bit; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister CPU directed interruptions */ +static int zpci_clear_directed_irq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT_D); + struct zpci_fib fib = {0}; + u8 cc, status; + + fib.fmt = 1; + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + struct msi_desc *entry = irq_get_msi_desc(data->irq); + struct msi_msg msg = entry->msg; + + msg.address_lo &= 0xff0000ff; + msg.address_lo |= (cpumask_first(dest) << 8); + pci_write_msi_msg(data->irq, &msg); + + return IRQ_SET_MASK_OK; +} + static struct irq_chip zpci_irq_chip = { .name = "zPCI", .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, + .irq_set_affinity = zpci_set_irq_affinity, }; -static void zpci_irq_handler(struct airq_struct *airq, bool floating) +static void zpci_handle_cpu_local_irq(bool rescan) +{ + struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; + unsigned long bit; + int irqs_on = 0; + + for (bit = 0;;) { + /* Scan the directed IRQ bit vector */ + bit = airq_iv_scan(dibv, bit, airq_iv_end(dibv)); + if (bit == -1UL) { + if (!rescan || irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, reenable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC)) + break; + bit = 0; + continue; + } + inc_irq_stat(IRQIO_MSI); + generic_handle_irq(airq_iv_get_data(dibv, bit)); + } +} + +struct cpu_irq_data { + call_single_data_t csd; + atomic_t scheduled; +}; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_irq_data, irq_data); + +static void zpci_handle_remote_irq(void *data) +{ + atomic_t *scheduled = data; + + do { + zpci_handle_cpu_local_irq(false); + } while (atomic_dec_return(scheduled)); +} + +static void zpci_handle_fallback_irq(void) +{ + struct cpu_irq_data *cpu_data; + unsigned long cpu; + int irqs_on = 0; + + for (cpu = 0;;) { + cpu = airq_iv_scan(zpci_sbv, cpu, airq_iv_end(zpci_sbv)); + if (cpu == -1UL) { + if (irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, reenable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + break; + cpu = 0; + continue; + } + cpu_data = &per_cpu(irq_data, cpu); + if (atomic_inc_return(&cpu_data->scheduled) > 1) + continue; + + cpu_data->csd.func = zpci_handle_remote_irq; + cpu_data->csd.info = &cpu_data->scheduled; + cpu_data->csd.flags = 0; + smp_call_function_single_async(cpu, &cpu_data->csd); + } +} + +static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) +{ + inc_irq_stat(IRQIO_PCI); + if (floating) + zpci_handle_fallback_irq(); + else + zpci_handle_cpu_local_irq(true); +} + +static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) { unsigned long si, ai; struct airq_iv *aibv; @@ -78,7 +210,7 @@ static void zpci_irq_handler(struct airq_struct *airq, bool floating) /* End of second scan with interrupts on. */ break; /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC)) + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) break; si = 0; continue; @@ -101,54 +233,79 @@ static void zpci_irq_handler(struct airq_struct *airq, bool floating) int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { struct zpci_dev *zdev = to_zpci(pdev); - unsigned int hwirq, msi_vecs; - unsigned long aisb; + unsigned int hwirq, msi_vecs, cpu; + unsigned long bit; struct msi_desc *msi; struct msi_msg msg; int rc, irq; zdev->aisb = -1UL; + zdev->msi_first_bit = -1U; if (type == PCI_CAP_ID_MSI && nvec > 1) return 1; msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); - /* Allocate adapter summary indicator bit */ - aisb = airq_iv_alloc_bit(zpci_sbv); - if (aisb == -1UL) - return -EIO; - zdev->aisb = aisb; + if (irq_delivery == DIRECTED) { + /* Allocate cpu vector bits */ + bit = airq_iv_alloc(zpci_ibv[0], msi_vecs); + if (bit == -1UL) + return -EIO; + } else { + /* Allocate adapter summary indicator bit */ + bit = airq_iv_alloc_bit(zpci_sbv); + if (bit == -1UL) + return -EIO; + zdev->aisb = bit; - /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); - if (!zdev->aibv) - return -ENOMEM; + /* Create adapter interrupt vector */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); + if (!zdev->aibv) + return -ENOMEM; - /* Wire up shortcut pointer */ - zpci_ibv[aisb] = zdev->aibv; + /* Wire up shortcut pointer */ + zpci_ibv[bit] = zdev->aibv; + /* Each function has its own interrupt vector */ + bit = 0; + } /* Request MSI interrupts */ - hwirq = 0; + hwirq = bit; for_each_pci_msi_entry(msi, pdev) { - if (hwirq >= msi_vecs) + rc = -EIO; + if (hwirq - bit >= msi_vecs) break; - irq = irq_alloc_desc(0); /* Alloc irq on node 0 */ + irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, msi->affinity); if (irq < 0) return -ENOMEM; rc = irq_set_msi_desc(irq, msi); if (rc) return rc; irq_set_chip_and_handler(irq, &zpci_irq_chip, - handle_simple_irq); + handle_percpu_irq); msg.data = hwirq; - msg.address_lo = zdev->msi_addr & 0xffffffff; + if (irq_delivery == DIRECTED) { + msg.address_lo = zdev->msi_addr & 0xff0000ff; + msg.address_lo |= msi->affinity ? + (cpumask_first(&msi->affinity->mask) << 8) : 0; + for_each_possible_cpu(cpu) { + airq_iv_set_data(zpci_ibv[cpu], hwirq, irq); + } + } else { + msg.address_lo = zdev->msi_addr & 0xffffffff; + airq_iv_set_data(zdev->aibv, hwirq, irq); + } msg.address_hi = zdev->msi_addr >> 32; pci_write_msi_msg(irq, &msg); - airq_iv_set_data(zdev->aibv, hwirq, irq); hwirq++; } - /* Enable adapter interrupts */ - rc = zpci_set_airq(zdev); + zdev->msi_first_bit = bit; + zdev->msi_nr_irqs = msi_vecs; + + if (irq_delivery == DIRECTED) + rc = zpci_set_directed_irq(zdev); + else + rc = zpci_set_airq(zdev); if (rc) return rc; @@ -161,8 +318,11 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) struct msi_desc *msi; int rc; - /* Disable adapter interrupts */ - rc = zpci_clear_airq(zdev); + /* Disable interrupts */ + if (irq_delivery == DIRECTED) + rc = zpci_clear_directed_irq(zdev); + else + rc = zpci_clear_airq(zdev); if (rc) return; @@ -191,37 +351,114 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) airq_iv_release(zdev->aibv); zdev->aibv = NULL; } + + if ((irq_delivery == DIRECTED) && zdev->msi_first_bit != -1U) + airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs); } static struct airq_struct zpci_airq = { - .handler = zpci_irq_handler, + .handler = zpci_floating_irq_handler, .isc = PCI_ISC, }; +static void __init cpu_enable_directed_irq(void *unused) +{ + union zpci_sic_iib iib = {{0}}; + + iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector; + + __zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC); +} + +static int __init zpci_directed_irq_init(void) +{ + union zpci_sic_iib iib = {{0}}; + unsigned int cpu; + + zpci_sbv = airq_iv_create(num_possible_cpus(), 0); + if (!zpci_sbv) + return -ENOMEM; + + iib.diib.isc = PCI_ISC; + iib.diib.nr_cpus = num_possible_cpus(); + iib.diib.disb_addr = (u64) zpci_sbv->vector; + __zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); + + zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv), + GFP_KERNEL); + if (!zpci_ibv) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + /* + * Per CPU IRQ vectors look the same but bit-allocation + * is only done on the first vector. + */ + zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, + AIRQ_IV_DATA | + AIRQ_IV_CACHELINE | + (!cpu ? AIRQ_IV_ALLOC : 0)); + if (!zpci_ibv[cpu]) + return -ENOMEM; + } + on_each_cpu(cpu_enable_directed_irq, NULL, 1); + + zpci_irq_chip.irq_set_affinity = zpci_set_irq_affinity; + + return 0; +} + +static int __init zpci_floating_irq_init(void) +{ + zpci_ibv = kcalloc(ZPCI_NR_DEVICES, sizeof(*zpci_ibv), GFP_KERNEL); + if (!zpci_ibv) + return -ENOMEM; + + zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); + if (!zpci_sbv) + goto out_free; + + return 0; + +out_free: + kfree(zpci_ibv); + return -ENOMEM; +} + int __init zpci_irq_init(void) { int rc; + irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING; + if (irq_delivery == DIRECTED) + zpci_airq.handler = zpci_directed_irq_handler; + rc = register_adapter_interrupt(&zpci_airq); if (rc) goto out; /* Set summary to 1 to be called every time for the ISC. */ *zpci_airq.lsi_ptr = 1; - rc = -ENOMEM; - zpci_ibv = kcalloc(ZPCI_NR_DEVICES, sizeof(*zpci_ibv), GFP_KERNEL); - if (!zpci_ibv) + switch (irq_delivery) { + case FLOATING: + rc = zpci_floating_irq_init(); + break; + case DIRECTED: + rc = zpci_directed_irq_init(); + break; + } + + if (rc) goto out_airq; - zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); - if (!zpci_sbv) - goto out_free; + /* + * Enable floating IRQs (with suppression after one IRQ). When using + * directed IRQs this enables the fallback path. + */ + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC); - zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC); return 0; - -out_free: - kfree(zpci_ibv); out_airq: unregister_adapter_interrupt(&zpci_airq); out: @@ -230,7 +467,15 @@ out: void __init zpci_irq_exit(void) { - airq_iv_release(zpci_sbv); + unsigned int cpu; + + if (irq_delivery == DIRECTED) { + for_each_possible_cpu(cpu) { + airq_iv_release(zpci_ibv[cpu]); + } + } kfree(zpci_ibv); + if (zpci_sbv) + airq_iv_release(zpci_sbv); unregister_adapter_interrupt(&zpci_airq); } From 914b7dd07ee8713c69c31ddb3e19a76852a846ac Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 14 Feb 2017 18:13:09 +0100 Subject: [PATCH 71/98] s390: show statistics for MSI IRQs Improve /proc/interrupts on s390 to show statistics for individual MSI interrupts. Signed-off-by: Sebastian Ott Acked-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/irq.h | 6 ++--- arch/s390/kernel/irq.c | 47 ++++++++++++++++++++++++++++++------- arch/s390/pci/pci_irq.c | 2 +- drivers/s390/cio/cio.c | 2 +- 4 files changed, 43 insertions(+), 14 deletions(-) diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index afaf5e3c57fd..d15943944054 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -47,7 +47,6 @@ enum interruption_class { IRQEXT_CMC, IRQEXT_FTP, IRQIO_CIO, - IRQIO_QAI, IRQIO_DAS, IRQIO_C15, IRQIO_C70, @@ -55,12 +54,13 @@ enum interruption_class { IRQIO_VMR, IRQIO_LCS, IRQIO_CTC, - IRQIO_APB, IRQIO_ADM, IRQIO_CSC, + IRQIO_VIR, + IRQIO_QAI, + IRQIO_APB, IRQIO_PCI, IRQIO_MSI, - IRQIO_VIR, IRQIO_VAI, IRQIO_GAL, NMI_NMI, diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 0cd5a5f96729..f586f94d3947 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -73,7 +73,6 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"}, {.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"}, {.irq = IRQIO_CIO, .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"}, - {.irq = IRQIO_QAI, .name = "QAI", .desc = "[I/O] QDIO Adapter Interrupt"}, {.irq = IRQIO_DAS, .name = "DAS", .desc = "[I/O] DASD"}, {.irq = IRQIO_C15, .name = "C15", .desc = "[I/O] 3215"}, {.irq = IRQIO_C70, .name = "C70", .desc = "[I/O] 3270"}, @@ -81,14 +80,15 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQIO_VMR, .name = "VMR", .desc = "[I/O] Unit Record Devices"}, {.irq = IRQIO_LCS, .name = "LCS", .desc = "[I/O] LCS"}, {.irq = IRQIO_CTC, .name = "CTC", .desc = "[I/O] CTC"}, - {.irq = IRQIO_APB, .name = "APB", .desc = "[I/O] AP Bus"}, {.irq = IRQIO_ADM, .name = "ADM", .desc = "[I/O] EADM Subchannel"}, {.irq = IRQIO_CSC, .name = "CSC", .desc = "[I/O] CHSC Subchannel"}, - {.irq = IRQIO_PCI, .name = "PCI", .desc = "[I/O] PCI Interrupt" }, - {.irq = IRQIO_MSI, .name = "MSI", .desc = "[I/O] MSI Interrupt" }, {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, - {.irq = IRQIO_VAI, .name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"}, - {.irq = IRQIO_GAL, .name = "GAL", .desc = "[I/O] GIB Alert"}, + {.irq = IRQIO_QAI, .name = "QAI", .desc = "[AIO] QDIO Adapter Interrupt"}, + {.irq = IRQIO_APB, .name = "APB", .desc = "[AIO] AP Bus"}, + {.irq = IRQIO_PCI, .name = "PCI", .desc = "[AIO] PCI Interrupt"}, + {.irq = IRQIO_MSI, .name = "MSI", .desc = "[AIO] MSI Interrupt"}, + {.irq = IRQIO_VAI, .name = "VAI", .desc = "[AIO] Virtual I/O Devices AI"}, + {.irq = IRQIO_GAL, .name = "GAL", .desc = "[AIO] GIB Alert"}, {.irq = NMI_NMI, .name = "NMI", .desc = "[NMI] Machine Check"}, {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, }; @@ -116,6 +116,34 @@ void do_IRQ(struct pt_regs *regs, int irq) set_irq_regs(old_regs); } +static void show_msi_interrupt(struct seq_file *p, int irq) +{ + struct irq_desc *desc; + unsigned long flags; + int cpu; + + irq_lock_sparse(); + desc = irq_to_desc(irq); + if (!desc) + goto out; + + raw_spin_lock_irqsave(&desc->lock, flags); + seq_printf(p, "%3d: ", irq); + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + + if (desc->irq_data.chip) + seq_printf(p, " %8s", desc->irq_data.chip->name); + + if (desc->action) + seq_printf(p, " %s", desc->action->name); + + seq_putc(p, '\n'); + raw_spin_unlock_irqrestore(&desc->lock, flags); +out: + irq_unlock_sparse(); +} + /* * show_interrupts is needed by /proc/interrupts. */ @@ -128,7 +156,7 @@ int show_interrupts(struct seq_file *p, void *v) if (index == 0) { seq_puts(p, " "); for_each_online_cpu(cpu) - seq_printf(p, "CPU%d ", cpu); + seq_printf(p, "CPU%-8d", cpu); seq_putc(p, '\n'); } if (index < NR_IRQS_BASE) { @@ -139,9 +167,10 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); goto out; } - if (index > NR_IRQS_BASE) + if (index < nr_irqs) { + show_msi_interrupt(p, index); goto out; - + } for (index = 0; index < NR_ARCH_IRQS; index++) { seq_printf(p, "%s: ", irqclass_sub_desc[index].name); irq = irqclass_sub_desc[index].irq; diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 4bfd902f27f4..e7e3eab9a2b8 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -112,7 +112,7 @@ static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *de } static struct irq_chip zpci_irq_chip = { - .name = "zPCI", + .name = "PCI-MSI", .irq_unmask = pci_msi_unmask_irq, .irq_mask = pci_msi_mask_irq, .irq_set_affinity = zpci_set_irq_affinity, diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c index de744ca158fd..18f5458f90e8 100644 --- a/drivers/s390/cio/cio.c +++ b/drivers/s390/cio/cio.c @@ -564,7 +564,7 @@ static irqreturn_t do_cio_interrupt(int irq, void *dummy) } static struct irqaction io_interrupt = { - .name = "IO", + .name = "I/O", .handler = do_cio_interrupt, }; From 07e3ec3acb80726f60b7ab924b1b0f1498148b56 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 22 Nov 2018 14:08:33 +0100 Subject: [PATCH 72/98] s390/pci: gather statistics for floating vs directed irqs Gather statistics to distinguish floating and directed interrupts. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/irq.h | 3 ++- arch/s390/kernel/irq.c | 3 ++- arch/s390/pci/pci_irq.c | 10 ++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index d15943944054..9f75d67b8c20 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -59,7 +59,8 @@ enum interruption_class { IRQIO_VIR, IRQIO_QAI, IRQIO_APB, - IRQIO_PCI, + IRQIO_PCF, + IRQIO_PCD, IRQIO_MSI, IRQIO_VAI, IRQIO_GAL, diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index f586f94d3947..150964f91183 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -85,7 +85,8 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, {.irq = IRQIO_QAI, .name = "QAI", .desc = "[AIO] QDIO Adapter Interrupt"}, {.irq = IRQIO_APB, .name = "APB", .desc = "[AIO] AP Bus"}, - {.irq = IRQIO_PCI, .name = "PCI", .desc = "[AIO] PCI Interrupt"}, + {.irq = IRQIO_PCF, .name = "PCF", .desc = "[AIO] PCI Floating Interrupt"}, + {.irq = IRQIO_PCD, .name = "PCD", .desc = "[AIO] PCI Directed Interrupt"}, {.irq = IRQIO_MSI, .name = "MSI", .desc = "[AIO] MSI Interrupt"}, {.irq = IRQIO_VAI, .name = "VAI", .desc = "[AIO] Virtual I/O Devices AI"}, {.irq = IRQIO_GAL, .name = "GAL", .desc = "[AIO] GIB Alert"}, diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index e7e3eab9a2b8..c73ab855a2ca 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -188,11 +188,13 @@ static void zpci_handle_fallback_irq(void) static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) { - inc_irq_stat(IRQIO_PCI); - if (floating) + if (floating) { + inc_irq_stat(IRQIO_PCF); zpci_handle_fallback_irq(); - else + } else { + inc_irq_stat(IRQIO_PCD); zpci_handle_cpu_local_irq(true); + } } static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) @@ -201,7 +203,7 @@ static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) struct airq_iv *aibv; int irqs_on = 0; - inc_irq_stat(IRQIO_PCI); + inc_irq_stat(IRQIO_PCF); for (si = 0;;) { /* Scan adapter summary indicator bit vector */ si = airq_iv_scan(zpci_sbv, si, airq_iv_end(zpci_sbv)); From fbfe07d440f2c55070a0358f66560bb4f9fb92e7 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 26 Feb 2019 16:07:32 +0100 Subject: [PATCH 73/98] s390/pci: add parameter to force floating irqs Provide a kernel parameter to force the usage of floating interrupts. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- Documentation/admin-guide/kernel-parameters.txt | 1 + arch/s390/include/asm/pci.h | 1 + arch/s390/pci/pci.c | 5 +++++ arch/s390/pci/pci_irq.c | 3 +++ 4 files changed, 10 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b8ee90bb644..abc1b6e305c4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3394,6 +3394,7 @@ bridges without forcing it upstream. Note: this removes isolation between devices and may put more devices in an IOMMU group. + force_floating [S390] Force usage of floating interrupts. pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index a285754d0742..328013219aec 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -163,6 +163,7 @@ static inline bool zdev_enabled(struct zpci_dev *zdev) } extern const struct attribute_group *zpci_attr_groups[]; +extern unsigned int s390_pci_force_floating __initdata; /* ----------------------------------------------------------------------------- Prototypes diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 716e773af788..db096fd16f46 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -741,6 +741,7 @@ static void zpci_mem_exit(void) } static unsigned int s390_pci_probe __initdata = 1; +unsigned int s390_pci_force_floating __initdata; static unsigned int s390_pci_initialized; char * __init pcibios_setup(char *str) @@ -749,6 +750,10 @@ char * __init pcibios_setup(char *str) s390_pci_probe = 0; return NULL; } + if (!strcmp(str, "force_floating")) { + s390_pci_force_floating = 1; + return NULL; + } return str; } diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index c73ab855a2ca..d80616ae8dd8 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -433,6 +433,9 @@ int __init zpci_irq_init(void) int rc; irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING; + if (s390_pci_force_floating) + irq_delivery = FLOATING; + if (irq_delivery == DIRECTED) zpci_airq.handler = zpci_directed_irq_handler; From 81deca12c202aa240a28f561a161ac3387a985db Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Sun, 14 Apr 2019 16:25:54 +0200 Subject: [PATCH 74/98] s390/pci: move io address mapping code to pci_insn.c This is a preparation patch for usage of new pci instructions. No functional change. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pci_insn.h | 8 ++++-- arch/s390/include/asm/pci_io.h | 49 +++++++++++++------------------- arch/s390/pci/pci.c | 4 +-- arch/s390/pci/pci_insn.c | 38 ++++++++++++++++++++++--- drivers/s390/net/ism.h | 2 +- 5 files changed, 61 insertions(+), 40 deletions(-) diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index ab1031070503..71f19604ec61 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -124,9 +124,11 @@ union zpci_sic_iib { u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status); int zpci_refresh_trans(u64 fn, u64 addr, u64 range); -int zpci_load(u64 *data, u64 req, u64 offset); -int zpci_store(u64 data, u64 req, u64 offset); -int zpci_store_block(const u64 *data, u64 req, u64 offset); +int __zpci_load(u64 *data, u64 req, u64 offset); +int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len); +int __zpci_store(u64 data, u64 req, u64 offset); +int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len); +int __zpci_store_block(const u64 *data, u64 req, u64 offset); int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc) diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h index cbb9cb9c6547..cd060b5dd8fd 100644 --- a/arch/s390/include/asm/pci_io.h +++ b/arch/s390/include/asm/pci_io.h @@ -37,12 +37,10 @@ extern struct zpci_iomap_entry *zpci_iomap_start; #define zpci_read(LENGTH, RETTYPE) \ static inline RETTYPE zpci_read_##RETTYPE(const volatile void __iomem *addr) \ { \ - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; \ - u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, LENGTH); \ u64 data; \ int rc; \ \ - rc = zpci_load(&data, req, ZPCI_OFFSET(addr)); \ + rc = zpci_load(&data, addr, LENGTH); \ if (rc) \ data = -1ULL; \ return (RETTYPE) data; \ @@ -52,11 +50,9 @@ static inline RETTYPE zpci_read_##RETTYPE(const volatile void __iomem *addr) \ static inline void zpci_write_##VALTYPE(VALTYPE val, \ const volatile void __iomem *addr) \ { \ - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; \ - u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, LENGTH); \ u64 data = (VALTYPE) val; \ \ - zpci_store(data, req, ZPCI_OFFSET(addr)); \ + zpci_store(addr, data, LENGTH); \ } zpci_read(8, u64) @@ -68,36 +64,38 @@ zpci_write(4, u32) zpci_write(2, u16) zpci_write(1, u8) -static inline int zpci_write_single(u64 req, const u64 *data, u64 offset, u8 len) +static inline int zpci_write_single(volatile void __iomem *dst, const void *src, + unsigned long len) { u64 val; switch (len) { case 1: - val = (u64) *((u8 *) data); + val = (u64) *((u8 *) src); break; case 2: - val = (u64) *((u16 *) data); + val = (u64) *((u16 *) src); break; case 4: - val = (u64) *((u32 *) data); + val = (u64) *((u32 *) src); break; case 8: - val = (u64) *((u64 *) data); + val = (u64) *((u64 *) src); break; default: val = 0; /* let FW report error */ break; } - return zpci_store(val, req, offset); + return zpci_store(dst, val, len); } -static inline int zpci_read_single(u64 req, u64 *dst, u64 offset, u8 len) +static inline int zpci_read_single(void *dst, const volatile void __iomem *src, + unsigned long len) { u64 data; int cc; - cc = zpci_load(&data, req, offset); + cc = zpci_load(&data, src, len); if (cc) goto out; @@ -119,10 +117,8 @@ out: return cc; } -static inline int zpci_write_block(u64 req, const u64 *data, u64 offset) -{ - return zpci_store_block(data, req, offset); -} +int zpci_write_block(volatile void __iomem *dst, const void *src, + unsigned long len); static inline u8 zpci_get_max_write_size(u64 src, u64 dst, int len, int max) { @@ -140,18 +136,15 @@ static inline int zpci_memcpy_fromio(void *dst, const volatile void __iomem *src, unsigned long n) { - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(src)]; - u64 req, offset = ZPCI_OFFSET(src); int size, rc = 0; while (n > 0) { size = zpci_get_max_write_size((u64 __force) src, (u64) dst, n, 8); - req = ZPCI_CREATE_REQ(entry->fh, entry->bar, size); - rc = zpci_read_single(req, dst, offset, size); + rc = zpci_read_single(dst, src, size); if (rc) break; - offset += size; + src += size; dst += size; n -= size; } @@ -161,8 +154,6 @@ static inline int zpci_memcpy_fromio(void *dst, static inline int zpci_memcpy_toio(volatile void __iomem *dst, const void *src, unsigned long n) { - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(dst)]; - u64 req, offset = ZPCI_OFFSET(dst); int size, rc = 0; if (!src) @@ -171,16 +162,14 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst, while (n > 0) { size = zpci_get_max_write_size((u64 __force) dst, (u64) src, n, 128); - req = ZPCI_CREATE_REQ(entry->fh, entry->bar, size); - if (size > 8) /* main path */ - rc = zpci_write_block(req, src, offset); + rc = zpci_write_block(dst, src, size); else - rc = zpci_write_single(req, src, offset, size); + rc = zpci_write_single(dst, src, size); if (rc) break; - offset += size; src += size; + dst += size; n -= size; } return rc; diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index db096fd16f46..89d15a7f2e9a 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -188,7 +188,7 @@ static int zpci_cfg_load(struct zpci_dev *zdev, int offset, u32 *val, u8 len) u64 data; int rc; - rc = zpci_load(&data, req, offset); + rc = __zpci_load(&data, req, offset); if (!rc) { data = le64_to_cpu((__force __le64) data); data >>= (8 - len) * 8; @@ -206,7 +206,7 @@ static int zpci_cfg_store(struct zpci_dev *zdev, int offset, u32 val, u8 len) data <<= (8 - len) * 8; data = (__force u64) cpu_to_le64(data); - rc = zpci_store(data, req, offset); + rc = __zpci_store(data, req, offset); return rc; } diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 4b2ca068d40e..6dbd13bb7c1c 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */ @@ -142,7 +143,7 @@ static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) return cc; } -int zpci_load(u64 *data, u64 req, u64 offset) +int __zpci_load(u64 *data, u64 req, u64 offset) { u8 status; int cc; @@ -158,6 +159,15 @@ int zpci_load(u64 *data, u64 req, u64 offset) return (cc > 0) ? -EIO : cc; } +EXPORT_SYMBOL_GPL(__zpci_load); + +int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; + u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + + return __zpci_load(data, req, ZPCI_OFFSET(addr)); +} EXPORT_SYMBOL_GPL(zpci_load); /* PCI Store */ @@ -180,7 +190,7 @@ static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status) return cc; } -int zpci_store(u64 data, u64 req, u64 offset) +int __zpci_store(u64 data, u64 req, u64 offset) { u8 status; int cc; @@ -196,6 +206,15 @@ int zpci_store(u64 data, u64 req, u64 offset) return (cc > 0) ? -EIO : cc; } +EXPORT_SYMBOL_GPL(__zpci_store); + +int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; + u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + + return __zpci_store(data, req, ZPCI_OFFSET(addr)); +} EXPORT_SYMBOL_GPL(zpci_store); /* PCI Store Block */ @@ -216,7 +235,7 @@ static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status) return cc; } -int zpci_store_block(const u64 *data, u64 req, u64 offset) +int __zpci_store_block(const u64 *data, u64 req, u64 offset) { u8 status; int cc; @@ -232,4 +251,15 @@ int zpci_store_block(const u64 *data, u64 req, u64 offset) return (cc > 0) ? -EIO : cc; } -EXPORT_SYMBOL_GPL(zpci_store_block); +EXPORT_SYMBOL_GPL(__zpci_store_block); + +int zpci_write_block(volatile void __iomem *dst, + const void *src, unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(dst)]; + u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + u64 offset = ZPCI_OFFSET(dst); + + return __zpci_store_block(src, req, offset); +} +EXPORT_SYMBOL_GPL(zpci_write_block); diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 0aab90817326..e8bf3d38af53 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -215,7 +215,7 @@ static inline int __ism_move(struct ism_dev *ism, u64 dmb_req, void *data, struct zpci_dev *zdev = to_zpci(ism->pdev); u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, size); - return zpci_write_block(req, data, dmb_req); + return __zpci_store_block(data, req, dmb_req); } #endif /* S390_ISM_H */ From c475f1770a5e062652a6a877afcbd4e22b18039f Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 23 Apr 2019 11:57:46 +0200 Subject: [PATCH 75/98] s390/ism: move oddities of device IO to wrapper function ISM devices are special in how they access PCI memory space. Provide wrappers for handling commands to the device. No functional change. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- drivers/s390/net/ism.h | 27 +++++++++++++++++++++++++-- drivers/s390/net/ism_drv.c | 20 ++++++-------------- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index e8bf3d38af53..66eac2b9704d 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -6,6 +6,7 @@ #include #include #include +#include #define UTIL_STR_LEN 16 @@ -194,8 +195,6 @@ struct ism_dev { struct pci_dev *pdev; struct smcd_dev *smcd; - void __iomem *ctl; - struct ism_sba *sba; dma_addr_t sba_dma_addr; DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); @@ -209,6 +208,30 @@ struct ism_dev { #define ISM_CREATE_REQ(dmb, idx, sf, offset) \ ((dmb) | (idx) << 24 | (sf) << 23 | (offset)) +static inline void __ism_read_cmd(struct ism_dev *ism, void *data, + unsigned long offset, unsigned long len) +{ + struct zpci_dev *zdev = to_zpci(ism->pdev); + u64 req = ZPCI_CREATE_REQ(zdev->fh, 2, 8); + + while (len > 0) { + __zpci_load(data, req, offset); + offset += 8; + data += 8; + len -= 8; + } +} + +static inline void __ism_write_cmd(struct ism_dev *ism, void *data, + unsigned long offset, unsigned long len) +{ + struct zpci_dev *zdev = to_zpci(ism->pdev); + u64 req = ZPCI_CREATE_REQ(zdev->fh, 2, len); + + if (len) + __zpci_store_block(data, req, offset); +} + static inline int __ism_move(struct ism_dev *ism, u64 dmb_req, void *data, unsigned int size) { diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 3e132592c1fe..4fc2056bd227 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -38,19 +38,18 @@ static int ism_cmd(struct ism_dev *ism, void *cmd) struct ism_req_hdr *req = cmd; struct ism_resp_hdr *resp = cmd; - memcpy_toio(ism->ctl + sizeof(*req), req + 1, req->len - sizeof(*req)); - memcpy_toio(ism->ctl, req, sizeof(*req)); + __ism_write_cmd(ism, req + 1, sizeof(*req), req->len - sizeof(*req)); + __ism_write_cmd(ism, req, 0, sizeof(*req)); WRITE_ONCE(resp->ret, ISM_ERROR); - memcpy_fromio(resp, ism->ctl, sizeof(*resp)); + __ism_read_cmd(ism, resp, 0, sizeof(*resp)); if (resp->ret) { debug_text_event(ism_debug_info, 0, "cmd failure"); debug_event(ism_debug_info, 0, resp, sizeof(*resp)); goto out; } - memcpy_fromio(resp + 1, ism->ctl + sizeof(*resp), - resp->len - sizeof(*resp)); + __ism_read_cmd(ism, resp + 1, sizeof(*resp), resp->len - sizeof(*resp)); out: return resp->ret; } @@ -512,13 +511,9 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (ret) goto err_disable; - ism->ctl = pci_iomap(pdev, 2, 0); - if (!ism->ctl) - goto err_resource; - ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); if (ret) - goto err_unmap; + goto err_resource; dma_set_seg_boundary(&pdev->dev, SZ_1M - 1); dma_set_max_seg_size(&pdev->dev, SZ_1M); @@ -527,7 +522,7 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) ism->smcd = smcd_alloc_dev(&pdev->dev, dev_name(&pdev->dev), &ism_ops, ISM_NR_DMBS); if (!ism->smcd) - goto err_unmap; + goto err_resource; ism->smcd->priv = ism; ret = ism_dev_init(ism); @@ -538,8 +533,6 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) err_free: smcd_free_dev(ism->smcd); -err_unmap: - pci_iounmap(pdev, ism->ctl); err_resource: pci_release_mem_regions(pdev); err_disable: @@ -568,7 +561,6 @@ static void ism_remove(struct pci_dev *pdev) ism_dev_exit(ism); smcd_free_dev(ism->smcd); - pci_iounmap(pdev, ism->ctl); pci_release_mem_regions(pdev); pci_disable_device(pdev); dev_set_drvdata(&pdev->dev, NULL); From 71ba41c9b1d91042960e9d92a5c8f52dc8531eda Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Sun, 14 Apr 2019 15:38:01 +0200 Subject: [PATCH 76/98] s390/pci: provide support for MIO instructions Provide support for PCI I/O instructions that work on mapped IO addresses. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/io.h | 17 ++-- arch/s390/include/asm/pci.h | 3 + arch/s390/include/asm/pci_clp.h | 14 +++- arch/s390/include/asm/pci_insn.h | 5 ++ arch/s390/pci/pci.c | 132 ++++++++++++++++++++++++++++--- arch/s390/pci/pci_clp.c | 25 ++++-- arch/s390/pci/pci_insn.c | 129 +++++++++++++++++++++++++++++- 7 files changed, 294 insertions(+), 31 deletions(-) diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index f34d729347e4..ca421614722f 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -30,14 +30,8 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define ioremap_wc ioremap_nocache #define ioremap_wt ioremap_nocache -static inline void __iomem *ioremap(unsigned long offset, unsigned long size) -{ - return (void __iomem *) offset; -} - -static inline void iounmap(volatile void __iomem *addr) -{ -} +void __iomem *ioremap(unsigned long offset, unsigned long size); +void iounmap(volatile void __iomem *addr); static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { @@ -57,14 +51,17 @@ static inline void ioport_unmap(void __iomem *p) * the corresponding device and create the mapping cookie. */ #define pci_iomap pci_iomap +#define pci_iomap_range pci_iomap_range #define pci_iounmap pci_iounmap -#define pci_iomap_wc pci_iomap -#define pci_iomap_wc_range pci_iomap_range +#define pci_iomap_wc pci_iomap_wc +#define pci_iomap_wc_range pci_iomap_wc_range #define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) #define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) #define memset_io(dst, val, count) zpci_memset_io(dst, val, count) +#define mmiowb() zpci_barrier() + #define __raw_readb zpci_read_u8 #define __raw_readw zpci_read_u16 #define __raw_readl zpci_read_u32 diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 328013219aec..305befd55326 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -86,6 +86,8 @@ enum zpci_state { struct zpci_bar_struct { struct resource *res; /* bus resource */ + void __iomem *mio_wb; + void __iomem *mio_wt; u32 val; /* bar start & 3 flag bits */ u16 map_idx; /* index into bar mapping array */ u8 size; /* order 2 exponent */ @@ -135,6 +137,7 @@ struct zpci_dev { struct iommu_device iommu_dev; /* IOMMU core handle */ char res_name[16]; + bool mio_capable; struct zpci_bar_struct bars[PCI_BAR_COUNT]; u64 start_dma; /* Start of available DMA addresses */ diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index d2d824a91e66..3ec52a05d500 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -43,6 +43,8 @@ struct clp_fh_list_entry { #define CLP_SET_ENABLE_PCI_FN 0 /* Yes, 0 enables it */ #define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */ +#define CLP_SET_ENABLE_MIO 2 +#define CLP_SET_DISABLE_MIO 3 #define CLP_UTIL_STR_LEN 64 #define CLP_PFIP_NR_SEGMENTS 4 @@ -80,7 +82,8 @@ struct clp_req_query_pci { struct clp_rsp_query_pci { struct clp_rsp_hdr hdr; u16 vfn; /* virtual fn number */ - u16 : 7; + u16 : 6; + u16 mio_addr_avail : 1; u16 util_str_avail : 1; /* utility string available? */ u16 pfgid : 8; /* pci function group id */ u32 fid; /* pci function id */ @@ -96,6 +99,15 @@ struct clp_rsp_query_pci { u32 reserved[11]; u32 uid; /* user defined id */ u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */ + u32 reserved2[16]; + u32 mio_valid : 6; + u32 : 26; + u32 : 32; + struct { + u64 wb; + u64 wt; + } addr[PCI_BAR_COUNT]; + u32 reserved3[6]; } __packed; /* Query PCI function group request */ diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index 71f19604ec61..61cf9531f68f 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -2,6 +2,8 @@ #ifndef _ASM_S390_PCI_INSN_H #define _ASM_S390_PCI_INSN_H +#include + /* Load/Store status codes */ #define ZPCI_PCI_ST_FUNC_NOT_ENABLED 4 #define ZPCI_PCI_ST_FUNC_IN_ERR 8 @@ -122,6 +124,8 @@ union zpci_sic_iib { struct zpci_cdiib cdiib; }; +DECLARE_STATIC_KEY_FALSE(have_mio); + u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status); int zpci_refresh_trans(u64 fn, u64 addr, u64 range); int __zpci_load(u64 *data, u64 req, u64 offset); @@ -129,6 +133,7 @@ int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len); int __zpci_store(u64 data, u64 req, u64 offset); int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len); int __zpci_store_block(const u64 *data, u64 req, u64 offset); +void zpci_barrier(void); int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 89d15a7f2e9a..dff8f4526c8d 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -50,6 +51,8 @@ static unsigned long *zpci_iomap_bitmap; struct zpci_iomap_entry *zpci_iomap_start; EXPORT_SYMBOL_GPL(zpci_iomap_start); +DEFINE_STATIC_KEY_FALSE(have_mio); + static struct kmem_cache *zdev_fmb_cache; struct zpci_dev *get_zdev_by_fid(u32 fid) @@ -223,18 +226,48 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) zpci_memcpy_toio(to, from, count); } +void __iomem *ioremap(unsigned long ioaddr, unsigned long size) +{ + struct vm_struct *area; + unsigned long offset; + + if (!size) + return NULL; + + if (!static_branch_unlikely(&have_mio)) + return (void __iomem *) ioaddr; + + offset = ioaddr & ~PAGE_MASK; + ioaddr &= PAGE_MASK; + size = PAGE_ALIGN(size + offset); + area = get_vm_area(size, VM_IOREMAP); + if (!area) + return NULL; + + if (ioremap_page_range((unsigned long) area->addr, + (unsigned long) area->addr + size, + ioaddr, PAGE_KERNEL)) { + vunmap(area->addr); + return NULL; + } + return (void __iomem *) ((unsigned long) area->addr + offset); +} +EXPORT_SYMBOL(ioremap); + +void iounmap(volatile void __iomem *addr) +{ + if (static_branch_likely(&have_mio)) + vunmap((__force void *) ((unsigned long) addr & PAGE_MASK)); +} +EXPORT_SYMBOL(iounmap); + /* Create a virtual mapping cookie for a PCI BAR */ -void __iomem *pci_iomap_range(struct pci_dev *pdev, - int bar, - unsigned long offset, - unsigned long max) +static void __iomem *pci_iomap_range_fh(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) { struct zpci_dev *zdev = to_zpci(pdev); int idx; - if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT) - return NULL; - idx = zdev->bars[bar].map_idx; spin_lock(&zpci_iomap_lock); /* Detect overrun */ @@ -245,6 +278,30 @@ void __iomem *pci_iomap_range(struct pci_dev *pdev, return (void __iomem *) ZPCI_ADDR(idx) + offset; } + +static void __iomem *pci_iomap_range_mio(struct pci_dev *pdev, int bar, + unsigned long offset, + unsigned long max) +{ + unsigned long barsize = pci_resource_len(pdev, bar); + struct zpci_dev *zdev = to_zpci(pdev); + void __iomem *iova; + + iova = ioremap((unsigned long) zdev->bars[bar].mio_wt, barsize); + return iova ? iova + offset : iova; +} + +void __iomem *pci_iomap_range(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT) + return NULL; + + if (static_branch_likely(&have_mio)) + return pci_iomap_range_mio(pdev, bar, offset, max); + else + return pci_iomap_range_fh(pdev, bar, offset, max); +} EXPORT_SYMBOL(pci_iomap_range); void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) @@ -253,7 +310,37 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) } EXPORT_SYMBOL(pci_iomap); -void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) +static void __iomem *pci_iomap_wc_range_mio(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + unsigned long barsize = pci_resource_len(pdev, bar); + struct zpci_dev *zdev = to_zpci(pdev); + void __iomem *iova; + + iova = ioremap((unsigned long) zdev->bars[bar].mio_wb, barsize); + return iova ? iova + offset : iova; +} + +void __iomem *pci_iomap_wc_range(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT) + return NULL; + + if (static_branch_likely(&have_mio)) + return pci_iomap_wc_range_mio(pdev, bar, offset, max); + else + return pci_iomap_range_fh(pdev, bar, offset, max); +} +EXPORT_SYMBOL(pci_iomap_wc_range); + +void __iomem *pci_iomap_wc(struct pci_dev *dev, int bar, unsigned long maxlen) +{ + return pci_iomap_wc_range(dev, bar, 0, maxlen); +} +EXPORT_SYMBOL(pci_iomap_wc); + +static void pci_iounmap_fh(struct pci_dev *pdev, void __iomem *addr) { unsigned int idx = ZPCI_IDX(addr); @@ -266,6 +353,19 @@ void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) } spin_unlock(&zpci_iomap_lock); } + +static void pci_iounmap_mio(struct pci_dev *pdev, void __iomem *addr) +{ + iounmap(addr); +} + +void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) +{ + if (static_branch_likely(&have_mio)) + pci_iounmap_mio(pdev, addr); + else + pci_iounmap_fh(pdev, addr); +} EXPORT_SYMBOL(pci_iounmap); static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, @@ -312,6 +412,7 @@ static struct resource iov_res = { static void zpci_map_resources(struct pci_dev *pdev) { + struct zpci_dev *zdev = to_zpci(pdev); resource_size_t len; int i; @@ -319,8 +420,13 @@ static void zpci_map_resources(struct pci_dev *pdev) len = pci_resource_len(pdev, i); if (!len) continue; - pdev->resource[i].start = - (resource_size_t __force) pci_iomap(pdev, i, 0); + + if (static_branch_likely(&have_mio)) + pdev->resource[i].start = + (resource_size_t __force) zdev->bars[i].mio_wb; + else + pdev->resource[i].start = + (resource_size_t __force) pci_iomap(pdev, i, 0); pdev->resource[i].end = pdev->resource[i].start + len - 1; } @@ -341,6 +447,9 @@ static void zpci_unmap_resources(struct pci_dev *pdev) resource_size_t len; int i; + if (static_branch_likely(&have_mio)) + return; + for (i = 0; i < PCI_BAR_COUNT; i++) { len = pci_resource_len(pdev, i); if (!len) @@ -772,6 +881,9 @@ static int __init pci_base_init(void) if (!test_facility(69) || !test_facility(71)) return 0; + if (test_facility(153)) + static_branch_enable(&have_mio); + rc = zpci_debug_init(); if (rc) goto out; diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index eeb7450db18c..3a36b07a5571 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -163,7 +163,14 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, memcpy(zdev->util_str, response->util_str, sizeof(zdev->util_str)); } + zdev->mio_capable = response->mio_addr_avail; + for (i = 0; i < PCI_BAR_COUNT; i++) { + if (!(response->mio_valid & (1 << (PCI_BAR_COUNT - i - 1)))) + continue; + zdev->bars[i].mio_wb = (void __iomem *) response->addr[i].wb; + zdev->bars[i].mio_wt = (void __iomem *) response->addr[i].wt; + } return 0; } @@ -279,11 +286,18 @@ int clp_enable_fh(struct zpci_dev *zdev, u8 nr_dma_as) int rc; rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN); - if (!rc) - /* Success -> store enabled handle in zdev */ - zdev->fh = fh; + zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); + if (rc) + goto out; - zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, zdev->fh, rc); + zdev->fh = fh; + if (zdev->mio_capable) { + rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_MIO); + zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); + if (rc) + clp_disable_fh(zdev); + } +out: return rc; } @@ -296,11 +310,10 @@ int clp_disable_fh(struct zpci_dev *zdev) return 0; rc = clp_set_pci_fn(&fh, 0, CLP_SET_DISABLE_PCI_FN); + zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); if (!rc) - /* Success -> store disabled handle in zdev */ zdev->fh = fh; - zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, zdev->fh, rc); return rc; } diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 6dbd13bb7c1c..02f9505c99a8 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -161,13 +162,50 @@ int __zpci_load(u64 *data, u64 req, u64 offset) } EXPORT_SYMBOL_GPL(__zpci_load); -int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len) +static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr, + unsigned long len) { struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); return __zpci_load(data, req, ZPCI_OFFSET(addr)); } + +static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status) +{ + register u64 addr asm("2") = ioaddr; + register u64 r3 asm("3") = len; + int cc = -ENXIO; + u64 __data; + + asm volatile ( + " .insn rre,0xb9d60000,%[data],%[ioaddr]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [data] "=d" (__data), "+d" (r3) + : [ioaddr] "d" (addr) + : "cc"); + *status = r3 >> 24 & 0xff; + *data = __data; + return cc; +} + +int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_load_fh(data, addr, len); + + cc = __pcilg_mio(data, (__force u64) addr, len, &status); + if (cc) + zpci_err_insn(cc, status, 0, (__force u64) addr); + + return (cc > 0) ? -EIO : cc; +} EXPORT_SYMBOL_GPL(zpci_load); /* PCI Store */ @@ -208,13 +246,48 @@ int __zpci_store(u64 data, u64 req, u64 offset) } EXPORT_SYMBOL_GPL(__zpci_store); -int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len) +static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data, + unsigned long len) { struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); return __zpci_store(data, req, ZPCI_OFFSET(addr)); } + +static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status) +{ + register u64 addr asm("2") = ioaddr; + register u64 r3 asm("3") = len; + int cc = -ENXIO; + + asm volatile ( + " .insn rre,0xb9d40000,%[data],%[ioaddr]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), "+d" (r3) + : [data] "d" (data), [ioaddr] "d" (addr) + : "cc"); + *status = r3 >> 24 & 0xff; + return cc; +} + +int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_store_fh(addr, data, len); + + cc = __pcistg_mio(data, (__force u64) addr, len, &status); + if (cc) + zpci_err_insn(cc, status, 0, (__force u64) addr); + + return (cc > 0) ? -EIO : cc; +} EXPORT_SYMBOL_GPL(zpci_store); /* PCI Store Block */ @@ -253,8 +326,8 @@ int __zpci_store_block(const u64 *data, u64 req, u64 offset) } EXPORT_SYMBOL_GPL(__zpci_store_block); -int zpci_write_block(volatile void __iomem *dst, - const void *src, unsigned long len) +static inline int zpci_write_block_fh(volatile void __iomem *dst, + const void *src, unsigned long len) { struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(dst)]; u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); @@ -262,4 +335,52 @@ int zpci_write_block(volatile void __iomem *dst, return __zpci_store_block(src, req, offset); } + +static inline int __pcistb_mio(const u64 *data, u64 ioaddr, u64 len, u8 *status) +{ + int cc = -ENXIO; + + asm volatile ( + " .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[data]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [len] "+d" (len) + : [ioaddr] "d" (ioaddr), [data] "Q" (*data) + : "cc"); + *status = len >> 24 & 0xff; + return cc; +} + +int zpci_write_block(volatile void __iomem *dst, + const void *src, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_write_block_fh(dst, src, len); + + cc = __pcistb_mio(src, (__force u64) dst, len, &status); + if (cc) + zpci_err_insn(cc, status, 0, (__force u64) dst); + + return (cc > 0) ? -EIO : cc; +} EXPORT_SYMBOL_GPL(zpci_write_block); + +static inline void __pciwb_mio(void) +{ + unsigned long unused = 0; + + asm volatile (".insn rre,0xb9d50000,%[op],%[op]\n" + : [op] "+d" (unused)); +} + +void zpci_barrier(void) +{ + if (static_branch_likely(&have_mio)) + __pciwb_mio(); +} +EXPORT_SYMBOL_GPL(zpci_barrier); From 56271303808fb86375ec33183d56c21fdeb836ea Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 18 Apr 2019 21:39:06 +0200 Subject: [PATCH 77/98] s390/pci: add parameter to disable usage of MIO instructions Allow users to disable usage of MIO instructions by specifying pci=nomio at the kernel command line. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- Documentation/admin-guide/kernel-parameters.txt | 1 + arch/s390/pci/pci.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index abc1b6e305c4..055284c31e37 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3395,6 +3395,7 @@ this removes isolation between devices and may put more devices in an IOMMU group. force_floating [S390] Force usage of floating interrupts. + nomio [S390] Do not use MIO instructions. pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index dff8f4526c8d..0ebb7c405a25 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -850,6 +850,7 @@ static void zpci_mem_exit(void) } static unsigned int s390_pci_probe __initdata = 1; +static unsigned int s390_pci_no_mio __initdata; unsigned int s390_pci_force_floating __initdata; static unsigned int s390_pci_initialized; @@ -859,6 +860,10 @@ char * __init pcibios_setup(char *str) s390_pci_probe = 0; return NULL; } + if (!strcmp(str, "nomio")) { + s390_pci_no_mio = 1; + return NULL; + } if (!strcmp(str, "force_floating")) { s390_pci_force_floating = 1; return NULL; @@ -881,7 +886,7 @@ static int __init pci_base_init(void) if (!test_facility(69) || !test_facility(71)) return 0; - if (test_facility(153)) + if (test_facility(153) && !s390_pci_no_mio) static_branch_enable(&have_mio); rc = zpci_debug_init(); From 833b441ec0f6f3c57cc2106ef628bb19d8fb0ee2 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 23 Apr 2019 13:14:28 +0200 Subject: [PATCH 78/98] s390: enable processes for mio instructions Allow for userspace to use PCI MIO instructions. Signed-off-by: Sebastian Ott Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pci_insn.h | 10 ++++++++++ arch/s390/kernel/early.c | 2 ++ 2 files changed, 12 insertions(+) diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index 61cf9531f68f..ff81ed19c506 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -143,4 +143,14 @@ static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc) return __zpci_set_irq_ctrl(ctl, isc, &iib); } +#ifdef CONFIG_PCI +static inline void enable_mio_ctl(void) +{ + if (static_branch_likely(&have_mio)) + __ctl_set_bit(2, 5); +} +#else /* CONFIG_PCI */ +static inline void enable_mio_ctl(void) {} +#endif /* CONFIG_PCI */ + #endif diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index c196abda36c7..ab09ada0e930 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "entry.h" /* @@ -235,6 +236,7 @@ static __init void detect_machine_facilities(void) clock_comparator_max = -1ULL >> 1; __ctl_set_bit(0, 53); } + enable_mio_ctl(); } static inline void save_vector_registers(void) From 805bc0bc238f7209fca5e39c152b0d3c12046ac9 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Sun, 3 Feb 2019 21:35:45 +0100 Subject: [PATCH 79/98] s390/kernel: build a relocatable kernel This patch adds support for building a relocatable kernel with -fPIE. The kernel will be relocated to 0 early in the boot process. Signed-off-by: Gerald Schaefer Reviewed-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 13 ++++++ arch/s390/Makefile | 4 ++ arch/s390/boot/Makefile | 1 + arch/s390/boot/compressed/decompressor.h | 3 ++ arch/s390/boot/machine_kexec_reloc.c | 2 + arch/s390/boot/startup.c | 27 ++++++++++++ arch/s390/include/asm/kexec.h | 2 + arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/machine_kexec_file.c | 44 ++------------------ arch/s390/kernel/machine_kexec_reloc.c | 53 ++++++++++++++++++++++++ arch/s390/kernel/vmlinux.lds.S | 15 +++++++ 11 files changed, 124 insertions(+), 42 deletions(-) create mode 100644 arch/s390/boot/machine_kexec_reloc.c create mode 100644 arch/s390/kernel/machine_kexec_reloc.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 21e851b0a989..4c99e4f5f366 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -624,6 +624,19 @@ config EXPOLINE_FULL endchoice +config RELOCATABLE + bool "Build a relocatable kernel" + select MODULE_REL_CRCS if MODVERSIONS + default y + help + This builds a kernel image that retains relocation information + so it can be loaded at an arbitrary address. + The kernel is linked as a position-independent executable (PIE) + and contains dynamic relocations which are processed early in the + bootup process. + The relocations make the kernel image about 15% larger (compressed + 10%), but are discarded at runtime. + endmenu menu "Memory setup" diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 9c079a506325..54b8a12d64e8 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -16,6 +16,10 @@ KBUILD_AFLAGS_MODULE += -fPIC KBUILD_CFLAGS_MODULE += -fPIC KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 +ifeq ($(CONFIG_RELOCATABLE),y) +KBUILD_CFLAGS += -fPIE +LDFLAGS_vmlinux := -pie +endif aflags_dwarf := -Wa,-gdwarf-2 KBUILD_AFLAGS_DECOMPRESSOR := -m64 -D__ASSEMBLY__ KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index c1993c57300f..4df43e83363a 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -32,6 +32,7 @@ obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += ctype.o obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o +obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) subdir- := compressed diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/compressed/decompressor.h index 424cf524aac1..c15eb7114d83 100644 --- a/arch/s390/boot/compressed/decompressor.h +++ b/arch/s390/boot/compressed/decompressor.h @@ -19,6 +19,9 @@ struct vmlinux_info { unsigned long bootdata_size; unsigned long bootdata_preserved_off; unsigned long bootdata_preserved_size; + unsigned long dynsym_start; + unsigned long rela_dyn_start; + unsigned long rela_dyn_end; }; extern char _vmlinux_info[]; diff --git a/arch/s390/boot/machine_kexec_reloc.c b/arch/s390/boot/machine_kexec_reloc.c new file mode 100644 index 000000000000..b7a5d0f72097 --- /dev/null +++ b/arch/s390/boot/machine_kexec_reloc.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../kernel/machine_kexec_reloc.c" diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 90898976a941..b7d6a76cb5e9 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include +#include #include #include #include "compressed/decompressor.h" @@ -47,6 +49,29 @@ static void copy_bootdata(void) memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); } +static void handle_relocs(unsigned long offset) +{ + Elf64_Rela *rela_start, *rela_end, *rela; + int r_type, r_sym, rc; + Elf64_Addr loc, val; + Elf64_Sym *dynsym; + + rela_start = (Elf64_Rela *) vmlinux.rela_dyn_start; + rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end; + dynsym = (Elf64_Sym *) vmlinux.dynsym_start; + for (rela = rela_start; rela < rela_end; rela++) { + loc = rela->r_offset + offset; + val = rela->r_addend + offset; + r_sym = ELF64_R_SYM(rela->r_info); + if (r_sym) + val += dynsym[r_sym].st_value; + r_type = ELF64_R_TYPE(rela->r_info); + rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0); + if (rc) + error("Unknown relocation type"); + } +} + void startup_kernel(void) { unsigned long safe_addr; @@ -67,5 +92,7 @@ void startup_kernel(void) memmove((void *)vmlinux.default_lma, img, vmlinux.image_size); } copy_bootdata(); + if (IS_ENABLED(CONFIG_RELOCATABLE)) + handle_relocs(0); vmlinux.entry(); } diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 305d3465574f..ea398a05f643 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -71,6 +71,8 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len); void *kexec_file_add_components(struct kimage *image, int (*add_kernel)(struct kimage *image, struct s390_load_data *data)); +int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, + unsigned long addr); extern const struct kexec_file_ops s390_kexec_image_ops; extern const struct kexec_file_ops s390_kexec_elf_ops; diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index d28acd7ba81e..19425605a83d 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -51,7 +51,7 @@ obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o early_nobss.o obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o -obj-y += nospec-branch.o ipl_vmparm.o +obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o extra-y += head64.o vmlinux.lds diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 48cab9600ed9..42c23a5c8229 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -290,7 +290,7 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *symtab) { Elf_Rela *relas; - int i; + int i, r_type; relas = (void *)pi->ehdr + relsec->sh_offset; @@ -324,46 +324,8 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, addr = section->sh_addr + relas[i].r_offset; - switch (ELF64_R_TYPE(relas[i].r_info)) { - case R_390_8: /* Direct 8 bit. */ - *(u8 *)loc = val; - break; - case R_390_12: /* Direct 12 bit. */ - *(u16 *)loc &= 0xf000; - *(u16 *)loc |= val & 0xfff; - break; - case R_390_16: /* Direct 16 bit. */ - *(u16 *)loc = val; - break; - case R_390_20: /* Direct 20 bit. */ - *(u32 *)loc &= 0xf00000ff; - *(u32 *)loc |= (val & 0xfff) << 16; /* DL */ - *(u32 *)loc |= (val & 0xff000) >> 4; /* DH */ - break; - case R_390_32: /* Direct 32 bit. */ - *(u32 *)loc = val; - break; - case R_390_64: /* Direct 64 bit. */ - *(u64 *)loc = val; - break; - case R_390_PC16: /* PC relative 16 bit. */ - *(u16 *)loc = (val - addr); - break; - case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ - *(u16 *)loc = (val - addr) >> 1; - break; - case R_390_PC32DBL: /* PC relative 32 bit shifted by 1. */ - *(u32 *)loc = (val - addr) >> 1; - break; - case R_390_PC32: /* PC relative 32 bit. */ - *(u32 *)loc = (val - addr); - break; - case R_390_PC64: /* PC relative 64 bit. */ - *(u64 *)loc = (val - addr); - break; - default: - break; - } + r_type = ELF64_R_TYPE(relas[i].r_info); + arch_kexec_do_relocs(r_type, loc, val, addr); } return 0; } diff --git a/arch/s390/kernel/machine_kexec_reloc.c b/arch/s390/kernel/machine_kexec_reloc.c new file mode 100644 index 000000000000..1dded39239f8 --- /dev/null +++ b/arch/s390/kernel/machine_kexec_reloc.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, + unsigned long addr) +{ + switch (r_type) { + case R_390_NONE: + break; + case R_390_8: /* Direct 8 bit. */ + *(u8 *)loc = val; + break; + case R_390_12: /* Direct 12 bit. */ + *(u16 *)loc &= 0xf000; + *(u16 *)loc |= val & 0xfff; + break; + case R_390_16: /* Direct 16 bit. */ + *(u16 *)loc = val; + break; + case R_390_20: /* Direct 20 bit. */ + *(u32 *)loc &= 0xf00000ff; + *(u32 *)loc |= (val & 0xfff) << 16; /* DL */ + *(u32 *)loc |= (val & 0xff000) >> 4; /* DH */ + break; + case R_390_32: /* Direct 32 bit. */ + *(u32 *)loc = val; + break; + case R_390_64: /* Direct 64 bit. */ + *(u64 *)loc = val; + break; + case R_390_PC16: /* PC relative 16 bit. */ + *(u16 *)loc = (val - addr); + break; + case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ + *(u16 *)loc = (val - addr) >> 1; + break; + case R_390_PC32DBL: /* PC relative 32 bit shifted by 1. */ + *(u32 *)loc = (val - addr) >> 1; + break; + case R_390_PC32: /* PC relative 32 bit. */ + *(u32 *)loc = (val - addr); + break; + case R_390_PC64: /* PC relative 64 bit. */ + *(u64 *)loc = (val - addr); + break; + case R_390_RELATIVE: + *(unsigned long *) loc = val; + break; + default: + return 1; + } + return 0; +} diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 6ef9c62bb01b..49d55327de0b 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -144,6 +144,18 @@ SECTIONS INIT_DATA_SECTION(0x100) PERCPU_SECTION(0x100) + + .dynsym ALIGN(8) : { + __dynsym_start = .; + *(.dynsym) + __dynsym_end = .; + } + .rela.dyn ALIGN(8) : { + __rela_dyn_start = .; + *(.rela*) + __rela_dyn_end = .; + } + . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ @@ -165,6 +177,9 @@ SECTIONS QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */ QUAD(__boot_data_preserved_end - __boot_data_preserved_start) /* bootdata_preserved_size */ + QUAD(__dynsym_start) /* dynsym_start */ + QUAD(__rela_dyn_start) /* rela_dyn_start */ + QUAD(__rela_dyn_end) /* rela_dyn_end */ } :NONE /* Debugging sections. */ From ff4a742dde3c4b80a91cdd754fed3bc576df28c9 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Sun, 3 Feb 2019 21:36:13 +0100 Subject: [PATCH 80/98] s390/kernel: convert SYSCALL and PGM_CHECK handlers to .quad With a relocatable kernel that could reside at any place in memory, the storage size for the SYSCALL and PGM_CHECK handlers needs to be increased from .long to .quad. Signed-off-by: Gerald Schaefer Reviewed-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/syscall.h | 9 ++------- arch/s390/kernel/entry.S | 18 +++++++++--------- arch/s390/kernel/pgm_check.S | 2 +- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index 96f9a9151fde..8f77bb4293f9 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -14,13 +14,8 @@ #include #include -/* - * The syscall table always contains 32 bit pointers since we know that the - * address of the function to be called is (way) below 4GB. So the "int" - * type here is what we want [need] for both 32 bit and 64 bit systems. - */ -extern const unsigned int sys_call_table[]; -extern const unsigned int sys_call_table_emu[]; +extern const unsigned long sys_call_table[]; +extern const unsigned long sys_call_table_emu[]; static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 583d65ef5007..baa67e434b46 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -358,19 +358,19 @@ ENTRY(system_call) # load address of system call table lg %r10,__THREAD_sysc_table(%r13,%r12) llgh %r8,__PT_INT_CODE+2(%r11) - slag %r8,%r8,2 # shift and test for svc 0 + slag %r8,%r8,3 # shift and test for svc 0 jnz .Lsysc_nr_ok # svc 0: system call number in %r1 llgfr %r1,%r1 # clear high word in r1 cghi %r1,NR_syscalls jnl .Lsysc_nr_ok sth %r1,__PT_INT_CODE+2(%r11) - slag %r8,%r1,2 + slag %r8,%r1,3 .Lsysc_nr_ok: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stg %r2,__PT_ORIG_GPR2(%r11) stg %r7,STACK_FRAME_OVERHEAD(%r15) - lgf %r9,0(%r8,%r10) # get system call add. + lg %r9,0(%r8,%r10) # get system call add. TSTMSK __TI_flags(%r12),_TIF_TRACE jnz .Lsysc_tracesys BASR_EX %r14,%r9 # call sys_xxxx @@ -556,8 +556,8 @@ ENTRY(system_call) lghi %r0,NR_syscalls clgr %r0,%r2 jnh .Lsysc_tracenogo - sllg %r8,%r2,2 - lgf %r9,0(%r8,%r10) + sllg %r8,%r2,3 + lg %r9,0(%r8,%r10) .Lsysc_tracego: lmg %r3,%r7,__PT_R3(%r11) stg %r7,STACK_FRAME_OVERHEAD(%r15) @@ -665,9 +665,9 @@ ENTRY(pgm_check_handler) larl %r1,pgm_check_table llgh %r10,__PT_INT_CODE+2(%r11) nill %r10,0x007f - sll %r10,2 + sll %r10,3 je .Lpgm_return - lgf %r9,0(%r10,%r1) # load address of handler routine + lg %r9,0(%r10,%r1) # load address of handler routine lgr %r2,%r11 # pass pointer to pt_regs BASR_EX %r14,%r9 # branch to interrupt-handler .Lpgm_return: @@ -1512,7 +1512,7 @@ cleanup_critical: .quad .Lsie_skip - .Lsie_entry #endif .section .rodata, "a" -#define SYSCALL(esame,emu) .long __s390x_ ## esame +#define SYSCALL(esame,emu) .quad __s390x_ ## esame .globl sys_call_table sys_call_table: #include "asm/syscall_table.h" @@ -1520,7 +1520,7 @@ sys_call_table: #ifdef CONFIG_COMPAT -#define SYSCALL(esame,emu) .long __s390_ ## emu +#define SYSCALL(esame,emu) .quad __s390_ ## emu .globl sys_call_table_emu sys_call_table_emu: #include "asm/syscall_table.h" diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S index 3e62aae34ea3..59dee9d3bebf 100644 --- a/arch/s390/kernel/pgm_check.S +++ b/arch/s390/kernel/pgm_check.S @@ -7,7 +7,7 @@ #include -#define PGM_CHECK(handler) .long handler +#define PGM_CHECK(handler) .quad handler #define PGM_CHECK_DEFAULT PGM_CHECK(default_trap_handler) /* From fd3d2742d558d33560ec5dfee3001e561b5c0822 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Sun, 3 Feb 2019 21:36:46 +0100 Subject: [PATCH 81/98] s390/kprobes: use static buffer for insn_page With a relocatable kernel that could reside at any place in memory, the current logic for allocating a kprobes insn_page does not work. The GFP_DMA allocated buffer might be more than 2 GB away from the kernel. Use a static buffer for the insn_page instead. Signed-off-by: Gerald Schaefer Reviewed-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/kprobes.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 7c0a095e9c5f..263426dcf97d 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -27,29 +27,30 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = { }; -DEFINE_INSN_CACHE_OPS(dmainsn); +DEFINE_INSN_CACHE_OPS(s390_insn); -static void *alloc_dmainsn_page(void) +static int insn_page_in_use; +static char insn_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +static void *alloc_s390_insn_page(void) { - void *page; - - page = (void *) __get_free_page(GFP_KERNEL | GFP_DMA); - if (page) - set_memory_x((unsigned long) page, 1); - return page; + if (xchg(&insn_page_in_use, 1) == 1) + return NULL; + set_memory_x((unsigned long) &insn_page, 1); + return &insn_page; } -static void free_dmainsn_page(void *page) +static void free_s390_insn_page(void *page) { set_memory_nx((unsigned long) page, 1); - free_page((unsigned long)page); + xchg(&insn_page_in_use, 0); } -struct kprobe_insn_cache kprobe_dmainsn_slots = { - .mutex = __MUTEX_INITIALIZER(kprobe_dmainsn_slots.mutex), - .alloc = alloc_dmainsn_page, - .free = free_dmainsn_page, - .pages = LIST_HEAD_INIT(kprobe_dmainsn_slots.pages), +struct kprobe_insn_cache kprobe_s390_insn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_s390_insn_slots.mutex), + .alloc = alloc_s390_insn_page, + .free = free_s390_insn_page, + .pages = LIST_HEAD_INIT(kprobe_s390_insn_slots.pages), .insn_size = MAX_INSN_SIZE, }; @@ -102,7 +103,7 @@ static int s390_get_insn_slot(struct kprobe *p) */ p->ainsn.insn = NULL; if (is_kernel_addr(p->addr)) - p->ainsn.insn = get_dmainsn_slot(); + p->ainsn.insn = get_s390_insn_slot(); else if (is_module_addr(p->addr)) p->ainsn.insn = get_insn_slot(); return p->ainsn.insn ? 0 : -ENOMEM; @@ -114,7 +115,7 @@ static void s390_free_insn_slot(struct kprobe *p) if (!p->ainsn.insn) return; if (is_kernel_addr(p->addr)) - free_dmainsn_slot(p->ainsn.insn, 0); + free_s390_insn_slot(p->ainsn.insn, 0); else free_insn_slot(p->ainsn.insn, 0); p->ainsn.insn = NULL; From 087c4d7423989b110c3312592db05acc009a5d58 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Mon, 8 Apr 2019 12:49:58 +0200 Subject: [PATCH 82/98] s390/sclp: do not use static sccbs The sccbs for init/read/sdias/early have to be located below 2 GB, and they are currently defined as a static buffer. With a relocatable kernel that could reside at any place in memory, this will no longer guarantee the location below 2 GB, so use a dynamic GFP_DMA allocation instead. The sclp_early_sccb buffer needs special handling, as it can be used very early, and by both the decompressor and also the decompressed kernel. Therefore, a fixed 4 KB buffer is introduced at 0x11000, the former PARMAREA_END. The new PARMAREA_END is now 0x12000, and it is renamed to HEAD_END, as it is rather the end of head.S and not the end of the parmarea. Signed-off-by: Gerald Schaefer Reviewed-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/boot/head.S | 5 +- arch/s390/include/asm/setup.h | 5 +- arch/s390/kernel/machine_kexec_file.c | 4 +- arch/s390/kernel/setup.c | 2 +- drivers/s390/char/sclp.c | 14 ++++-- drivers/s390/char/sclp.h | 2 +- drivers/s390/char/sclp_early.c | 2 +- drivers/s390/char/sclp_early_core.c | 20 ++++---- drivers/s390/char/sclp_sdias.c | 72 +++++++++++++++------------ 9 files changed, 70 insertions(+), 56 deletions(-) diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index d585c4dbdac7..c6b4b4c5dd58 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -336,4 +336,7 @@ ENTRY(startup_kdump) .byte "root=/dev/ram0 ro" .byte 0 - .org 0x11000 + .org EARLY_SCCB_OFFSET + .fill 4096 + + .org HEAD_END diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index b603cc09c895..d202756d6291 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -12,7 +12,10 @@ #define EP_OFFSET 0x10008 #define EP_STRING "S390EP" #define PARMAREA 0x10400 -#define PARMAREA_END 0x11000 +#define EARLY_SCCB_OFFSET 0x11000 +#define HEAD_END 0x12000 + +#define EARLY_SCCB_SIZE PAGE_SIZE /* * Machine features detected in early.c diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 42c23a5c8229..fbdd3ea73667 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -337,10 +337,8 @@ int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, * load memory in head.S will be accessed, e.g. to register the next * command line. If the next kernel were smaller the current kernel * will panic at load. - * - * 0x11000 = sizeof(head.S) */ - if (buf_len < 0x11000) + if (buf_len < HEAD_END) return -ENOEXEC; return kexec_image_probe_default(image, buf, buf_len); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index ffc87520aca9..94efb1eb34b6 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -829,7 +829,7 @@ static void __init reserve_kernel(void) { unsigned long start_pfn = PFN_UP(__pa(_end)); - memblock_reserve(0, PARMAREA_END); + memblock_reserve(0, HEAD_END); memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) - (unsigned long)_stext); } diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c index e9aa71cdfc44..d2ab3f07c008 100644 --- a/drivers/s390/char/sclp.c +++ b/drivers/s390/char/sclp.c @@ -45,8 +45,8 @@ static struct list_head sclp_req_queue; /* Data for read and and init requests. */ static struct sclp_req sclp_read_req; static struct sclp_req sclp_init_req; -static char sclp_read_sccb[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); -static char sclp_init_sccb[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); +static void *sclp_read_sccb; +static struct init_sccb *sclp_init_sccb; /* Suspend request */ static DECLARE_COMPLETION(sclp_request_queue_flushed); @@ -753,9 +753,8 @@ EXPORT_SYMBOL(sclp_remove_processed); static inline void __sclp_make_init_req(sccb_mask_t receive_mask, sccb_mask_t send_mask) { - struct init_sccb *sccb; + struct init_sccb *sccb = sclp_init_sccb; - sccb = (struct init_sccb *) sclp_init_sccb; clear_page(sccb); memset(&sclp_init_req, 0, sizeof(struct sclp_req)); sclp_init_req.command = SCLP_CMDW_WRITE_EVENT_MASK; @@ -782,7 +781,7 @@ static int sclp_init_mask(int calculate) { unsigned long flags; - struct init_sccb *sccb = (struct init_sccb *) sclp_init_sccb; + struct init_sccb *sccb = sclp_init_sccb; sccb_mask_t receive_mask; sccb_mask_t send_mask; int retry; @@ -1175,6 +1174,9 @@ sclp_init(void) if (sclp_init_state != sclp_init_state_uninitialized) goto fail_unlock; sclp_init_state = sclp_init_state_initializing; + sclp_read_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA); + sclp_init_sccb = (void *) __get_free_page(GFP_ATOMIC | GFP_DMA); + BUG_ON(!sclp_read_sccb || !sclp_init_sccb); /* Set up variables */ INIT_LIST_HEAD(&sclp_req_queue); INIT_LIST_HEAD(&sclp_reg_list); @@ -1207,6 +1209,8 @@ fail_unregister_reboot_notifier: unregister_reboot_notifier(&sclp_reboot_notifier); fail_init_state_uninitialized: sclp_init_state = sclp_init_state_uninitialized; + free_page((unsigned long) sclp_read_sccb); + free_page((unsigned long) sclp_init_sccb); fail_unlock: spin_unlock_irqrestore(&sclp_lock, flags); return rc; diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h index 043f32ba3749..28b433960831 100644 --- a/drivers/s390/char/sclp.h +++ b/drivers/s390/char/sclp.h @@ -321,7 +321,7 @@ extern int sclp_console_drop; extern unsigned long sclp_console_full; extern bool sclp_mask_compat_mode; -extern char sclp_early_sccb[PAGE_SIZE]; +extern char *sclp_early_sccb; void sclp_early_wait_irq(void); int sclp_early_cmd(sclp_cmdw_t cmd, void *sccb); diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index fdad2a980129..6c90aa725f23 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -147,7 +147,7 @@ static void __init sclp_early_console_detect(struct init_sccb *sccb) void __init sclp_early_detect(void) { - void *sccb = &sclp_early_sccb; + void *sccb = sclp_early_sccb; sclp_early_facilities_detect(sccb); sclp_early_init_core_info(sccb); diff --git a/drivers/s390/char/sclp_early_core.c b/drivers/s390/char/sclp_early_core.c index 387c114ded3f..7737470f8498 100644 --- a/drivers/s390/char/sclp_early_core.c +++ b/drivers/s390/char/sclp_early_core.c @@ -16,7 +16,7 @@ static struct read_info_sccb __bootdata(sclp_info_sccb); static int __bootdata(sclp_info_sccb_valid); -char sclp_early_sccb[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data); +char *sclp_early_sccb = (char *) EARLY_SCCB_OFFSET; int sclp_init_state __section(.data) = sclp_init_state_uninitialized; /* * Used to keep track of the size of the event masks. Qemu until version 2.11 @@ -91,8 +91,8 @@ static void sclp_early_print_lm(const char *str, unsigned int len) struct mto *mto; struct go *go; - sccb = (struct write_sccb *) &sclp_early_sccb; - end = (unsigned char *) sccb + sizeof(sclp_early_sccb) - 1; + sccb = (struct write_sccb *) sclp_early_sccb; + end = (unsigned char *) sccb + EARLY_SCCB_SIZE - 1; memset(sccb, 0, sizeof(*sccb)); ptr = (unsigned char *) &sccb->msg.mdb.mto; offset = 0; @@ -139,9 +139,9 @@ static void sclp_early_print_vt220(const char *str, unsigned int len) { struct vt220_sccb *sccb; - sccb = (struct vt220_sccb *) &sclp_early_sccb; - if (sizeof(*sccb) + len >= sizeof(sclp_early_sccb)) - len = sizeof(sclp_early_sccb) - sizeof(*sccb); + sccb = (struct vt220_sccb *) sclp_early_sccb; + if (sizeof(*sccb) + len >= EARLY_SCCB_SIZE) + len = EARLY_SCCB_SIZE - sizeof(*sccb); memset(sccb, 0, sizeof(*sccb)); memcpy(&sccb->msg.data, str, len); sccb->header.length = sizeof(*sccb) + len; @@ -199,7 +199,7 @@ static int sclp_early_setup(int disable, int *have_linemode, int *have_vt220) BUILD_BUG_ON(sizeof(struct init_sccb) > PAGE_SIZE); *have_linemode = *have_vt220 = 0; - sccb = (struct init_sccb *) &sclp_early_sccb; + sccb = (struct init_sccb *) sclp_early_sccb; receive_mask = disable ? 0 : EVTYP_OPCMD_MASK; send_mask = disable ? 0 : EVTYP_VT220MSG_MASK | EVTYP_MSG_MASK; rc = sclp_early_set_event_mask(sccb, receive_mask, send_mask); @@ -304,7 +304,7 @@ int __init sclp_early_get_hsa_size(unsigned long *hsa_size) void __weak __init add_mem_detect_block(u64 start, u64 end) {} int __init sclp_early_read_storage_info(void) { - struct read_storage_sccb *sccb = (struct read_storage_sccb *)&sclp_early_sccb; + struct read_storage_sccb *sccb = (struct read_storage_sccb *)sclp_early_sccb; int rc, id, max_id = 0; unsigned long rn, rzm; sclp_cmdw_t command; @@ -320,8 +320,8 @@ int __init sclp_early_read_storage_info(void) rzm <<= 20; for (id = 0; id <= max_id; id++) { - memset(sclp_early_sccb, 0, sizeof(sclp_early_sccb)); - sccb->header.length = sizeof(sclp_early_sccb); + memset(sclp_early_sccb, 0, EARLY_SCCB_SIZE); + sccb->header.length = EARLY_SCCB_SIZE; command = SCLP_CMDW_READ_STORAGE_INFO | (id << 8); rc = sclp_early_cmd(command, sccb); if (rc) diff --git a/drivers/s390/char/sclp_sdias.c b/drivers/s390/char/sclp_sdias.c index 8e0b69a2f11a..13f97fd73aca 100644 --- a/drivers/s390/char/sclp_sdias.c +++ b/drivers/s390/char/sclp_sdias.c @@ -29,7 +29,7 @@ static struct sclp_register sclp_sdias_register = { .send_mask = EVTYP_SDIAS_MASK, }; -static struct sdias_sccb sccb __attribute__((aligned(4096))); +static struct sdias_sccb *sclp_sdias_sccb; static struct sdias_evbuf sdias_evbuf; static DECLARE_COMPLETION(evbuf_accepted); @@ -58,6 +58,7 @@ static void sdias_callback(struct sclp_req *request, void *data) static int sdias_sclp_send(struct sclp_req *req) { + struct sdias_sccb *sccb = sclp_sdias_sccb; int retries; int rc; @@ -78,16 +79,16 @@ static int sdias_sclp_send(struct sclp_req *req) continue; } /* if not accepted, retry */ - if (!(sccb.evbuf.hdr.flags & 0x80)) { + if (!(sccb->evbuf.hdr.flags & 0x80)) { TRACE("sclp request failed: flags=%x\n", - sccb.evbuf.hdr.flags); + sccb->evbuf.hdr.flags); continue; } /* * for the sync interface the response is in the initial sccb */ if (!sclp_sdias_register.receiver_fn) { - memcpy(&sdias_evbuf, &sccb.evbuf, sizeof(sdias_evbuf)); + memcpy(&sdias_evbuf, &sccb->evbuf, sizeof(sdias_evbuf)); TRACE("sync request done\n"); return 0; } @@ -104,23 +105,24 @@ static int sdias_sclp_send(struct sclp_req *req) */ int sclp_sdias_blk_count(void) { + struct sdias_sccb *sccb = sclp_sdias_sccb; struct sclp_req request; int rc; mutex_lock(&sdias_mutex); - memset(&sccb, 0, sizeof(sccb)); + memset(sccb, 0, sizeof(*sccb)); memset(&request, 0, sizeof(request)); - sccb.hdr.length = sizeof(sccb); - sccb.evbuf.hdr.length = sizeof(struct sdias_evbuf); - sccb.evbuf.hdr.type = EVTYP_SDIAS; - sccb.evbuf.event_qual = SDIAS_EQ_SIZE; - sccb.evbuf.data_id = SDIAS_DI_FCP_DUMP; - sccb.evbuf.event_id = 4712; - sccb.evbuf.dbs = 1; + sccb->hdr.length = sizeof(*sccb); + sccb->evbuf.hdr.length = sizeof(struct sdias_evbuf); + sccb->evbuf.hdr.type = EVTYP_SDIAS; + sccb->evbuf.event_qual = SDIAS_EQ_SIZE; + sccb->evbuf.data_id = SDIAS_DI_FCP_DUMP; + sccb->evbuf.event_id = 4712; + sccb->evbuf.dbs = 1; - request.sccb = &sccb; + request.sccb = sccb; request.command = SCLP_CMDW_WRITE_EVENT_DATA; request.status = SCLP_REQ_FILLED; request.callback = sdias_callback; @@ -130,8 +132,8 @@ int sclp_sdias_blk_count(void) pr_err("sclp_send failed for get_nr_blocks\n"); goto out; } - if (sccb.hdr.response_code != 0x0020) { - TRACE("send failed: %x\n", sccb.hdr.response_code); + if (sccb->hdr.response_code != 0x0020) { + TRACE("send failed: %x\n", sccb->hdr.response_code); rc = -EIO; goto out; } @@ -163,30 +165,31 @@ out: */ int sclp_sdias_copy(void *dest, int start_blk, int nr_blks) { + struct sdias_sccb *sccb = sclp_sdias_sccb; struct sclp_req request; int rc; mutex_lock(&sdias_mutex); - memset(&sccb, 0, sizeof(sccb)); + memset(sccb, 0, sizeof(*sccb)); memset(&request, 0, sizeof(request)); - sccb.hdr.length = sizeof(sccb); - sccb.evbuf.hdr.length = sizeof(struct sdias_evbuf); - sccb.evbuf.hdr.type = EVTYP_SDIAS; - sccb.evbuf.hdr.flags = 0; - sccb.evbuf.event_qual = SDIAS_EQ_STORE_DATA; - sccb.evbuf.data_id = SDIAS_DI_FCP_DUMP; - sccb.evbuf.event_id = 4712; - sccb.evbuf.asa_size = SDIAS_ASA_SIZE_64; - sccb.evbuf.event_status = 0; - sccb.evbuf.blk_cnt = nr_blks; - sccb.evbuf.asa = (unsigned long)dest; - sccb.evbuf.fbn = start_blk; - sccb.evbuf.lbn = 0; - sccb.evbuf.dbs = 1; + sccb->hdr.length = sizeof(*sccb); + sccb->evbuf.hdr.length = sizeof(struct sdias_evbuf); + sccb->evbuf.hdr.type = EVTYP_SDIAS; + sccb->evbuf.hdr.flags = 0; + sccb->evbuf.event_qual = SDIAS_EQ_STORE_DATA; + sccb->evbuf.data_id = SDIAS_DI_FCP_DUMP; + sccb->evbuf.event_id = 4712; + sccb->evbuf.asa_size = SDIAS_ASA_SIZE_64; + sccb->evbuf.event_status = 0; + sccb->evbuf.blk_cnt = nr_blks; + sccb->evbuf.asa = (unsigned long)dest; + sccb->evbuf.fbn = start_blk; + sccb->evbuf.lbn = 0; + sccb->evbuf.dbs = 1; - request.sccb = &sccb; + request.sccb = sccb; request.command = SCLP_CMDW_WRITE_EVENT_DATA; request.status = SCLP_REQ_FILLED; request.callback = sdias_callback; @@ -196,8 +199,8 @@ int sclp_sdias_copy(void *dest, int start_blk, int nr_blks) pr_err("sclp_send failed: %x\n", rc); goto out; } - if (sccb.hdr.response_code != 0x0020) { - TRACE("copy failed: %x\n", sccb.hdr.response_code); + if (sccb->hdr.response_code != 0x0020) { + TRACE("copy failed: %x\n", sccb->hdr.response_code); rc = -EIO; goto out; } @@ -256,6 +259,8 @@ int __init sclp_sdias_init(void) { if (ipl_info.type != IPL_TYPE_FCP_DUMP) return 0; + sclp_sdias_sccb = (void *) __get_free_page(GFP_KERNEL | GFP_DMA); + BUG_ON(!sclp_sdias_sccb); sdias_dbf = debug_register("dump_sdias", 4, 1, 4 * sizeof(long)); debug_register_view(sdias_dbf, &debug_sprintf_view); debug_set_level(sdias_dbf, 6); @@ -264,6 +269,7 @@ int __init sclp_sdias_init(void) if (sclp_sdias_init_async() == 0) goto out; TRACE("init failed\n"); + free_page((unsigned long) sclp_sdias_sccb); return -ENODEV; out: TRACE("init done\n"); From a80313ff91abda67641dc33bed97f6bcc5e9f6a4 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Sun, 3 Feb 2019 21:37:20 +0100 Subject: [PATCH 83/98] s390/kernel: introduce .dma sections With a relocatable kernel that could reside at any place in memory, code and data that has to stay below 2 GB needs special handling. This patch introduces .dma sections for such text, data and ex_table. The sections will be part of the decompressor kernel, so they will not be relocated and stay below 2 GB. Their location is passed over to the decompressed / relocated kernel via the .boot.preserved.data section. The duald and aste for control register setup also need to stay below 2 GB, so move the setup code from arch/s390/kernel/head64.S to arch/s390/boot/head.S. The duct and linkage_stack could reside above 2 GB, but their content has to be preserved for the decompresed kernel, so they are also moved into the .dma section. The start and end address of the .dma sections is added to vmcoreinfo, for crash support, to help debugging in case the kernel crashed there. Signed-off-by: Gerald Schaefer Reviewed-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/boot/Makefile | 2 +- arch/s390/boot/compressed/vmlinux.lds.S | 21 +++ arch/s390/boot/head.S | 32 ++++- arch/s390/boot/startup.c | 39 ++++++ arch/s390/boot/text_dma.S | 167 ++++++++++++++++++++++++ arch/s390/hypfs/hypfs_diag0c.c | 18 +-- arch/s390/include/asm/diag.h | 13 ++ arch/s390/include/asm/extable.h | 5 + arch/s390/include/asm/ipl.h | 1 - arch/s390/include/asm/linkage.h | 7 + arch/s390/include/asm/sections.h | 3 + arch/s390/kernel/base.S | 68 ---------- arch/s390/kernel/diag.c | 65 ++------- arch/s390/kernel/early.c | 2 +- arch/s390/kernel/head64.S | 26 ---- arch/s390/kernel/ipl.c | 2 +- arch/s390/kernel/kprobes.c | 2 +- arch/s390/kernel/machine_kexec.c | 2 + arch/s390/kernel/setup.c | 9 ++ arch/s390/kernel/smp.c | 2 +- arch/s390/kernel/swsusp.S | 15 +-- arch/s390/kernel/traps.c | 3 +- arch/s390/mm/fault.c | 14 +- arch/s390/mm/vmem.c | 2 + 24 files changed, 333 insertions(+), 187 deletions(-) create mode 100644 arch/s390/boot/text_dma.S diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 4df43e83363a..88932c25ad26 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -30,7 +30,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o -obj-y += ctype.o +obj-y += ctype.o text_dma.o obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/compressed/vmlinux.lds.S index 1c8ef393c656..112b8d9f1e4c 100644 --- a/arch/s390/boot/compressed/vmlinux.lds.S +++ b/arch/s390/boot/compressed/vmlinux.lds.S @@ -33,6 +33,27 @@ SECTIONS *(.data.*) _edata = . ; } + /* + * .dma section for code, data, ex_table that need to stay below 2 GB, + * even when the kernel is relocate: above 2 GB. + */ + _sdma = .; + .dma.text : { + . = ALIGN(PAGE_SIZE); + _stext_dma = .; + *(.dma.text) + . = ALIGN(PAGE_SIZE); + _etext_dma = .; + } + . = ALIGN(16); + .dma.ex_table : { + _start_dma_ex_table = .; + KEEP(*(.dma.ex_table)) + _stop_dma_ex_table = .; + } + .dma.data : { *(.dma.data) } + _edma = .; + BOOT_DATA BOOT_DATA_PRESERVED diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index c6b4b4c5dd58..028aab03a9e7 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -305,7 +305,7 @@ ENTRY(startup_kdump) xc 0x300(256),0x300 xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 - lctlg %c0,%c15,0x200(%r0) # initialize control registers + lctlg %c0,%c15,.Lctl-.LPG0(%r13) # load control registers stcke __LC_BOOT_CLOCK mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 spt 6f-.LPG0(%r13) @@ -319,6 +319,36 @@ ENTRY(startup_kdump) .align 8 6: .long 0x7fffffff,0xffffffff +.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space + .quad 0 # cr1: primary space segment table + .quad .Lduct # cr2: dispatchable unit control table + .quad 0 # cr3: instruction authorization + .quad 0xffff # cr4: instruction authorization + .quad .Lduct # cr5: primary-aste origin + .quad 0 # cr6: I/O interrupts + .quad 0 # cr7: secondary space segment table + .quad 0 # cr8: access registers translation + .quad 0 # cr9: tracing off + .quad 0 # cr10: tracing off + .quad 0 # cr11: tracing off + .quad 0 # cr12: tracing off + .quad 0 # cr13: home space segment table + .quad 0xc0000000 # cr14: machine check handling off + .quad .Llinkage_stack # cr15: linkage stack operations + + .section .dma.data,"aw",@progbits +.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0 + .long 0,0,0,0,0,0,0,0 +.Llinkage_stack: + .long 0,0,0x89000000,0,0,0,0x8a000000,0 + .align 64 +.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0 + .align 128 +.Lduald:.rept 8 + .long 0x80000000,0,0,0 # invalid access-list entries + .endr + .previous + #include "head_kdump.S" # diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index b7d6a76cb5e9..e3f339d248ce 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -1,9 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include +#include #include #include "compressed/decompressor.h" #include "boot.h" @@ -11,6 +13,43 @@ extern char __boot_data_start[], __boot_data_end[]; extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; +/* + * Some code and data needs to stay below 2 GB, even when the kernel would be + * relocated above 2 GB, because it has to use 31 bit addresses. + * Such code and data is part of the .dma section, and its location is passed + * over to the decompressed / relocated kernel via the .boot.preserved.data + * section. + */ +extern char _sdma[], _edma[]; +extern char _stext_dma[], _etext_dma[]; +extern struct exception_table_entry _start_dma_ex_table[]; +extern struct exception_table_entry _stop_dma_ex_table[]; +unsigned long __bootdata_preserved(__sdma) = __pa(&_sdma); +unsigned long __bootdata_preserved(__edma) = __pa(&_edma); +unsigned long __bootdata_preserved(__stext_dma) = __pa(&_stext_dma); +unsigned long __bootdata_preserved(__etext_dma) = __pa(&_etext_dma); +struct exception_table_entry * + __bootdata_preserved(__start_dma_ex_table) = _start_dma_ex_table; +struct exception_table_entry * + __bootdata_preserved(__stop_dma_ex_table) = _stop_dma_ex_table; + +int _diag210_dma(struct diag210 *addr); +int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode); +int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode); +void _diag0c_dma(struct hypfs_diag0c_entry *entry); +void _diag308_reset_dma(void); +struct diag_ops __bootdata_preserved(diag_dma_ops) = { + .diag210 = _diag210_dma, + .diag26c = _diag26c_dma, + .diag14 = _diag14_dma, + .diag0c = _diag0c_dma, + .diag308_reset = _diag308_reset_dma +}; +static struct diag210 _diag210_tmp_dma __section(".dma.data"); +struct diag210 *__bootdata_preserved(__diag210_tmp_dma) = &_diag210_tmp_dma; +void _swsusp_reset_dma(void); +unsigned long __bootdata_preserved(__swsusp_reset_dma) = __pa(_swsusp_reset_dma); + void error(char *x) { sclp_early_printk("\n\n"); diff --git a/arch/s390/boot/text_dma.S b/arch/s390/boot/text_dma.S new file mode 100644 index 000000000000..2414360ff22d --- /dev/null +++ b/arch/s390/boot/text_dma.S @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code that needs to run below 2 GB. + * + * Copyright IBM Corp. 2019 + */ + +#include +#include +#include + + .section .dma.text,"ax" +/* + * Simplified version of expoline thunk. The normal thunks can not be used here, + * because they might be more than 2 GB away, and not reachable by the relative + * branch. No comdat, exrl, etc. optimizations used here, because it only + * affects a few functions that are not performance-relevant. + */ + .macro BR_EX_DMA_r14 + larl %r1,0f + ex 0,0(%r1) + j . +0: br %r14 + .endm + +/* + * int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode) + */ +ENTRY(_diag14_dma) + lgr %r1,%r2 + lgr %r2,%r3 + lgr %r3,%r4 + lhi %r5,-EIO + sam31 + diag %r1,%r2,0x14 +.Ldiag14_ex: + ipm %r5 + srl %r5,28 +.Ldiag14_fault: + sam64 + lgfr %r2,%r5 + BR_EX_DMA_r14 + EX_TABLE_DMA(.Ldiag14_ex, .Ldiag14_fault) + +/* + * int _diag210_dma(struct diag210 *addr) + */ +ENTRY(_diag210_dma) + lgr %r1,%r2 + lhi %r2,-1 + sam31 + diag %r1,%r0,0x210 +.Ldiag210_ex: + ipm %r2 + srl %r2,28 +.Ldiag210_fault: + sam64 + lgfr %r2,%r2 + BR_EX_DMA_r14 + EX_TABLE_DMA(.Ldiag210_ex, .Ldiag210_fault) + +/* + * int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode) + */ +ENTRY(_diag26c_dma) + lghi %r5,-EOPNOTSUPP + sam31 + diag %r2,%r4,0x26c +.Ldiag26c_ex: + sam64 + lgfr %r2,%r5 + BR_EX_DMA_r14 + EX_TABLE_DMA(.Ldiag26c_ex, .Ldiag26c_ex) + +/* + * void _diag0c_dma(struct hypfs_diag0c_entry *entry) + */ +ENTRY(_diag0c_dma) + sam31 + diag %r2,%r2,0x0c + sam64 + BR_EX_DMA_r14 + +/* + * void _swsusp_reset_dma(void) + */ +ENTRY(_swsusp_reset_dma) + larl %r1,restart_entry + larl %r2,.Lrestart_diag308_psw + og %r1,0(%r2) + stg %r1,0(%r0) + lghi %r0,0 + diag %r0,%r0,0x308 +restart_entry: + lhi %r1,1 + sigp %r1,%r0,SIGP_SET_ARCHITECTURE + sam64 + BR_EX_DMA_r14 + +/* + * void _diag308_reset_dma(void) + * + * Calls diag 308 subcode 1 and continues execution + */ +ENTRY(_diag308_reset_dma) + larl %r4,.Lctlregs # Save control registers + stctg %c0,%c15,0(%r4) + lg %r2,0(%r4) # Disable lowcore protection + nilh %r2,0xefff + larl %r4,.Lctlreg0 + stg %r2,0(%r4) + lctlg %c0,%c0,0(%r4) + larl %r4,.Lfpctl # Floating point control register + stfpc 0(%r4) + larl %r4,.Lprefix # Save prefix register + stpx 0(%r4) + larl %r4,.Lprefix_zero # Set prefix register to 0 + spx 0(%r4) + larl %r4,.Lcontinue_psw # Save PSW flags + epsw %r2,%r3 + stm %r2,%r3,0(%r4) + larl %r4,restart_part2 # Setup restart PSW at absolute 0 + larl %r3,.Lrestart_diag308_psw + og %r4,0(%r3) # Save PSW + lghi %r3,0 + sturg %r4,%r3 # Use sturg, because of large pages + lghi %r1,1 + lghi %r0,0 + diag %r0,%r1,0x308 +restart_part2: + lhi %r0,0 # Load r0 with zero + lhi %r1,2 # Use mode 2 = ESAME (dump) + sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode + sam64 # Switch to 64 bit addressing mode + larl %r4,.Lctlregs # Restore control registers + lctlg %c0,%c15,0(%r4) + larl %r4,.Lfpctl # Restore floating point ctl register + lfpc 0(%r4) + larl %r4,.Lprefix # Restore prefix register + spx 0(%r4) + larl %r4,.Lcontinue_psw # Restore PSW flags + lpswe 0(%r4) +.Lcontinue: + BR_EX_DMA_r14 + + .section .dma.data,"aw",@progbits +.align 8 +.Lrestart_diag308_psw: + .long 0x00080000,0x80000000 + +.align 8 +.Lcontinue_psw: + .quad 0,.Lcontinue + +.align 8 +.Lctlreg0: + .quad 0 +.Lctlregs: + .rept 16 + .quad 0 + .endr +.Lfpctl: + .long 0 +.Lprefix: + .long 0 +.Lprefix_zero: + .long 0 diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c index 72e3140fafb5..3235e4d82f2d 100644 --- a/arch/s390/hypfs/hypfs_diag0c.c +++ b/arch/s390/hypfs/hypfs_diag0c.c @@ -15,27 +15,13 @@ #define DBFS_D0C_HDR_VERSION 0 -/* - * Execute diagnose 0c in 31 bit mode - */ -static void diag0c(struct hypfs_diag0c_entry *entry) -{ - diag_stat_inc(DIAG_STAT_X00C); - asm volatile ( - " sam31\n" - " diag %0,%0,0x0c\n" - " sam64\n" - : /* no output register */ - : "a" (entry) - : "memory"); -} - /* * Get hypfs_diag0c_entry from CPU vector and store diag0c data */ static void diag0c_fn(void *data) { - diag0c(((void **) data)[smp_processor_id()]); + diag_stat_inc(DIAG_STAT_X00C); + diag_dma_ops.diag0c(((void **) data)[smp_processor_id()]); } /* diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 19562be22b7e..0036eab14391 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -308,4 +308,17 @@ union diag318_info { int diag204(unsigned long subcode, unsigned long size, void *addr); int diag224(void *ptr); int diag26c(void *req, void *resp, enum diag26c_sc subcode); + +struct hypfs_diag0c_entry; + +struct diag_ops { + int (*diag210)(struct diag210 *addr); + int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode); + int (*diag14)(unsigned long rx, unsigned long ry1, unsigned long subcode); + void (*diag0c)(struct hypfs_diag0c_entry *entry); + void (*diag308_reset)(void); +}; + +extern struct diag_ops diag_dma_ops; +extern struct diag210 *__diag210_tmp_dma; #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h index 80a4e5a9cb46..ae27f756b409 100644 --- a/arch/s390/include/asm/extable.h +++ b/arch/s390/include/asm/extable.h @@ -19,6 +19,11 @@ struct exception_table_entry int insn, fixup; }; +extern struct exception_table_entry *__start_dma_ex_table; +extern struct exception_table_entry *__stop_dma_ex_table; + +const struct exception_table_entry *s390_search_extables(unsigned long addr); + static inline unsigned long extable_fixup(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 420d39ebdc46..084e71b7272a 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -127,7 +127,6 @@ enum diag308_rc { }; extern int diag308(unsigned long subcode, void *addr); -extern void diag308_reset(void); extern void store_status(void (*fn)(void *), void *data); extern void lgr_info_log(void); diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index 1b95da3fdd64..7f22262b0e46 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -28,5 +28,12 @@ .long (_target) - . ; \ .previous +#define EX_TABLE_DMA(_fault, _target) \ + .section .dma.ex_table, "a" ; \ + .align 4 ; \ + .long (_fault) - . ; \ + .long (_target) - . ; \ + .previous + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index 29e55739b516..af670fa4b12a 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -23,4 +23,7 @@ */ #define __bootdata_preserved(var) __section(.boot.preserved.data.var) var +extern unsigned long __sdma, __edma; +extern unsigned long __stext_dma, __etext_dma; + #endif diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S index f268fca67e82..d6ee5978e273 100644 --- a/arch/s390/kernel/base.S +++ b/arch/s390/kernel/base.S @@ -79,71 +79,3 @@ disabled_wait_psw: s390_base_pgm_handler_fn: .quad 0 .previous - -# -# Calls diag 308 subcode 1 and continues execution -# -ENTRY(diag308_reset) - larl %r4,.Lctlregs # Save control registers - stctg %c0,%c15,0(%r4) - lg %r2,0(%r4) # Disable lowcore protection - nilh %r2,0xefff - larl %r4,.Lctlreg0 - stg %r2,0(%r4) - lctlg %c0,%c0,0(%r4) - larl %r4,.Lfpctl # Floating point control register - stfpc 0(%r4) - larl %r4,.Lprefix # Save prefix register - stpx 0(%r4) - larl %r4,.Lprefix_zero # Set prefix register to 0 - spx 0(%r4) - larl %r4,.Lcontinue_psw # Save PSW flags - epsw %r2,%r3 - stm %r2,%r3,0(%r4) - larl %r4,.Lrestart_psw # Setup restart PSW at absolute 0 - lghi %r3,0 - lg %r4,0(%r4) # Save PSW - sturg %r4,%r3 # Use sturg, because of large pages - lghi %r1,1 - lghi %r0,0 - diag %r0,%r1,0x308 -.Lrestart_part2: - lhi %r0,0 # Load r0 with zero - lhi %r1,2 # Use mode 2 = ESAME (dump) - sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode - sam64 # Switch to 64 bit addressing mode - larl %r4,.Lctlregs # Restore control registers - lctlg %c0,%c15,0(%r4) - larl %r4,.Lfpctl # Restore floating point ctl register - lfpc 0(%r4) - larl %r4,.Lprefix # Restore prefix register - spx 0(%r4) - larl %r4,.Lcontinue_psw # Restore PSW flags - lpswe 0(%r4) -.Lcontinue: - BR_EX %r14 -.align 16 -.Lrestart_psw: - .long 0x00080000,0x80000000 + .Lrestart_part2 - - .section .data..nosave,"aw",@progbits -.align 8 -.Lcontinue_psw: - .quad 0,.Lcontinue - .previous - - .section .bss -.align 8 -.Lctlreg0: - .quad 0 -.Lctlregs: - .rept 16 - .quad 0 - .endr -.Lfpctl: - .long 0 -.Lprefix: - .long 0 -.Lprefix_zero: - .long 0 - .previous diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index 7edaa733a77f..e9dac9a24d3f 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -13,6 +13,7 @@ #include #include #include +#include struct diag_stat { unsigned int counter[NR_DIAG_STAT]; @@ -49,6 +50,9 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, }; +struct diag_ops __bootdata_preserved(diag_dma_ops); +struct diag210 *__bootdata_preserved(__diag210_tmp_dma); + static int show_diag_stat(struct seq_file *m, void *v) { struct diag_stat *stat; @@ -139,30 +143,10 @@ EXPORT_SYMBOL(diag_stat_inc_norecursion); /* * Diagnose 14: Input spool file manipulation */ -static inline int __diag14(unsigned long rx, unsigned long ry1, - unsigned long subcode) -{ - register unsigned long _ry1 asm("2") = ry1; - register unsigned long _ry2 asm("3") = subcode; - int rc = 0; - - asm volatile( - " sam31\n" - " diag %2,2,0x14\n" - " sam64\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (rc), "+d" (_ry2) - : "d" (rx), "d" (_ry1) - : "cc"); - - return rc; -} - int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) { diag_stat_inc(DIAG_STAT_X014); - return __diag14(rx, ry1, subcode); + return diag_dma_ops.diag14(rx, ry1, subcode); } EXPORT_SYMBOL(diag14); @@ -195,30 +179,17 @@ EXPORT_SYMBOL(diag204); */ int diag210(struct diag210 *addr) { - /* - * diag 210 needs its data below the 2GB border, so we - * use a static data area to be sure - */ - static struct diag210 diag210_tmp; static DEFINE_SPINLOCK(diag210_lock); unsigned long flags; int ccode; spin_lock_irqsave(&diag210_lock, flags); - diag210_tmp = *addr; + *__diag210_tmp_dma = *addr; diag_stat_inc(DIAG_STAT_X210); - asm volatile( - " lhi %0,-1\n" - " sam31\n" - " diag %1,0,0x210\n" - "0: ipm %0\n" - " srl %0,28\n" - "1: sam64\n" - EX_TABLE(0b, 1b) - : "=&d" (ccode) : "a" (&diag210_tmp) : "cc", "memory"); + ccode = diag_dma_ops.diag210(__diag210_tmp_dma); - *addr = diag210_tmp; + *addr = *__diag210_tmp_dma; spin_unlock_irqrestore(&diag210_lock, flags); return ccode; @@ -243,27 +214,9 @@ EXPORT_SYMBOL(diag224); /* * Diagnose 26C: Access Certain System Information */ -static inline int __diag26c(void *req, void *resp, enum diag26c_sc subcode) -{ - register unsigned long _req asm("2") = (addr_t) req; - register unsigned long _resp asm("3") = (addr_t) resp; - register unsigned long _subcode asm("4") = subcode; - register unsigned long _rc asm("5") = -EOPNOTSUPP; - - asm volatile( - " sam31\n" - " diag %[rx],%[ry],0x26c\n" - "0: sam64\n" - EX_TABLE(0b,0b) - : "+d" (_rc) - : [rx] "d" (_req), "d" (_resp), [ry] "d" (_subcode) - : "cc", "memory"); - return _rc; -} - int diag26c(void *req, void *resp, enum diag26c_sc subcode) { diag_stat_inc(DIAG_STAT_X26C); - return __diag26c(req, resp, subcode); + return diag_dma_ops.diag26c(req, resp, subcode); } EXPORT_SYMBOL(diag26c); diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index ab09ada0e930..33f704c16bd3 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -139,7 +139,7 @@ static void early_pgm_check_handler(void) unsigned long addr; addr = S390_lowcore.program_old_psw.addr; - fixup = search_exception_tables(addr); + fixup = s390_search_extables(addr); if (!fixup) disabled_wait(0); /* Disable low address protection before storing into lowcore. */ diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 56491e636eab..5aea1a527443 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -26,7 +26,6 @@ ENTRY(startup_continue) 0: larl %r1,tod_clock_base mvc 0(16,%r1),__LC_BOOT_CLOCK larl %r13,.LPG1 # get base - lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers larl %r0,boot_vdso_data stg %r0,__LC_VDSO_PER_CPU # @@ -61,22 +60,6 @@ ENTRY(startup_continue) .align 16 .LPG1: -.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space - .quad 0 # cr1: primary space segment table - .quad .Lduct # cr2: dispatchable unit control table - .quad 0 # cr3: instruction authorization - .quad 0xffff # cr4: instruction authorization - .quad .Lduct # cr5: primary-aste origin - .quad 0 # cr6: I/O interrupts - .quad 0 # cr7: secondary space segment table - .quad 0 # cr8: access registers translation - .quad 0 # cr9: tracing off - .quad 0 # cr10: tracing off - .quad 0 # cr11: tracing off - .quad 0 # cr12: tracing off - .quad 0 # cr13: home space segment table - .quad 0xc0000000 # cr14: machine check handling off - .quad .Llinkage_stack # cr15: linkage stack operations .Lpcmsk:.quad 0x0000000180000000 .L4malign:.quad 0xffffffffffc00000 .Lscan2g:.quad 0x80000000 + 0x20000 - 8 # 2GB + 128K - 8 @@ -84,14 +67,5 @@ ENTRY(startup_continue) .Lparmaddr: .quad PARMAREA .align 64 -.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0 - .long 0,0,0,0,0,0,0,0 -.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0 - .align 128 -.Lduald:.rept 8 - .long 0x80000000,0,0,0 # invalid access-list entries - .endr -.Llinkage_stack: - .long 0,0,0x89000000,0,0,0,0x8a000000,0 .Ldw: .quad 0x0002000180000000,0x0000000000000000 .Laregs:.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index e123c0df83f1..aa8fe768640e 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -1720,7 +1720,7 @@ void s390_reset_system(void) /* Disable lowcore protection */ __ctl_clear_bit(0, 28); - diag308_reset(); + diag_dma_ops.diag308_reset(); } #ifdef CONFIG_KEXEC_FILE diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 263426dcf97d..6f1388391620 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -573,7 +573,7 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) * In case the user-specified fault handler returned * zero, try to fix up. */ - entry = search_exception_tables(regs->psw.addr); + entry = s390_search_extables(regs->psw.addr); if (entry) { regs->psw.addr = extable_fixup(entry); return 1; diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index cb582649aba6..4b998d639c32 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -253,6 +253,8 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); + vmcoreinfo_append_str("SDMA=%lx\n", __sdma); + vmcoreinfo_append_str("EDMA=%lx\n", __edma); } void machine_shutdown(void) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 94efb1eb34b6..4ccaf5ed96ee 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -101,6 +101,14 @@ unsigned long __bootdata(memory_end); unsigned long __bootdata(max_physmem_end); struct mem_detect_info __bootdata(mem_detect); +struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table); +struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table); +unsigned long __bootdata_preserved(__swsusp_reset_dma); +unsigned long __bootdata_preserved(__stext_dma); +unsigned long __bootdata_preserved(__etext_dma); +unsigned long __bootdata_preserved(__sdma); +unsigned long __bootdata_preserved(__edma); + unsigned long VMALLOC_START; EXPORT_SYMBOL(VMALLOC_START); @@ -832,6 +840,7 @@ static void __init reserve_kernel(void) memblock_reserve(0, HEAD_END); memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) - (unsigned long)_stext); + memblock_reserve(__sdma, __edma - __sdma); } static void __init setup_memory(void) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index bd197baf1dc3..88634fb0cc50 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -689,7 +689,7 @@ void __init smp_save_dump_cpus(void) smp_save_cpu_regs(sa, addr, is_boot_cpu, page); } memblock_free(page, PAGE_SIZE); - diag308_reset(); + diag_dma_ops.diag308_reset(); pcpu_set_smt(0); } #endif /* CONFIG_CRASH_DUMP */ diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S index 993100c31d65..f5219ce11cc5 100644 --- a/arch/s390/kernel/swsusp.S +++ b/arch/s390/kernel/swsusp.S @@ -154,20 +154,13 @@ ENTRY(swsusp_arch_resume) ptlb /* flush tlb */ /* Reset System */ - larl %r1,restart_entry - larl %r2,.Lrestart_diag308_psw - og %r1,0(%r2) - stg %r1,0(%r0) larl %r1,.Lnew_pgm_check_psw epsw %r2,%r3 stm %r2,%r3,0(%r1) mvc __LC_PGM_NEW_PSW(16,%r0),0(%r1) - lghi %r0,0 - diag %r0,%r0,0x308 -restart_entry: - lhi %r1,1 - sigp %r1,%r0,SIGP_SET_ARCHITECTURE - sam64 + larl %r1,__swsusp_reset_dma + lg %r1,0(%r1) + BASR_EX %r14,%r1 #ifdef CONFIG_SMP larl %r1,smp_cpu_mt_shift icm %r1,15,0(%r1) @@ -275,8 +268,6 @@ restore_registers: .Lpanic_string: .asciz "Resume not possible because suspend CPU is no longer available\n" .align 8 -.Lrestart_diag308_psw: - .long 0x00080000,0x80000000 .Lrestart_suspend_psw: .quad 0x0000000180000000,restart_suspend .Lnew_pgm_check_psw: diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 8003b38c1688..82e81a9f7112 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -49,7 +49,7 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) report_user_fault(regs, si_signo, 0); } else { const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->psw.addr); + fixup = s390_search_extables(regs->psw.addr); if (fixup) regs->psw.addr = extable_fixup(fixup); else { @@ -263,5 +263,6 @@ NOKPROBE_SYMBOL(kernel_stack_overflow); void __init trap_init(void) { + sort_extable(__start_dma_ex_table, __stop_dma_ex_table); local_mcck_enable(); } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 11613362c4e7..c220399ae196 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -247,12 +247,24 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code) current); } +const struct exception_table_entry *s390_search_extables(unsigned long addr) +{ + const struct exception_table_entry *fixup; + + fixup = search_extable(__start_dma_ex_table, + __stop_dma_ex_table - __start_dma_ex_table, + addr); + if (!fixup) + fixup = search_exception_tables(addr); + return fixup; +} + static noinline void do_no_context(struct pt_regs *regs) { const struct exception_table_entry *fixup; /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->psw.addr); + fixup = s390_search_extables(regs->psw.addr); if (fixup) { regs->psw.addr = extable_fixup(fixup); return; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 0472e27febdf..b403fa14847d 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -413,6 +413,8 @@ void __init vmem_map_init(void) __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); + __set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT, + SET_MEMORY_RO | SET_MEMORY_X); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } From b2d24b97b2a9691351920e700bfda4368c177232 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Sun, 3 Feb 2019 21:37:20 +0100 Subject: [PATCH 84/98] s390/kernel: add support for kernel address space layout randomization (KASLR) This patch adds support for relocating the kernel to a random address. The random kernel offset is obtained from cpacf, using either TRNG, PRNO, or KMC_PRNG, depending on supported MSA level. KERNELOFFSET is added to vmcoreinfo, for crash --kaslr support. Signed-off-by: Gerald Schaefer Reviewed-by: Philipp Rudo Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 10 +++ arch/s390/boot/Makefile | 1 + arch/s390/boot/boot.h | 3 + arch/s390/boot/ipl_parm.c | 15 +++- arch/s390/boot/kaslr.c | 144 +++++++++++++++++++++++++++++++ arch/s390/boot/startup.c | 31 ++++++- arch/s390/include/asm/setup.h | 6 ++ arch/s390/kernel/machine_kexec.c | 1 + arch/s390/kernel/setup.c | 1 + 9 files changed, 207 insertions(+), 5 deletions(-) create mode 100644 arch/s390/boot/kaslr.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 4c99e4f5f366..8c15392ee985 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -637,6 +637,16 @@ config RELOCATABLE The relocations make the kernel image about 15% larger (compressed 10%), but are discarded at runtime. +config RANDOMIZE_BASE + bool "Randomize the address of the kernel image (KASLR)" + depends on RELOCATABLE + default y + help + In support of Kernel Address Space Layout Randomization (KASLR), + this randomizes the address at which the kernel image is loaded, + as a security feature that deters exploit attempts relying on + knowledge of the location of kernel internals. + endmenu menu "Memory setup" diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 88932c25ad26..cb40317dc3f7 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -33,6 +33,7 @@ obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += ctype.o text_dma.o obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) subdir- := compressed diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index ca395fcff15e..ad57c2205a71 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -9,6 +9,9 @@ void setup_boot_command_line(void); void parse_boot_command_line(void); void setup_memory_end(void); void print_missing_facilities(void); +unsigned long get_random_base(unsigned long safe_addr); + +extern int kaslr_enabled; unsigned long read_ipl_report(unsigned long safe_offset); diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 96777092fb14..b49bd97d33af 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -18,6 +18,8 @@ unsigned long __bootdata(memory_end); int __bootdata(memory_end_set); int __bootdata(noexec_disabled); +int kaslr_enabled __section(.data); + static inline int __diag308(unsigned long subcode, void *addr) { register unsigned long _addr asm("0") = (unsigned long)addr; @@ -214,6 +216,7 @@ void parse_boot_command_line(void) char *args; int rc; + kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE); args = strcpy(command_line_buf, early_command_line); while (*args) { args = next_arg(args, ¶m, &val); @@ -231,15 +234,21 @@ void parse_boot_command_line(void) if (!strcmp(param, "facilities")) modify_fac_list(val); + + if (!strcmp(param, "nokaslr")) + kaslr_enabled = 0; } } void setup_memory_end(void) { #ifdef CONFIG_CRASH_DUMP - if (!OLDMEM_BASE && ipl_block_valid && - ipl_block.pb0_hdr.pbt == IPL_PBT_FCP && - ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) { + if (OLDMEM_BASE) { + kaslr_enabled = 0; + } else if (ipl_block_valid && + ipl_block.pb0_hdr.pbt == IPL_PBT_FCP && + ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) { + kaslr_enabled = 0; if (!sclp_early_get_hsa_size(&memory_end) && memory_end) memory_end_set = 1; } diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c new file mode 100644 index 000000000000..3bdd8132e56b --- /dev/null +++ b/arch/s390/boot/kaslr.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2019 + */ +#include +#include +#include +#include +#include "compressed/decompressor.h" + +#define PRNG_MODE_TDES 1 +#define PRNG_MODE_SHA512 2 +#define PRNG_MODE_TRNG 3 + +struct prno_parm { + u32 res; + u32 reseed_counter; + u64 stream_bytes; + u8 V[112]; + u8 C[112]; +}; + +struct prng_parm { + u8 parm_block[32]; + u32 reseed_counter; + u64 byte_counter; +}; + +static int check_prng(void) +{ + if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) { + sclp_early_printk("KASLR disabled: CPU has no PRNG\n"); + return 0; + } + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + return PRNG_MODE_TRNG; + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) + return PRNG_MODE_SHA512; + else + return PRNG_MODE_TDES; +} + +static unsigned long get_random(unsigned long limit) +{ + struct prng_parm prng = { + /* initial parameter block for tdes mode, copied from libica */ + .parm_block = { + 0x0F, 0x2B, 0x8E, 0x63, 0x8C, 0x8E, 0xD2, 0x52, + 0x64, 0xB7, 0xA0, 0x7B, 0x75, 0x28, 0xB8, 0xF4, + 0x75, 0x5F, 0xD2, 0xA6, 0x8D, 0x97, 0x11, 0xFF, + 0x49, 0xD8, 0x23, 0xF3, 0x7E, 0x21, 0xEC, 0xA0 + }, + }; + unsigned long seed, random; + struct prno_parm prno; + __u64 entropy[4]; + int mode, i; + + mode = check_prng(); + seed = get_tod_clock_fast(); + switch (mode) { + case PRNG_MODE_TRNG: + cpacf_trng(NULL, 0, (u8 *) &random, sizeof(random)); + break; + case PRNG_MODE_SHA512: + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, &prno, NULL, 0, + (u8 *) &seed, sizeof(seed)); + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, &prno, (u8 *) &random, + sizeof(random), NULL, 0); + break; + case PRNG_MODE_TDES: + /* add entropy */ + *(unsigned long *) prng.parm_block ^= seed; + for (i = 0; i < 16; i++) { + cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, + (char *) entropy, (char *) entropy, + sizeof(entropy)); + memcpy(prng.parm_block, entropy, sizeof(entropy)); + } + random = seed; + cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, (u8 *) &random, + (u8 *) &random, sizeof(random)); + break; + default: + random = 0; + } + return random % limit; +} + +unsigned long get_random_base(unsigned long safe_addr) +{ + unsigned long base, start, end, kernel_size; + unsigned long block_sum, offset; + int i; + + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE) { + if (safe_addr < INITRD_START + INITRD_SIZE) + safe_addr = INITRD_START + INITRD_SIZE; + } + safe_addr = ALIGN(safe_addr, THREAD_SIZE); + + kernel_size = vmlinux.image_size + vmlinux.bss_size; + block_sum = 0; + for_each_mem_detect_block(i, &start, &end) { + if (memory_end_set) { + if (start >= memory_end) + break; + if (end > memory_end) + end = memory_end; + } + if (end - start < kernel_size) + continue; + block_sum += end - start - kernel_size; + } + if (!block_sum) { + sclp_early_printk("KASLR disabled: not enough memory\n"); + return 0; + } + + base = get_random(block_sum); + if (base == 0) + return 0; + if (base < safe_addr) + base = safe_addr; + block_sum = offset = 0; + for_each_mem_detect_block(i, &start, &end) { + if (memory_end_set) { + if (start >= memory_end) + break; + if (end > memory_end) + end = memory_end; + } + if (end - start < kernel_size) + continue; + block_sum += end - start - kernel_size; + if (base <= block_sum) { + base = start + base - offset; + base = ALIGN_DOWN(base, THREAD_SIZE); + break; + } + offset = block_sum; + } + return base; +} diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index e3f339d248ce..4401e992bda1 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -12,6 +12,7 @@ extern char __boot_data_start[], __boot_data_end[]; extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; +unsigned long __bootdata_preserved(__kaslr_offset); /* * Some code and data needs to stay below 2 GB, even when the kernel would be @@ -113,6 +114,7 @@ static void handle_relocs(unsigned long offset) void startup_kernel(void) { + unsigned long random_lma; unsigned long safe_addr; void *img; @@ -126,12 +128,37 @@ void startup_kernel(void) parse_boot_command_line(); setup_memory_end(); detect_memory(); + + random_lma = __kaslr_offset = 0; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) { + random_lma = get_random_base(safe_addr); + if (random_lma) { + __kaslr_offset = random_lma - vmlinux.default_lma; + img = (void *)vmlinux.default_lma; + vmlinux.default_lma += __kaslr_offset; + vmlinux.entry += __kaslr_offset; + vmlinux.bootdata_off += __kaslr_offset; + vmlinux.bootdata_preserved_off += __kaslr_offset; + vmlinux.rela_dyn_start += __kaslr_offset; + vmlinux.rela_dyn_end += __kaslr_offset; + vmlinux.dynsym_start += __kaslr_offset; + } + } + if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) { img = decompress_kernel(); memmove((void *)vmlinux.default_lma, img, vmlinux.image_size); - } + } else if (__kaslr_offset) + memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size); + copy_bootdata(); if (IS_ENABLED(CONFIG_RELOCATABLE)) - handle_relocs(0); + handle_relocs(__kaslr_offset); + + if (__kaslr_offset) { + /* Clear non-relocated kernel */ + if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) + memset(img, 0, vmlinux.image_size); + } vmlinux.entry(); } diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index d202756d6291..925889d360c1 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -147,6 +147,12 @@ extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); extern void (*_machine_power_off)(void); +extern unsigned long __kaslr_offset; +static inline unsigned long kaslr_offset(void) +{ + return __kaslr_offset; +} + #else /* __ASSEMBLY__ */ #define IPL_DEVICE (IPL_DEVICE_OFFSET) diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 4b998d639c32..e2ba7b7f574e 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -255,6 +255,7 @@ void arch_crash_save_vmcoreinfo(void) mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); vmcoreinfo_append_str("SDMA=%lx\n", __sdma); vmcoreinfo_append_str("EDMA=%lx\n", __edma); + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); } void machine_shutdown(void) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 4ccaf5ed96ee..64e4bc9dd130 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -108,6 +108,7 @@ unsigned long __bootdata_preserved(__stext_dma); unsigned long __bootdata_preserved(__etext_dma); unsigned long __bootdata_preserved(__sdma); unsigned long __bootdata_preserved(__edma); +unsigned long __bootdata_preserved(__kaslr_offset); unsigned long VMALLOC_START; EXPORT_SYMBOL(VMALLOC_START); From 7a5da02de8d6eafba99556f8c98e5313edebb449 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 18 Apr 2019 16:24:50 +0200 Subject: [PATCH 85/98] locking/lockdep: check for freed initmem in static_obj() The following warning occurred on s390: WARNING: CPU: 0 PID: 804 at kernel/locking/lockdep.c:1025 lockdep_register_key+0x30/0x150 This is because the check in static_obj() assumes that all memory within [_stext, _end] belongs to static objects, which at least for s390 isn't true. The init section is also part of this range, and freeing it allows the buddy allocator to allocate memory from it. We have virt == phys for the kernel on s390, so that such allocations would then have addresses within the range [_stext, _end]. To fix this, introduce arch_is_kernel_initmem_freed(), similar to arch_is_kernel_text/data(), and add it to the checks in static_obj(). This will always return 0 on architectures that do not define arch_is_kernel_initmem_freed. On s390, it will return 1 if initmem has been freed and the address is in the range [__init_begin, __init_end]. Signed-off-by: Gerald Schaefer Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/sections.h | 12 ++++++++++++ arch/s390/mm/init.c | 3 +++ include/asm-generic/sections.h | 14 ++++++++++++++ kernel/locking/lockdep.c | 3 +++ 4 files changed, 32 insertions(+) diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index af670fa4b12a..42de04ad9c07 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -2,8 +2,20 @@ #ifndef _S390_SECTIONS_H #define _S390_SECTIONS_H +#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed + #include +extern bool initmem_freed; + +static inline int arch_is_kernel_initmem_freed(unsigned long addr) +{ + if (!initmem_freed) + return 0; + return addr >= (unsigned long)__init_begin && + addr < (unsigned long)__init_end; +} + /* * .boot.data section contains variables "shared" between the decompressor and * the decompressed kernel. The decompressor will store values in them, and diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3e82f66d5c61..7cf48eefec8f 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -49,6 +49,8 @@ unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); +bool initmem_freed; + static void __init setup_zero_pages(void) { unsigned int order; @@ -148,6 +150,7 @@ void __init mem_init(void) void free_initmem(void) { + initmem_freed = true; __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RW | SET_MEMORY_NX); diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index d79abca81a52..d1779d442aa5 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -77,6 +77,20 @@ static inline int arch_is_kernel_data(unsigned long addr) } #endif +/* + * Check if an address is part of freed initmem. This is needed on architectures + * with virt == phys kernel mapping, for code that wants to check if an address + * is part of a static object within [_stext, _end]. After initmem is freed, + * memory can be allocated from it, and such allocations would then have + * addresses within the range [_stext, _end]. + */ +#ifndef arch_is_kernel_initmem_freed +static inline int arch_is_kernel_initmem_freed(unsigned long addr) +{ + return 0; +} +#endif + /** * memory_contains - checks if an object is contained within a memory region * @begin: virtual address of the beginning of the memory region diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 34cdcbedda49..22a99530983e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -649,6 +649,9 @@ static int static_obj(const void *obj) end = (unsigned long) &_end, addr = (unsigned long) obj; + if (arch_is_kernel_initmem_freed(addr)) + return 0; + /* * static variable? */ From 26a374ae7af8d7003ad28a962fba0141e68af5da Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 17 Jan 2019 10:02:22 +0100 Subject: [PATCH 86/98] s390: add missing ENDPROC statements to assembler functions The assembler code in arch/s390 misses proper ENDPROC statements to properly end functions in a few places. Add them. Signed-off-by: Martin Schwidefsky --- arch/s390/boot/text_dma.S | 6 ++++++ arch/s390/crypto/crc32be-vx.S | 1 + arch/s390/crypto/crc32le-vx.S | 6 ++++-- arch/s390/kernel/base.S | 3 +++ arch/s390/kernel/entry.S | 24 ++++++++++++++++++++++-- arch/s390/kernel/mcount.S | 8 ++++++-- arch/s390/kernel/reipl.S | 1 + arch/s390/kernel/relocate_kernel.S | 1 + arch/s390/kernel/swsusp.S | 2 ++ arch/s390/lib/mem.S | 1 + 10 files changed, 47 insertions(+), 6 deletions(-) diff --git a/arch/s390/boot/text_dma.S b/arch/s390/boot/text_dma.S index 2414360ff22d..ea93314f4497 100644 --- a/arch/s390/boot/text_dma.S +++ b/arch/s390/boot/text_dma.S @@ -41,6 +41,7 @@ ENTRY(_diag14_dma) lgfr %r2,%r5 BR_EX_DMA_r14 EX_TABLE_DMA(.Ldiag14_ex, .Ldiag14_fault) +ENDPROC(_diag14_dma) /* * int _diag210_dma(struct diag210 *addr) @@ -58,6 +59,7 @@ ENTRY(_diag210_dma) lgfr %r2,%r2 BR_EX_DMA_r14 EX_TABLE_DMA(.Ldiag210_ex, .Ldiag210_fault) +ENDPROC(_diag210_dma) /* * int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode) @@ -71,6 +73,7 @@ ENTRY(_diag26c_dma) lgfr %r2,%r5 BR_EX_DMA_r14 EX_TABLE_DMA(.Ldiag26c_ex, .Ldiag26c_ex) +ENDPROC(_diag26c_dma) /* * void _diag0c_dma(struct hypfs_diag0c_entry *entry) @@ -80,6 +83,7 @@ ENTRY(_diag0c_dma) diag %r2,%r2,0x0c sam64 BR_EX_DMA_r14 +ENDPROC(_diag0c_dma) /* * void _swsusp_reset_dma(void) @@ -96,6 +100,7 @@ restart_entry: sigp %r1,%r0,SIGP_SET_ARCHITECTURE sam64 BR_EX_DMA_r14 +ENDPROC(_swsusp_reset_dma) /* * void _diag308_reset_dma(void) @@ -142,6 +147,7 @@ restart_part2: lpswe 0(%r4) .Lcontinue: BR_EX_DMA_r14 +ENDPROC(_diag308_reset_dma) .section .dma.data,"aw",@progbits .align 8 diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S index 2bf01ba44107..0099044e2c86 100644 --- a/arch/s390/crypto/crc32be-vx.S +++ b/arch/s390/crypto/crc32be-vx.S @@ -207,5 +207,6 @@ ENTRY(crc32_be_vgfm_16) .Ldone: VLGVF %r2,%v2,3 BR_EX %r14 +ENDPROC(crc32_be_vgfm_16) .previous diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S index 7d6f568bd3ad..71caf0f4ec08 100644 --- a/arch/s390/crypto/crc32le-vx.S +++ b/arch/s390/crypto/crc32le-vx.S @@ -105,13 +105,14 @@ ENTRY(crc32_le_vgfm_16) larl %r5,.Lconstants_CRC_32_LE j crc32_le_vgfm_generic +ENDPROC(crc32_le_vgfm_16) ENTRY(crc32c_le_vgfm_16) larl %r5,.Lconstants_CRC_32C_LE j crc32_le_vgfm_generic +ENDPROC(crc32c_le_vgfm_16) - -crc32_le_vgfm_generic: +ENTRY(crc32_le_vgfm_generic) /* Load CRC-32 constants */ VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5 @@ -267,5 +268,6 @@ crc32_le_vgfm_generic: .Ldone: VLGVF %r2,%v2,2 BR_EX %r14 +ENDPROC(crc32_le_vgfm_generic) .previous diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S index d6ee5978e273..2f39ea57f358 100644 --- a/arch/s390/kernel/base.S +++ b/arch/s390/kernel/base.S @@ -28,6 +28,7 @@ ENTRY(s390_base_mcck_handler) 1: la %r1,4095 lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1) lpswe __LC_MCK_OLD_PSW +ENDPROC(s390_base_mcck_handler) .section .bss .align 8 @@ -48,6 +49,7 @@ ENTRY(s390_base_ext_handler) 1: lmg %r0,%r15,__LC_SAVE_AREA_ASYNC ni __LC_EXT_OLD_PSW+1,0xfd # clear wait state bit lpswe __LC_EXT_OLD_PSW +ENDPROC(s390_base_ext_handler) .section .bss .align 8 @@ -68,6 +70,7 @@ ENTRY(s390_base_pgm_handler) lmg %r0,%r15,__LC_SAVE_AREA_SYNC lpswe __LC_PGM_OLD_PSW 1: lpswe disabled_wait_psw-0b(%r13) +ENDPROC(s390_base_pgm_handler) .align 8 disabled_wait_psw: diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index baa67e434b46..3f4d272577d3 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -224,6 +224,7 @@ ENTRY(__bpon) .globl __bpon BPON BR_EX %r14 +ENDPROC(__bpon) /* * Scheduler resume function, called by switch_to @@ -248,6 +249,7 @@ ENTRY(__switch_to) lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 BR_EX %r14 +ENDPROC(__switch_to) .L__critical_start: @@ -324,6 +326,7 @@ sie_exit: EX_TABLE(.Lrewind_pad4,.Lsie_fault) EX_TABLE(.Lrewind_pad2,.Lsie_fault) EX_TABLE(sie_exit,.Lsie_fault) +ENDPROC(sie64a) EXPORT_SYMBOL(sie64a) EXPORT_SYMBOL(sie_exit) #endif @@ -570,6 +573,7 @@ ENTRY(system_call) lgr %r2,%r11 # pass pointer to pt_regs larl %r14,.Lsysc_return jg do_syscall_trace_exit +ENDPROC(system_call) # # a new process exits the kernel with ret_from_fork @@ -584,10 +588,16 @@ ENTRY(ret_from_fork) jne .Lsysc_tracenogo # it's a kernel thread lmg %r9,%r10,__PT_R9(%r11) # load gprs + la %r2,0(%r10) + BASR_EX %r14,%r9 + j .Lsysc_tracenogo +ENDPROC(ret_from_fork) + ENTRY(kernel_thread_starter) la %r2,0(%r10) BASR_EX %r14,%r9 j .Lsysc_tracenogo +ENDPROC(kernel_thread_starter) /* * Program check handler routine @@ -698,6 +708,7 @@ ENTRY(pgm_check_handler) stg %r14,__LC_RETURN_PSW+8 lghi %r14,_PIF_SYSCALL | _PIF_PER_TRAP lpswe __LC_RETURN_PSW # branch to .Lsysc_per and enable irqs +ENDPROC(pgm_check_handler) /* * IO interrupt handler routine @@ -926,6 +937,7 @@ ENTRY(io_int_handler) ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts TRACE_IRQS_OFF j .Lio_return +ENDPROC(io_int_handler) /* * External interrupt handler routine @@ -965,6 +977,7 @@ ENTRY(ext_int_handler) lghi %r3,EXT_INTERRUPT brasl %r14,do_IRQ j .Lio_return +ENDPROC(ext_int_handler) /* * Load idle PSW. The second "half" of this function is in .Lcleanup_idle. @@ -989,6 +1002,7 @@ ENTRY(psw_idle) lpswe __SF_EMPTY(%r15) BR_EX %r14 .Lpsw_idle_end: +ENDPROC(psw_idle) /* * Store floating-point controls and floating-point or vector register @@ -1031,6 +1045,7 @@ ENTRY(save_fpu_regs) .Lsave_fpu_regs_exit: BR_EX %r14 .Lsave_fpu_regs_end: +ENDPROC(save_fpu_regs) EXPORT_SYMBOL(save_fpu_regs) /* @@ -1077,6 +1092,7 @@ load_fpu_regs: .Lload_fpu_regs_exit: BR_EX %r14 .Lload_fpu_regs_end: +ENDPROC(load_fpu_regs) .L__critical_end: @@ -1206,6 +1222,7 @@ ENTRY(mcck_int_handler) lg %r15,__LC_NODAT_STACK la %r11,STACK_FRAME_OVERHEAD(%r15) j .Lmcck_skip +ENDPROC(mcck_int_handler) # # PSW restart interrupt handler @@ -1232,6 +1249,7 @@ ENTRY(restart_int_handler) 2: sigp %r4,%r3,SIGP_STOP # sigp stop to current cpu brc 2,2b 3: j 3b +ENDPROC(restart_int_handler) .section .kprobes.text, "ax" @@ -1241,7 +1259,7 @@ ENTRY(restart_int_handler) * No need to properly save the registers, we are going to panic anyway. * Setup a pt_regs so that show_trace can provide a good call trace. */ -stack_overflow: +ENTRY(stack_overflow) lg %r15,__LC_NODAT_STACK # change to panic stack la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) @@ -1251,9 +1269,10 @@ stack_overflow: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs jg kernel_stack_overflow +ENDPROC(stack_overflow) #endif -cleanup_critical: +ENTRY(cleanup_critical) #if IS_ENABLED(CONFIG_KVM) clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap jl 0f @@ -1289,6 +1308,7 @@ cleanup_critical: clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end jl .Lcleanup_load_fpu_regs 0: BR_EX %r14,%r11 +ENDPROC(cleanup_critical) .align 8 .Lcleanup_table: diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index e93fbf02490c..09ae6da0aaa5 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -20,6 +20,7 @@ ENTRY(ftrace_stub) BR_EX %r14 +ENDPROC(ftrace_stub) #define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) #define STACK_PTREGS (STACK_FRAME_OVERHEAD) @@ -28,7 +29,7 @@ ENTRY(ftrace_stub) ENTRY(_mcount) BR_EX %r14 - +ENDPROC(_mcount) EXPORT_SYMBOL(_mcount) ENTRY(ftrace_caller) @@ -61,7 +62,8 @@ ENTRY(ftrace_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER # The j instruction gets runtime patched to a nop instruction. # See ftrace_enable_ftrace_graph_caller. -ENTRY(ftrace_graph_caller) + .globl ftrace_graph_caller +ftrace_graph_caller: j ftrace_graph_caller_end lg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) lg %r3,(STACK_PTREGS_PSW+8)(%r15) @@ -73,6 +75,7 @@ ftrace_graph_caller_end: lg %r1,(STACK_PTREGS_PSW+8)(%r15) lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) BR_EX %r1 +ENDPROC(ftrace_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -86,5 +89,6 @@ ENTRY(return_to_handler) lgr %r14,%r2 lmg %r2,%r5,32(%r15) BR_EX %r14 +ENDPROC(return_to_handler) #endif diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S index 7f14adf512c6..4a22163962eb 100644 --- a/arch/s390/kernel/reipl.S +++ b/arch/s390/kernel/reipl.S @@ -73,6 +73,7 @@ ENTRY(store_status) lgr %r9,%r2 lgr %r2,%r3 BR_EX %r9 +ENDPROC(store_status) .section .bss .align 8 diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index 1b56f087ce2c..fe396673e8a6 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -66,6 +66,7 @@ ENTRY(relocate_kernel) mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 .diag: diag %r0,%r0,0x308 +ENDPROC(relocate_kernel) .align 8 load_psw: diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S index f5219ce11cc5..19a3c427801a 100644 --- a/arch/s390/kernel/swsusp.S +++ b/arch/s390/kernel/swsusp.S @@ -108,6 +108,7 @@ ENTRY(swsusp_arch_suspend) lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) lghi %r2,0 BR_EX %r14 +ENDPROC(swsusp_arch_suspend) /* * Restore saved memory image to correct place and restore register context. @@ -260,6 +261,7 @@ restore_registers: lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) lghi %r2,0 BR_EX %r14 +ENDPROC(swsusp_arch_resume) .section .data..nosave,"aw",@progbits .align 8 diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index 53008da05190..dc0874f2e203 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -178,6 +178,7 @@ ENTRY(__memset\bits) BR_EX %r14 .L__memset_mvc\bits: mvc \bytes(1,%r1),0(%r1) +ENDPROC(__memset\bits) .endm __MEMSET 16,2,sth From 40a3abf751dd0ff9ef758fd5c14530f557cd432d Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 1 Feb 2019 16:33:37 +0100 Subject: [PATCH 87/98] s390/nospec: rename assembler generated expoline thunks The assembler version of the expoline thunk use the naming __s390x_indirect_jump_rxuse_ry while the compiler generates names like __s390_indirect_jump_rx_use_ry. Make the naming more consistent. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/nospec-insn.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h index 123dac3717b3..0033dcd663b1 100644 --- a/arch/s390/include/asm/nospec-insn.h +++ b/arch/s390/include/asm/nospec-insn.h @@ -32,23 +32,23 @@ _LC_BR_R1 = __LC_BR_R1 .endm .macro __THUNK_PROLOG_BR r1,r2 - __THUNK_PROLOG_NAME __s390x_indirect_jump_r\r2\()use_r\r1 + __THUNK_PROLOG_NAME __s390_indirect_jump_r\r2\()use_r\r1 .endm .macro __THUNK_PROLOG_BC d0,r1,r2 - __THUNK_PROLOG_NAME __s390x_indirect_branch_\d0\()_\r2\()use_\r1 + __THUNK_PROLOG_NAME __s390_indirect_branch_\d0\()_\r2\()use_\r1 .endm .macro __THUNK_BR r1,r2 - jg __s390x_indirect_jump_r\r2\()use_r\r1 + jg __s390_indirect_jump_r\r2\()use_r\r1 .endm .macro __THUNK_BC d0,r1,r2 - jg __s390x_indirect_branch_\d0\()_\r2\()use_\r1 + jg __s390_indirect_branch_\d0\()_\r2\()use_\r1 .endm .macro __THUNK_BRASL r1,r2,r3 - brasl \r1,__s390x_indirect_jump_r\r3\()use_r\r2 + brasl \r1,__s390_indirect_jump_r\r3\()use_r\r2 .endm .macro __DECODE_RR expand,reg,ruse From bf72630130c29e2ba42e3db8d502de6bdfaa7d9a Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 12 Apr 2019 08:57:34 +0200 Subject: [PATCH 88/98] s390: use proper expoline sections for .dma code The text_dma.S code uses its own macro to generate an inline version of an expoline. To make it easier to identify all expolines in the kernel use a thunk and a branch to the thunk just like the rest of the kernel code does it. The name of the text_dma.S expoline thunk is __dma__s390_indirect_jump_r14 and the section is named .dma.text.__s390_indirect_jump_r14. This will be needed for the objtool support. Signed-off-by: Martin Schwidefsky --- arch/s390/boot/text_dma.S | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/s390/boot/text_dma.S b/arch/s390/boot/text_dma.S index ea93314f4497..9715715c4c28 100644 --- a/arch/s390/boot/text_dma.S +++ b/arch/s390/boot/text_dma.S @@ -9,6 +9,16 @@ #include #include +#ifdef CC_USING_EXPOLINE + .pushsection .dma.text.__s390_indirect_jump_r14,"axG" +__dma__s390_indirect_jump_r14: + larl %r1,0f + ex 0,0(%r1) + j . +0: br %r14 + .popsection +#endif + .section .dma.text,"ax" /* * Simplified version of expoline thunk. The normal thunks can not be used here, @@ -17,10 +27,11 @@ * affects a few functions that are not performance-relevant. */ .macro BR_EX_DMA_r14 - larl %r1,0f - ex 0,0(%r1) - j . -0: br %r14 +#ifdef CC_USING_EXPOLINE + jg __dma__s390_indirect_jump_r14 +#else + br %r14 +#endif .endm /* From e21f8baf8d9a452a09a9cf7c8ab4720ff9a891ef Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 17 Jan 2019 09:29:15 +0100 Subject: [PATCH 89/98] s390/bug: add entry size to the __bug_table section Change the __EMIT_BUG inline assembly to emit mergeable __bug_table entries with type @progbits and specify the size of each entry. The entry size is encoded sh_entsize field of the section definition, it allows to identify which struct bug_entry to use to decode the entries. This will be needed for the objtool support. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/bug.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index 429f43a8a8e8..713fc9735ffb 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -15,7 +15,7 @@ ".section .rodata.str,\"aMS\",@progbits,1\n" \ "2: .asciz \""__FILE__"\"\n" \ ".previous\n" \ - ".section __bug_table,\"aw\"\n" \ + ".section __bug_table,\"awM\",@progbits,%2\n" \ "3: .long 1b-3b,2b-3b\n" \ " .short %0,%1\n" \ " .org 3b+%2\n" \ @@ -27,17 +27,17 @@ #else /* CONFIG_DEBUG_BUGVERBOSE */ -#define __EMIT_BUG(x) do { \ - asm volatile( \ - "0: j 0b+2\n" \ - "1:\n" \ - ".section __bug_table,\"aw\"\n" \ - "2: .long 1b-2b\n" \ - " .short %0\n" \ - " .org 2b+%1\n" \ - ".previous\n" \ - : : "i" (x), \ - "i" (sizeof(struct bug_entry))); \ +#define __EMIT_BUG(x) do { \ + asm volatile( \ + "0: j 0b+2\n" \ + "1:\n" \ + ".section __bug_table,\"awM\",@progbits,%1\n" \ + "2: .long 1b-2b\n" \ + " .short %0\n" \ + " .org 2b+%1\n" \ + ".previous\n" \ + : : "i" (x), \ + "i" (sizeof(struct bug_entry))); \ } while (0) #endif /* CONFIG_DEBUG_BUGVERBOSE */ From 1c705ad5efae9c712e763a47fbcc95b87b7347d2 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 3 Jan 2019 16:49:35 +0100 Subject: [PATCH 90/98] s390/opcodes: add missing instructions to the disassembler Signed-off-by: Martin Schwidefsky --- arch/s390/tools/opcodes.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt index 1cbed82cd17b..64638b764d1c 100644 --- a/arch/s390/tools/opcodes.txt +++ b/arch/s390/tools/opcodes.txt @@ -1,3 +1,5 @@ +0000 illegal E +0002 brkpt E 0101 pr E 0102 upt E 0104 ptff E @@ -257,6 +259,7 @@ b258 bsg RRE_RR b25a bsa RRE_RR b25d clst RRE_RR b25e srst RRE_RR +b25f chsc RRE_R0 b263 cmpsc RRE_RR b274 siga S_RD b276 xsch S_00 @@ -277,6 +280,9 @@ b29d lfpc S_RD b2a5 tre RRE_RR b2a6 cu21 RRF_U0RR b2a7 cu12 RRF_U0RR +b2ad nqap RRE_RR +b2ae dqap RRE_RR +b2af pqap RRE_RR b2b0 stfle S_RD b2b1 stfl S_RD b2b2 lpswe S_RD @@ -290,6 +296,7 @@ b2e5 epctr RRE_RR b2e8 ppa RRF_U0RR b2ec etnd RRE_R0 b2ed ecpga RRE_RR +b2f0 iucv RRE_RR b2f8 tend S_00 b2fa niai IE_UU b2fc tabort S_RD @@ -559,12 +566,15 @@ b998 alcr RRE_RR b999 slbr RRE_RR b99a epair RRE_R0 b99b esair RRE_R0 +b99c eqbs RRF_U0RR b99d esea RRE_R0 b99e pti RRE_RR b99f ssair RRE_R0 +b9a0 clp RRF_U0RR b9a1 tpei RRE_RR b9a2 ptf RRE_R0 b9aa lptea RRF_RURR2 +b9ab essa RRF_U0RR b9ac irbm RRE_RR b9ae rrbm RRE_RR b9af pfmf RRE_RR @@ -1039,6 +1049,7 @@ eb7a agsi SIY_IRD eb7e algsi SIY_IRD eb80 icmh RSY_RURD eb81 icmy RSY_RURD +eb8a sqbs RSY_RDRU eb8e mvclu RSY_RRRD eb8f clclu RSY_RRRD eb90 stmy RSY_RRRD From 78c98f9074135d3dab4e39544e0a537f92388fce Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 28 Jan 2019 08:33:08 +0100 Subject: [PATCH 91/98] s390/unwind: introduce stack unwind API Rework the dump_trace() stack unwinder interface to support different unwinding algorithms. The new interface looks like this: struct unwind_state state; unwind_for_each_frame(&state, task, regs, start_stack) do_something(state.sp, state.ip, state.reliable); The unwind_bc.c file contains the implementation for the classic back-chain unwinder. One positive side effect of the new code is it now handles ftraced functions gracefully. It prints the real name of the return function instead of 'return_to_handler'. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/processor.h | 72 ------------- arch/s390/include/asm/stacktrace.h | 114 ++++++++++++++++++++ arch/s390/include/asm/unwind.h | 101 +++++++++++++++++ arch/s390/kernel/Makefile | 3 +- arch/s390/kernel/asm-offsets.c | 1 + arch/s390/kernel/dumpstack.c | 167 +++++++++++++++++------------ arch/s390/kernel/irq.c | 1 + arch/s390/kernel/machine_kexec.c | 1 + arch/s390/kernel/perf_event.c | 16 +-- arch/s390/kernel/process.c | 1 + arch/s390/kernel/setup.c | 1 + arch/s390/kernel/smp.c | 1 + arch/s390/kernel/stacktrace.c | 69 ++++++------ arch/s390/kernel/unwind_bc.c | 155 ++++++++++++++++++++++++++ arch/s390/mm/maccess.c | 1 + arch/s390/oprofile/init.c | 22 ++-- 16 files changed, 521 insertions(+), 205 deletions(-) create mode 100644 arch/s390/include/asm/stacktrace.h create mode 100644 arch/s390/include/asm/unwind.h create mode 100644 arch/s390/kernel/unwind_bc.c diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 8aa85e39f50b..9f2ff4a54aff 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -156,25 +156,6 @@ struct thread_struct { typedef struct thread_struct thread_struct; -/* - * Stack layout of a C stack frame. - */ -#ifndef __PACK_STACK -struct stack_frame { - unsigned long back_chain; - unsigned long empty1[5]; - unsigned long gprs[10]; - unsigned int empty2[8]; -}; -#else -struct stack_frame { - unsigned long empty1[5]; - unsigned int empty2[8]; - unsigned long gprs[10]; - unsigned long back_chain; -}; -#endif - #define ARCH_MIN_TASKALIGN 8 #define INIT_THREAD { \ @@ -206,11 +187,7 @@ struct mm_struct; struct seq_file; struct pt_regs; -typedef int (*dump_trace_func_t)(void *data, unsigned long address, int reliable); -void dump_trace(dump_trace_func_t func, void *data, - struct task_struct *task, unsigned long sp); void show_registers(struct pt_regs *regs); - void show_cacheinfo(struct seq_file *m); /* Free all resources held by a thread. */ @@ -244,55 +221,6 @@ static __no_kasan_or_inline unsigned short stap(void) return cpu_address; } -#define CALL_ARGS_0() \ - register unsigned long r2 asm("2") -#define CALL_ARGS_1(arg1) \ - register unsigned long r2 asm("2") = (unsigned long)(arg1) -#define CALL_ARGS_2(arg1, arg2) \ - CALL_ARGS_1(arg1); \ - register unsigned long r3 asm("3") = (unsigned long)(arg2) -#define CALL_ARGS_3(arg1, arg2, arg3) \ - CALL_ARGS_2(arg1, arg2); \ - register unsigned long r4 asm("4") = (unsigned long)(arg3) -#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \ - CALL_ARGS_3(arg1, arg2, arg3); \ - register unsigned long r4 asm("5") = (unsigned long)(arg4) -#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \ - CALL_ARGS_4(arg1, arg2, arg3, arg4); \ - register unsigned long r4 asm("6") = (unsigned long)(arg5) - -#define CALL_FMT_0 "=&d" (r2) : -#define CALL_FMT_1 "+&d" (r2) : -#define CALL_FMT_2 CALL_FMT_1 "d" (r3), -#define CALL_FMT_3 CALL_FMT_2 "d" (r4), -#define CALL_FMT_4 CALL_FMT_3 "d" (r5), -#define CALL_FMT_5 CALL_FMT_4 "d" (r6), - -#define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory" -#define CALL_CLOBBER_4 CALL_CLOBBER_5 -#define CALL_CLOBBER_3 CALL_CLOBBER_4, "5" -#define CALL_CLOBBER_2 CALL_CLOBBER_3, "4" -#define CALL_CLOBBER_1 CALL_CLOBBER_2, "3" -#define CALL_CLOBBER_0 CALL_CLOBBER_1 - -#define CALL_ON_STACK(fn, stack, nr, args...) \ -({ \ - CALL_ARGS_##nr(args); \ - unsigned long prev; \ - \ - asm volatile( \ - " la %[_prev],0(15)\n" \ - " la 15,0(%[_stack])\n" \ - " stg %[_prev],%[_bc](15)\n" \ - " brasl 14,%[_fn]\n" \ - " la 15,0(%[_prev])\n" \ - : [_prev] "=&a" (prev), CALL_FMT_##nr \ - [_stack] "a" (stack), \ - [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ - [_fn] "X" (fn) : CALL_CLOBBER_##nr); \ - r2; \ -}) - /* * Give up the time slice of the virtual PU. */ diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h new file mode 100644 index 000000000000..49634bfbecdd --- /dev/null +++ b/arch/s390/include/asm/stacktrace.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_STACKTRACE_H +#define _ASM_S390_STACKTRACE_H + +#include +#include +#include + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_NODAT, + STACK_TYPE_RESTART, +}; + +struct stack_info { + enum stack_type type; + unsigned long begin, end; +}; + +const char *stack_type_name(enum stack_type type); +int get_stack_info(unsigned long sp, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); + +static inline bool on_stack(struct stack_info *info, + unsigned long addr, size_t len) +{ + if (info->type == STACK_TYPE_UNKNOWN) + return false; + if (addr + len < addr) + return false; + return addr >= info->begin && addr + len < info->end; +} + +static inline unsigned long get_stack_pointer(struct task_struct *task, + struct pt_regs *regs) +{ + if (regs) + return (unsigned long) kernel_stack_pointer(regs); + if (task == current) + return current_stack_pointer(); + return (unsigned long) task->thread.ksp; +} + +/* + * Stack layout of a C stack frame. + */ +#ifndef __PACK_STACK +struct stack_frame { + unsigned long back_chain; + unsigned long empty1[5]; + unsigned long gprs[10]; + unsigned int empty2[8]; +}; +#else +struct stack_frame { + unsigned long empty1[5]; + unsigned int empty2[8]; + unsigned long gprs[10]; + unsigned long back_chain; +}; +#endif + +#define CALL_ARGS_0() \ + register unsigned long r2 asm("2") +#define CALL_ARGS_1(arg1) \ + register unsigned long r2 asm("2") = (unsigned long)(arg1) +#define CALL_ARGS_2(arg1, arg2) \ + CALL_ARGS_1(arg1); \ + register unsigned long r3 asm("3") = (unsigned long)(arg2) +#define CALL_ARGS_3(arg1, arg2, arg3) \ + CALL_ARGS_2(arg1, arg2); \ + register unsigned long r4 asm("4") = (unsigned long)(arg3) +#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \ + CALL_ARGS_3(arg1, arg2, arg3); \ + register unsigned long r4 asm("5") = (unsigned long)(arg4) +#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \ + CALL_ARGS_4(arg1, arg2, arg3, arg4); \ + register unsigned long r4 asm("6") = (unsigned long)(arg5) + +#define CALL_FMT_0 "=&d" (r2) : +#define CALL_FMT_1 "+&d" (r2) : +#define CALL_FMT_2 CALL_FMT_1 "d" (r3), +#define CALL_FMT_3 CALL_FMT_2 "d" (r4), +#define CALL_FMT_4 CALL_FMT_3 "d" (r5), +#define CALL_FMT_5 CALL_FMT_4 "d" (r6), + +#define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory" +#define CALL_CLOBBER_4 CALL_CLOBBER_5 +#define CALL_CLOBBER_3 CALL_CLOBBER_4, "5" +#define CALL_CLOBBER_2 CALL_CLOBBER_3, "4" +#define CALL_CLOBBER_1 CALL_CLOBBER_2, "3" +#define CALL_CLOBBER_0 CALL_CLOBBER_1 + +#define CALL_ON_STACK(fn, stack, nr, args...) \ +({ \ + CALL_ARGS_##nr(args); \ + unsigned long prev; \ + \ + asm volatile( \ + " la %[_prev],0(15)\n" \ + " la 15,0(%[_stack])\n" \ + " stg %[_prev],%[_bc](15)\n" \ + " brasl 14,%[_fn]\n" \ + " la 15,0(%[_prev])\n" \ + : [_prev] "=&a" (prev), CALL_FMT_##nr \ + [_stack] "a" (stack), \ + [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ + [_fn] "X" (fn) : CALL_CLOBBER_##nr); \ + r2; \ +}) + +#endif /* _ASM_S390_STACKTRACE_H */ diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h new file mode 100644 index 000000000000..6eb2ef105d87 --- /dev/null +++ b/arch/s390/include/asm/unwind.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_UNWIND_H +#define _ASM_S390_UNWIND_H + +#include +#include +#include +#include + +/* + * To use the stack unwinder it has to be initialized with unwind_start. + * There four combinations for task and regs: + * 1) task==NULL, regs==NULL: the unwind starts for the task that is currently + * running, sp/ip picked up from the CPU registers + * 2) task==NULL, regs!=NULL: the unwind starts from the sp/ip found in + * the struct pt_regs of an interrupt frame for the current task + * 3) task!=NULL, regs==NULL: the unwind starts for an inactive task with + * the sp picked up from task->thread.ksp and the ip picked up from the + * return address stored by __switch_to + * 4) task!=NULL, regs!=NULL: the sp/ip are picked up from the interrupt + * frame 'regs' of a inactive task + * If 'first_frame' is not zero unwind_start skips unwind frames until it + * reaches the specified stack pointer. + * The end of the unwinding is indicated with unwind_done, this can be true + * right after unwind_start, e.g. with first_frame!=0 that can not be found. + * unwind_next_frame skips to the next frame. + * Once the unwind is completed unwind_error() can be used to check if there + * has been a situation where the unwinder could not correctly understand + * the tasks call chain. + */ + +struct unwind_state { + struct stack_info stack_info; + unsigned long stack_mask; + struct task_struct *task; + struct pt_regs *regs; + unsigned long sp, ip; + int graph_idx; + bool reliable; + bool error; +}; + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long first_frame); +bool unwind_next_frame(struct unwind_state *state); +unsigned long unwind_get_return_address(struct unwind_state *state); + +static inline bool unwind_done(struct unwind_state *state) +{ + return state->stack_info.type == STACK_TYPE_UNKNOWN; +} + +static inline bool unwind_error(struct unwind_state *state) +{ + return state->error; +} + +static inline void unwind_start(struct unwind_state *state, + struct task_struct *task, + struct pt_regs *regs, + unsigned long sp) +{ + sp = sp ? : get_stack_pointer(task, regs); + __unwind_start(state, task, regs, sp); +} + +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + return unwind_done(state) ? NULL : state->regs; +} + +#define unwind_for_each_frame(state, task, regs, first_frame) \ + for (unwind_start(state, task, regs, first_frame); \ + !unwind_done(state); \ + unwind_next_frame(state)) + +static inline void unwind_init(void) {} +static inline void unwind_module_init(struct module *mod, void *orc_ip, + size_t orc_ip_size, void *orc, + size_t orc_size) {} + +#ifdef CONFIG_KASAN +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) +#else +#define READ_ONCE_TASK_STACK(task, x) READ_ONCE(x) +#endif + +#endif /* _ASM_S390_UNWIND_H */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 19425605a83d..b0478d01a0c5 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -39,6 +39,7 @@ CFLAGS_smp.o := -Wno-nonnull # CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls +CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls # # Pass UTS_MACHINE for user_regset definition @@ -51,7 +52,7 @@ obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o early_nobss.o obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o -obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o +obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o extra-y += head64.o vmlinux.lds diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 164bec175628..41ac4ad21311 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -16,6 +16,7 @@ #include #include #include +#include int main(void) { diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index cb7f55bbe06e..9e87b68be21c 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -21,95 +21,124 @@ #include #include #include +#include -/* - * For dump_trace we have tree different stack to consider: - * - the panic stack which is used if the kernel stack has overflown - * - the asynchronous interrupt stack (cpu related) - * - the synchronous kernel stack (process related) - * The stack trace can start at any of the three stacks and can potentially - * touch all of them. The order is: panic stack, async stack, sync stack. - */ -static unsigned long __no_sanitize_address -__dump_trace(dump_trace_func_t func, void *data, unsigned long sp, - unsigned long low, unsigned long high) +const char *stack_type_name(enum stack_type type) { - struct stack_frame *sf; - struct pt_regs *regs; - - while (1) { - if (sp < low || sp > high - sizeof(*sf)) - return sp; - sf = (struct stack_frame *) sp; - if (func(data, sf->gprs[8], 0)) - return sp; - /* Follow the backchain. */ - while (1) { - low = sp; - sp = sf->back_chain; - if (!sp) - break; - if (sp <= low || sp > high - sizeof(*sf)) - return sp; - sf = (struct stack_frame *) sp; - if (func(data, sf->gprs[8], 1)) - return sp; - } - /* Zero backchain detected, check for interrupt frame. */ - sp = (unsigned long) (sf + 1); - if (sp <= low || sp > high - sizeof(*regs)) - return sp; - regs = (struct pt_regs *) sp; - if (!user_mode(regs)) { - if (func(data, regs->psw.addr, 1)) - return sp; - } - low = sp; - sp = regs->gprs[15]; + switch (type) { + case STACK_TYPE_TASK: + return "task"; + case STACK_TYPE_IRQ: + return "irq"; + case STACK_TYPE_NODAT: + return "nodat"; + case STACK_TYPE_RESTART: + return "restart"; + default: + return "unknown"; } } -void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task, - unsigned long sp) +static inline bool in_stack(unsigned long sp, struct stack_info *info, + enum stack_type type, unsigned long low, + unsigned long high) { - unsigned long frame_size; + if (sp < low || sp >= high) + return false; + info->type = type; + info->begin = low; + info->end = high; + return true; +} + +static bool in_task_stack(unsigned long sp, struct task_struct *task, + struct stack_info *info) +{ + unsigned long stack; + + stack = (unsigned long) task_stack_page(task); + return in_stack(sp, info, STACK_TYPE_TASK, stack, stack + THREAD_SIZE); +} + +static bool in_irq_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long frame_size, top; frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); -#ifdef CONFIG_CHECK_STACK - sp = __dump_trace(func, data, sp, - S390_lowcore.nodat_stack + frame_size - THREAD_SIZE, - S390_lowcore.nodat_stack + frame_size); -#endif - sp = __dump_trace(func, data, sp, - S390_lowcore.async_stack + frame_size - THREAD_SIZE, - S390_lowcore.async_stack + frame_size); - task = task ?: current; - __dump_trace(func, data, sp, - (unsigned long)task_stack_page(task), - (unsigned long)task_stack_page(task) + THREAD_SIZE); + top = S390_lowcore.async_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_IRQ, top - THREAD_SIZE, top); } -EXPORT_SYMBOL_GPL(dump_trace); -static int show_address(void *data, unsigned long address, int reliable) +static bool in_nodat_stack(unsigned long sp, struct stack_info *info) { - if (reliable) - printk(" [<%016lx>] %pSR \n", address, (void *)address); - else - printk("([<%016lx>] %pSR)\n", address, (void *)address); + unsigned long frame_size, top; + + frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); + top = S390_lowcore.nodat_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top); +} + +static bool in_restart_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long frame_size, top; + + frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); + top = S390_lowcore.restart_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_RESTART, top - THREAD_SIZE, top); +} + +int get_stack_info(unsigned long sp, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!sp) + goto unknown; + + task = task ? : current; + + /* Check per-task stack */ + if (in_task_stack(sp, task, info)) + goto recursion_check; + + if (task != current) + goto unknown; + + /* Check per-cpu stacks */ + if (!in_irq_stack(sp, info) && + !in_nodat_stack(sp, info) && + !in_restart_stack(sp, info)) + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING + "WARNING: stack recursion on stack type %d\n", + info->type); + goto unknown; + } + *visit_mask |= 1UL << info->type; return 0; +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } void show_stack(struct task_struct *task, unsigned long *stack) { - unsigned long sp = (unsigned long) stack; + struct unwind_state state; - if (!sp) - sp = task ? task->thread.ksp : current_stack_pointer(); printk("Call Trace:\n"); - dump_trace(show_address, NULL, task, sp); if (!task) task = current; - debug_show_held_locks(task); + unwind_for_each_frame(&state, task, NULL, (unsigned long) stack) + printk(state.reliable ? " [<%016lx>] %pSR \n" : + "([<%016lx>] %pSR)\n", + state.ip, (void *) state.ip); + debug_show_held_locks(task ? : current); } static void show_last_breaking_event(struct pt_regs *regs) diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 150964f91183..8371855042dc 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "entry.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index e2ba7b7f574e..2f3a742a71a5 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index 0d770e513abf..fcb6c2e92b07 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -21,6 +21,7 @@ #include #include #include +#include const char *perf_pmu_name(void) { @@ -219,20 +220,13 @@ static int __init service_level_perf_register(void) } arch_initcall(service_level_perf_register); -static int __perf_callchain_kernel(void *data, unsigned long address, int reliable) -{ - struct perf_callchain_entry_ctx *entry = data; - - perf_callchain_store(entry, address); - return 0; -} - void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (user_mode(regs)) - return; - dump_trace(__perf_callchain_kernel, entry, NULL, regs->gprs[15]); + struct unwind_state state; + + unwind_for_each_frame(&state, current, regs, 0) + perf_callchain_store(entry, state.ip); } /* Perf definitions for PMU event attributes in sysfs */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 6e758bb6cd29..63873aa6693f 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include "entry.h" diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 64e4bc9dd130..f8544d517430 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 88634fb0cc50..35fafa2b91a8 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include "entry.h" diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 460dcfba7d4e..89f9f63dca18 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -11,40 +11,21 @@ #include #include #include - -static int __save_address(void *data, unsigned long address, int nosched) -{ - struct stack_trace *trace = data; - - if (nosched && in_sched_functions(address)) - return 0; - if (trace->skip > 0) { - trace->skip--; - return 0; - } - if (trace->nr_entries < trace->max_entries) { - trace->entries[trace->nr_entries++] = address; - return 0; - } - return 1; -} - -static int save_address(void *data, unsigned long address, int reliable) -{ - return __save_address(data, address, 0); -} - -static int save_address_nosched(void *data, unsigned long address, int reliable) -{ - return __save_address(data, address, 1); -} +#include +#include void save_stack_trace(struct stack_trace *trace) { - unsigned long sp; + struct unwind_state state; - sp = current_stack_pointer(); - dump_trace(save_address, trace, NULL, sp); + unwind_for_each_frame(&state, current, NULL, 0) { + if (trace->nr_entries >= trace->max_entries) + break; + if (trace->skip > 0) + trace->skip--; + else + trace->entries[trace->nr_entries++] = state.ip; + } if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } @@ -52,12 +33,18 @@ EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - unsigned long sp; + struct unwind_state state; - sp = tsk->thread.ksp; - if (tsk == current) - sp = current_stack_pointer(); - dump_trace(save_address_nosched, trace, tsk, sp); + unwind_for_each_frame(&state, tsk, NULL, 0) { + if (trace->nr_entries >= trace->max_entries) + break; + if (in_sched_functions(state.ip)) + continue; + if (trace->skip > 0) + trace->skip--; + else + trace->entries[trace->nr_entries++] = state.ip; + } if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } @@ -65,10 +52,16 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { - unsigned long sp; + struct unwind_state state; - sp = kernel_stack_pointer(regs); - dump_trace(save_address, trace, NULL, sp); + unwind_for_each_frame(&state, current, regs, 0) { + if (trace->nr_entries >= trace->max_entries) + break; + if (trace->skip > 0) + trace->skip--; + else + trace->entries[trace->nr_entries++] = state.ip; + } if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c new file mode 100644 index 000000000000..cf5a630f3aa9 --- /dev/null +++ b/arch/s390/kernel/unwind_bc.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +static bool outside_of_stack(struct unwind_state *state, unsigned long sp) +{ + return (sp <= state->sp) || + (sp + sizeof(struct stack_frame) > state->stack_info.end); +} + +static bool update_stack_info(struct unwind_state *state, unsigned long sp) +{ + struct stack_info *info = &state->stack_info; + unsigned long *mask = &state->stack_mask; + + /* New stack pointer leaves the current stack */ + if (get_stack_info(sp, state->task, info, mask) != 0 || + !on_stack(info, sp, sizeof(struct stack_frame))) + /* 'sp' does not point to a valid stack */ + return false; + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + struct stack_frame *sf; + struct pt_regs *regs; + unsigned long sp, ip; + bool reliable; + + regs = state->regs; + if (unlikely(regs)) { + sp = READ_ONCE_TASK_STACK(state->task, regs->gprs[15]); + if (unlikely(outside_of_stack(state, sp))) { + if (!update_stack_info(state, sp)) + goto out_err; + } + sf = (struct stack_frame *) sp; + ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + reliable = false; + regs = NULL; + } else { + sf = (struct stack_frame *) state->sp; + sp = READ_ONCE_TASK_STACK(state->task, sf->back_chain); + if (likely(sp)) { + /* Non-zero back-chain points to the previous frame */ + if (unlikely(outside_of_stack(state, sp))) { + if (!update_stack_info(state, sp)) + goto out_err; + } + sf = (struct stack_frame *) sp; + ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + reliable = true; + } else { + /* No back-chain, look for a pt_regs structure */ + sp = state->sp + STACK_FRAME_OVERHEAD; + if (!on_stack(info, sp, sizeof(struct pt_regs))) + goto out_stop; + regs = (struct pt_regs *) sp; + if (user_mode(regs)) + goto out_stop; + ip = READ_ONCE_TASK_STACK(state->task, regs->psw.addr); + reliable = true; + } + } + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* Decode any ftrace redirection */ + if (ip == (unsigned long) return_to_handler) + ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + ip, NULL); +#endif + + /* Update unwind state */ + state->sp = sp; + state->ip = ip; + state->regs = regs; + state->reliable = reliable; + return true; + +out_err: + state->error = true; +out_stop: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long sp) +{ + struct stack_info *info = &state->stack_info; + unsigned long *mask = &state->stack_mask; + struct stack_frame *sf; + unsigned long ip; + bool reliable; + + memset(state, 0, sizeof(*state)); + state->task = task; + state->regs = regs; + + /* Don't even attempt to start from user mode regs: */ + if (regs && user_mode(regs)) { + info->type = STACK_TYPE_UNKNOWN; + return; + } + + /* Get current stack pointer and initialize stack info */ + if (get_stack_info(sp, task, info, mask) != 0 || + !on_stack(info, sp, sizeof(struct stack_frame))) { + /* Something is wrong with the stack pointer */ + info->type = STACK_TYPE_UNKNOWN; + state->error = true; + return; + } + + /* Get the instruction pointer from pt_regs or the stack frame */ + if (regs) { + ip = READ_ONCE_TASK_STACK(state->task, regs->psw.addr); + reliable = true; + } else { + sf = (struct stack_frame *) sp; + ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + reliable = false; + } + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* Decode any ftrace redirection */ + if (ip == (unsigned long) return_to_handler) + ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + ip, NULL); +#endif + + /* Update unwind state */ + state->sp = sp; + state->ip = ip; + state->reliable = reliable; +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 97b3ee53852b..818deeb1ebc3 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -16,6 +16,7 @@ #include #include #include +#include static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) { diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c index 43d9525c36fc..7441857df51b 100644 --- a/arch/s390/oprofile/init.c +++ b/arch/s390/oprofile/init.c @@ -13,23 +13,17 @@ #include #include #include - -static int __s390_backtrace(void *data, unsigned long address, int reliable) -{ - unsigned int *depth = data; - - if (*depth == 0) - return 1; - (*depth)--; - oprofile_add_trace(address); - return 0; -} +#include static void s390_backtrace(struct pt_regs *regs, unsigned int depth) { - if (user_mode(regs)) - return; - dump_trace(__s390_backtrace, &depth, NULL, regs->gprs[15]); + struct unwind_state state; + + unwind_for_each_frame(&state, current, regs, 0) { + if (depth-- == 0) + break; + oprofile_add_trace(state.ip); + } } int __init oprofile_arch_init(struct oprofile_operations *ops) From ec7bf4789d95a0053bac0dfa36fbefd8cc584eea Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Mon, 18 Feb 2019 16:51:28 +0100 Subject: [PATCH 92/98] s390/ftrace: use HAVE_FUNCTION_GRAPH_RET_ADDR_PTR Make the call chain more reliable by tagging the ftrace stack entries with the stack pointer that is associated with the return address. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ftrace.h | 2 ++ arch/s390/kernel/entry.h | 2 +- arch/s390/kernel/ftrace.c | 9 +++++---- arch/s390/kernel/mcount.S | 4 ++-- arch/s390/kernel/unwind_bc.c | 2 +- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 6e0ed0338785..68d362f8d6c1 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -11,6 +11,8 @@ #define MCOUNT_RETURN_FIXUP 18 #endif +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + #ifndef __ASSEMBLY__ #ifdef CONFIG_CC_IS_CLANG diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index c3816ae108b0..20420c2b8a14 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -65,7 +65,7 @@ int setup_profiling_timer(unsigned int multiplier); void __init time_init(void); int pfn_is_nosave(unsigned long); void s390_early_resume(void); -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip); struct s390_mmap_arg_struct; struct fadvise64_64_args; diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 39b13d71a8fe..1bb85f60c0dd 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -201,17 +201,18 @@ device_initcall(ftrace_plt_init); * Hook the return address and push it in the stack of return addresses * in current thread info. */ -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) +unsigned long prepare_ftrace_return(unsigned long ra, unsigned long sp, + unsigned long ip) { if (unlikely(ftrace_graph_is_dead())) goto out; if (unlikely(atomic_read(¤t->tracing_graph_pause))) goto out; ip -= MCOUNT_INSN_SIZE; - if (!function_graph_enter(parent, ip, 0, NULL)) - parent = (unsigned long) return_to_handler; + if (!function_graph_enter(ra, ip, 0, (void *) sp)) + ra = (unsigned long) return_to_handler; out: - return parent; + return ra; } NOKPROBE_SYMBOL(prepare_ftrace_return); diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 09ae6da0aaa5..9e1660a6b9db 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -65,8 +65,8 @@ ENTRY(ftrace_caller) .globl ftrace_graph_caller ftrace_graph_caller: j ftrace_graph_caller_end - lg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) - lg %r3,(STACK_PTREGS_PSW+8)(%r15) + lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15) + lg %r4,(STACK_PTREGS_PSW+8)(%r15) brasl %r14,prepare_ftrace_return stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) ftrace_graph_caller_end: diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c index cf5a630f3aa9..57fd4e902f1f 100644 --- a/arch/s390/kernel/unwind_bc.c +++ b/arch/s390/kernel/unwind_bc.c @@ -84,7 +84,7 @@ bool unwind_next_frame(struct unwind_state *state) /* Decode any ftrace redirection */ if (ip == (unsigned long) return_to_handler) ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, - ip, NULL); + ip, (void *) sp); #endif /* Update unwind state */ From 98587c2d894c34c9af5cd84ca169e1cd493aa692 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 30 Apr 2019 12:33:45 +0200 Subject: [PATCH 93/98] s390: simplify disabled_wait The disabled_wait() function uses its argument as the PSW address when it stops the CPU with a wait PSW that is disabled for interrupts. The different callers sometimes use a specific number like 0xdeadbeef to indicate a specific failure, the early boot code uses 0 and some other calls sites use __builtin_return_address(0). At the time a dump is created the current PSW and the registers of a CPU are written to lowcore to make them avaiable to the dump analysis tool. For a CPU stopped with disabled_wait the PSW and the registers do not really make sense together, the PSW address does not point to the function the registers belong to. Simplify disabled_wait() by using _THIS_IP_ for the PSW address and drop the argument to the function. Signed-off-by: Martin Schwidefsky --- arch/s390/boot/als.c | 2 +- arch/s390/boot/startup.c | 2 +- arch/s390/include/asm/processor.h | 4 ++-- arch/s390/kernel/early.c | 4 ++-- arch/s390/kernel/early_nobss.c | 2 +- arch/s390/kernel/ipl.c | 4 ++-- arch/s390/kernel/machine_kexec.c | 4 ++-- arch/s390/kernel/nmi.c | 2 +- kernel/panic.c | 7 +------ 9 files changed, 13 insertions(+), 18 deletions(-) diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c index f902215e9cd9..ff6801d401c4 100644 --- a/arch/s390/boot/als.c +++ b/arch/s390/boot/als.c @@ -99,7 +99,7 @@ static void facility_mismatch(void) print_machine_type(); print_missing_facilities(); sclp_early_printk("See Principles of Operations for facility bits\n"); - disabled_wait(0x8badcccc); + disabled_wait(); } void verify_facilities(void) diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 4401e992bda1..7b0d05414618 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -57,7 +57,7 @@ void error(char *x) sclp_early_printk(x); sclp_early_printk("\n\n -- System halted"); - disabled_wait(0xdeadbeef); + disabled_wait(); } #ifdef CONFIG_KERNEL_UNCOMPRESSED diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 9f2ff4a54aff..b0fcbc37b637 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -315,12 +315,12 @@ void enabled_wait(void); /* * Function to drop a processor into disabled wait state */ -static inline void __noreturn disabled_wait(unsigned long code) +static inline void __noreturn disabled_wait(void) { psw_t psw; psw.mask = PSW_MASK_BASE | PSW_MASK_WAIT | PSW_MASK_BA | PSW_MASK_EA; - psw.addr = code; + psw.addr = _THIS_IP_; __load_psw(psw); while (1); } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 33f704c16bd3..629f173f60cd 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -141,7 +141,7 @@ static void early_pgm_check_handler(void) addr = S390_lowcore.program_old_psw.addr; fixup = s390_search_extables(addr); if (!fixup) - disabled_wait(0); + disabled_wait(); /* Disable low address protection before storing into lowcore. */ __ctl_store(cr0, 0, 0); cr0_new = cr0 & ~(1UL << 28); @@ -298,7 +298,7 @@ static void __init check_image_bootable(void) sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n"); sclp_early_printk("This image does not contain all parts necessary for starting up. Use\n"); sclp_early_printk("bzImage or arch/s390/boot/compressed/vmlinux instead.\n"); - disabled_wait(0xbadb007); + disabled_wait(); } void __init startup_init(void) diff --git a/arch/s390/kernel/early_nobss.c b/arch/s390/kernel/early_nobss.c index 8d73f7fae16e..52a3ef959341 100644 --- a/arch/s390/kernel/early_nobss.c +++ b/arch/s390/kernel/early_nobss.c @@ -25,7 +25,7 @@ static void __init reset_tod_clock(void) return; /* TOD clock not running. Set the clock to Unix Epoch. */ if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0) - disabled_wait(0); + disabled_wait(); memset(tod_clock_base, 0, 16); *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH; diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index aa8fe768640e..d836af3ccc38 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -920,7 +920,7 @@ static void __reipl_run(void *unused) case IPL_TYPE_FCP_DUMP: break; } - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); } static void reipl_run(struct shutdown_trigger *trigger) @@ -1375,7 +1375,7 @@ static void stop_run(struct shutdown_trigger *trigger) { if (strcmp(trigger->name, ON_PANIC_STR) == 0 || strcmp(trigger->name, ON_RESTART_STR) == 0) - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); smp_stop_cpu(); } diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index 2f3a742a71a5..8a1ae140c5e2 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -96,7 +96,7 @@ static void __do_machine_kdump(void *image) start_kdump(1); /* Die if start_kdump returns */ - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); } /* @@ -284,7 +284,7 @@ static void __do_machine_kexec(void *data) (*data_mover)(&image->head, image->start); /* Die if kexec returns */ - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); } /* diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 8c867b43c8eb..0a487fae763e 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -125,7 +125,7 @@ void nmi_free_per_cpu(struct lowcore *lc) static notrace void s390_handle_damage(void) { smp_emergency_stop(); - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); while (1); } NOKPROBE_SYMBOL(s390_handle_damage); diff --git a/kernel/panic.c b/kernel/panic.c index 0ae0d7332f12..c1fcaad337b7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -318,12 +318,7 @@ void panic(const char *fmt, ...) } #endif #if defined(CONFIG_S390) - { - unsigned long caller; - - caller = (unsigned long)__builtin_return_address(0); - disabled_wait(caller); - } + disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); local_irq_enable(); From c263a4e990b7296b074e33aa077239a0a28a818e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 15 Apr 2019 10:35:51 +0200 Subject: [PATCH 94/98] s390: only build for new CPUs with clang llvm does does not understand -march=z9-109 and older target specifiers, so disable the respective Kconfig settings and the logic to make the boot code work on old systems when building with clang. Part of the early boot code is normally compiled with -march=z900 for maximum compatibility. This also has to get changed with clang to the oldest supported ISA, which is -march=z10 here. Signed-off-by: Arnd Bergmann Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 6 ++++++ arch/s390/boot/Makefile | 18 ++++++++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8c15392ee985..380d13df4411 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -241,6 +241,7 @@ choice config MARCH_Z900 bool "IBM zSeries model z800 and z900" + depends on !CC_IS_CLANG select HAVE_MARCH_Z900_FEATURES help Select this to enable optimizations for model z800/z900 (2064 and @@ -249,6 +250,7 @@ config MARCH_Z900 config MARCH_Z990 bool "IBM zSeries model z890 and z990" + depends on !CC_IS_CLANG select HAVE_MARCH_Z990_FEATURES help Select this to enable optimizations for model z890/z990 (2084 and @@ -257,6 +259,7 @@ config MARCH_Z990 config MARCH_Z9_109 bool "IBM System z9" + depends on !CC_IS_CLANG select HAVE_MARCH_Z9_109_FEATURES help Select this to enable optimizations for IBM System z9 (2094 and @@ -348,12 +351,15 @@ config TUNE_DEFAULT config TUNE_Z900 bool "IBM zSeries model z800 and z900" + depends on !CC_IS_CLANG config TUNE_Z990 bool "IBM zSeries model z890 and z990" + depends on !CC_IS_CLANG config TUNE_Z9_109 bool "IBM System z9" + depends on !CC_IS_CLANG config TUNE_Z10 bool "IBM System z10" diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index cb40317dc3f7..c51496bbac19 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -12,18 +12,24 @@ KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR) KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR) # -# Use -march=z900 for als.c to be able to print an error +# Use minimum architecture for als.c to be able to print an error # message if the kernel is started on a machine which is too old # -ifneq ($(CC_FLAGS_MARCH),-march=z900) +ifndef CONFIG_CC_IS_CLANG +CC_FLAGS_MARCH_MINIMUM := -march=z900 +else +CC_FLAGS_MARCH_MINIMUM := -march=z10 +endif + +ifneq ($(CC_FLAGS_MARCH),$(CC_FLAGS_MARCH_MINIMUM)) AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) -AFLAGS_head.o += -march=z900 +AFLAGS_head.o += $(CC_FLAGS_MARCH_MINIMUM) AFLAGS_REMOVE_mem.o += $(CC_FLAGS_MARCH) -AFLAGS_mem.o += -march=z900 +AFLAGS_mem.o += $(CC_FLAGS_MARCH_MINIMUM) CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH) -CFLAGS_als.o += -march=z900 +CFLAGS_als.o += $(CC_FLAGS_MARCH_MINIMUM) CFLAGS_REMOVE_sclp_early_core.o += $(CC_FLAGS_MARCH) -CFLAGS_sclp_early_core.o += -march=z900 +CFLAGS_sclp_early_core.o += $(CC_FLAGS_MARCH_MINIMUM) endif CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char From 96fb54a180894dc3c6821aa3297a3eb8b27f03c5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 15 Apr 2019 10:35:52 +0200 Subject: [PATCH 95/98] s390: boot, purgatory: pass $(CLANG_FLAGS) where needed The purgatory and boot Makefiles do not inherit the original cflags, so clang falls back to the default target architecture when building it, typically this would be x86 when cross-compiling. Add $(CLANG_FLAGS) everywhere so we pass the correct --target=s390x-linux option when cross-compiling. Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/Makefile | 4 ++-- arch/s390/purgatory/Makefile | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 54b8a12d64e8..df1d6a150f30 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -21,9 +21,9 @@ KBUILD_CFLAGS += -fPIE LDFLAGS_vmlinux := -pie endif aflags_dwarf := -Wa,-gdwarf-2 -KBUILD_AFLAGS_DECOMPRESSOR := -m64 -D__ASSEMBLY__ +KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) -KBUILD_CFLAGS_DECOMPRESSOR := -m64 -O2 +KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index b20934d91986..dc1ae4ff79d7 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -20,6 +20,7 @@ KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common +KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) From 964d06b4ed212d85ff589a9438baeb6c4fff3272 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 15 Apr 2019 10:35:53 +0200 Subject: [PATCH 96/98] s390: drop CONFIG_VIRT_TO_BUS VIRT_TO_BUS is only used for legacy device PCI and ISA drivers using virt_to_bus() instead of the streaming DMA mapping API, and the remaining drivers generally don't work on 64-bit architectures. Two of these drivers also cause a build warning on s390, so instead of trying to fix that, let's just disable the option as we do on most architectures now. Signed-off-by: Arnd Bergmann Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 380d13df4411..0ad0d204404a 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -189,7 +189,6 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select ARCH_HAS_SCALED_CPUTIME - select VIRT_TO_BUS select HAVE_NMI From 4ae987894c06056e29264e5a1051e459a4454a61 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 15 Apr 2019 10:35:54 +0200 Subject: [PATCH 97/98] s390: fix clang -Wpointer-sign warnigns in boot code The arch/s390/boot directory is built with its own set of compiler options that does not include -Wno-pointer-sign like the rest of the kernel does, this causes a lot of harmless but correct warnings when building with clang. For the atomics, we can add type casts to avoid the warnings, for everything else the easiest way is to slightly adapt the types to be more consistent. Signed-off-by: Arnd Bergmann Reviewed-by: Nick Desaulniers Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/boot/ipl_parm.c | 2 +- arch/s390/include/asm/bitops.h | 12 ++++++------ arch/s390/include/asm/ebcdic.h | 2 +- arch/s390/include/asm/lowcore.h | 2 +- drivers/s390/char/sclp.h | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index b49bd97d33af..3c49bde8aa5e 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -56,7 +56,7 @@ void store_ipl_parmblock(void) ipl_block_valid = 1; } -static size_t scpdata_length(const char *buf, size_t count) +static size_t scpdata_length(const u8 *buf, size_t count) { while (count) { if (buf[count - 1] != '\0' && buf[count - 1] != ' ') diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index d1f8a4d94cca..9900d655014c 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -73,7 +73,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __atomic64_or(mask, addr); + __atomic64_or(mask, (long *)addr); } static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) @@ -94,7 +94,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - __atomic64_and(mask, addr); + __atomic64_and(mask, (long *)addr); } static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) @@ -115,7 +115,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __atomic64_xor(mask, addr); + __atomic64_xor(mask, (long *)addr); } static inline int @@ -125,7 +125,7 @@ test_and_set_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __atomic64_or_barrier(mask, addr); + old = __atomic64_or_barrier(mask, (long *)addr); return (old & mask) != 0; } @@ -136,7 +136,7 @@ test_and_clear_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - old = __atomic64_and_barrier(mask, addr); + old = __atomic64_and_barrier(mask, (long *)addr); return (old & ~mask) != 0; } @@ -147,7 +147,7 @@ test_and_change_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __atomic64_xor_barrier(mask, addr); + old = __atomic64_xor_barrier(mask, (long *)addr); return (old & mask) != 0; } diff --git a/arch/s390/include/asm/ebcdic.h b/arch/s390/include/asm/ebcdic.h index 29441beb92e6..efb50fc6866c 100644 --- a/arch/s390/include/asm/ebcdic.h +++ b/arch/s390/include/asm/ebcdic.h @@ -20,7 +20,7 @@ extern __u8 _ebc_tolower[256]; /* EBCDIC -> lowercase */ extern __u8 _ebc_toupper[256]; /* EBCDIC -> uppercase */ static inline void -codepage_convert(const __u8 *codepage, volatile __u8 * addr, unsigned long nr) +codepage_convert(const __u8 *codepage, volatile char *addr, unsigned long nr) { if (nr-- <= 0) return; diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 5b9f10b1e55d..237ee0c4169f 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -129,7 +129,7 @@ struct lowcore { /* SMP info area */ __u32 cpu_nr; /* 0x03a0 */ __u32 softirq_pending; /* 0x03a4 */ - __u32 preempt_count; /* 0x03a8 */ + __s32 preempt_count; /* 0x03a8 */ __u32 spinlock_lockval; /* 0x03ac */ __u32 spinlock_index; /* 0x03b0 */ __u32 fpu_flags; /* 0x03b4 */ diff --git a/drivers/s390/char/sclp.h b/drivers/s390/char/sclp.h index 28b433960831..196333013e54 100644 --- a/drivers/s390/char/sclp.h +++ b/drivers/s390/char/sclp.h @@ -367,14 +367,14 @@ sclp_ascebc(unsigned char ch) /* translate string from EBCDIC to ASCII */ static inline void -sclp_ebcasc_str(unsigned char *str, int nr) +sclp_ebcasc_str(char *str, int nr) { (MACHINE_IS_VM) ? EBCASC(str, nr) : EBCASC_500(str, nr); } /* translate string from ASCII to EBCDIC */ static inline void -sclp_ascebc_str(unsigned char *str, int nr) +sclp_ascebc_str(char *str, int nr) { (MACHINE_IS_VM) ? ASCEBC(str, nr) : ASCEBC_500(str, nr); } From ce968f6012f632bbe071839d229db77c45fc38d1 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 23 Apr 2019 14:00:56 -0700 Subject: [PATCH 98/98] s390/vdso: drop unnecessary cc-ldoption Towards the goal of removing cc-ldoption, it seems that --hash-style= was added to binutils 2.17.50.0.2 in 2006. The minimal required version of binutils for the kernel according to Documentation/process/changes.rst is 2.20. Link: https://gcc.gnu.org/ml/gcc/2007-01/msg01141.html Cc: clang-built-linux@googlegroups.com Suggested-by: Masahiro Yamada Signed-off-by: Nick Desaulniers Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/vdso32/Makefile | 2 +- arch/s390/kernel/vdso64/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index e76309fbbcb3..aee9ffbccb54 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -19,7 +19,7 @@ KBUILD_AFLAGS_31 += -m31 -s KBUILD_CFLAGS_31 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_31 += -m31 -fPIC -shared -fno-common -fno-builtin KBUILD_CFLAGS_31 += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=both) + -Wl,--hash-style=both $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_31) $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_31) diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index f849ac61c5da..bec19e7e6e1c 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -19,7 +19,7 @@ KBUILD_AFLAGS_64 += -m64 -s KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=both) + -Wl,--hash-style=both $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64)