From 2e95c4d672ce6c931951c87b5dbfa4106fd3cae6 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Wed, 17 Oct 2018 21:15:51 +0800 Subject: [PATCH 01/16] nds32: Remove the redundant assignment For early version, the value of r2 register was used to display a character on UART when error occurred. Remove these r2 assignments because we no longer show the character. Signed-off-by: Zong Li Signed-off-by: Greentime Hu --- arch/nds32/kernel/head.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/nds32/kernel/head.S b/arch/nds32/kernel/head.S index c5fdae174ced..1ce653d35d93 100644 --- a/arch/nds32/kernel/head.S +++ b/arch/nds32/kernel/head.S @@ -130,14 +130,12 @@ _image_size_check: * * direct mapping is not supported now. */ - li $r2, 't' beqz $r6, __error ! MMU_CFG.TBW = 0 is direct mappin addi $r0, $r0, #0x2 ! MMU_CFG.TBS value -> meaning sll $r0, $r6, $r0 ! entries = k-way * n-set mul $r6, $r0, $r5 ! max size = entries * page size /* check kernel image size */ la $r3, (_end - PAGE_OFFSET) - li $r2, 's' bgt $r3, $r6, __error li $r2, #(PHYS_OFFSET + TLB_DATA_kernel_text_attr) From 8730c178b4208ec52f4c2b381d9a2b685a791b1c Mon Sep 17 00:00:00 2001 From: Zong Li Date: Wed, 17 Oct 2018 21:15:52 +0800 Subject: [PATCH 02/16] nds32: Fill all TLB entries with kernel image mapping We use earlycon replace with early_printk and doesn't use early_io_map() to create UART mapping. It is not necessary to reserve the one way in TLB for now. It didn't make sense if use direct-mapped and reserve one way at the same time. It allow the direct-mapped now. Signed-off-by: Zong Li Signed-off-by: Greentime Hu --- arch/nds32/kernel/head.S | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/nds32/kernel/head.S b/arch/nds32/kernel/head.S index 1ce653d35d93..2c8aac6201be 100644 --- a/arch/nds32/kernel/head.S +++ b/arch/nds32/kernel/head.S @@ -123,14 +123,7 @@ _image_size_check: andi $r0, $r0, MMU_CFG_mskTBS srli $r6, $r6, MMU_CFG_offTBW srli $r0, $r0, MMU_CFG_offTBS - /* - * we just map the kernel to the maximum way - 1 of tlb - * reserver one way for UART VA mapping - * it will cause page fault if UART mapping cover the kernel mapping - * - * direct mapping is not supported now. - */ - beqz $r6, __error ! MMU_CFG.TBW = 0 is direct mappin + addi $r6, $r6, #0x1 ! MMU_CFG.TBW value -> meaning addi $r0, $r0, #0x2 ! MMU_CFG.TBS value -> meaning sll $r0, $r6, $r0 ! entries = k-way * n-set mul $r6, $r0, $r5 ! max size = entries * page size From 4c3d6174e0e17599549f636ec48ddf78627a17fe Mon Sep 17 00:00:00 2001 From: Nickhu Date: Thu, 18 Oct 2018 16:37:55 +0800 Subject: [PATCH 03/16] nds32: Fix gcc 8.0 compiler option incompatible. When the kernel configs of ftrace and frame pointer options are choosed, the compiler option of kernel will incompatible. Error message: nds32le-linux-gcc: error: -pg and -fomit-frame-pointer are incompatible Signed-off-by: Nickhu Signed-off-by: Zong Li Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/mm/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/nds32/mm/Makefile b/arch/nds32/mm/Makefile index 6b6855852223..7c5c15ad854a 100644 --- a/arch/nds32/mm/Makefile +++ b/arch/nds32/mm/Makefile @@ -4,4 +4,8 @@ obj-y := extable.o tlb.o \ obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o obj-$(CONFIG_HIGHMEM) += highmem.o -CFLAGS_proc-n13.o += -fomit-frame-pointer + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_proc.o = $(CC_FLAGS_FTRACE) +endif +CFLAGS_proc.o += -fomit-frame-pointer From 9aaafac8cffa1c1edb66e19a63841b7c86be07ca Mon Sep 17 00:00:00 2001 From: Nickhu Date: Thu, 25 Oct 2018 10:24:14 +0800 Subject: [PATCH 04/16] nds32: Fix bug in bitfield.h There two bitfield bug for perfomance counter in bitfield.h: PFM_CTL_offSEL1 21 --> 16 PFM_CTL_offSEL2 27 --> 22 This commit fix it. Signed-off-by: Nickhu Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/include/asm/bitfield.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/nds32/include/asm/bitfield.h b/arch/nds32/include/asm/bitfield.h index 8e84fc385b94..19b2841219ad 100644 --- a/arch/nds32/include/asm/bitfield.h +++ b/arch/nds32/include/asm/bitfield.h @@ -692,8 +692,8 @@ #define PFM_CTL_offKU1 13 /* Enable user mode event counting for PFMC1 */ #define PFM_CTL_offKU2 14 /* Enable user mode event counting for PFMC2 */ #define PFM_CTL_offSEL0 15 /* The event selection for PFMC0 */ -#define PFM_CTL_offSEL1 21 /* The event selection for PFMC1 */ -#define PFM_CTL_offSEL2 27 /* The event selection for PFMC2 */ +#define PFM_CTL_offSEL1 16 /* The event selection for PFMC1 */ +#define PFM_CTL_offSEL2 22 /* The event selection for PFMC2 */ /* bit 28:31 reserved */ #define PFM_CTL_mskEN0 ( 0x01 << PFM_CTL_offEN0 ) From ebd09753b5707cc083c52e1d0ec7f45dccdb73bf Mon Sep 17 00:00:00 2001 From: Nickhu Date: Thu, 25 Oct 2018 10:24:15 +0800 Subject: [PATCH 05/16] nds32: Perf porting This is the commit that porting the perf for nds32. 1.Raw event: The raw events start with 'r'. Usage: perf stat -e rXYZ ./app X: the index of performance counter. YZ: the index(convert to hexdecimal) of events Example: 'perf stat -e r101 ./app' means the counter 1 will count the instruction event. The index of counter and events can be found in "Andes System Privilege Architecture Version 3 Manual". Or you can perform the 'perf list' to find the symbolic name of raw events. 2.Perf mmap2: Fix unexpected perf mmap2() page fault When the mmap2() called by perf application, you will encounter such condition:"failed to write." With return value -EFAULT This is due to the page fault caused by "reading" buffer from the mapped legal address region to write to the descriptor. The page_fault handler will get a VM_FAULT_SIGBUS return value, which should not happens here.(Due to this is a read request.) You can refer to kernel/events/core.c:perf_mmap_fault(...) If "(vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))" is evaluated as true, you will get VM_FAULT_SIGBUS as return value. However, this is not an write request. The flags which indicated why the page fault happens is wrong. Furthermore, NDS32 SPAv3 is not able to detect it is read or write. It only know either it is instruction fetch or data access. Therefore, by removing the wrong flag assignment(actually, the hardware is not able to show the reason), we can fix this bug. 3.Perf multiple events map to same counter. When there are multiple events map to the same counter, the counter counts inaccurately. This is because each counter only counts one event in the same time. So when there are multiple events map to same counter, they have to take turns in each context. There are two solution: 1. Print the error message when multiple events map to the same counter. But print the error message would let the program hang in loop. The ltp (linux test program) would be failed when the program hang in loop. 2. Don't print the error message, the ltp would pass. But the user need to have the knowledge that don't count the events which map to the same counter, or the user will get the inaccurate results. We choose method 2 for the solution Signed-off-by: Nickhu Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/Kconfig | 1 + arch/nds32/boot/dts/ae3xx.dts | 5 + arch/nds32/include/asm/Kbuild | 1 + arch/nds32/include/asm/perf_event.h | 16 + arch/nds32/include/asm/pmu.h | 386 ++++++ arch/nds32/include/asm/stacktrace.h | 39 + arch/nds32/kernel/Makefile | 3 +- arch/nds32/kernel/perf_event_cpu.c | 1223 +++++++++++++++++ arch/nds32/mm/fault.c | 13 +- tools/include/asm/barrier.h | 2 + tools/perf/arch/nds32/Build | 1 + tools/perf/arch/nds32/util/Build | 1 + tools/perf/arch/nds32/util/header.c | 29 + tools/perf/pmu-events/arch/nds32/mapfile.csv | 15 + .../pmu-events/arch/nds32/n13/atcpmu.json | 290 ++++ 15 files changed, 2019 insertions(+), 6 deletions(-) create mode 100644 arch/nds32/include/asm/perf_event.h create mode 100644 arch/nds32/include/asm/pmu.h create mode 100644 arch/nds32/include/asm/stacktrace.h create mode 100644 arch/nds32/kernel/perf_event_cpu.c create mode 100644 tools/perf/arch/nds32/Build create mode 100644 tools/perf/arch/nds32/util/Build create mode 100644 tools/perf/arch/nds32/util/header.c create mode 100644 tools/perf/pmu-events/arch/nds32/mapfile.csv create mode 100644 tools/perf/pmu-events/arch/nds32/n13/atcpmu.json diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 7a04adacb2f0..97786a865023 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -30,6 +30,7 @@ config NDS32 select HAVE_ARCH_TRACEHOOK select HAVE_DEBUG_KMEMLEAK select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_PERF_EVENTS select IRQ_DOMAIN select LOCKDEP_SUPPORT select MODULES_USE_ELF_RELA diff --git a/arch/nds32/boot/dts/ae3xx.dts b/arch/nds32/boot/dts/ae3xx.dts index bb39749a6673..16a9f54a805e 100644 --- a/arch/nds32/boot/dts/ae3xx.dts +++ b/arch/nds32/boot/dts/ae3xx.dts @@ -82,4 +82,9 @@ interrupts = <18>; }; }; + + pmu { + compatible = "andestech,nds32v3-pmu"; + interrupts= <13>; + }; }; diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild index dbc4e5422550..f81b633d5379 100644 --- a/arch/nds32/include/asm/Kbuild +++ b/arch/nds32/include/asm/Kbuild @@ -36,6 +36,7 @@ generic-y += kprobes.h generic-y += kvm_para.h generic-y += limits.h generic-y += local.h +generic-y += local64.h generic-y += mm-arch-hooks.h generic-y += mman.h generic-y += parport.h diff --git a/arch/nds32/include/asm/perf_event.h b/arch/nds32/include/asm/perf_event.h new file mode 100644 index 000000000000..fcdff02acc14 --- /dev/null +++ b/arch/nds32/include/asm/perf_event.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2008-2018 Andes Technology Corporation */ + +#ifndef __ASM_PERF_EVENT_H +#define __ASM_PERF_EVENT_H + +/* + * This file is request by Perf, + * please refer to tools/perf/design.txt for more details + */ +struct pt_regs; +unsigned long perf_instruction_pointer(struct pt_regs *regs); +unsigned long perf_misc_flags(struct pt_regs *regs); +#define perf_misc_flags(regs) perf_misc_flags(regs) + +#endif diff --git a/arch/nds32/include/asm/pmu.h b/arch/nds32/include/asm/pmu.h new file mode 100644 index 000000000000..e1ac0b0b8bcf --- /dev/null +++ b/arch/nds32/include/asm/pmu.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2008-2018 Andes Technology Corporation */ + +#ifndef __ASM_PMU_H +#define __ASM_PMU_H + +#include +#include +#include +#include + +/* Has special meaning for perf core implementation */ +#define HW_OP_UNSUPPORTED 0x0 +#define C(_x) PERF_COUNT_HW_CACHE_##_x +#define CACHE_OP_UNSUPPORTED 0x0 + +/* Enough for both software and hardware defined events */ +#define SOFTWARE_EVENT_MASK 0xFF + +#define PFM_OFFSET_MAGIC_0 2 /* DO NOT START FROM 0 */ +#define PFM_OFFSET_MAGIC_1 (PFM_OFFSET_MAGIC_0 + 36) +#define PFM_OFFSET_MAGIC_2 (PFM_OFFSET_MAGIC_1 + 36) + +enum { PFMC0, PFMC1, PFMC2, MAX_COUNTERS }; + +u32 PFM_CTL_OVF[3] = { PFM_CTL_mskOVF0, PFM_CTL_mskOVF1, + PFM_CTL_mskOVF2 }; +u32 PFM_CTL_EN[3] = { PFM_CTL_mskEN0, PFM_CTL_mskEN1, + PFM_CTL_mskEN2 }; +u32 PFM_CTL_OFFSEL[3] = { PFM_CTL_offSEL0, PFM_CTL_offSEL1, + PFM_CTL_offSEL2 }; +u32 PFM_CTL_IE[3] = { PFM_CTL_mskIE0, PFM_CTL_mskIE1, PFM_CTL_mskIE2 }; +u32 PFM_CTL_KS[3] = { PFM_CTL_mskKS0, PFM_CTL_mskKS1, PFM_CTL_mskKS2 }; +u32 PFM_CTL_KU[3] = { PFM_CTL_mskKU0, PFM_CTL_mskKU1, PFM_CTL_mskKU2 }; +u32 PFM_CTL_SEL[3] = { PFM_CTL_mskSEL0, PFM_CTL_mskSEL1, PFM_CTL_mskSEL2 }; +/* + * Perf Events' indices + */ +#define NDS32_IDX_CYCLE_COUNTER 0 +#define NDS32_IDX_COUNTER0 1 +#define NDS32_IDX_COUNTER1 2 + +/* The events for a given PMU register set. */ +struct pmu_hw_events { + /* + * The events that are active on the PMU for the given index. + */ + struct perf_event *events[MAX_COUNTERS]; + + /* + * A 1 bit for an index indicates that the counter is being used for + * an event. A 0 means that the counter can be used. + */ + unsigned long used_mask[BITS_TO_LONGS(MAX_COUNTERS)]; + + /* + * Hardware lock to serialize accesses to PMU registers. Needed for the + * read/modify/write sequences. + */ + raw_spinlock_t pmu_lock; +}; + +struct nds32_pmu { + struct pmu pmu; + cpumask_t active_irqs; + char *name; + irqreturn_t (*handle_irq)(int irq_num, void *dev); + void (*enable)(struct perf_event *event); + void (*disable)(struct perf_event *event); + int (*get_event_idx)(struct pmu_hw_events *hw_events, + struct perf_event *event); + int (*set_event_filter)(struct hw_perf_event *evt, + struct perf_event_attr *attr); + u32 (*read_counter)(struct perf_event *event); + void (*write_counter)(struct perf_event *event, u32 val); + void (*start)(struct nds32_pmu *nds32_pmu); + void (*stop)(struct nds32_pmu *nds32_pmu); + void (*reset)(void *data); + int (*request_irq)(struct nds32_pmu *nds32_pmu, irq_handler_t handler); + void (*free_irq)(struct nds32_pmu *nds32_pmu); + int (*map_event)(struct perf_event *event); + int num_events; + atomic_t active_events; + u64 max_period; + struct platform_device *plat_device; + struct pmu_hw_events *(*get_hw_events)(void); +}; + +#define to_nds32_pmu(p) (container_of(p, struct nds32_pmu, pmu)) + +int nds32_pmu_register(struct nds32_pmu *nds32_pmu, int type); + +u64 nds32_pmu_event_update(struct perf_event *event); + +int nds32_pmu_event_set_period(struct perf_event *event); + +/* + * Common NDS32 SPAv3 event types + * + * Note: An implementation may not be able to count all of these events + * but the encodings are considered to be `reserved' in the case that + * they are not available. + * + * SEL_TOTAL_CYCLES will add an offset is due to ZERO is defined as + * NOT_SUPPORTED EVENT mapping in generic perf code. + * You will need to deal it in the event writing implementation. + */ +enum spav3_counter_0_perf_types { + SPAV3_0_SEL_BASE = -1 + PFM_OFFSET_MAGIC_0, /* counting symbol */ + SPAV3_0_SEL_TOTAL_CYCLES = 0 + PFM_OFFSET_MAGIC_0, + SPAV3_0_SEL_COMPLETED_INSTRUCTION = 1 + PFM_OFFSET_MAGIC_0, + SPAV3_0_SEL_LAST /* counting symbol */ +}; + +enum spav3_counter_1_perf_types { + SPAV3_1_SEL_BASE = -1 + PFM_OFFSET_MAGIC_1, /* counting symbol */ + SPAV3_1_SEL_TOTAL_CYCLES = 0 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_COMPLETED_INSTRUCTION = 1 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_CONDITIONAL_BRANCH = 2 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_TAKEN_CONDITIONAL_BRANCH = 3 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_PREFETCH_INSTRUCTION = 4 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_RET_INST = 5 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_JR_INST = 6 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_JAL_JRAL_INST = 7 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_NOP_INST = 8 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_SCW_INST = 9 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_ISB_DSB_INST = 10 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_CCTL_INST = 11 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_TAKEN_INTERRUPTS = 12 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_LOADS_COMPLETED = 13 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_UITLB_ACCESS = 14 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_UDTLB_ACCESS = 15 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_MTLB_ACCESS = 16 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_CODE_CACHE_ACCESS = 17 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_DATA_DEPENDENCY_STALL_CYCLES = 18 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_DATA_CACHE_MISS_STALL_CYCLES = 19 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_DATA_CACHE_ACCESS = 20 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_DATA_CACHE_MISS = 21 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_LOAD_DATA_CACHE_ACCESS = 22 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_STORE_DATA_CACHE_ACCESS = 23 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_ILM_ACCESS = 24 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_LSU_BIU_CYCLES = 25 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_HPTWK_BIU_CYCLES = 26 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_DMA_BIU_CYCLES = 27 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_CODE_CACHE_FILL_BIU_CYCLES = 28 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_LEGAL_UNALIGN_DCACHE_ACCESS = 29 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_PUSH25 = 30 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_SYSCALLS_INST = 31 + PFM_OFFSET_MAGIC_1, + SPAV3_1_SEL_LAST /* counting symbol */ +}; + +enum spav3_counter_2_perf_types { + SPAV3_2_SEL_BASE = -1 + PFM_OFFSET_MAGIC_2, /* counting symbol */ + SPAV3_2_SEL_TOTAL_CYCLES = 0 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_COMPLETED_INSTRUCTION = 1 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_CONDITIONAL_BRANCH_MISPREDICT = 2 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_TAKEN_CONDITIONAL_BRANCH_MISPREDICT = + 3 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_PREFETCH_INSTRUCTION_CACHE_HIT = 4 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_RET_MISPREDICT = 5 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_IMMEDIATE_J_INST = 6 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_MULTIPLY_INST = 7 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_16_BIT_INST = 8 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_FAILED_SCW_INST = 9 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_LD_AFTER_ST_CONFLICT_REPLAYS = 10 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_TAKEN_EXCEPTIONS = 12 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_STORES_COMPLETED = 13 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_UITLB_MISS = 14 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_UDTLB_MISS = 15 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_MTLB_MISS = 16 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_CODE_CACHE_MISS = 17 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_EMPTY_INST_QUEUE_STALL_CYCLES = 18 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_DATA_WRITE_BACK = 19 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_DATA_CACHE_MISS = 21 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_LOAD_DATA_CACHE_MISS = 22 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_STORE_DATA_CACHE_MISS = 23 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_DLM_ACCESS = 24 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_LSU_BIU_REQUEST = 25 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_HPTWK_BIU_REQUEST = 26 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_DMA_BIU_REQUEST = 27 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_CODE_CACHE_FILL_BIU_REQUEST = 28 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_EXTERNAL_EVENTS = 29 + PFM_OFFSET_MAGIC_2, + SPAV3_1_SEL_POP25 = 30 + PFM_OFFSET_MAGIC_2, + SPAV3_2_SEL_LAST /* counting symbol */ +}; + +/* Get converted event counter index */ +static inline int get_converted_event_idx(unsigned long event) +{ + int idx; + + if ((event) > SPAV3_0_SEL_BASE && event < SPAV3_0_SEL_LAST) { + idx = 0; + } else if ((event) > SPAV3_1_SEL_BASE && event < SPAV3_1_SEL_LAST) { + idx = 1; + } else if ((event) > SPAV3_2_SEL_BASE && event < SPAV3_2_SEL_LAST) { + idx = 2; + } else { + pr_err("GET_CONVERTED_EVENT_IDX PFM counter range error\n"); + return -EPERM; + } + + return idx; +} + +/* Get converted hardware event number */ +static inline u32 get_converted_evet_hw_num(u32 event) +{ + if (event > SPAV3_0_SEL_BASE && event < SPAV3_0_SEL_LAST) + event -= PFM_OFFSET_MAGIC_0; + else if (event > SPAV3_1_SEL_BASE && event < SPAV3_1_SEL_LAST) + event -= PFM_OFFSET_MAGIC_1; + else if (event > SPAV3_2_SEL_BASE && event < SPAV3_2_SEL_LAST) + event -= PFM_OFFSET_MAGIC_2; + else if (event != 0) + pr_err("GET_CONVERTED_EVENT_HW_NUM PFM counter range error\n"); + + return event; +} + +/* + * NDS32 HW events mapping + * + * The hardware events that we support. We do support cache operations but + * we have harvard caches and no way to combine instruction and data + * accesses/misses in hardware. + */ +static const unsigned int nds32_pfm_perf_map[PERF_COUNT_HW_MAX] = { + [PERF_COUNT_HW_CPU_CYCLES] = SPAV3_0_SEL_TOTAL_CYCLES, + [PERF_COUNT_HW_INSTRUCTIONS] = SPAV3_1_SEL_COMPLETED_INSTRUCTION, + [PERF_COUNT_HW_CACHE_REFERENCES] = SPAV3_1_SEL_DATA_CACHE_ACCESS, + [PERF_COUNT_HW_CACHE_MISSES] = SPAV3_2_SEL_DATA_CACHE_MISS, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = HW_OP_UNSUPPORTED, + [PERF_COUNT_HW_BRANCH_MISSES] = HW_OP_UNSUPPORTED, + [PERF_COUNT_HW_BUS_CYCLES] = HW_OP_UNSUPPORTED, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = HW_OP_UNSUPPORTED, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = HW_OP_UNSUPPORTED, + [PERF_COUNT_HW_REF_CPU_CYCLES] = HW_OP_UNSUPPORTED +}; + +static const unsigned int nds32_pfm_perf_cache_map[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_LOAD_DATA_CACHE_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_LOAD_DATA_CACHE_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_STORE_DATA_CACHE_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_STORE_DATA_CACHE_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + }, + [C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_CODE_CACHE_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_CODE_CACHE_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_CODE_CACHE_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_CODE_CACHE_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, + /* TODO: L2CC */ + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + }, + /* NDS32 PMU does not support TLB read/write hit/miss, + * However, it can count access/miss, which mixed with read and write. + * Therefore, only READ counter will use it. + * We do as possible as we can. + */ + [C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_UDTLB_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_UDTLB_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + }, + [C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = + SPAV3_1_SEL_UITLB_ACCESS, + [C(RESULT_MISS)] = + SPAV3_2_SEL_UITLB_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + }, + [C(BPU)] = { /* What is BPU? */ + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + }, + [C(NODE)] = { /* What is NODE? */ + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = CACHE_OP_UNSUPPORTED, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = + CACHE_OP_UNSUPPORTED, + [C(RESULT_MISS)] = + CACHE_OP_UNSUPPORTED, + }, + }, +}; + +int nds32_pmu_map_event(struct perf_event *event, + const unsigned int (*event_map)[PERF_COUNT_HW_MAX], + const unsigned int (*cache_map)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX], u32 raw_event_mask); + +#endif /* __ASM_PMU_H */ diff --git a/arch/nds32/include/asm/stacktrace.h b/arch/nds32/include/asm/stacktrace.h new file mode 100644 index 000000000000..6bf7c777bda4 --- /dev/null +++ b/arch/nds32/include/asm/stacktrace.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2008-2018 Andes Technology Corporation */ + +#ifndef __ASM_STACKTRACE_H +#define __ASM_STACKTRACE_H + +/* Kernel callchain */ +struct stackframe { + unsigned long fp; + unsigned long sp; + unsigned long lp; +}; + +/* + * struct frame_tail: User callchain + * IMPORTANT: + * This struct is used for call-stack walking, + * the order and types matters. + * Do not use array, it only stores sizeof(pointer) + * + * The details can refer to arch/arm/kernel/perf_event.c + */ +struct frame_tail { + unsigned long stack_fp; + unsigned long stack_lp; +}; + +/* For User callchain with optimize for size */ +struct frame_tail_opt_size { + unsigned long stack_r6; + unsigned long stack_fp; + unsigned long stack_gp; + unsigned long stack_lp; +}; + +extern void +get_real_ret_addr(unsigned long *addr, struct task_struct *tsk, int *graph); + +#endif /* __ASM_STACKTRACE_H */ diff --git a/arch/nds32/kernel/Makefile b/arch/nds32/kernel/Makefile index 27cded39fa66..f52bd2744f50 100644 --- a/arch/nds32/kernel/Makefile +++ b/arch/nds32/kernel/Makefile @@ -4,7 +4,6 @@ CPPFLAGS_vmlinux.lds := -DTEXTADDR=$(TEXTADDR) AFLAGS_head.o := -DTEXTADDR=$(TEXTADDR) - # Object file lists. obj-y := ex-entry.o ex-exit.o ex-scall.o irq.o \ @@ -16,10 +15,10 @@ obj-$(CONFIG_MODULES) += nds32_ksyms.o module.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_OF) += devtree.o obj-$(CONFIG_CACHE_L2) += atl2c.o +obj-$(CONFIG_PERF_EVENTS) += perf_event_cpu.o extra-y := head.o vmlinux.lds - obj-y += vdso/ obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o diff --git a/arch/nds32/kernel/perf_event_cpu.c b/arch/nds32/kernel/perf_event_cpu.c new file mode 100644 index 000000000000..a6e723d0fdbc --- /dev/null +++ b/arch/nds32/kernel/perf_event_cpu.c @@ -0,0 +1,1223 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2008-2017 Andes Technology Corporation + * + * Reference ARMv7: Jean Pihet + * 2010 (c) MontaVista Software, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* Set at runtime when we know what CPU type we are. */ +static struct nds32_pmu *cpu_pmu; + +static DEFINE_PER_CPU(struct pmu_hw_events, cpu_hw_events); +static void nds32_pmu_start(struct nds32_pmu *cpu_pmu); +static void nds32_pmu_stop(struct nds32_pmu *cpu_pmu); +static struct platform_device_id cpu_pmu_plat_device_ids[] = { + {.name = "nds32-pfm"}, + {}, +}; + +static int nds32_pmu_map_cache_event(const unsigned int (*cache_map) + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX], u64 config) +{ + unsigned int cache_type, cache_op, cache_result, ret; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + ret = (int)(*cache_map)[cache_type][cache_op][cache_result]; + + if (ret == CACHE_OP_UNSUPPORTED) + return -ENOENT; + + return ret; +} + +static int +nds32_pmu_map_hw_event(const unsigned int (*event_map)[PERF_COUNT_HW_MAX], + u64 config) +{ + int mapping; + + if (config >= PERF_COUNT_HW_MAX) + return -ENOENT; + + mapping = (*event_map)[config]; + return mapping == HW_OP_UNSUPPORTED ? -ENOENT : mapping; +} + +static int nds32_pmu_map_raw_event(u32 raw_event_mask, u64 config) +{ + int ev_type = (int)(config & raw_event_mask); + int idx = config >> 8; + + switch (idx) { + case 0: + ev_type = PFM_OFFSET_MAGIC_0 + ev_type; + if (ev_type >= SPAV3_0_SEL_LAST || ev_type <= SPAV3_0_SEL_BASE) + return -ENOENT; + break; + case 1: + ev_type = PFM_OFFSET_MAGIC_1 + ev_type; + if (ev_type >= SPAV3_1_SEL_LAST || ev_type <= SPAV3_1_SEL_BASE) + return -ENOENT; + break; + case 2: + ev_type = PFM_OFFSET_MAGIC_2 + ev_type; + if (ev_type >= SPAV3_2_SEL_LAST || ev_type <= SPAV3_2_SEL_BASE) + return -ENOENT; + break; + default: + return -ENOENT; + } + + return ev_type; +} + +int +nds32_pmu_map_event(struct perf_event *event, + const unsigned int (*event_map)[PERF_COUNT_HW_MAX], + const unsigned int (*cache_map) + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX], u32 raw_event_mask) +{ + u64 config = event->attr.config; + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + return nds32_pmu_map_hw_event(event_map, config); + case PERF_TYPE_HW_CACHE: + return nds32_pmu_map_cache_event(cache_map, config); + case PERF_TYPE_RAW: + return nds32_pmu_map_raw_event(raw_event_mask, config); + } + + return -ENOENT; +} + +static int nds32_spav3_map_event(struct perf_event *event) +{ + return nds32_pmu_map_event(event, &nds32_pfm_perf_map, + &nds32_pfm_perf_cache_map, SOFTWARE_EVENT_MASK); +} + +static inline u32 nds32_pfm_getreset_flags(void) +{ + /* Read overflow status */ + u32 val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 old_val = val; + + /* Write overflow bit to clear status, and others keep it 0 */ + u32 ov_flag = PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]; + + __nds32__mtsr(val | ov_flag, NDS32_SR_PFM_CTL); + + return old_val; +} + +static inline int nds32_pfm_has_overflowed(u32 pfm) +{ + u32 ov_flag = PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]; + + return pfm & ov_flag; +} + +static inline int nds32_pfm_counter_has_overflowed(u32 pfm, int idx) +{ + u32 mask = 0; + + switch (idx) { + case 0: + mask = PFM_CTL_OVF[0]; + break; + case 1: + mask = PFM_CTL_OVF[1]; + break; + case 2: + mask = PFM_CTL_OVF[2]; + break; + default: + pr_err("%s index wrong\n", __func__); + break; + } + return pfm & mask; +} + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the event disabled in hw: + */ +int nds32_pmu_event_set_period(struct perf_event *event) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int ret = 0; + + /* The period may have been changed by PERF_EVENT_IOC_PERIOD */ + if (unlikely(period != hwc->last_period)) + left = period - (hwc->last_period - left); + + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (left > (s64)nds32_pmu->max_period) + left = nds32_pmu->max_period; + + /* + * The hw event starts counting from this event offset, + * mark it to be able to extract future "deltas": + */ + local64_set(&hwc->prev_count, (u64)(-left)); + + nds32_pmu->write_counter(event, (u64)(-left) & nds32_pmu->max_period); + + perf_event_update_userpage(event); + + return ret; +} + +static irqreturn_t nds32_pmu_handle_irq(int irq_num, void *dev) +{ + u32 pfm; + struct perf_sample_data data; + struct nds32_pmu *cpu_pmu = (struct nds32_pmu *)dev; + struct pmu_hw_events *cpuc = cpu_pmu->get_hw_events(); + struct pt_regs *regs; + int idx; + /* + * Get and reset the IRQ flags + */ + pfm = nds32_pfm_getreset_flags(); + + /* + * Did an overflow occur? + */ + if (!nds32_pfm_has_overflowed(pfm)) + return IRQ_NONE; + + /* + * Handle the counter(s) overflow(s) + */ + regs = get_irq_regs(); + + nds32_pmu_stop(cpu_pmu); + for (idx = 0; idx < cpu_pmu->num_events; ++idx) { + struct perf_event *event = cpuc->events[idx]; + struct hw_perf_event *hwc; + + /* Ignore if we don't have an event. */ + if (!event) + continue; + + /* + * We have a single interrupt for all counters. Check that + * each counter has overflowed before we process it. + */ + if (!nds32_pfm_counter_has_overflowed(pfm, idx)) + continue; + + hwc = &event->hw; + nds32_pmu_event_update(event); + perf_sample_data_init(&data, 0, hwc->last_period); + if (!nds32_pmu_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + cpu_pmu->disable(event); + } + nds32_pmu_start(cpu_pmu); + /* + * Handle the pending perf events. + * + * Note: this call *must* be run with interrupts disabled. For + * platforms that can have the PMU interrupts raised as an NMI, this + * will not work. + */ + irq_work_run(); + + return IRQ_HANDLED; +} + +static inline int nds32_pfm_counter_valid(struct nds32_pmu *cpu_pmu, int idx) +{ + return ((idx >= 0) && (idx < cpu_pmu->num_events)); +} + +static inline int nds32_pfm_disable_counter(int idx) +{ + unsigned int val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 mask = 0; + + mask = PFM_CTL_EN[idx]; + val &= ~mask; + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + return idx; +} + +/* + * Add an event filter to a given event. + */ +static int nds32_pmu_set_event_filter(struct hw_perf_event *event, + struct perf_event_attr *attr) +{ + unsigned long config_base = 0; + int idx = event->idx; + unsigned long no_kernel_tracing = 0; + unsigned long no_user_tracing = 0; + /* If index is -1, do not do anything */ + if (idx == -1) + return 0; + + no_kernel_tracing = PFM_CTL_KS[idx]; + no_user_tracing = PFM_CTL_KU[idx]; + /* + * Default: enable both kernel and user mode tracing. + */ + if (attr->exclude_user) + config_base |= no_user_tracing; + + if (attr->exclude_kernel) + config_base |= no_kernel_tracing; + + /* + * Install the filter into config_base as this is used to + * construct the event type. + */ + event->config_base |= config_base; + return 0; +} + +static inline void nds32_pfm_write_evtsel(int idx, u32 evnum) +{ + u32 offset = 0; + u32 ori_val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 ev_mask = 0; + u32 no_kernel_mask = 0; + u32 no_user_mask = 0; + u32 val; + + offset = PFM_CTL_OFFSEL[idx]; + /* Clear previous mode selection, and write new one */ + no_kernel_mask = PFM_CTL_KS[idx]; + no_user_mask = PFM_CTL_KU[idx]; + ori_val &= ~no_kernel_mask; + ori_val &= ~no_user_mask; + if (evnum & no_kernel_mask) + ori_val |= no_kernel_mask; + + if (evnum & no_user_mask) + ori_val |= no_user_mask; + + /* Clear previous event selection */ + ev_mask = PFM_CTL_SEL[idx]; + ori_val &= ~ev_mask; + evnum &= SOFTWARE_EVENT_MASK; + + /* undo the linear mapping */ + evnum = get_converted_evet_hw_num(evnum); + val = ori_val | (evnum << offset); + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); +} + +static inline int nds32_pfm_enable_counter(int idx) +{ + unsigned int val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 mask = 0; + + mask = PFM_CTL_EN[idx]; + val |= mask; + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + return idx; +} + +static inline int nds32_pfm_enable_intens(int idx) +{ + unsigned int val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 mask = 0; + + mask = PFM_CTL_IE[idx]; + val |= mask; + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + return idx; +} + +static inline int nds32_pfm_disable_intens(int idx) +{ + unsigned int val = __nds32__mfsr(NDS32_SR_PFM_CTL); + u32 mask = 0; + + mask = PFM_CTL_IE[idx]; + val &= ~mask; + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + return idx; +} + +static int event_requires_mode_exclusion(struct perf_event_attr *attr) +{ + /* Other modes NDS32 does not support */ + return attr->exclude_user || attr->exclude_kernel; +} + +static void nds32_pmu_enable_event(struct perf_event *event) +{ + unsigned long flags; + unsigned int evnum = 0; + struct hw_perf_event *hwc = &event->hw; + struct nds32_pmu *cpu_pmu = to_nds32_pmu(event->pmu); + struct pmu_hw_events *events = cpu_pmu->get_hw_events(); + int idx = hwc->idx; + + if (!nds32_pfm_counter_valid(cpu_pmu, idx)) { + pr_err("CPU enabling wrong pfm counter IRQ enable\n"); + return; + } + + /* + * Enable counter and interrupt, and set the counter to count + * the event that we're interested in. + */ + raw_spin_lock_irqsave(&events->pmu_lock, flags); + + /* + * Disable counter + */ + nds32_pfm_disable_counter(idx); + + /* + * Check whether we need to exclude the counter from certain modes. + */ + if ((!cpu_pmu->set_event_filter || + cpu_pmu->set_event_filter(hwc, &event->attr)) && + event_requires_mode_exclusion(&event->attr)) { + pr_notice + ("NDS32 performance counters do not support mode exclusion\n"); + hwc->config_base = 0; + } + /* Write event */ + evnum = hwc->config_base; + nds32_pfm_write_evtsel(idx, evnum); + + /* + * Enable interrupt for this counter + */ + nds32_pfm_enable_intens(idx); + + /* + * Enable counter + */ + nds32_pfm_enable_counter(idx); + + raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +} + +static void nds32_pmu_disable_event(struct perf_event *event) +{ + unsigned long flags; + struct hw_perf_event *hwc = &event->hw; + struct nds32_pmu *cpu_pmu = to_nds32_pmu(event->pmu); + struct pmu_hw_events *events = cpu_pmu->get_hw_events(); + int idx = hwc->idx; + + if (!nds32_pfm_counter_valid(cpu_pmu, idx)) { + pr_err("CPU disabling wrong pfm counter IRQ enable %d\n", idx); + return; + } + + /* + * Disable counter and interrupt + */ + raw_spin_lock_irqsave(&events->pmu_lock, flags); + + /* + * Disable counter + */ + nds32_pfm_disable_counter(idx); + + /* + * Disable interrupt for this counter + */ + nds32_pfm_disable_intens(idx); + + raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +} + +static inline u32 nds32_pmu_read_counter(struct perf_event *event) +{ + struct nds32_pmu *cpu_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + u32 count = 0; + + if (!nds32_pfm_counter_valid(cpu_pmu, idx)) { + pr_err("CPU reading wrong counter %d\n", idx); + } else { + switch (idx) { + case PFMC0: + count = __nds32__mfsr(NDS32_SR_PFMC0); + break; + case PFMC1: + count = __nds32__mfsr(NDS32_SR_PFMC1); + break; + case PFMC2: + count = __nds32__mfsr(NDS32_SR_PFMC2); + break; + default: + pr_err + ("%s: CPU has no performance counters %d\n", + __func__, idx); + } + } + return count; +} + +static inline void nds32_pmu_write_counter(struct perf_event *event, u32 value) +{ + struct nds32_pmu *cpu_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + if (!nds32_pfm_counter_valid(cpu_pmu, idx)) { + pr_err("CPU writing wrong counter %d\n", idx); + } else { + switch (idx) { + case PFMC0: + __nds32__mtsr_isb(value, NDS32_SR_PFMC0); + break; + case PFMC1: + __nds32__mtsr_isb(value, NDS32_SR_PFMC1); + break; + case PFMC2: + __nds32__mtsr_isb(value, NDS32_SR_PFMC2); + break; + default: + pr_err + ("%s: CPU has no performance counters %d\n", + __func__, idx); + } + } +} + +static int nds32_pmu_get_event_idx(struct pmu_hw_events *cpuc, + struct perf_event *event) +{ + int idx; + struct hw_perf_event *hwc = &event->hw; + /* + * Current implementation maps cycles, instruction count and cache-miss + * to specific counter. + * However, multiple of the 3 counters are able to count these events. + * + * + * SOFTWARE_EVENT_MASK mask for getting event num , + * This is defined by Jia-Rung, you can change the polocies. + * However, do not exceed 8 bits. This is hardware specific. + * The last number is SPAv3_2_SEL_LAST. + */ + unsigned long evtype = hwc->config_base & SOFTWARE_EVENT_MASK; + + idx = get_converted_event_idx(evtype); + /* + * Try to get the counter for correpsonding event + */ + if (evtype == SPAV3_0_SEL_TOTAL_CYCLES) { + if (!test_and_set_bit(idx, cpuc->used_mask)) + return idx; + if (!test_and_set_bit(NDS32_IDX_COUNTER0, cpuc->used_mask)) + return NDS32_IDX_COUNTER0; + if (!test_and_set_bit(NDS32_IDX_COUNTER1, cpuc->used_mask)) + return NDS32_IDX_COUNTER1; + } else if (evtype == SPAV3_1_SEL_COMPLETED_INSTRUCTION) { + if (!test_and_set_bit(idx, cpuc->used_mask)) + return idx; + else if (!test_and_set_bit(NDS32_IDX_COUNTER1, cpuc->used_mask)) + return NDS32_IDX_COUNTER1; + else if (!test_and_set_bit + (NDS32_IDX_CYCLE_COUNTER, cpuc->used_mask)) + return NDS32_IDX_CYCLE_COUNTER; + } else { + if (!test_and_set_bit(idx, cpuc->used_mask)) + return idx; + } + return -EAGAIN; +} + +static void nds32_pmu_start(struct nds32_pmu *cpu_pmu) +{ + unsigned long flags; + unsigned int val; + struct pmu_hw_events *events = cpu_pmu->get_hw_events(); + + raw_spin_lock_irqsave(&events->pmu_lock, flags); + + /* Enable all counters , NDS PFM has 3 counters */ + val = __nds32__mfsr(NDS32_SR_PFM_CTL); + val |= (PFM_CTL_EN[0] | PFM_CTL_EN[1] | PFM_CTL_EN[2]); + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + + raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +} + +static void nds32_pmu_stop(struct nds32_pmu *cpu_pmu) +{ + unsigned long flags; + unsigned int val; + struct pmu_hw_events *events = cpu_pmu->get_hw_events(); + + raw_spin_lock_irqsave(&events->pmu_lock, flags); + + /* Disable all counters , NDS PFM has 3 counters */ + val = __nds32__mfsr(NDS32_SR_PFM_CTL); + val &= ~(PFM_CTL_EN[0] | PFM_CTL_EN[1] | PFM_CTL_EN[2]); + val &= ~(PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr_isb(val, NDS32_SR_PFM_CTL); + + raw_spin_unlock_irqrestore(&events->pmu_lock, flags); +} + +static void nds32_pmu_reset(void *info) +{ + u32 val = 0; + + val |= (PFM_CTL_OVF[0] | PFM_CTL_OVF[1] | PFM_CTL_OVF[2]); + __nds32__mtsr(val, NDS32_SR_PFM_CTL); + __nds32__mtsr(0, NDS32_SR_PFM_CTL); + __nds32__mtsr(0, NDS32_SR_PFMC0); + __nds32__mtsr(0, NDS32_SR_PFMC1); + __nds32__mtsr(0, NDS32_SR_PFMC2); +} + +static void nds32_pmu_init(struct nds32_pmu *cpu_pmu) +{ + cpu_pmu->handle_irq = nds32_pmu_handle_irq; + cpu_pmu->enable = nds32_pmu_enable_event; + cpu_pmu->disable = nds32_pmu_disable_event; + cpu_pmu->read_counter = nds32_pmu_read_counter; + cpu_pmu->write_counter = nds32_pmu_write_counter; + cpu_pmu->get_event_idx = nds32_pmu_get_event_idx; + cpu_pmu->start = nds32_pmu_start; + cpu_pmu->stop = nds32_pmu_stop; + cpu_pmu->reset = nds32_pmu_reset; + cpu_pmu->max_period = 0xFFFFFFFF; /* Maximum counts */ +}; + +static u32 nds32_read_num_pfm_events(void) +{ + /* NDS32 SPAv3 PMU support 3 counter */ + return 3; +} + +static int device_pmu_init(struct nds32_pmu *cpu_pmu) +{ + nds32_pmu_init(cpu_pmu); + /* + * This name should be devive-specific name, whatever you like :) + * I think "PMU" will be a good generic name. + */ + cpu_pmu->name = "nds32v3-pmu"; + cpu_pmu->map_event = nds32_spav3_map_event; + cpu_pmu->num_events = nds32_read_num_pfm_events(); + cpu_pmu->set_event_filter = nds32_pmu_set_event_filter; + return 0; +} + +/* + * CPU PMU identification and probing. + */ +static int probe_current_pmu(struct nds32_pmu *pmu) +{ + int ret; + + get_cpu(); + ret = -ENODEV; + /* + * If ther are various CPU types with its own PMU, initialize with + * + * the corresponding one + */ + device_pmu_init(pmu); + put_cpu(); + return ret; +} + +static void nds32_pmu_enable(struct pmu *pmu) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(pmu); + struct pmu_hw_events *hw_events = nds32_pmu->get_hw_events(); + int enabled = bitmap_weight(hw_events->used_mask, + nds32_pmu->num_events); + + if (enabled) + nds32_pmu->start(nds32_pmu); +} + +static void nds32_pmu_disable(struct pmu *pmu) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(pmu); + + nds32_pmu->stop(nds32_pmu); +} + +static void nds32_pmu_release_hardware(struct nds32_pmu *nds32_pmu) +{ + nds32_pmu->free_irq(nds32_pmu); + pm_runtime_put_sync(&nds32_pmu->plat_device->dev); +} + +static irqreturn_t nds32_pmu_dispatch_irq(int irq, void *dev) +{ + struct nds32_pmu *nds32_pmu = (struct nds32_pmu *)dev; + int ret; + u64 start_clock, finish_clock; + + start_clock = local_clock(); + ret = nds32_pmu->handle_irq(irq, dev); + finish_clock = local_clock(); + + perf_sample_event_took(finish_clock - start_clock); + return ret; +} + +static int nds32_pmu_reserve_hardware(struct nds32_pmu *nds32_pmu) +{ + int err; + struct platform_device *pmu_device = nds32_pmu->plat_device; + + if (!pmu_device) + return -ENODEV; + + pm_runtime_get_sync(&pmu_device->dev); + err = nds32_pmu->request_irq(nds32_pmu, nds32_pmu_dispatch_irq); + if (err) { + nds32_pmu_release_hardware(nds32_pmu); + return err; + } + + return 0; +} + +static int +validate_event(struct pmu *pmu, struct pmu_hw_events *hw_events, + struct perf_event *event) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + + if (is_software_event(event)) + return 1; + + if (event->pmu != pmu) + return 0; + + if (event->state < PERF_EVENT_STATE_OFF) + return 1; + + if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec) + return 1; + + return nds32_pmu->get_event_idx(hw_events, event) >= 0; +} + +static int validate_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct pmu_hw_events fake_pmu; + DECLARE_BITMAP(fake_used_mask, MAX_COUNTERS); + /* + * Initialize the fake PMU. We only need to populate the + * used_mask for the purposes of validation. + */ + memset(fake_used_mask, 0, sizeof(fake_used_mask)); + + if (!validate_event(event->pmu, &fake_pmu, leader)) + return -EINVAL; + + for_each_sibling_event(sibling, leader) { + if (!validate_event(event->pmu, &fake_pmu, sibling)) + return -EINVAL; + } + + if (!validate_event(event->pmu, &fake_pmu, event)) + return -EINVAL; + + return 0; +} + +static int __hw_perf_event_init(struct perf_event *event) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int mapping; + + mapping = nds32_pmu->map_event(event); + + if (mapping < 0) { + pr_debug("event %x:%llx not supported\n", event->attr.type, + event->attr.config); + return mapping; + } + + /* + * We don't assign an index until we actually place the event onto + * hardware. Use -1 to signify that we haven't decided where to put it + * yet. For SMP systems, each core has it's own PMU so we can't do any + * clever allocation or constraints checking at this point. + */ + hwc->idx = -1; + hwc->config_base = 0; + hwc->config = 0; + hwc->event_base = 0; + + /* + * Check whether we need to exclude the counter from certain modes. + */ + if ((!nds32_pmu->set_event_filter || + nds32_pmu->set_event_filter(hwc, &event->attr)) && + event_requires_mode_exclusion(&event->attr)) { + pr_debug + ("NDS performance counters do not support mode exclusion\n"); + return -EOPNOTSUPP; + } + + /* + * Store the event encoding into the config_base field. + */ + hwc->config_base |= (unsigned long)mapping; + + if (!hwc->sample_period) { + /* + * For non-sampling runs, limit the sample_period to half + * of the counter width. That way, the new counter value + * is far less likely to overtake the previous one unless + * you have some serious IRQ latency issues. + */ + hwc->sample_period = nds32_pmu->max_period >> 1; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + } + + if (event->group_leader != event) { + if (validate_group(event) != 0) + return -EINVAL; + } + + return 0; +} + +static int nds32_pmu_event_init(struct perf_event *event) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + int err = 0; + atomic_t *active_events = &nds32_pmu->active_events; + + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + if (nds32_pmu->map_event(event) == -ENOENT) + return -ENOENT; + + if (!atomic_inc_not_zero(active_events)) { + if (atomic_read(active_events) == 0) { + /* Register irq handler */ + err = nds32_pmu_reserve_hardware(nds32_pmu); + } + + if (!err) + atomic_inc(active_events); + } + + if (err) + return err; + + err = __hw_perf_event_init(event); + + return err; +} + +static void nds32_start(struct perf_event *event, int flags) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + /* + * NDS pmu always has to reprogram the period, so ignore + * PERF_EF_RELOAD, see the comment below. + */ + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + + hwc->state = 0; + /* Set the period for the event. */ + nds32_pmu_event_set_period(event); + + nds32_pmu->enable(event); +} + +static int nds32_pmu_add(struct perf_event *event, int flags) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct pmu_hw_events *hw_events = nds32_pmu->get_hw_events(); + struct hw_perf_event *hwc = &event->hw; + int idx; + int err = 0; + + perf_pmu_disable(event->pmu); + + /* If we don't have a space for the counter then finish early. */ + idx = nds32_pmu->get_event_idx(hw_events, event); + if (idx < 0) { + err = idx; + goto out; + } + + /* + * If there is an event in the counter we are going to use then make + * sure it is disabled. + */ + event->hw.idx = idx; + nds32_pmu->disable(event); + hw_events->events[idx] = event; + + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + if (flags & PERF_EF_START) + nds32_start(event, PERF_EF_RELOAD); + + /* Propagate our changes to the userspace mapping. */ + perf_event_update_userpage(event); + +out: + perf_pmu_enable(event->pmu); + return err; +} + +u64 nds32_pmu_event_update(struct perf_event *event) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u64 delta, prev_raw_count, new_raw_count; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + new_raw_count = nds32_pmu->read_counter(event); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) { + goto again; + } + /* + * Whether overflow or not, "unsigned substraction" + * will always get their delta + */ + delta = (new_raw_count - prev_raw_count) & nds32_pmu->max_period; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +static void nds32_stop(struct perf_event *event, int flags) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + /* + * NDS pmu always has to update the counter, so ignore + * PERF_EF_UPDATE, see comments in nds32_start(). + */ + if (!(hwc->state & PERF_HES_STOPPED)) { + nds32_pmu->disable(event); + nds32_pmu_event_update(event); + hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + } +} + +static void nds32_pmu_del(struct perf_event *event, int flags) +{ + struct nds32_pmu *nds32_pmu = to_nds32_pmu(event->pmu); + struct pmu_hw_events *hw_events = nds32_pmu->get_hw_events(); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + nds32_stop(event, PERF_EF_UPDATE); + hw_events->events[idx] = NULL; + clear_bit(idx, hw_events->used_mask); + + perf_event_update_userpage(event); +} + +static void nds32_pmu_read(struct perf_event *event) +{ + nds32_pmu_event_update(event); +} + +/* Please refer to SPAv3 for more hardware specific details */ +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *nds32_arch_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group nds32_pmu_format_group = { + .name = "format", + .attrs = nds32_arch_formats_attr, +}; + +static ssize_t nds32_pmu_cpumask_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return 0; +} + +static DEVICE_ATTR(cpus, 0444, nds32_pmu_cpumask_show, NULL); + +static struct attribute *nds32_pmu_common_attrs[] = { + &dev_attr_cpus.attr, + NULL, +}; + +static struct attribute_group nds32_pmu_common_group = { + .attrs = nds32_pmu_common_attrs, +}; + +static const struct attribute_group *nds32_pmu_attr_groups[] = { + &nds32_pmu_format_group, + &nds32_pmu_common_group, + NULL, +}; + +static void nds32_init(struct nds32_pmu *nds32_pmu) +{ + atomic_set(&nds32_pmu->active_events, 0); + + nds32_pmu->pmu = (struct pmu) { + .pmu_enable = nds32_pmu_enable, + .pmu_disable = nds32_pmu_disable, + .attr_groups = nds32_pmu_attr_groups, + .event_init = nds32_pmu_event_init, + .add = nds32_pmu_add, + .del = nds32_pmu_del, + .start = nds32_start, + .stop = nds32_stop, + .read = nds32_pmu_read, + }; +} + +int nds32_pmu_register(struct nds32_pmu *nds32_pmu, int type) +{ + nds32_init(nds32_pmu); + pm_runtime_enable(&nds32_pmu->plat_device->dev); + pr_info("enabled with %s PMU driver, %d counters available\n", + nds32_pmu->name, nds32_pmu->num_events); + return perf_pmu_register(&nds32_pmu->pmu, nds32_pmu->name, type); +} + +static struct pmu_hw_events *cpu_pmu_get_cpu_events(void) +{ + return this_cpu_ptr(&cpu_hw_events); +} + +static int cpu_pmu_request_irq(struct nds32_pmu *cpu_pmu, irq_handler_t handler) +{ + int err, irq, irqs; + struct platform_device *pmu_device = cpu_pmu->plat_device; + + if (!pmu_device) + return -ENODEV; + + irqs = min(pmu_device->num_resources, num_possible_cpus()); + if (irqs < 1) { + pr_err("no irqs for PMUs defined\n"); + return -ENODEV; + } + + irq = platform_get_irq(pmu_device, 0); + err = request_irq(irq, handler, IRQF_NOBALANCING, "nds32-pfm", + cpu_pmu); + if (err) { + pr_err("unable to request IRQ%d for NDS PMU counters\n", + irq); + return err; + } + return 0; +} + +static void cpu_pmu_free_irq(struct nds32_pmu *cpu_pmu) +{ + int irq; + struct platform_device *pmu_device = cpu_pmu->plat_device; + + irq = platform_get_irq(pmu_device, 0); + if (irq >= 0) + free_irq(irq, cpu_pmu); +} + +static void cpu_pmu_init(struct nds32_pmu *cpu_pmu) +{ + int cpu; + struct pmu_hw_events *events = &per_cpu(cpu_hw_events, cpu); + + raw_spin_lock_init(&events->pmu_lock); + + cpu_pmu->get_hw_events = cpu_pmu_get_cpu_events; + cpu_pmu->request_irq = cpu_pmu_request_irq; + cpu_pmu->free_irq = cpu_pmu_free_irq; + + /* Ensure the PMU has sane values out of reset. */ + if (cpu_pmu->reset) + on_each_cpu(cpu_pmu->reset, cpu_pmu, 1); +} + +const static struct of_device_id cpu_pmu_of_device_ids[] = { + {.compatible = "andestech,nds32v3-pmu", + .data = device_pmu_init}, + {}, +}; + +static int cpu_pmu_device_probe(struct platform_device *pdev) +{ + const struct of_device_id *of_id; + int (*init_fn)(struct nds32_pmu *nds32_pmu); + struct device_node *node = pdev->dev.of_node; + struct nds32_pmu *pmu; + int ret = -ENODEV; + + if (cpu_pmu) { + pr_notice("[perf] attempt to register multiple PMU devices!\n"); + return -ENOSPC; + } + + pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); + if (!pmu) + return -ENOMEM; + + of_id = of_match_node(cpu_pmu_of_device_ids, pdev->dev.of_node); + if (node && of_id) { + init_fn = of_id->data; + ret = init_fn(pmu); + } else { + ret = probe_current_pmu(pmu); + } + + if (ret) { + pr_notice("[perf] failed to probe PMU!\n"); + goto out_free; + } + + cpu_pmu = pmu; + cpu_pmu->plat_device = pdev; + cpu_pmu_init(cpu_pmu); + ret = nds32_pmu_register(cpu_pmu, PERF_TYPE_RAW); + + if (!ret) + return 0; + +out_free: + pr_notice("[perf] failed to register PMU devices!\n"); + kfree(pmu); + return ret; +} + +static struct platform_driver cpu_pmu_driver = { + .driver = { + .name = "nds32-pfm", + .of_match_table = cpu_pmu_of_device_ids, + }, + .probe = cpu_pmu_device_probe, + .id_table = cpu_pmu_plat_device_ids, +}; + +static int __init register_pmu_driver(void) +{ + int err = 0; + + err = platform_driver_register(&cpu_pmu_driver); + if (err) + pr_notice("[perf] PMU initialization failed\n"); + else + pr_notice("[perf] PMU initialization done\n"); + + return err; +} + +device_initcall(register_pmu_driver); + +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + /* However, NDS32 does not support virtualization */ + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) + return perf_guest_cbs->get_guest_ip(); + + return instruction_pointer(regs); +} + +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + int misc = 0; + + /* However, NDS32 does not support virtualization */ + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_cbs->is_user_mode()) + misc |= PERF_RECORD_MISC_GUEST_USER; + else + misc |= PERF_RECORD_MISC_GUEST_KERNEL; + } else { + if (user_mode(regs)) + misc |= PERF_RECORD_MISC_USER; + else + misc |= PERF_RECORD_MISC_KERNEL; + } + + return misc; +} diff --git a/arch/nds32/mm/fault.c b/arch/nds32/mm/fault.c index b740534b152c..68d5f2a27f38 100644 --- a/arch/nds32/mm/fault.c +++ b/arch/nds32/mm/fault.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -169,8 +170,6 @@ good_area: mask = VM_EXEC; else { mask = VM_READ | VM_WRITE; - if (vma->vm_flags & VM_WRITE) - flags |= FAULT_FLAG_WRITE; } } else if (entry == ENTRY_TLB_MISC) { switch (error_code & ITYPE_mskETYPE) { @@ -231,11 +230,17 @@ good_area: * attempt. If we go through a retry, it is extremely likely that the * page will be found in page cache at that point. */ + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - else + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, + 1, regs, addr); + } else { tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, + 1, regs, addr); + } if (fault & VM_FAULT_RETRY) { flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; diff --git a/tools/include/asm/barrier.h b/tools/include/asm/barrier.h index 8d378c57cb01..dc696c151e1c 100644 --- a/tools/include/asm/barrier.h +++ b/tools/include/asm/barrier.h @@ -24,6 +24,8 @@ #include "../../arch/ia64/include/asm/barrier.h" #elif defined(__xtensa__) #include "../../arch/xtensa/include/asm/barrier.h" +#elif defined(__nds32__) +#include "../../arch/nds32/include/asm/barrier.h" #else #include #endif diff --git a/tools/perf/arch/nds32/Build b/tools/perf/arch/nds32/Build new file mode 100644 index 000000000000..54afe4a467e7 --- /dev/null +++ b/tools/perf/arch/nds32/Build @@ -0,0 +1 @@ +libperf-y += util/ diff --git a/tools/perf/arch/nds32/util/Build b/tools/perf/arch/nds32/util/Build new file mode 100644 index 000000000000..ca623bbf993c --- /dev/null +++ b/tools/perf/arch/nds32/util/Build @@ -0,0 +1 @@ +libperf-y += header.o diff --git a/tools/perf/arch/nds32/util/header.c b/tools/perf/arch/nds32/util/header.c new file mode 100644 index 000000000000..ef9dbdbe7968 --- /dev/null +++ b/tools/perf/arch/nds32/util/header.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2017 Andes Technology Corporation + +#include +#include +#include +#include "header.h" + +#define STR_LEN 1024 + +char *get_cpuid_str(struct perf_pmu *pmu) +{ + /* In nds32, we only have one cpu */ + char *buf = NULL; + struct cpu_map *cpus; + const char *sysfs = sysfs__mountpoint(); + + if (!sysfs || !pmu || !pmu->cpus) + return NULL; + + buf = malloc(STR_LEN); + if (!buf) + return NULL; + + cpus = cpu_map__get(pmu->cpus); + sprintf(buf, "0x%x", cpus->nr - 1); + cpu_map__put(cpus); + return buf; +} diff --git a/tools/perf/pmu-events/arch/nds32/mapfile.csv b/tools/perf/pmu-events/arch/nds32/mapfile.csv new file mode 100644 index 000000000000..efb395f26883 --- /dev/null +++ b/tools/perf/pmu-events/arch/nds32/mapfile.csv @@ -0,0 +1,15 @@ +# Format: +# MIDR,Version,JSON/file/pathname,Type +# +# where +# MIDR Processor version +# Variant[23:20] and Revision [3:0] should be zero. +# Version could be used to track version of of JSON file +# but currently unused. +# JSON/file/pathname is the path to JSON file, relative +# to tools/perf/pmu-events/arch/arm64/. +# Type is core, uncore etc +# +# +#Family-model,Version,Filename,EventType +0x0,v3,n13,core diff --git a/tools/perf/pmu-events/arch/nds32/n13/atcpmu.json b/tools/perf/pmu-events/arch/nds32/n13/atcpmu.json new file mode 100644 index 000000000000..5347350c360c --- /dev/null +++ b/tools/perf/pmu-events/arch/nds32/n13/atcpmu.json @@ -0,0 +1,290 @@ +[ + { + "PublicDescription": "Conditional branch", + "EventCode": "0x102", + "EventName": "cond_br", + "BriefDescription": "V3 Conditional branch" + }, + { + "PublicDescription": "Taken conditional branches", + "EventCode": "0x103", + "EventName": "taken_cond_br", + "BriefDescription": "V3 Taken Conditional branch" + }, + { + "PublicDescription": "Prefetch Instruction", + "EventCode": "0x104", + "EventName": "prefetch_inst", + "BriefDescription": "V3 Prefetch Instruction" + }, + { + "PublicDescription": "RET Inst", + "EventCode": "0x105", + "EventName": "ret_inst", + "BriefDescription": "V3 RET Inst" + }, + { + "PublicDescription": "JR(non-RET) instructions", + "EventCode": "0x106", + "EventName": "jr_inst", + "BriefDescription": "V3 JR(non-RET) instructions" + }, + { + "PublicDescription": "JAL/JRAL instructions", + "EventCode": "0x107", + "EventName": "jal_jral_inst", + "BriefDescription": "V3 JAL/JRAL instructions" + }, + { + "PublicDescription": "NOP instructions", + "EventCode": "0x108", + "EventName": "nop_inst", + "BriefDescription": "V3 NOP instructions" + }, + { + "PublicDescription": "SCW instructions", + "EventCode": "0x109", + "EventName": "scw_inst", + "BriefDescription": "V3 SCW instructions" + }, + { + "PublicDescription": "ISB/DSB instructions", + "EventCode": "0x10a", + "EventName": "isb_dsb_inst", + "BriefDescription": "V3 ISB/DSB instructions" + }, + { + "PublicDescription": "CCTL instructions", + "EventCode": "0x10b", + "EventName": "cctl_inst", + "BriefDescription": "V3 CCTL instructions" + }, + { + "PublicDescription": "Taken Interrupts", + "EventCode": "0x10c", + "EventName": "taken_interrupts", + "BriefDescription": "V3 Taken Interrupts" + }, + { + "PublicDescription": "Loads Completed", + "EventCode": "0x10d", + "EventName": "load_completed", + "BriefDescription": "V3 Loads Completed" + }, + { + "PublicDescription": "uITLB accesses", + "EventCode": "0x10e", + "EventName": "uitlb_access", + "BriefDescription": "V3 uITLB accesses" + }, + { + "PublicDescription": "uDTLB accesses", + "EventCode": "0x10f", + "EventName": "udtlb_access", + "BriefDescription": "V3 uDTLB accesses" + }, + { + "PublicDescription": "MTLB accesses", + "EventCode": "0x110", + "EventName": "mtlb_access", + "BriefDescription": "V3 MTLB accesses" + }, + { + "PublicDescription": "DATA_DEPENDENCY_STALL_CYCLES", + "EventCode": "0x112", + "EventName": "data_dependency_stall", + "BriefDescription": "V3 DATA_DEPENDENCY_STALL_CYCLES" + }, + { + "PublicDescription": "DATA_CACHE_MISS_STALL_CYCLES", + "EventCode": "0x113", + "EventName": "dcache_miss_stall", + "BriefDescription": "V3 DATA_CACHE_MISS_STALL_CYCLES" + }, + { + "PublicDescription": "ILM access", + "EventCode": "0x118", + "EventName": "ilm_access", + "BriefDescription": "V3 ILM accesses" + }, + { + "PublicDescription": "LSU BIU CYCLES", + "EventCode": "0x119", + "EventName": "lsu_biu_cycles", + "BriefDescription": "V3 LSU BIU CYCLES" + }, + { + "PublicDescription": "HPTWK BIU CYCLES", + "EventCode": "0x11a", + "EventName": "hptwk_biu_cycles", + "BriefDescription": "V3 HPTWK BIU CYCLES" + }, + { + "PublicDescription": "DMA BIU CYCLES", + "EventCode": "0x11b", + "EventName": "dma_biu_cycles", + "BriefDescription": "V3 DMA BIU CYCLES" + }, + { + "PublicDescription": "CODE CACHE FILL BIU CYCLES", + "EventCode": "0x11c", + "EventName": "icache_fill_biu_cycles", + "BriefDescription": "V3 CODE CACHE FILL BIU CYCLES" + }, + { + "PublicDescription": "LEAGAL UNALIGN DCACHE ACCESS", + "EventCode": "0x11d", + "EventName": "legal_unalined_dcache_access", + "BriefDescription": "V3 LEAGAL UNALIGN DCACHE ACCESS" + }, + { + "PublicDescription": "PUSH25 instructions", + "EventCode": "0x11e", + "EventName": "push25_inst", + "BriefDescription": "V3 PUSH25 instructions" + }, + { + "PublicDescription": "SYSCALL instructions", + "EventCode": "0x11f", + "EventName": "syscall_inst", + "BriefDescription": "V3 SYSCALL instructions" + }, + { + "PublicDescription": "conditional branch miss", + "EventCode": "0x202", + "EventName": "cond_br_miss", + "BriefDescription": "V3 conditional branch miss" + }, + { + "PublicDescription": "taken conditional branch miss", + "EventCode": "0x203", + "EventName": "taken_cond_br_miss", + "BriefDescription": "V3 taken conditional branch miss" + }, + { + "PublicDescription": "Prefetch Instructions with cache hit", + "EventCode": "0x204", + "EventName": "prefetch_icache_hit", + "BriefDescription": "V3 Prefetch Instructions with cache hit" + }, + { + "PublicDescription": "RET mispredict", + "EventCode": "0x205", + "EventName": "ret_mispredict", + "BriefDescription": "V3 RET mispredict" + }, + { + "PublicDescription": "Immediate J instructions", + "EventCode": "0x206", + "EventName": "imm_j_inst", + "BriefDescription": "V3 Immediate J instructions" + }, + { + "PublicDescription": "Multiply instructions", + "EventCode": "0x207", + "EventName": "mul_inst", + "BriefDescription": "V3 Multiply instructions" + }, + { + "PublicDescription": "16 bits instructions", + "EventCode": "0x208", + "EventName": "sixteen_bits_inst", + "BriefDescription": "V3 16 bits instructions" + }, + { + "PublicDescription": "Failed SCW instructions", + "EventCode": "0x209", + "EventName": "fail_scw_inst", + "BriefDescription": "V3 Failed SCW instructions" + }, + { + "PublicDescription": "ld-after-st conflict replays", + "EventCode": "0x20a", + "EventName": "ld_af_st_conflict", + "BriefDescription": "V3 ld-after-st conflict replays" + }, + { + "PublicDescription": "Exception taken", + "EventCode": "0x20c", + "EventName": "exception_taken", + "BriefDescription": "V3 Exception taken" + }, + { + "PublicDescription": "Stores completed", + "EventCode": "0x20d", + "EventName": "store_completed", + "BriefDescription": "V3 Stores completed" + }, + { + "PublicDescription": "uITLB miss", + "EventCode": "0x20e", + "EventName": "uitlb_miss", + "BriefDescription": "V3 uITLB miss" + }, + { + "PublicDescription": "uDTLB miss", + "EventCode": "0x20f", + "EventName": "udtlb_miss", + "BriefDescription": "V3 uDTLB miss" + }, + { + "PublicDescription": "MTLB miss", + "EventCode": "0x210", + "EventName": "mtlb_miss", + "BriefDescription": "V3 MTLB miss" + }, + { + "PublicDescription": "Empty instructions queue stall cycles", + "EventCode": "0x212", + "EventName": "empty_inst_q_stall", + "BriefDescription": "V3 Empty instructions queue stall cycles" + }, + { + "PublicDescription": "Data write back", + "EventCode": "0x213", + "EventName": "data_wb", + "BriefDescription": "V3 Data write back" + }, + { + "PublicDescription": "DLM access", + "EventCode": "0x218", + "EventName": "dlm_access", + "BriefDescription": "V3 DLM access" + }, + { + "PublicDescription": "LSU BIU request", + "EventCode": "0x219", + "EventName": "lsu_biu_req", + "BriefDescription": "V3 LSU BIU request" + }, + { + "PublicDescription": "HPTWK BIU request", + "EventCode": "0x21a", + "EventName": "hptwk_biu_req", + "BriefDescription": "V3 HPTWK BIU request" + }, + { + "PublicDescription": "DMA BIU request", + "EventCode": "0x21b", + "EventName": "dma_biu_req", + "BriefDescription": "V3 DMA BIU request" + }, + { + "PublicDescription": "Icache fill BIU request", + "EventCode": "0x21c", + "EventName": "icache_fill_biu_req", + "BriefDescription": "V3 Icache fill BIU request" + }, + { + "PublicDescription": "External events", + "EventCode": "0x21d", + "EventName": "external_events", + "BriefDescription": "V3 External events" + }, + { + "PublicDescription": "POP25 instructions", + "EventCode": "0x21e", + "EventName": "pop25_inst", + "BriefDescription": "V3 POP25 instructions" + }, +] From c8b34461705e16b94d34c96c7784009b28b3da03 Mon Sep 17 00:00:00 2001 From: Nickhu Date: Thu, 25 Oct 2018 10:24:16 +0800 Subject: [PATCH 06/16] nds32: Add perf call-graph support. The perf call-graph option can trace the callchain between functions. This commit add the perf callchain for nds32. There are kerenl callchain and user callchain. The kerenl callchain can trace the function in kernel space. There are two type for user callchain. One for the 'optimize for size' config is set, and another one for the config is not set. The difference between two types is that the index of frame-pointer in user stack is not the same. For example: With optimize for size: User Stack: --------- | lp | --------- | gp | --------- | fp | Without optimize for size: User Stack: 1. non-leaf function: --------- | lp | --------- | fp | 2. leaf function: --------- | fp | Signed-off-by: Nickhu Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/kernel/perf_event_cpu.c | 299 +++++++++++++++++++++++++++++ 1 file changed, 299 insertions(+) diff --git a/arch/nds32/kernel/perf_event_cpu.c b/arch/nds32/kernel/perf_event_cpu.c index a6e723d0fdbc..5e00ce54d0ff 100644 --- a/arch/nds32/kernel/perf_event_cpu.c +++ b/arch/nds32/kernel/perf_event_cpu.c @@ -1193,6 +1193,305 @@ static int __init register_pmu_driver(void) device_initcall(register_pmu_driver); +/* + * References: arch/nds32/kernel/traps.c:__dump() + * You will need to know the NDS ABI first. + */ +static int unwind_frame_kernel(struct stackframe *frame) +{ + int graph = 0; +#ifdef CONFIG_FRAME_POINTER + /* 0x3 means misalignment */ + if (!kstack_end((void *)frame->fp) && + !((unsigned long)frame->fp & 0x3) && + ((unsigned long)frame->fp >= TASK_SIZE)) { + /* + * The array index is based on the ABI, the below graph + * illustrate the reasons. + * Function call procedure: "smw" and "lmw" will always + * update SP and FP for you automatically. + * + * Stack Relative Address + * | | 0 + * ---- + * |LP| <-- SP(before smw) <-- FP(after smw) -1 + * ---- + * |FP| -2 + * ---- + * | | <-- SP(after smw) -3 + */ + frame->lp = ((unsigned long *)frame->fp)[-1]; + frame->fp = ((unsigned long *)frame->fp)[FP_OFFSET]; + /* make sure CONFIG_FUNCTION_GRAPH_TRACER is turned on */ + if (__kernel_text_address(frame->lp)) + frame->lp = ftrace_graph_ret_addr + (NULL, &graph, frame->lp, NULL); + + return 0; + } else { + return -EPERM; + } +#else + /* + * You can refer to arch/nds32/kernel/traps.c:__dump() + * Treat "sp" as "fp", but the "sp" is one frame ahead of "fp". + * And, the "sp" is not always correct. + * + * Stack Relative Address + * | | 0 + * ---- + * |LP| <-- SP(before smw) -1 + * ---- + * | | <-- SP(after smw) -2 + * ---- + */ + if (!kstack_end((void *)frame->sp)) { + frame->lp = ((unsigned long *)frame->sp)[1]; + /* TODO: How to deal with the value in first + * "sp" is not correct? + */ + if (__kernel_text_address(frame->lp)) + frame->lp = ftrace_graph_ret_addr + (tsk, &graph, frame->lp, NULL); + + frame->sp = ((unsigned long *)frame->sp) + 1; + + return 0; + } else { + return -EPERM; + } +#endif +} + +static void notrace +walk_stackframe(struct stackframe *frame, + int (*fn_record)(struct stackframe *, void *), + void *data) +{ + while (1) { + int ret; + + if (fn_record(frame, data)) + break; + + ret = unwind_frame_kernel(frame); + if (ret < 0) + break; + } +} + +/* + * Gets called by walk_stackframe() for every stackframe. This will be called + * whist unwinding the stackframe and is like a subroutine return so we use + * the PC. + */ +static int callchain_trace(struct stackframe *fr, void *data) +{ + struct perf_callchain_entry_ctx *entry = data; + + perf_callchain_store(entry, fr->lp); + return 0; +} + +/* + * Get the return address for a single stackframe and return a pointer to the + * next frame tail. + */ +static unsigned long +user_backtrace(struct perf_callchain_entry_ctx *entry, unsigned long fp) +{ + struct frame_tail buftail; + unsigned long lp = 0; + unsigned long *user_frame_tail = + (unsigned long *)(fp - (unsigned long)sizeof(buftail)); + + /* Check accessibility of one struct frame_tail beyond */ + if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(buftail))) + return 0; + if (__copy_from_user_inatomic + (&buftail, user_frame_tail, sizeof(buftail))) + return 0; + + /* + * Refer to unwind_frame_kernel() for more illurstration + */ + lp = buftail.stack_lp; /* ((unsigned long *)fp)[-1] */ + fp = buftail.stack_fp; /* ((unsigned long *)fp)[FP_OFFSET] */ + perf_callchain_store(entry, lp); + return fp; +} + +static unsigned long +user_backtrace_opt_size(struct perf_callchain_entry_ctx *entry, + unsigned long fp) +{ + struct frame_tail_opt_size buftail; + unsigned long lp = 0; + + unsigned long *user_frame_tail = + (unsigned long *)(fp - (unsigned long)sizeof(buftail)); + + /* Check accessibility of one struct frame_tail beyond */ + if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(buftail))) + return 0; + if (__copy_from_user_inatomic + (&buftail, user_frame_tail, sizeof(buftail))) + return 0; + + /* + * Refer to unwind_frame_kernel() for more illurstration + */ + lp = buftail.stack_lp; /* ((unsigned long *)fp)[-1] */ + fp = buftail.stack_fp; /* ((unsigned long *)fp)[FP_OFFSET] */ + + perf_callchain_store(entry, lp); + return fp; +} + +/* + * This will be called when the target is in user mode + * This function will only be called when we use + * "PERF_SAMPLE_CALLCHAIN" in + * kernel/events/core.c:perf_prepare_sample() + * + * How to trigger perf_callchain_[user/kernel] : + * $ perf record -e cpu-clock --call-graph fp ./program + * $ perf report --call-graph + */ +unsigned long leaf_fp; +void +perf_callchain_user(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + unsigned long fp = 0; + unsigned long gp = 0; + unsigned long lp = 0; + unsigned long sp = 0; + unsigned long *user_frame_tail; + + leaf_fp = 0; + + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* We don't support guest os callchain now */ + return; + } + + perf_callchain_store(entry, regs->ipc); + fp = regs->fp; + gp = regs->gp; + lp = regs->lp; + sp = regs->sp; + if (entry->nr < PERF_MAX_STACK_DEPTH && + (unsigned long)fp && !((unsigned long)fp & 0x7) && fp > sp) { + user_frame_tail = + (unsigned long *)(fp - (unsigned long)sizeof(fp)); + + if (!access_ok(VERIFY_READ, user_frame_tail, sizeof(fp))) + return; + + if (__copy_from_user_inatomic + (&leaf_fp, user_frame_tail, sizeof(fp))) + return; + + if (leaf_fp == lp) { + /* + * Maybe this is non leaf function + * with optimize for size, + * or maybe this is the function + * with optimize for size + */ + struct frame_tail buftail; + + user_frame_tail = + (unsigned long *)(fp - + (unsigned long)sizeof(buftail)); + + if (!access_ok + (VERIFY_READ, user_frame_tail, sizeof(buftail))) + return; + + if (__copy_from_user_inatomic + (&buftail, user_frame_tail, sizeof(buftail))) + return; + + if (buftail.stack_fp == gp) { + /* non leaf function with optimize + * for size condition + */ + struct frame_tail_opt_size buftail_opt_size; + + user_frame_tail = + (unsigned long *)(fp - (unsigned long) + sizeof(buftail_opt_size)); + + if (!access_ok(VERIFY_READ, user_frame_tail, + sizeof(buftail_opt_size))) + return; + + if (__copy_from_user_inatomic + (&buftail_opt_size, user_frame_tail, + sizeof(buftail_opt_size))) + return; + + perf_callchain_store(entry, lp); + fp = buftail_opt_size.stack_fp; + + while ((entry->nr < PERF_MAX_STACK_DEPTH) && + (unsigned long)fp && + !((unsigned long)fp & 0x7) && + fp > sp) { + sp = fp; + fp = user_backtrace_opt_size(entry, fp); + } + + } else { + /* this is the function + * without optimize for size + */ + fp = buftail.stack_fp; + perf_callchain_store(entry, lp); + while ((entry->nr < PERF_MAX_STACK_DEPTH) && + (unsigned long)fp && + !((unsigned long)fp & 0x7) && + fp > sp) { + sp = fp; + fp = user_backtrace(entry, fp); + } + } + } else { + /* this is leaf function */ + fp = leaf_fp; + perf_callchain_store(entry, lp); + + /* previous function callcahin */ + while ((entry->nr < PERF_MAX_STACK_DEPTH) && + (unsigned long)fp && + !((unsigned long)fp & 0x7) && fp > sp) { + sp = fp; + fp = user_backtrace(entry, fp); + } + } + return; + } +} + +/* This will be called when the target is in kernel mode */ +void +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, + struct pt_regs *regs) +{ + struct stackframe fr; + + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* We don't support guest os callchain now */ + return; + } + fr.fp = regs->fp; + fr.lp = regs->lp; + fr.sp = regs->sp; + walk_stackframe(&fr, callchain_trace, entry); +} + unsigned long perf_instruction_pointer(struct pt_regs *regs) { /* However, NDS32 does not support virtualization */ From cf26edd840dc65b122a0f5e22d2d81ad05eccb2d Mon Sep 17 00:00:00 2001 From: Nickhu Date: Thu, 25 Oct 2018 10:24:17 +0800 Subject: [PATCH 07/16] nds32: Add document for NDS32 PMU. The document for how to add NDS32 PMU in devicetree. Signed-off-by: Nickhu Reviewed-by: Rob Herring Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- .../devicetree/bindings/perf/nds32v3-pmu.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 Documentation/devicetree/bindings/perf/nds32v3-pmu.txt diff --git a/Documentation/devicetree/bindings/perf/nds32v3-pmu.txt b/Documentation/devicetree/bindings/perf/nds32v3-pmu.txt new file mode 100644 index 000000000000..1bd15785b4ae --- /dev/null +++ b/Documentation/devicetree/bindings/perf/nds32v3-pmu.txt @@ -0,0 +1,17 @@ +* NDS32 Performance Monitor Units + +NDS32 core have a PMU for counting cpu and cache events like cache misses. +The NDS32 PMU representation in the device tree should be done as under: + +Required properties: + +- compatible : + "andestech,nds32v3-pmu" + +- interrupts : The interrupt number for NDS32 PMU is 13. + +Example: +pmu{ + compatible = "andestech,nds32v3-pmu"; + interrupts = <13>; +} From 7938e6315c9af3d4a40185b537733bbce842305a Mon Sep 17 00:00:00 2001 From: Nick Hu Date: Wed, 24 Oct 2018 18:14:32 +0800 Subject: [PATCH 08/16] nds32: Power management for nds32 There are three sleep states in nds32: suspend to idle, suspend to standby, suspend to ram In suspend to ram, we use the 'standby' instruction to emulate power management device to hang the system util wakeup source send wakeup events to break the loop. First, we push the general purpose registers and system registers to stack. Second, we translate stack pointer to physical address and store to memory to save the stack pointer. Third, after write back and invalid the cache we hang in 'standby' intruction. When wakeup source trigger wake up events, the loop will be break and resume the system. Signed-off-by: Nick Hu Acked-by: Pavel Machek Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/Kconfig | 10 +++ arch/nds32/include/asm/suspend.h | 11 +++ arch/nds32/kernel/Makefile | 2 +- arch/nds32/kernel/pm.c | 79 +++++++++++++++++++ arch/nds32/kernel/sleep.S | 129 +++++++++++++++++++++++++++++++ drivers/irqchip/irq-ativic32.c | 31 ++++++++ 6 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 arch/nds32/include/asm/suspend.h create mode 100644 arch/nds32/kernel/pm.c create mode 100644 arch/nds32/kernel/sleep.S diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 97786a865023..5a11772a514d 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -93,3 +93,13 @@ endmenu menu "Kernel Features" source "kernel/Kconfig.hz" endmenu + +menu "Power management options" +config SYS_SUPPORTS_APM_EMULATION + bool + +config ARCH_SUSPEND_POSSIBLE + def_bool y + +source "kernel/power/Kconfig" +endmenu diff --git a/arch/nds32/include/asm/suspend.h b/arch/nds32/include/asm/suspend.h new file mode 100644 index 000000000000..6ed2418af1ac --- /dev/null +++ b/arch/nds32/include/asm/suspend.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (C) 2008-2017 Andes Technology Corporation + +#ifndef __ASM_NDS32_SUSPEND_H +#define __ASM_NDS32_SUSPEND_H + +extern void suspend2ram(void); +extern void cpu_resume(void); +extern unsigned long wake_mask; + +#endif diff --git a/arch/nds32/kernel/Makefile b/arch/nds32/kernel/Makefile index f52bd2744f50..8d62f2ecb1ab 100644 --- a/arch/nds32/kernel/Makefile +++ b/arch/nds32/kernel/Makefile @@ -16,7 +16,7 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_OF) += devtree.o obj-$(CONFIG_CACHE_L2) += atl2c.o obj-$(CONFIG_PERF_EVENTS) += perf_event_cpu.o - +obj-$(CONFIG_PM) += pm.o sleep.o extra-y := head.o vmlinux.lds obj-y += vdso/ diff --git a/arch/nds32/kernel/pm.c b/arch/nds32/kernel/pm.c new file mode 100644 index 000000000000..6989560abf4e --- /dev/null +++ b/arch/nds32/kernel/pm.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2008-2017 Andes Technology Corporation + +#include +#include +#include +#include +#include +#include +#include + +unsigned int resume_addr; +unsigned int *phy_addr_sp_tmp; + +static void nds32_suspend2ram(void) +{ + pgd_t *pgdv; + pud_t *pudv; + pmd_t *pmdv; + pte_t *ptev; + + pgdv = (pgd_t *)__va((__nds32__mfsr(NDS32_SR_L1_PPTB) & + L1_PPTB_mskBASE)) + pgd_index((unsigned int)cpu_resume); + + pudv = pud_offset(pgdv, (unsigned int)cpu_resume); + pmdv = pmd_offset(pudv, (unsigned int)cpu_resume); + ptev = pte_offset_map(pmdv, (unsigned int)cpu_resume); + + resume_addr = ((*ptev) & TLB_DATA_mskPPN) + | ((unsigned int)cpu_resume & 0x00000fff); + + suspend2ram(); +} + +static void nds32_suspend_cpu(void) +{ + while (!(__nds32__mfsr(NDS32_SR_INT_PEND) & wake_mask)) + __asm__ volatile ("standby no_wake_grant\n\t"); +} + +static int nds32_pm_valid(suspend_state_t state) +{ + switch (state) { + case PM_SUSPEND_ON: + case PM_SUSPEND_STANDBY: + case PM_SUSPEND_MEM: + return 1; + default: + return 0; + } +} + +static int nds32_pm_enter(suspend_state_t state) +{ + pr_debug("%s:state:%d\n", __func__, state); + switch (state) { + case PM_SUSPEND_STANDBY: + nds32_suspend_cpu(); + return 0; + case PM_SUSPEND_MEM: + nds32_suspend2ram(); + return 0; + default: + return -EINVAL; + } +} + +static const struct platform_suspend_ops nds32_pm_ops = { + .valid = nds32_pm_valid, + .enter = nds32_pm_enter, +}; + +static int __init nds32_pm_init(void) +{ + pr_debug("Enter %s\n", __func__); + suspend_set_ops(&nds32_pm_ops); + return 0; +} +late_initcall(nds32_pm_init); diff --git a/arch/nds32/kernel/sleep.S b/arch/nds32/kernel/sleep.S new file mode 100644 index 000000000000..60c64bfbc901 --- /dev/null +++ b/arch/nds32/kernel/sleep.S @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2017 Andes Technology Corporation */ + +#include + +.data +.global sp_tmp +sp_tmp: +.long + +.text +.globl suspend2ram +.globl cpu_resume + +suspend2ram: + pushm $r0, $r31 +#if defined(CONFIG_HWZOL) + mfusr $r0, $lc + mfusr $r1, $le + mfusr $r2, $lb +#endif + mfsr $r3, $mr0 + mfsr $r4, $mr1 + mfsr $r5, $mr4 + mfsr $r6, $mr6 + mfsr $r7, $mr7 + mfsr $r8, $mr8 + mfsr $r9, $ir0 + mfsr $r10, $ir1 + mfsr $r11, $ir2 + mfsr $r12, $ir3 + mfsr $r13, $ir9 + mfsr $r14, $ir10 + mfsr $r15, $ir12 + mfsr $r16, $ir13 + mfsr $r17, $ir14 + mfsr $r18, $ir15 + pushm $r0, $r19 + + tlbop FlushAll + isb + + // transfer $sp from va to pa + sethi $r0, hi20(PAGE_OFFSET) + ori $r0, $r0, lo12(PAGE_OFFSET) + movi $r2, PHYS_OFFSET + sub $r1, $sp, $r0 + add $r2, $r1, $r2 + + // store pa($sp) to sp_tmp + sethi $r1, hi20(sp_tmp) + swi $r2, [$r1 + lo12(sp_tmp)] + + pushm $r16, $r25 + pushm $r29, $r30 +#ifdef CONFIG_CACHE_L2 + jal dcache_wb_all_level +#else + jal cpu_dcache_wb_all +#endif + popm $r29, $r30 + popm $r16, $r25 + + // get wake_mask and loop in standby + la $r1, wake_mask + lwi $r1, [$r1] +self_loop: + standby wake_grant + mfsr $r2, $ir15 + and $r2, $r1, $r2 + beqz $r2, self_loop + + // set ipc to resume address + la $r1, resume_addr + lwi $r1, [$r1] + mtsr $r1, $ipc + isb + + // reset psw, turn off the address translation + li $r2, 0x7000a + mtsr $r2, $ipsw + isb + + iret +cpu_resume: + // translate the address of sp_tmp variable to pa + la $r1, sp_tmp + sethi $r0, hi20(PAGE_OFFSET) + ori $r0, $r0, lo12(PAGE_OFFSET) + movi $r2, PHYS_OFFSET + sub $r1, $r1, $r0 + add $r1, $r1, $r2 + + // access the sp_tmp to get stack pointer + lwi $sp, [$r1] + + popm $r0, $r19 +#if defined(CONFIG_HWZOL) + mtusr $r0, $lb + mtusr $r1, $lc + mtusr $r2, $le +#endif + mtsr $r3, $mr0 + mtsr $r4, $mr1 + mtsr $r5, $mr4 + mtsr $r6, $mr6 + mtsr $r7, $mr7 + mtsr $r8, $mr8 + // set original psw to ipsw + mtsr $r9, $ir1 + + mtsr $r11, $ir2 + mtsr $r12, $ir3 + + // set ipc to RR + la $r13, RR + mtsr $r13, $ir9 + + mtsr $r14, $ir10 + mtsr $r15, $ir12 + mtsr $r16, $ir13 + mtsr $r17, $ir14 + mtsr $r18, $ir15 + popm $r0, $r31 + + isb + iret +RR: + ret diff --git a/drivers/irqchip/irq-ativic32.c b/drivers/irqchip/irq-ativic32.c index f69a8588521c..85cf6e0e0e52 100644 --- a/drivers/irqchip/irq-ativic32.c +++ b/drivers/irqchip/irq-ativic32.c @@ -10,6 +10,8 @@ #include #include +unsigned long wake_mask; + static void ativic32_ack_irq(struct irq_data *data) { __nds32__mtsr_dsb(BIT(data->hwirq), NDS32_SR_INT_PEND2); @@ -27,11 +29,40 @@ static void ativic32_unmask_irq(struct irq_data *data) __nds32__mtsr_dsb(int_mask2 | (BIT(data->hwirq)), NDS32_SR_INT_MASK2); } +static int nointc_set_wake(struct irq_data *data, unsigned int on) +{ + unsigned long int_mask = __nds32__mfsr(NDS32_SR_INT_MASK); + static unsigned long irq_orig_bit; + u32 bit = 1 << data->hwirq; + + if (on) { + if (int_mask & bit) + __assign_bit(data->hwirq, &irq_orig_bit, true); + else + __assign_bit(data->hwirq, &irq_orig_bit, false); + + __assign_bit(data->hwirq, &int_mask, true); + __assign_bit(data->hwirq, &wake_mask, true); + + } else { + if (!(irq_orig_bit & bit)) + __assign_bit(data->hwirq, &int_mask, false); + + __assign_bit(data->hwirq, &wake_mask, false); + __assign_bit(data->hwirq, &irq_orig_bit, false); + } + + __nds32__mtsr_dsb(int_mask, NDS32_SR_INT_MASK); + + return 0; +} + static struct irq_chip ativic32_chip = { .name = "ativic32", .irq_ack = ativic32_ack_irq, .irq_mask = ativic32_mask_irq, .irq_unmask = ativic32_unmask_irq, + .irq_set_wake = nointc_set_wake, }; static unsigned int __initdata nivic_map[6] = { 6, 2, 10, 16, 24, 32 }; From 4f014a41b4efd72cbefb5525372dfcd65162eb4e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 30 Oct 2018 02:22:42 +0000 Subject: [PATCH 09/16] nds32: Remove duplicated include from pm.c Remove duplicated include. Signed-off-by: YueHaibing Acked-by: Pavel Machek Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/kernel/pm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/nds32/kernel/pm.c b/arch/nds32/kernel/pm.c index 6989560abf4e..ffa8040d8be7 100644 --- a/arch/nds32/kernel/pm.c +++ b/arch/nds32/kernel/pm.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include From e46bf83c1864a20f9dd17d597ec9be18ed05add8 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 22 Nov 2018 11:14:34 +0800 Subject: [PATCH 10/16] nds32: nds32 FPU port This patch set contains basic components for supporting the nds32 FPU, such as exception handlers and context switch for FPU registers. By default, the lazy FPU scheme is supported and the user can configure it via CONFIG_LZAY_FPU. Signed-off-by: Vincent Chen Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/Kconfig | 1 + arch/nds32/Kconfig.cpu | 21 +++ arch/nds32/Makefile | 4 + arch/nds32/include/asm/bitfield.h | 15 ++ arch/nds32/include/asm/fpu.h | 114 +++++++++++ arch/nds32/include/asm/processor.h | 7 + arch/nds32/include/uapi/asm/sigcontext.h | 5 + arch/nds32/kernel/Makefile | 4 + arch/nds32/kernel/ex-entry.S | 24 ++- arch/nds32/kernel/ex-exit.S | 13 +- arch/nds32/kernel/ex-scall.S | 8 +- arch/nds32/kernel/fpu.c | 231 +++++++++++++++++++++++ arch/nds32/kernel/process.c | 64 ++++++- arch/nds32/kernel/setup.c | 12 +- arch/nds32/kernel/signal.c | 62 +++++- arch/nds32/kernel/sleep.S | 4 +- arch/nds32/kernel/traps.c | 16 ++ 17 files changed, 589 insertions(+), 16 deletions(-) create mode 100644 arch/nds32/include/asm/fpu.h create mode 100644 arch/nds32/kernel/fpu.c diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index 5a11772a514d..41cffe3de0c3 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -29,6 +29,7 @@ config NDS32 select HANDLE_DOMAIN_IRQ select HAVE_ARCH_TRACEHOOK select HAVE_DEBUG_KMEMLEAK + select HAVE_EXIT_THREAD select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_PERF_EVENTS select IRQ_DOMAIN diff --git a/arch/nds32/Kconfig.cpu b/arch/nds32/Kconfig.cpu index b8c8984d1456..bb06a1b7eef0 100644 --- a/arch/nds32/Kconfig.cpu +++ b/arch/nds32/Kconfig.cpu @@ -7,6 +7,27 @@ config CPU_LITTLE_ENDIAN bool "Little endian" default y +config FPU + bool "FPU support" + default n + help + If FPU ISA is used in user space, this configuration shall be Y to + enable required support in kerenl such as fpu context switch and + fpu exception handler. + + If no FPU ISA is used in user space, say N. + +config LAZY_FPU + bool "lazy FPU support" + depends on FPU + default y + help + Say Y here to enable the lazy FPU scheme. The lazy FPU scheme can + enhance system performance by reducing the context switch + frequency of the FPU register. + + For nomal case, say Y. + config HWZOL bool "hardware zero overhead loop support" depends on CPU_D10 || CPU_D15 diff --git a/arch/nds32/Makefile b/arch/nds32/Makefile index 9f525ed70049..6dc03206e3c9 100644 --- a/arch/nds32/Makefile +++ b/arch/nds32/Makefile @@ -5,10 +5,14 @@ KBUILD_DEFCONFIG := defconfig comma = , + ifdef CONFIG_FUNCTION_TRACER arch-y += -malways-save-lp -mno-relax endif +# Avoid generating FPU instructions +arch-y += -mno-ext-fpu-sp -mno-ext-fpu-dp -mfloat-abi=soft + KBUILD_CFLAGS += $(call cc-option, -mno-sched-prolog-epilog) KBUILD_CFLAGS += -mcmodel=large diff --git a/arch/nds32/include/asm/bitfield.h b/arch/nds32/include/asm/bitfield.h index 19b2841219ad..c1619730192a 100644 --- a/arch/nds32/include/asm/bitfield.h +++ b/arch/nds32/include/asm/bitfield.h @@ -251,6 +251,11 @@ #define ITYPE_mskSTYPE ( 0xF << ITYPE_offSTYPE ) #define ITYPE_mskCPID ( 0x3 << ITYPE_offCPID ) +/* Additional definitions of ITYPE register for FPU */ +#define FPU_DISABLE_EXCEPTION (0x1 << ITYPE_offSTYPE) +#define FPU_EXCEPTION (0x2 << ITYPE_offSTYPE) +#define FPU_CPID 0 /* FPU Co-Processor ID is 0 */ + #define NDS32_VECTOR_mskNONEXCEPTION 0x78 #define NDS32_VECTOR_offEXCEPTION 8 #define NDS32_VECTOR_offINTERRUPT 9 @@ -926,6 +931,7 @@ #define FPCSR_mskDNIT ( 0x1 << FPCSR_offDNIT ) #define FPCSR_mskRIT ( 0x1 << FPCSR_offRIT ) #define FPCSR_mskALL (FPCSR_mskIVO | FPCSR_mskDBZ | FPCSR_mskOVF | FPCSR_mskUDF | FPCSR_mskIEX) +#define FPCSR_mskALLE_NO_UDFE (FPCSR_mskIVOE | FPCSR_mskDBZE | FPCSR_mskOVFE | FPCSR_mskIEXE) #define FPCSR_mskALLE (FPCSR_mskIVOE | FPCSR_mskDBZE | FPCSR_mskOVFE | FPCSR_mskUDFE | FPCSR_mskIEXE) #define FPCSR_mskALLT (FPCSR_mskIVOT | FPCSR_mskDBZT | FPCSR_mskOVFT | FPCSR_mskUDFT | FPCSR_mskIEXT |FPCSR_mskDNIT | FPCSR_mskRIT) @@ -946,6 +952,15 @@ #define FPCFG_mskIMVER ( 0x1F << FPCFG_offIMVER ) #define FPCFG_mskAVER ( 0x1F << FPCFG_offAVER ) +/* 8 Single precision or 4 double precision registers are available */ +#define SP8_DP4_reg 0 +/* 16 Single precision or 8 double precision registers are available */ +#define SP16_DP8_reg 1 +/* 32 Single precision or 16 double precision registers are available */ +#define SP32_DP16_reg 2 +/* 32 Single precision or 32 double precision registers are available */ +#define SP32_DP32_reg 3 + /****************************************************************************** * fucpr: FUCOP_CTL (FPU and Coprocessor Enable Control Register) *****************************************************************************/ diff --git a/arch/nds32/include/asm/fpu.h b/arch/nds32/include/asm/fpu.h new file mode 100644 index 000000000000..f7a7f6b2ea8f --- /dev/null +++ b/arch/nds32/include/asm/fpu.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2005-2018 Andes Technology Corporation */ + +#ifndef __ASM_NDS32_FPU_H +#define __ASM_NDS32_FPU_H + +#if IS_ENABLED(CONFIG_FPU) +#ifndef __ASSEMBLY__ +#include +#include +#include + +extern bool has_fpu; + +extern void save_fpu(struct task_struct *__tsk); +extern void load_fpu(const struct fpu_struct *fpregs); +extern bool do_fpu_exception(unsigned int subtype, struct pt_regs *regs); + +#define test_tsk_fpu(regs) (regs->fucop_ctl & FUCOP_CTL_mskCP0EN) + +/* + * Initially load the FPU with signalling NANS. This bit pattern + * has the property that no matter whether considered as single or as + * double precision, it still represents a signalling NAN. + */ + +#define sNAN64 0xFFFFFFFFFFFFFFFFULL +#define sNAN32 0xFFFFFFFFUL + +#define FPCSR_INIT 0x0UL + +extern const struct fpu_struct init_fpuregs; + +static inline void disable_ptreg_fpu(struct pt_regs *regs) +{ + regs->fucop_ctl &= ~FUCOP_CTL_mskCP0EN; +} + +static inline void enable_ptreg_fpu(struct pt_regs *regs) +{ + regs->fucop_ctl |= FUCOP_CTL_mskCP0EN; +} + +static inline void enable_fpu(void) +{ + unsigned long fucop_ctl; + + fucop_ctl = __nds32__mfsr(NDS32_SR_FUCOP_CTL) | FUCOP_CTL_mskCP0EN; + __nds32__mtsr(fucop_ctl, NDS32_SR_FUCOP_CTL); + __nds32__isb(); +} + +static inline void disable_fpu(void) +{ + unsigned long fucop_ctl; + + fucop_ctl = __nds32__mfsr(NDS32_SR_FUCOP_CTL) & ~FUCOP_CTL_mskCP0EN; + __nds32__mtsr(fucop_ctl, NDS32_SR_FUCOP_CTL); + __nds32__isb(); +} + +static inline void lose_fpu(void) +{ + preempt_disable(); +#if IS_ENABLED(CONFIG_LAZY_FPU) + if (last_task_used_math == current) { + last_task_used_math = NULL; +#else + if (test_tsk_fpu(task_pt_regs(current))) { +#endif + save_fpu(current); + } + disable_ptreg_fpu(task_pt_regs(current)); + preempt_enable(); +} + +static inline void own_fpu(void) +{ + preempt_disable(); +#if IS_ENABLED(CONFIG_LAZY_FPU) + if (last_task_used_math != current) { + if (last_task_used_math != NULL) + save_fpu(last_task_used_math); + load_fpu(¤t->thread.fpu); + last_task_used_math = current; + } +#else + if (!test_tsk_fpu(task_pt_regs(current))) { + load_fpu(¤t->thread.fpu); + } +#endif + enable_ptreg_fpu(task_pt_regs(current)); + preempt_enable(); +} + +#if !IS_ENABLED(CONFIG_LAZY_FPU) +static inline void unlazy_fpu(struct task_struct *tsk) +{ + preempt_disable(); + if (test_tsk_fpu(task_pt_regs(tsk))) + save_fpu(tsk); + preempt_enable(); +} +#endif /* !CONFIG_LAZY_FPU */ +static inline void clear_fpu(struct pt_regs *regs) +{ + preempt_disable(); + if (test_tsk_fpu(regs)) + disable_ptreg_fpu(regs); + preempt_enable(); +} +#endif /* CONFIG_FPU */ +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_NDS32_FPU_H */ diff --git a/arch/nds32/include/asm/processor.h b/arch/nds32/include/asm/processor.h index c2660f566bac..72024f8bc129 100644 --- a/arch/nds32/include/asm/processor.h +++ b/arch/nds32/include/asm/processor.h @@ -35,6 +35,8 @@ struct thread_struct { unsigned long address; unsigned long trap_no; unsigned long error_code; + + struct fpu_struct fpu; }; #define INIT_THREAD { } @@ -72,6 +74,11 @@ struct task_struct; /* Free all resources held by a thread. */ #define release_thread(thread) do { } while(0) +#if IS_ENABLED(CONFIG_FPU) +#if !IS_ENABLED(CONFIG_UNLAZU_FPU) +extern struct task_struct *last_task_used_math; +#endif +#endif /* Prepare to copy thread state - unlazy all lazy status */ #define prepare_to_copy(tsk) do { } while (0) diff --git a/arch/nds32/include/uapi/asm/sigcontext.h b/arch/nds32/include/uapi/asm/sigcontext.h index 00567b237b0c..1257a78e3ae1 100644 --- a/arch/nds32/include/uapi/asm/sigcontext.h +++ b/arch/nds32/include/uapi/asm/sigcontext.h @@ -9,6 +9,10 @@ * before the signal handler was invoked. Note: only add new entries * to the end of the structure. */ +struct fpu_struct { + unsigned long long fd_regs[32]; + unsigned long fpcsr; +}; struct zol_struct { unsigned long nds32_lc; /* $LC */ @@ -54,6 +58,7 @@ struct sigcontext { unsigned long fault_address; unsigned long used_math_flag; /* FPU Registers */ + struct fpu_struct fpu; struct zol_struct zol; }; diff --git a/arch/nds32/kernel/Makefile b/arch/nds32/kernel/Makefile index 8d62f2ecb1ab..a1a1d61509e5 100644 --- a/arch/nds32/kernel/Makefile +++ b/arch/nds32/kernel/Makefile @@ -13,12 +13,16 @@ obj-y := ex-entry.o ex-exit.o ex-scall.o irq.o \ obj-$(CONFIG_MODULES) += nds32_ksyms.o module.o obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_OF) += devtree.o obj-$(CONFIG_CACHE_L2) += atl2c.o obj-$(CONFIG_PERF_EVENTS) += perf_event_cpu.o obj-$(CONFIG_PM) += pm.o sleep.o extra-y := head.o vmlinux.lds +CFLAGS_fpu.o += -mext-fpu-sp -mext-fpu-dp + + obj-y += vdso/ obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o diff --git a/arch/nds32/kernel/ex-entry.S b/arch/nds32/kernel/ex-entry.S index 21a144071566..107d98a1d1b8 100644 --- a/arch/nds32/kernel/ex-entry.S +++ b/arch/nds32/kernel/ex-entry.S @@ -7,6 +7,7 @@ #include #include #include +#include #ifdef CONFIG_HWZOL .macro push_zol @@ -15,12 +16,31 @@ mfusr $r16, $LC .endm #endif + .macro skip_save_fucop_ctl +#if defined(CONFIG_FPU) +skip_fucop_ctl: + smw.adm $p0, [$sp], $p0, #0x1 + j fucop_ctl_done +#endif + .endm .macro save_user_regs - +#if defined(CONFIG_FPU) + sethi $p0, hi20(has_fpu) + lbsi $p0, [$p0+lo12(has_fpu)] + beqz $p0, skip_fucop_ctl + mfsr $p0, $FUCOP_CTL + smw.adm $p0, [$sp], $p0, #0x1 + bclr $p0, $p0, #FUCOP_CTL_offCP0EN + mtsr $p0, $FUCOP_CTL +fucop_ctl_done: + /* move $SP to the bottom of pt_regs */ + addi $sp, $sp, -FUCOP_CTL_OFFSET +#else smw.adm $sp, [$sp], $sp, #0x1 /* move $SP to the bottom of pt_regs */ addi $sp, $sp, -OSP_OFFSET +#endif /* push $r0 ~ $r25 */ smw.bim $r0, [$sp], $r25 @@ -79,6 +99,7 @@ exception_handlers: .long eh_syscall !Syscall .long asm_do_IRQ !IRQ + skip_save_fucop_ctl common_exception_handler: save_user_regs mfsr $p0, $ITYPE @@ -103,7 +124,6 @@ common_exception_handler: mtsr $r21, $PSW dsb jr $p1 - /* syscall */ 1: addi $p1, $p0, #-NDS32_VECTOR_offEXCEPTION diff --git a/arch/nds32/kernel/ex-exit.S b/arch/nds32/kernel/ex-exit.S index f00af92f7e22..97ba15cd4180 100644 --- a/arch/nds32/kernel/ex-exit.S +++ b/arch/nds32/kernel/ex-exit.S @@ -8,6 +8,7 @@ #include #include #include +#include @@ -22,10 +23,18 @@ .macro restore_user_regs_first setgie.d isb - +#if defined(CONFIG_FPU) + addi $sp, $sp, OSP_OFFSET + lmw.adm $r12, [$sp], $r25, #0x0 + sethi $p0, hi20(has_fpu) + lbsi $p0, [$p0+lo12(has_fpu)] + beqz $p0, 2f + mtsr $r25, $FUCOP_CTL +2: +#else addi $sp, $sp, FUCOP_CTL_OFFSET - lmw.adm $r12, [$sp], $r24, #0x0 +#endif mtsr $r12, $SP_USR mtsr $r13, $IPC #ifdef CONFIG_HWZOL diff --git a/arch/nds32/kernel/ex-scall.S b/arch/nds32/kernel/ex-scall.S index 36aa87ecdabd..270050f1b7b1 100644 --- a/arch/nds32/kernel/ex-scall.S +++ b/arch/nds32/kernel/ex-scall.S @@ -19,11 +19,13 @@ ENTRY(__switch_to) la $p0, __entry_task sw $r1, [$p0] - move $p1, $r0 - addi $p1, $p1, #THREAD_CPU_CONTEXT + addi $p1, $r0, #THREAD_CPU_CONTEXT smw.bi $r6, [$p1], $r14, #0xb ! push r6~r14, fp, lp, sp move $r25, $r1 - addi $r1, $r1, #THREAD_CPU_CONTEXT +#if defined(CONFIG_FPU) + call _switch_fpu +#endif + addi $r1, $r25, #THREAD_CPU_CONTEXT lmw.bi $r6, [$r1], $r14, #0xb ! pop r6~r14, fp, lp, sp ret diff --git a/arch/nds32/kernel/fpu.c b/arch/nds32/kernel/fpu.c new file mode 100644 index 000000000000..e55a1e190e97 --- /dev/null +++ b/arch/nds32/kernel/fpu.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include +#include +#include +#include +#include +#include +#include +#include + +const struct fpu_struct init_fpuregs = { + .fd_regs = {[0 ... 31] = sNAN64}, + .fpcsr = FPCSR_INIT +}; + +void save_fpu(struct task_struct *tsk) +{ + unsigned int fpcfg, fpcsr; + + enable_fpu(); + fpcfg = ((__nds32__fmfcfg() & FPCFG_mskFREG) >> FPCFG_offFREG); + switch (fpcfg) { + case SP32_DP32_reg: + asm volatile ("fsdi $fd31, [%0+0xf8]\n\t" + "fsdi $fd30, [%0+0xf0]\n\t" + "fsdi $fd29, [%0+0xe8]\n\t" + "fsdi $fd28, [%0+0xe0]\n\t" + "fsdi $fd27, [%0+0xd8]\n\t" + "fsdi $fd26, [%0+0xd0]\n\t" + "fsdi $fd25, [%0+0xc8]\n\t" + "fsdi $fd24, [%0+0xc0]\n\t" + "fsdi $fd23, [%0+0xb8]\n\t" + "fsdi $fd22, [%0+0xb0]\n\t" + "fsdi $fd21, [%0+0xa8]\n\t" + "fsdi $fd20, [%0+0xa0]\n\t" + "fsdi $fd19, [%0+0x98]\n\t" + "fsdi $fd18, [%0+0x90]\n\t" + "fsdi $fd17, [%0+0x88]\n\t" + "fsdi $fd16, [%0+0x80]\n\t" + : /* no output */ + : "r" (&tsk->thread.fpu) + : "memory"); + /* fall through */ + case SP32_DP16_reg: + asm volatile ("fsdi $fd15, [%0+0x78]\n\t" + "fsdi $fd14, [%0+0x70]\n\t" + "fsdi $fd13, [%0+0x68]\n\t" + "fsdi $fd12, [%0+0x60]\n\t" + "fsdi $fd11, [%0+0x58]\n\t" + "fsdi $fd10, [%0+0x50]\n\t" + "fsdi $fd9, [%0+0x48]\n\t" + "fsdi $fd8, [%0+0x40]\n\t" + : /* no output */ + : "r" (&tsk->thread.fpu) + : "memory"); + /* fall through */ + case SP16_DP8_reg: + asm volatile ("fsdi $fd7, [%0+0x38]\n\t" + "fsdi $fd6, [%0+0x30]\n\t" + "fsdi $fd5, [%0+0x28]\n\t" + "fsdi $fd4, [%0+0x20]\n\t" + : /* no output */ + : "r" (&tsk->thread.fpu) + : "memory"); + /* fall through */ + case SP8_DP4_reg: + asm volatile ("fsdi $fd3, [%1+0x18]\n\t" + "fsdi $fd2, [%1+0x10]\n\t" + "fsdi $fd1, [%1+0x8]\n\t" + "fsdi $fd0, [%1+0x0]\n\t" + "fmfcsr %0\n\t" + "swi %0, [%1+0x100]\n\t" + : "=&r" (fpcsr) + : "r"(&tsk->thread.fpu) + : "memory"); + } + disable_fpu(); +} + +void load_fpu(const struct fpu_struct *fpregs) +{ + unsigned int fpcfg, fpcsr; + + enable_fpu(); + fpcfg = ((__nds32__fmfcfg() & FPCFG_mskFREG) >> FPCFG_offFREG); + switch (fpcfg) { + case SP32_DP32_reg: + asm volatile ("fldi $fd31, [%0+0xf8]\n\t" + "fldi $fd30, [%0+0xf0]\n\t" + "fldi $fd29, [%0+0xe8]\n\t" + "fldi $fd28, [%0+0xe0]\n\t" + "fldi $fd27, [%0+0xd8]\n\t" + "fldi $fd26, [%0+0xd0]\n\t" + "fldi $fd25, [%0+0xc8]\n\t" + "fldi $fd24, [%0+0xc0]\n\t" + "fldi $fd23, [%0+0xb8]\n\t" + "fldi $fd22, [%0+0xb0]\n\t" + "fldi $fd21, [%0+0xa8]\n\t" + "fldi $fd20, [%0+0xa0]\n\t" + "fldi $fd19, [%0+0x98]\n\t" + "fldi $fd18, [%0+0x90]\n\t" + "fldi $fd17, [%0+0x88]\n\t" + "fldi $fd16, [%0+0x80]\n\t" + : /* no output */ + : "r" (fpregs)); + /* fall through */ + case SP32_DP16_reg: + asm volatile ("fldi $fd15, [%0+0x78]\n\t" + "fldi $fd14, [%0+0x70]\n\t" + "fldi $fd13, [%0+0x68]\n\t" + "fldi $fd12, [%0+0x60]\n\t" + "fldi $fd11, [%0+0x58]\n\t" + "fldi $fd10, [%0+0x50]\n\t" + "fldi $fd9, [%0+0x48]\n\t" + "fldi $fd8, [%0+0x40]\n\t" + : /* no output */ + : "r" (fpregs)); + /* fall through */ + case SP16_DP8_reg: + asm volatile ("fldi $fd7, [%0+0x38]\n\t" + "fldi $fd6, [%0+0x30]\n\t" + "fldi $fd5, [%0+0x28]\n\t" + "fldi $fd4, [%0+0x20]\n\t" + : /* no output */ + : "r" (fpregs)); + /* fall through */ + case SP8_DP4_reg: + asm volatile ("fldi $fd3, [%1+0x18]\n\t" + "fldi $fd2, [%1+0x10]\n\t" + "fldi $fd1, [%1+0x8]\n\t" + "fldi $fd0, [%1+0x0]\n\t" + "lwi %0, [%1+0x100]\n\t" + "fmtcsr %0\n\t":"=&r" (fpcsr) + : "r"(fpregs)); + } + disable_fpu(); +} +void store_fpu_for_suspend(void) +{ +#ifdef CONFIG_LAZY_FPU + if (last_task_used_math != NULL) + save_fpu(last_task_used_math); + last_task_used_math = NULL; +#else + if (!used_math()) + return; + unlazy_fpu(current); +#endif + clear_fpu(task_pt_regs(current)); +} +inline void do_fpu_context_switch(struct pt_regs *regs) +{ + /* Enable to use FPU. */ + + if (!user_mode(regs)) { + pr_err("BUG: FPU is used in kernel mode.\n"); + BUG(); + return; + } + + enable_ptreg_fpu(regs); +#ifdef CONFIG_LAZY_FPU //Lazy FPU is used + if (last_task_used_math == current) + return; + if (last_task_used_math != NULL) + /* Other processes fpu state, save away */ + save_fpu(last_task_used_math); + last_task_used_math = current; +#endif + if (used_math()) { + load_fpu(¤t->thread.fpu); + } else { + /* First time FPU user. */ + load_fpu(&init_fpuregs); + set_used_math(); + } + +} + +inline void fill_sigfpe_signo(unsigned int fpcsr, int *signo) +{ + if (fpcsr & FPCSR_mskOVFT) + *signo = FPE_FLTOVF; + else if (fpcsr & FPCSR_mskUDFT) + *signo = FPE_FLTUND; + else if (fpcsr & FPCSR_mskIVOT) + *signo = FPE_FLTINV; + else if (fpcsr & FPCSR_mskDBZT) + *signo = FPE_FLTDIV; + else if (fpcsr & FPCSR_mskIEXT) + *signo = FPE_FLTRES; +} + +inline void handle_fpu_exception(struct pt_regs *regs) +{ + unsigned int fpcsr; + int si_code = 0, si_signo = SIGFPE; + + lose_fpu(); + fpcsr = current->thread.fpu.fpcsr; + + if (fpcsr & FPCSR_mskRIT) { + if (!user_mode(regs)) + do_exit(SIGILL); + si_signo = SIGILL; + show_regs(regs); + si_code = ILL_COPROC; + } else + fill_sigfpe_signo(fpcsr, &si_code); + force_sig_fault(si_signo, si_code, + (void __user *)instruction_pointer(regs), current); +} + +bool do_fpu_exception(unsigned int subtype, struct pt_regs *regs) +{ + int done = true; + /* Coprocessor disabled exception */ + if (subtype == FPU_DISABLE_EXCEPTION) { + preempt_disable(); + do_fpu_context_switch(regs); + preempt_enable(); + } + /* Coprocessor exception such as underflow and overflow */ + else if (subtype == FPU_EXCEPTION) + handle_fpu_exception(regs); + else + done = false; + return done; +} diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c index 65fda986e55f..ab7ab46234b1 100644 --- a/arch/nds32/kernel/process.c +++ b/arch/nds32/kernel/process.c @@ -9,15 +9,16 @@ #include #include #include +#include #include #include -extern void setup_mm_for_reboot(char mode); -#ifdef CONFIG_PROC_FS -struct proc_dir_entry *proc_dir_cpu; -EXPORT_SYMBOL(proc_dir_cpu); +#if IS_ENABLED(CONFIG_LAZY_FPU) +struct task_struct *last_task_used_math; #endif +extern void setup_mm_for_reboot(char mode); + extern inline void arch_reset(char mode) { if (mode == 's') { @@ -125,15 +126,31 @@ void show_regs(struct pt_regs *regs) EXPORT_SYMBOL(show_regs); +void exit_thread(struct task_struct *tsk) +{ +#if defined(CONFIG_FPU) && defined(CONFIG_LAZY_FPU) + if (last_task_used_math == tsk) + last_task_used_math = NULL; +#endif +} + void flush_thread(void) { +#if defined(CONFIG_FPU) + clear_fpu(task_pt_regs(current)); + clear_used_math(); +# ifdef CONFIG_LAZY_FPU + if (last_task_used_math == current) + last_task_used_math = NULL; +# endif +#endif } DEFINE_PER_CPU(struct task_struct *, __entry_task); asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); int copy_thread(unsigned long clone_flags, unsigned long stack_start, - unsigned long stk_sz, struct task_struct *p) + unsigned long stk_sz, struct task_struct *p) { struct pt_regs *childregs = task_pt_regs(p); @@ -159,6 +176,22 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; +#if IS_ENABLED(CONFIG_FPU) + if (used_math()) { +# if !IS_ENABLED(CONFIG_LAZY_FPU) + unlazy_fpu(current); +# else + preempt_disable(); + if (last_task_used_math == current) + save_fpu(current); + preempt_enable(); +# endif + p->thread.fpu = current->thread.fpu; + clear_fpu(task_pt_regs(p)); + set_stopped_child_used_math(p); + } +#endif + #ifdef CONFIG_HWZOL childregs->lb = 0; childregs->le = 0; @@ -168,12 +201,33 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, return 0; } +#if IS_ENABLED(CONFIG_FPU) +struct task_struct *_switch_fpu(struct task_struct *prev, struct task_struct *next) +{ +#if !IS_ENABLED(CONFIG_LAZY_FPU) + unlazy_fpu(prev); +#endif + if (!(next->flags & PF_KTHREAD)) + clear_fpu(task_pt_regs(next)); + return prev; +} +#endif + /* * fill in the fpe structure for a core dump... */ int dump_fpu(struct pt_regs *regs, elf_fpregset_t * fpu) { int fpvalid = 0; +#if IS_ENABLED(CONFIG_FPU) + struct task_struct *tsk = current; + + fpvalid = tsk_used_math(tsk); + if (fpvalid) { + lose_fpu(); + memcpy(fpu, &tsk->thread.fpu, sizeof(*fpu)); + } +#endif return fpvalid; } diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c index eacc79024879..d7f5657bc638 100644 --- a/arch/nds32/kernel/setup.c +++ b/arch/nds32/kernel/setup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #define HWCAP_MFUSR_PC 0x000001 @@ -40,6 +41,7 @@ #define HWCAP_DX_REGS 0x100000 unsigned long cpu_id, cpu_rev, cpu_cfgid; +bool has_fpu = false; char cpu_series; char *endianness = NULL; @@ -136,6 +138,11 @@ static void __init dump_cpu_info(int cpu) (aliasing_num - 1) << PAGE_SHIFT; } #endif +#ifdef CONFIG_FPU + /* Disable fpu and enable when it is used. */ + if (has_fpu) + disable_fpu(); +#endif } static void __init setup_cpuinfo(void) @@ -180,9 +187,10 @@ static void __init setup_cpuinfo(void) if (cpu_cfgid & 0x0004) elf_hwcap |= HWCAP_EXT2; - if (cpu_cfgid & 0x0008) + if (cpu_cfgid & 0x0008) { elf_hwcap |= HWCAP_FPU; - + has_fpu = true; + } if (cpu_cfgid & 0x0010) elf_hwcap |= HWCAP_STRING; diff --git a/arch/nds32/kernel/signal.c b/arch/nds32/kernel/signal.c index 5d01f6e33cb8..5b5be082cfa4 100644 --- a/arch/nds32/kernel/signal.c +++ b/arch/nds32/kernel/signal.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,60 @@ struct rt_sigframe { struct siginfo info; struct ucontext uc; }; +#if IS_ENABLED(CONFIG_FPU) +static inline int restore_sigcontext_fpu(struct pt_regs *regs, + struct sigcontext __user *sc) +{ + struct task_struct *tsk = current; + unsigned long used_math_flag; + int ret = 0; + + clear_used_math(); + __get_user_error(used_math_flag, &sc->used_math_flag, ret); + + if (!used_math_flag) + return 0; + set_used_math(); + +#if IS_ENABLED(CONFIG_LAZY_FPU) + preempt_disable(); + if (current == last_task_used_math) { + last_task_used_math = NULL; + disable_ptreg_fpu(regs); + } + preempt_enable(); +#else + clear_fpu(regs); +#endif + + return __copy_from_user(&tsk->thread.fpu, &sc->fpu, + sizeof(struct fpu_struct)); +} + +static inline int setup_sigcontext_fpu(struct pt_regs *regs, + struct sigcontext __user *sc) +{ + struct task_struct *tsk = current; + int ret = 0; + + __put_user_error(used_math(), &sc->used_math_flag, ret); + + if (!used_math()) + return ret; + + preempt_disable(); +#if IS_ENABLED(CONFIG_LAZY_FPU) + if (last_task_used_math == tsk) + save_fpu(last_task_used_math); +#else + unlazy_fpu(tsk); +#endif + ret = __copy_to_user(&sc->fpu, &tsk->thread.fpu, + sizeof(struct fpu_struct)); + preempt_enable(); + return ret; +} +#endif static int restore_sigframe(struct pt_regs *regs, struct rt_sigframe __user * sf) @@ -69,7 +124,9 @@ static int restore_sigframe(struct pt_regs *regs, __get_user_error(regs->le, &sf->uc.uc_mcontext.zol.nds32_le, err); __get_user_error(regs->lb, &sf->uc.uc_mcontext.zol.nds32_lb, err); #endif - +#if IS_ENABLED(CONFIG_FPU) + err |= restore_sigcontext_fpu(regs, &sf->uc.uc_mcontext); +#endif /* * Avoid sys_rt_sigreturn() restarting. */ @@ -153,6 +210,9 @@ setup_sigframe(struct rt_sigframe __user * sf, struct pt_regs *regs, __put_user_error(regs->le, &sf->uc.uc_mcontext.zol.nds32_le, err); __put_user_error(regs->lb, &sf->uc.uc_mcontext.zol.nds32_lb, err); #endif +#if IS_ENABLED(CONFIG_FPU) + err |= setup_sigcontext_fpu(regs, &sf->uc.uc_mcontext); +#endif __put_user_error(current->thread.trap_no, &sf->uc.uc_mcontext.trap_no, err); diff --git a/arch/nds32/kernel/sleep.S b/arch/nds32/kernel/sleep.S index 60c64bfbc901..ca4e61f3656f 100644 --- a/arch/nds32/kernel/sleep.S +++ b/arch/nds32/kernel/sleep.S @@ -36,7 +36,9 @@ suspend2ram: mfsr $r17, $ir14 mfsr $r18, $ir15 pushm $r0, $r19 - +#if defined(CONFIG_FPU) + jal store_fpu_for_suspend +#endif tlbop FlushAll isb diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index 1496aab48998..5aa7c17da27a 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -357,6 +358,21 @@ void do_dispatch_general(unsigned long entry, unsigned long addr, } else if (type == ETYPE_RESERVED_INSTRUCTION) { /* Reserved instruction */ do_revinsn(regs); + } else if (type == ETYPE_COPROCESSOR) { + /* Coprocessor */ +#if IS_ENABLED(CONFIG_FPU) + unsigned int fucop_exist = __nds32__mfsr(NDS32_SR_FUCOP_EXIST); + unsigned int cpid = ((itype & ITYPE_mskCPID) >> ITYPE_offCPID); + + if ((cpid == FPU_CPID) && + (fucop_exist & FUCOP_EXIST_mskCP0ISFPU)) { + unsigned int subtype = (itype & ITYPE_mskSTYPE); + + if (true == do_fpu_exception(subtype, regs)) + return; + } +#endif + unhandled_exceptions(entry, addr, type, regs); } else if (type == ETYPE_TRAP && swid == SWID_RAISE_INTERRUPT_LEVEL) { /* trap, used on v3 EDM target debugging workaround */ /* From 1ac832509f2ea1b566f0c06f98f308f58b03d098 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 22 Nov 2018 11:14:35 +0800 Subject: [PATCH 11/16] nds32: Support FP emulation The Andes FPU coprocessor does not support denormalized number handling. According to the specification, FPU generates a denorm input exception that requires the kernel to deal with this instrution operation when it encounters denormalized operands. Hence an nds32 FPU ISA emulator in the kernel is required to meet requirement. Signed-off-by: Vincent Chen Signed-off-by: Nickhu Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/Makefile | 1 + arch/nds32/include/asm/fpu.h | 1 + arch/nds32/include/asm/fpuemu.h | 32 +++ arch/nds32/include/asm/nds32_fpu_inst.h | 109 ++++++++ arch/nds32/include/asm/sfp-machine.h | 158 +++++++++++ arch/nds32/kernel/fpu.c | 31 ++- arch/nds32/math-emu/Makefile | 7 + arch/nds32/math-emu/faddd.c | 24 ++ arch/nds32/math-emu/fadds.c | 24 ++ arch/nds32/math-emu/fcmpd.c | 24 ++ arch/nds32/math-emu/fcmps.c | 24 ++ arch/nds32/math-emu/fd2s.c | 22 ++ arch/nds32/math-emu/fdivd.c | 27 ++ arch/nds32/math-emu/fdivs.c | 26 ++ arch/nds32/math-emu/fmuld.c | 23 ++ arch/nds32/math-emu/fmuls.c | 23 ++ arch/nds32/math-emu/fnegd.c | 21 ++ arch/nds32/math-emu/fnegs.c | 21 ++ arch/nds32/math-emu/fpuemu.c | 352 ++++++++++++++++++++++++ arch/nds32/math-emu/fs2d.c | 23 ++ arch/nds32/math-emu/fsqrtd.c | 21 ++ arch/nds32/math-emu/fsqrts.c | 21 ++ arch/nds32/math-emu/fsubd.c | 27 ++ arch/nds32/math-emu/fsubs.c | 27 ++ 24 files changed, 1064 insertions(+), 5 deletions(-) create mode 100644 arch/nds32/include/asm/fpuemu.h create mode 100644 arch/nds32/include/asm/nds32_fpu_inst.h create mode 100644 arch/nds32/include/asm/sfp-machine.h create mode 100644 arch/nds32/math-emu/Makefile create mode 100644 arch/nds32/math-emu/faddd.c create mode 100644 arch/nds32/math-emu/fadds.c create mode 100644 arch/nds32/math-emu/fcmpd.c create mode 100644 arch/nds32/math-emu/fcmps.c create mode 100644 arch/nds32/math-emu/fd2s.c create mode 100644 arch/nds32/math-emu/fdivd.c create mode 100644 arch/nds32/math-emu/fdivs.c create mode 100644 arch/nds32/math-emu/fmuld.c create mode 100644 arch/nds32/math-emu/fmuls.c create mode 100644 arch/nds32/math-emu/fnegd.c create mode 100644 arch/nds32/math-emu/fnegs.c create mode 100644 arch/nds32/math-emu/fpuemu.c create mode 100644 arch/nds32/math-emu/fs2d.c create mode 100644 arch/nds32/math-emu/fsqrtd.c create mode 100644 arch/nds32/math-emu/fsqrts.c create mode 100644 arch/nds32/math-emu/fsubd.c create mode 100644 arch/nds32/math-emu/fsubs.c diff --git a/arch/nds32/Makefile b/arch/nds32/Makefile index 6dc03206e3c9..0a935c136ec2 100644 --- a/arch/nds32/Makefile +++ b/arch/nds32/Makefile @@ -30,6 +30,7 @@ export TEXTADDR # If we have a machine-specific directory, then include it in the build. core-y += arch/nds32/kernel/ arch/nds32/mm/ +core-$(CONFIG_FPU) += arch/nds32/math-emu/ libs-y += arch/nds32/lib/ ifneq '$(CONFIG_NDS32_BUILTIN_DTB)' '""' diff --git a/arch/nds32/include/asm/fpu.h b/arch/nds32/include/asm/fpu.h index f7a7f6b2ea8f..9b1107b58e23 100644 --- a/arch/nds32/include/asm/fpu.h +++ b/arch/nds32/include/asm/fpu.h @@ -15,6 +15,7 @@ extern bool has_fpu; extern void save_fpu(struct task_struct *__tsk); extern void load_fpu(const struct fpu_struct *fpregs); extern bool do_fpu_exception(unsigned int subtype, struct pt_regs *regs); +extern int do_fpuemu(struct pt_regs *regs, struct fpu_struct *fpu); #define test_tsk_fpu(regs) (regs->fucop_ctl & FUCOP_CTL_mskCP0EN) diff --git a/arch/nds32/include/asm/fpuemu.h b/arch/nds32/include/asm/fpuemu.h new file mode 100644 index 000000000000..c4bd0c7faa75 --- /dev/null +++ b/arch/nds32/include/asm/fpuemu.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2005-2018 Andes Technology Corporation */ + +#ifndef __ARCH_NDS32_FPUEMU_H +#define __ARCH_NDS32_FPUEMU_H + +/* + * single precision + */ + +void fadds(void *ft, void *fa, void *fb); +void fsubs(void *ft, void *fa, void *fb); +void fmuls(void *ft, void *fa, void *fb); +void fdivs(void *ft, void *fa, void *fb); +void fs2d(void *ft, void *fa); +void fsqrts(void *ft, void *fa); +void fnegs(void *ft, void *fa); +int fcmps(void *ft, void *fa, void *fb, int cop); + +/* + * double precision + */ +void faddd(void *ft, void *fa, void *fb); +void fsubd(void *ft, void *fa, void *fb); +void fmuld(void *ft, void *fa, void *fb); +void fdivd(void *ft, void *fa, void *fb); +void fsqrtd(void *ft, void *fa); +void fd2s(void *ft, void *fa); +void fnegd(void *ft, void *fa); +int fcmpd(void *ft, void *fa, void *fb, int cop); + +#endif /* __ARCH_NDS32_FPUEMU_H */ diff --git a/arch/nds32/include/asm/nds32_fpu_inst.h b/arch/nds32/include/asm/nds32_fpu_inst.h new file mode 100644 index 000000000000..1e4b86a90a48 --- /dev/null +++ b/arch/nds32/include/asm/nds32_fpu_inst.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2005-2018 Andes Technology Corporation */ + +#ifndef __NDS32_FPU_INST_H +#define __NDS32_FPU_INST_H + +#define cop0_op 0x35 + +/* + * COP0 field of opcodes. + */ +#define fs1_op 0x0 +#define fs2_op 0x4 +#define fd1_op 0x8 +#define fd2_op 0xc + +/* + * FS1 opcode. + */ +enum fs1 { + fadds_op, fsubs_op, fcpynss_op, fcpyss_op, + fmadds_op, fmsubs_op, fcmovns_op, fcmovzs_op, + fnmadds_op, fnmsubs_op, + fmuls_op = 0xc, fdivs_op, + fs1_f2op_op = 0xf +}; + +/* + * FS1/F2OP opcode. + */ +enum fs1_f2 { + fs2d_op, fsqrts_op, + fui2s_op = 0x8, fsi2s_op = 0xc, + fs2ui_op = 0x10, fs2ui_z_op = 0x14, + fs2si_op = 0x18, fs2si_z_op = 0x1c +}; + +/* + * FS2 opcode. + */ +enum fs2 { + fcmpeqs_op, fcmpeqs_e_op, fcmplts_op, fcmplts_e_op, + fcmples_op, fcmples_e_op, fcmpuns_op, fcmpuns_e_op +}; + +/* + * FD1 opcode. + */ +enum fd1 { + faddd_op, fsubd_op, fcpynsd_op, fcpysd_op, + fmaddd_op, fmsubd_op, fcmovnd_op, fcmovzd_op, + fnmaddd_op, fnmsubd_op, + fmuld_op = 0xc, fdivd_op, fd1_f2op_op = 0xf +}; + +/* + * FD1/F2OP opcode. + */ +enum fd1_f2 { + fd2s_op, fsqrtd_op, + fui2d_op = 0x8, fsi2d_op = 0xc, + fd2ui_op = 0x10, fd2ui_z_op = 0x14, + fd2si_op = 0x18, fd2si_z_op = 0x1c +}; + +/* + * FD2 opcode. + */ +enum fd2 { + fcmpeqd_op, fcmpeqd_e_op, fcmpltd_op, fcmpltd_e_op, + fcmpled_op, fcmpled_e_op, fcmpund_op, fcmpund_e_op +}; + +#define NDS32Insn(x) x + +#define I_OPCODE_off 25 +#define NDS32Insn_OPCODE(x) (NDS32Insn(x) >> I_OPCODE_off) + +#define I_OPCODE_offRt 20 +#define I_OPCODE_mskRt (0x1fUL << I_OPCODE_offRt) +#define NDS32Insn_OPCODE_Rt(x) \ + ((NDS32Insn(x) & I_OPCODE_mskRt) >> I_OPCODE_offRt) + +#define I_OPCODE_offRa 15 +#define I_OPCODE_mskRa (0x1fUL << I_OPCODE_offRa) +#define NDS32Insn_OPCODE_Ra(x) \ + ((NDS32Insn(x) & I_OPCODE_mskRa) >> I_OPCODE_offRa) + +#define I_OPCODE_offRb 10 +#define I_OPCODE_mskRb (0x1fUL << I_OPCODE_offRb) +#define NDS32Insn_OPCODE_Rb(x) \ + ((NDS32Insn(x) & I_OPCODE_mskRb) >> I_OPCODE_offRb) + +#define I_OPCODE_offbit1014 10 +#define I_OPCODE_mskbit1014 (0x1fUL << I_OPCODE_offbit1014) +#define NDS32Insn_OPCODE_BIT1014(x) \ + ((NDS32Insn(x) & I_OPCODE_mskbit1014) >> I_OPCODE_offbit1014) + +#define I_OPCODE_offbit69 6 +#define I_OPCODE_mskbit69 (0xfUL << I_OPCODE_offbit69) +#define NDS32Insn_OPCODE_BIT69(x) \ + ((NDS32Insn(x) & I_OPCODE_mskbit69) >> I_OPCODE_offbit69) + +#define I_OPCODE_offCOP0 0 +#define I_OPCODE_mskCOP0 (0x3fUL << I_OPCODE_offCOP0) +#define NDS32Insn_OPCODE_COP0(x) \ + ((NDS32Insn(x) & I_OPCODE_mskCOP0) >> I_OPCODE_offCOP0) + +#endif /* __NDS32_FPU_INST_H */ diff --git a/arch/nds32/include/asm/sfp-machine.h b/arch/nds32/include/asm/sfp-machine.h new file mode 100644 index 000000000000..b1a5caa332b5 --- /dev/null +++ b/arch/nds32/include/asm/sfp-machine.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2005-2018 Andes Technology Corporation */ + +#include + +#define _FP_W_TYPE_SIZE 32 +#define _FP_W_TYPE unsigned long +#define _FP_WS_TYPE signed long +#define _FP_I_TYPE long + +#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2)) +#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1)) +#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2)) + +#define _FP_MUL_MEAT_S(R, X, Y) \ + _FP_MUL_MEAT_1_wide(_FP_WFRACBITS_S, R, X, Y, umul_ppmm) +#define _FP_MUL_MEAT_D(R, X, Y) \ + _FP_MUL_MEAT_2_wide(_FP_WFRACBITS_D, R, X, Y, umul_ppmm) +#define _FP_MUL_MEAT_Q(R, X, Y) \ + _FP_MUL_MEAT_4_wide(_FP_WFRACBITS_Q, R, X, Y, umul_ppmm) + +#define _FP_MUL_MEAT_DW_S(R, X, Y) \ + _FP_MUL_MEAT_DW_1_wide(_FP_WFRACBITS_S, R, X, Y, umul_ppmm) +#define _FP_MUL_MEAT_DW_D(R, X, Y) \ + _FP_MUL_MEAT_DW_2_wide(_FP_WFRACBITS_D, R, X, Y, umul_ppmm) + +#define _FP_DIV_MEAT_S(R, X, Y) _FP_DIV_MEAT_1_udiv_norm(S, R, X, Y) +#define _FP_DIV_MEAT_D(R, X, Y) _FP_DIV_MEAT_2_udiv(D, R, X, Y) + +#define _FP_NANFRAC_S ((_FP_QNANBIT_S << 1) - 1) +#define _FP_NANFRAC_D ((_FP_QNANBIT_D << 1) - 1), -1 +#define _FP_NANFRAC_Q ((_FP_QNANBIT_Q << 1) - 1), -1, -1, -1 +#define _FP_NANSIGN_S 0 +#define _FP_NANSIGN_D 0 +#define _FP_NANSIGN_Q 0 + +#define _FP_KEEPNANFRACP 1 +#define _FP_QNANNEGATEDP 0 + +#define _FP_CHOOSENAN(fs, wc, R, X, Y, OP) \ +do { \ + if ((_FP_FRAC_HIGH_RAW_##fs(X) & _FP_QNANBIT_##fs) \ + && !(_FP_FRAC_HIGH_RAW_##fs(Y) & _FP_QNANBIT_##fs)) { \ + R##_s = Y##_s; \ + _FP_FRAC_COPY_##wc(R, Y); \ + } else { \ + R##_s = X##_s; \ + _FP_FRAC_COPY_##wc(R, X); \ + } \ + R##_c = FP_CLS_NAN; \ +} while (0) + +#define __FPU_FPCSR (current->thread.fpu.fpcsr) + +/* Obtain the current rounding mode. */ +#define FP_ROUNDMODE \ +({ \ + __FPU_FPCSR & FPCSR_mskRM; \ +}) + +#define FP_RND_NEAREST 0 +#define FP_RND_PINF 1 +#define FP_RND_MINF 2 +#define FP_RND_ZERO 3 + +#define FP_EX_INVALID FPCSR_mskIVO +#define FP_EX_DIVZERO FPCSR_mskDBZ +#define FP_EX_OVERFLOW FPCSR_mskOVF +#define FP_EX_UNDERFLOW FPCSR_mskUDF +#define FP_EX_INEXACT FPCSR_mskIEX + +#define SF_CEQ 2 +#define SF_CLT 1 +#define SF_CGT 3 +#define SF_CUN 4 + +#include + +#ifdef __BIG_ENDIAN__ +#define __BYTE_ORDER __BIG_ENDIAN +#define __LITTLE_ENDIAN 0 +#else +#define __BYTE_ORDER __LITTLE_ENDIAN +#define __BIG_ENDIAN 0 +#endif + +#define abort() do { } while (0) +#define umul_ppmm(w1, w0, u, v) \ +do { \ + UWtype __x0, __x1, __x2, __x3; \ + UHWtype __ul, __vl, __uh, __vh; \ + \ + __ul = __ll_lowpart(u); \ + __uh = __ll_highpart(u); \ + __vl = __ll_lowpart(v); \ + __vh = __ll_highpart(v); \ + \ + __x0 = (UWtype) __ul * __vl; \ + __x1 = (UWtype) __ul * __vh; \ + __x2 = (UWtype) __uh * __vl; \ + __x3 = (UWtype) __uh * __vh; \ + \ + __x1 += __ll_highpart(__x0); \ + __x1 += __x2; \ + if (__x1 < __x2) \ + __x3 += __ll_B; \ + \ + (w1) = __x3 + __ll_highpart(__x1); \ + (w0) = __ll_lowpart(__x1) * __ll_B + __ll_lowpart(__x0); \ +} while (0) + +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ +do { \ + UWtype __x; \ + __x = (al) + (bl); \ + (sh) = (ah) + (bh) + (__x < (al)); \ + (sl) = __x; \ +} while (0) + +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ +do { \ + UWtype __x; \ + __x = (al) - (bl); \ + (sh) = (ah) - (bh) - (__x > (al)); \ + (sl) = __x; \ +} while (0) + +#define udiv_qrnnd(q, r, n1, n0, d) \ +do { \ + UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ + __d1 = __ll_highpart(d); \ + __d0 = __ll_lowpart(d); \ + \ + __r1 = (n1) % __d1; \ + __q1 = (n1) / __d1; \ + __m = (UWtype) __q1 * __d0; \ + __r1 = __r1 * __ll_B | __ll_highpart(n0); \ + if (__r1 < __m) { \ + __q1--, __r1 += (d); \ + if (__r1 >= (d)) \ + if (__r1 < __m) \ + __q1--, __r1 += (d); \ + } \ + __r1 -= __m; \ + __r0 = __r1 % __d1; \ + __q0 = __r1 / __d1; \ + __m = (UWtype) __q0 * __d0; \ + __r0 = __r0 * __ll_B | __ll_lowpart(n0); \ + if (__r0 < __m) { \ + __q0--, __r0 += (d); \ + if (__r0 >= (d)) \ + if (__r0 < __m) \ + __q0--, __r0 += (d); \ + } \ + __r0 -= __m; \ + (q) = (UWtype) __q1 * __ll_B | __q0; \ + (r) = __r0; \ +} while (0) diff --git a/arch/nds32/kernel/fpu.c b/arch/nds32/kernel/fpu.c index e55a1e190e97..2942df6f93e6 100644 --- a/arch/nds32/kernel/fpu.c +++ b/arch/nds32/kernel/fpu.c @@ -183,10 +183,10 @@ inline void fill_sigfpe_signo(unsigned int fpcsr, int *signo) { if (fpcsr & FPCSR_mskOVFT) *signo = FPE_FLTOVF; - else if (fpcsr & FPCSR_mskUDFT) - *signo = FPE_FLTUND; else if (fpcsr & FPCSR_mskIVOT) *signo = FPE_FLTINV; + else if (fpcsr & FPCSR_mskUDFT) + *signo = FPE_FLTUND; else if (fpcsr & FPCSR_mskDBZT) *signo = FPE_FLTDIV; else if (fpcsr & FPCSR_mskIEXT) @@ -201,16 +201,37 @@ inline void handle_fpu_exception(struct pt_regs *regs) lose_fpu(); fpcsr = current->thread.fpu.fpcsr; - if (fpcsr & FPCSR_mskRIT) { + if (fpcsr & FPCSR_mskDNIT) { + si_signo = do_fpuemu(regs, ¤t->thread.fpu); + fpcsr = current->thread.fpu.fpcsr; + if (!si_signo) + goto done; + } else if (fpcsr & FPCSR_mskRIT) { if (!user_mode(regs)) do_exit(SIGILL); si_signo = SIGILL; + } + + + switch (si_signo) { + case SIGFPE: + fill_sigfpe_signo(fpcsr, &si_code); + break; + case SIGILL: show_regs(regs); si_code = ILL_COPROC; - } else - fill_sigfpe_signo(fpcsr, &si_code); + break; + case SIGBUS: + si_code = BUS_ADRERR; + break; + default: + break; + } + force_sig_fault(si_signo, si_code, (void __user *)instruction_pointer(regs), current); +done: + own_fpu(); } bool do_fpu_exception(unsigned int subtype, struct pt_regs *regs) diff --git a/arch/nds32/math-emu/Makefile b/arch/nds32/math-emu/Makefile new file mode 100644 index 000000000000..947fe0c3d52f --- /dev/null +++ b/arch/nds32/math-emu/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux/nds32 kernel FPU emulation. +# + +obj-y := fpuemu.o \ + fdivd.o fmuld.o fsubd.o faddd.o fs2d.o fsqrtd.o fcmpd.o fnegs.o \ + fdivs.o fmuls.o fsubs.o fadds.o fd2s.o fsqrts.o fcmps.o fnegd.o diff --git a/arch/nds32/math-emu/faddd.c b/arch/nds32/math-emu/faddd.c new file mode 100644 index 000000000000..f7fd4e3c3904 --- /dev/null +++ b/arch/nds32/math-emu/faddd.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include + +#include +#include +#include +void faddd(void *ft, void *fa, void *fb) +{ + FP_DECL_D(A); + FP_DECL_D(B); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + FP_UNPACK_DP(B, fb); + + FP_ADD_D(R, A, B); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; + +} diff --git a/arch/nds32/math-emu/fadds.c b/arch/nds32/math-emu/fadds.c new file mode 100644 index 000000000000..f5af6ca8cca5 --- /dev/null +++ b/arch/nds32/math-emu/fadds.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include + +#include +#include +#include +void fadds(void *ft, void *fa, void *fb) +{ + FP_DECL_S(A); + FP_DECL_S(B); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + FP_UNPACK_SP(B, fb); + + FP_ADD_S(R, A, B); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; + +} diff --git a/arch/nds32/math-emu/fcmpd.c b/arch/nds32/math-emu/fcmpd.c new file mode 100644 index 000000000000..0ea225abe880 --- /dev/null +++ b/arch/nds32/math-emu/fcmpd.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include +#include +#include +int fcmpd(void *ft, void *fa, void *fb, int cmpop) +{ + FP_DECL_D(A); + FP_DECL_D(B); + FP_DECL_EX; + long cmp; + + FP_UNPACK_DP(A, fa); + FP_UNPACK_DP(B, fb); + + FP_CMP_D(cmp, A, B, SF_CUN); + cmp += 2; + if (cmp == SF_CGT) + *(long *)ft = 0; + else + *(long *)ft = (cmp & cmpop) ? 1 : 0; + + return 0; +} diff --git a/arch/nds32/math-emu/fcmps.c b/arch/nds32/math-emu/fcmps.c new file mode 100644 index 000000000000..681480758213 --- /dev/null +++ b/arch/nds32/math-emu/fcmps.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include +#include +#include +int fcmps(void *ft, void *fa, void *fb, int cmpop) +{ + FP_DECL_S(A); + FP_DECL_S(B); + FP_DECL_EX; + long cmp; + + FP_UNPACK_SP(A, fa); + FP_UNPACK_SP(B, fb); + + FP_CMP_S(cmp, A, B, SF_CUN); + cmp += 2; + if (cmp == SF_CGT) + *(int *)ft = 0x0; + else + *(int *)ft = (cmp & cmpop) ? 0x1 : 0x0; + + return 0; +} diff --git a/arch/nds32/math-emu/fd2s.c b/arch/nds32/math-emu/fd2s.c new file mode 100644 index 000000000000..1328371e8170 --- /dev/null +++ b/arch/nds32/math-emu/fd2s.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include + +#include +#include +#include +#include +void fd2s(void *ft, void *fa) +{ + FP_DECL_D(A); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + + FP_CONV(S, D, 1, 2, R, A); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fdivd.c b/arch/nds32/math-emu/fdivd.c new file mode 100644 index 000000000000..458e7e98b08e --- /dev/null +++ b/arch/nds32/math-emu/fdivd.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include +#include +#include +#include + +void fdivd(void *ft, void *fa, void *fb) +{ + FP_DECL_D(A); + FP_DECL_D(B); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + FP_UNPACK_DP(B, fb); + + if (B_c == FP_CLS_ZERO && A_c != FP_CLS_ZERO) + FP_SET_EXCEPTION(FP_EX_DIVZERO); + + FP_DIV_D(R, A, B); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fdivs.c b/arch/nds32/math-emu/fdivs.c new file mode 100644 index 000000000000..c7d202159ce2 --- /dev/null +++ b/arch/nds32/math-emu/fdivs.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include + +#include +#include +#include +void fdivs(void *ft, void *fa, void *fb) +{ + FP_DECL_S(A); + FP_DECL_S(B); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + FP_UNPACK_SP(B, fb); + + if (B_c == FP_CLS_ZERO && A_c != FP_CLS_ZERO) + FP_SET_EXCEPTION(FP_EX_DIVZERO); + + FP_DIV_S(R, A, B); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fmuld.c b/arch/nds32/math-emu/fmuld.c new file mode 100644 index 000000000000..f3c77a45ddc2 --- /dev/null +++ b/arch/nds32/math-emu/fmuld.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include + +#include +#include +#include +void fmuld(void *ft, void *fa, void *fb) +{ + FP_DECL_D(A); + FP_DECL_D(B); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + FP_UNPACK_DP(B, fb); + + FP_MUL_D(R, A, B); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fmuls.c b/arch/nds32/math-emu/fmuls.c new file mode 100644 index 000000000000..cf150df938f9 --- /dev/null +++ b/arch/nds32/math-emu/fmuls.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include + +#include +#include +#include +void fmuls(void *ft, void *fa, void *fb) +{ + FP_DECL_S(A); + FP_DECL_S(B); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + FP_UNPACK_SP(B, fb); + + FP_MUL_S(R, A, B); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fnegd.c b/arch/nds32/math-emu/fnegd.c new file mode 100644 index 000000000000..de7ea6a0873e --- /dev/null +++ b/arch/nds32/math-emu/fnegd.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include + +#include +#include +#include +void fnegd(void *ft, void *fa) +{ + FP_DECL_D(A); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + + FP_NEG_D(R, A); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fnegs.c b/arch/nds32/math-emu/fnegs.c new file mode 100644 index 000000000000..07270b326a77 --- /dev/null +++ b/arch/nds32/math-emu/fnegs.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include + +#include +#include +#include +void fnegs(void *ft, void *fa) +{ + FP_DECL_S(A); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + + FP_NEG_S(R, A); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fpuemu.c b/arch/nds32/math-emu/fpuemu.c new file mode 100644 index 000000000000..2a01333d6e5f --- /dev/null +++ b/arch/nds32/math-emu/fpuemu.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include +#include +#include +#include +#include + +#define DPFROMREG(dp, x) (dp = (void *)((unsigned long *)fpu_reg + 2*x)) +#ifdef __NDS32_EL__ +#define SPFROMREG(sp, x)\ + ((sp) = (void *)((unsigned long *)fpu_reg + (x^1))) +#else +#define SPFROMREG(sp, x) ((sp) = (void *)((unsigned long *)fpu_reg + x)) +#endif + +#define DEF3OP(name, p, f1, f2) \ +void fpemu_##name##p(void *ft, void *fa, void *fb) \ +{ \ + f1(fa, fa, fb); \ + f2(ft, ft, fa); \ +} + +#define DEF3OPNEG(name, p, f1, f2, f3) \ +void fpemu_##name##p(void *ft, void *fa, void *fb) \ +{ \ + f1(fa, fa, fb); \ + f2(ft, ft, fa); \ + f3(ft, ft); \ +} +DEF3OP(fmadd, s, fmuls, fadds); +DEF3OP(fmsub, s, fmuls, fsubs); +DEF3OP(fmadd, d, fmuld, faddd); +DEF3OP(fmsub, d, fmuld, fsubd); +DEF3OPNEG(fnmadd, s, fmuls, fadds, fnegs); +DEF3OPNEG(fnmsub, s, fmuls, fsubs, fnegs); +DEF3OPNEG(fnmadd, d, fmuld, faddd, fnegd); +DEF3OPNEG(fnmsub, d, fmuld, fsubd, fnegd); + +static const unsigned char cmptab[8] = { + SF_CEQ, + SF_CEQ, + SF_CLT, + SF_CLT, + SF_CLT | SF_CEQ, + SF_CLT | SF_CEQ, + SF_CUN, + SF_CUN +}; + +enum ARGTYPE { + S1S = 1, + S2S, + S1D, + CS, + D1D, + D2D, + D1S, + CD +}; +union func_t { + void (*t)(void *ft, void *fa, void *fb); + void (*b)(void *ft, void *fa); +}; +/* + * Emulate a single FPU arithmetic instruction. + */ +static int fpu_emu(struct fpu_struct *fpu_reg, unsigned long insn) +{ + int rfmt; /* resulting format */ + union func_t func; + int ftype = 0; + + switch (rfmt = NDS32Insn_OPCODE_COP0(insn)) { + case fs1_op:{ + switch (NDS32Insn_OPCODE_BIT69(insn)) { + case fadds_op: + func.t = fadds; + ftype = S2S; + break; + case fsubs_op: + func.t = fsubs; + ftype = S2S; + break; + case fmadds_op: + func.t = fpemu_fmadds; + ftype = S2S; + break; + case fmsubs_op: + func.t = fpemu_fmsubs; + ftype = S2S; + break; + case fnmadds_op: + func.t = fpemu_fnmadds; + ftype = S2S; + break; + case fnmsubs_op: + func.t = fpemu_fnmsubs; + ftype = S2S; + break; + case fmuls_op: + func.t = fmuls; + ftype = S2S; + break; + case fdivs_op: + func.t = fdivs; + ftype = S2S; + break; + case fs1_f2op_op: + switch (NDS32Insn_OPCODE_BIT1014(insn)) { + case fs2d_op: + func.b = fs2d; + ftype = S1D; + break; + case fsqrts_op: + func.b = fsqrts; + ftype = S1S; + break; + default: + return SIGILL; + } + break; + default: + return SIGILL; + } + break; + } + case fs2_op: + switch (NDS32Insn_OPCODE_BIT69(insn)) { + case fcmpeqs_op: + case fcmpeqs_e_op: + case fcmplts_op: + case fcmplts_e_op: + case fcmples_op: + case fcmples_e_op: + case fcmpuns_op: + case fcmpuns_e_op: + ftype = CS; + break; + default: + return SIGILL; + } + break; + case fd1_op:{ + switch (NDS32Insn_OPCODE_BIT69(insn)) { + case faddd_op: + func.t = faddd; + ftype = D2D; + break; + case fsubd_op: + func.t = fsubd; + ftype = D2D; + break; + case fmaddd_op: + func.t = fpemu_fmaddd; + ftype = D2D; + break; + case fmsubd_op: + func.t = fpemu_fmsubd; + ftype = D2D; + break; + case fnmaddd_op: + func.t = fpemu_fnmaddd; + ftype = D2D; + break; + case fnmsubd_op: + func.t = fpemu_fnmsubd; + ftype = D2D; + break; + case fmuld_op: + func.t = fmuld; + ftype = D2D; + break; + case fdivd_op: + func.t = fdivd; + ftype = D2D; + break; + case fd1_f2op_op: + switch (NDS32Insn_OPCODE_BIT1014(insn)) { + case fd2s_op: + func.b = fd2s; + ftype = D1S; + break; + case fsqrtd_op: + func.b = fsqrtd; + ftype = D1D; + break; + default: + return SIGILL; + } + break; + default: + return SIGILL; + + } + break; + } + + case fd2_op: + switch (NDS32Insn_OPCODE_BIT69(insn)) { + case fcmpeqd_op: + case fcmpeqd_e_op: + case fcmpltd_op: + case fcmpltd_e_op: + case fcmpled_op: + case fcmpled_e_op: + case fcmpund_op: + case fcmpund_e_op: + ftype = CD; + break; + default: + return SIGILL; + } + break; + + default: + return SIGILL; + } + + switch (ftype) { + case S1S:{ + void *ft, *fa; + + SPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + SPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + func.b(ft, fa); + break; + } + case S2S:{ + void *ft, *fa, *fb; + + SPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + SPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + SPFROMREG(fb, NDS32Insn_OPCODE_Rb(insn)); + func.t(ft, fa, fb); + break; + } + case S1D:{ + void *ft, *fa; + + DPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + SPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + func.b(ft, fa); + break; + } + case CS:{ + unsigned int cmpop = NDS32Insn_OPCODE_BIT69(insn); + void *ft, *fa, *fb; + + SPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + SPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + SPFROMREG(fb, NDS32Insn_OPCODE_Rb(insn)); + if (cmpop < 0x8) { + cmpop = cmptab[cmpop]; + fcmps(ft, fa, fb, cmpop); + } else + return SIGILL; + break; + } + case D1D:{ + void *ft, *fa; + + DPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + DPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + func.b(ft, fa); + break; + } + case D2D:{ + void *ft, *fa, *fb; + + DPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + DPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + DPFROMREG(fb, NDS32Insn_OPCODE_Rb(insn)); + func.t(ft, fa, fb); + break; + } + case D1S:{ + void *ft, *fa; + + SPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + DPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + func.b(ft, fa); + break; + } + case CD:{ + unsigned int cmpop = NDS32Insn_OPCODE_BIT69(insn); + void *ft, *fa, *fb; + + SPFROMREG(ft, NDS32Insn_OPCODE_Rt(insn)); + DPFROMREG(fa, NDS32Insn_OPCODE_Ra(insn)); + DPFROMREG(fb, NDS32Insn_OPCODE_Rb(insn)); + if (cmpop < 0x8) { + cmpop = cmptab[cmpop]; + fcmpd(ft, fa, fb, cmpop); + } else + return SIGILL; + break; + } + default: + return SIGILL; + } + + /* + * If an exception is required, generate a tidy SIGFPE exception. + */ + if ((fpu_reg->fpcsr << 5) & fpu_reg->fpcsr & FPCSR_mskALLE) + return SIGFPE; + return 0; +} + + +int do_fpuemu(struct pt_regs *regs, struct fpu_struct *fpu) +{ + unsigned long insn = 0, addr = regs->ipc; + unsigned long emulpc, contpc; + unsigned char *pc = (void *)&insn; + char c; + int i = 0, ret; + + for (i = 0; i < 4; i++) { + if (__get_user(c, (unsigned char *)addr++)) + return SIGBUS; + *pc++ = c; + } + + insn = be32_to_cpu(insn); + + emulpc = regs->ipc; + contpc = regs->ipc + 4; + + if (NDS32Insn_OPCODE(insn) != cop0_op) + return SIGILL; + switch (NDS32Insn_OPCODE_COP0(insn)) { + case fs1_op: + case fs2_op: + case fd1_op: + case fd2_op: + { + /* a real fpu computation instruction */ + ret = fpu_emu(fpu, insn); + if (!ret) + regs->ipc = contpc; + } + break; + + default: + return SIGILL; + } + + return ret; +} diff --git a/arch/nds32/math-emu/fs2d.c b/arch/nds32/math-emu/fs2d.c new file mode 100644 index 000000000000..0e8db9035631 --- /dev/null +++ b/arch/nds32/math-emu/fs2d.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include +#include +#include +#include +#include + +void fs2d(void *ft, void *fa) +{ + FP_DECL_S(A); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + + FP_CONV(D, S, 2, 1, R, A); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fsqrtd.c b/arch/nds32/math-emu/fsqrtd.c new file mode 100644 index 000000000000..c3a8dbd81d4e --- /dev/null +++ b/arch/nds32/math-emu/fsqrtd.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include +#include +#include +#include +void fsqrtd(void *ft, void *fa) +{ + FP_DECL_D(A); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + + FP_SQRT_D(R, A); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fsqrts.c b/arch/nds32/math-emu/fsqrts.c new file mode 100644 index 000000000000..4c6f94b27328 --- /dev/null +++ b/arch/nds32/math-emu/fsqrts.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation + +#include +#include +#include +#include +void fsqrts(void *ft, void *fa) +{ + FP_DECL_S(A); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + + FP_SQRT_S(R, A); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fsubd.c b/arch/nds32/math-emu/fsubd.c new file mode 100644 index 000000000000..81b6a0d02a1f --- /dev/null +++ b/arch/nds32/math-emu/fsubd.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include + +#include +#include +#include +void fsubd(void *ft, void *fa, void *fb) +{ + + FP_DECL_D(A); + FP_DECL_D(B); + FP_DECL_D(R); + FP_DECL_EX; + + FP_UNPACK_DP(A, fa); + FP_UNPACK_DP(B, fb); + + if (B_c != FP_CLS_NAN) + B_s ^= 1; + + FP_ADD_D(R, A, B); + + FP_PACK_DP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} diff --git a/arch/nds32/math-emu/fsubs.c b/arch/nds32/math-emu/fsubs.c new file mode 100644 index 000000000000..61ddd9708465 --- /dev/null +++ b/arch/nds32/math-emu/fsubs.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2005-2018 Andes Technology Corporation +#include + +#include +#include +#include +void fsubs(void *ft, void *fa, void *fb) +{ + + FP_DECL_S(A); + FP_DECL_S(B); + FP_DECL_S(R); + FP_DECL_EX; + + FP_UNPACK_SP(A, fa); + FP_UNPACK_SP(B, fb); + + if (B_c != FP_CLS_NAN) + B_s ^= 1; + + FP_ADD_S(R, A, B); + + FP_PACK_SP(ft, R); + + __FPU_FPCSR |= FP_CUR_EXCEPTIONS; +} From 44e92e0364adfd7b6759084e02a550d06336d896 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 22 Nov 2018 11:14:36 +0800 Subject: [PATCH 12/16] nds32: support denormalized result through FP emulator Currently, the nds32 FPU dose not support the arithmetic of denormalized number. When the nds32 FPU finds the result of the instruction is a denormlized number, the nds32 FPU considers it to be an underflow condition and rounds the result to an appropriate number. It may causes some loss of precision. This commit proposes a solution to re-execute the instruction by the FPU emulator to enhance the precision. To transfer calculations from user space to kernel space, this feature will enable the underflow exception trap by default. Enabling this feature may cause some side effects: 1. Performance loss due to extra FPU exception 2. Need another scheme to control real underflow trap A new parameter, UDF_trap, which is belong to FPU context is used to control underflow trap. User can configure this feature via CONFIG_SUPPORT_DENORMAL_ARITHMETIC Signed-off-by: Vincent Chen Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/Kconfig.cpu | 13 ++++++++++ arch/nds32/include/asm/elf.h | 11 ++++++++ arch/nds32/include/asm/fpu.h | 11 ++++++++ arch/nds32/include/asm/syscalls.h | 1 + arch/nds32/include/uapi/asm/auxvec.h | 7 ++++++ arch/nds32/include/uapi/asm/sigcontext.h | 9 +++++++ arch/nds32/include/uapi/asm/udftrap.h | 13 ++++++++++ arch/nds32/include/uapi/asm/unistd.h | 2 ++ arch/nds32/kernel/fpu.c | 25 +++++++++++++++--- arch/nds32/kernel/sys_nds32.c | 32 ++++++++++++++++++++++++ arch/nds32/math-emu/fpuemu.c | 5 ++++ 11 files changed, 125 insertions(+), 4 deletions(-) create mode 100644 arch/nds32/include/uapi/asm/udftrap.h diff --git a/arch/nds32/Kconfig.cpu b/arch/nds32/Kconfig.cpu index bb06a1b7eef0..6482ed877f97 100644 --- a/arch/nds32/Kconfig.cpu +++ b/arch/nds32/Kconfig.cpu @@ -28,6 +28,19 @@ config LAZY_FPU For nomal case, say Y. +config SUPPORT_DENORMAL_ARITHMETIC + bool "Denormal arithmetic support" + depends on FPU + default n + help + Say Y here to enable arithmetic of denormalized number. Enabling + this feature can enhance the precision for tininess number. + However, performance loss in float pointe calculations is + possibly significant due to additional FPU exception. + + If the calculated tolerance for tininess number is not critical, + say N to prevent performance loss. + config HWZOL bool "hardware zero overhead loop support" depends on CPU_D10 || CPU_D15 diff --git a/arch/nds32/include/asm/elf.h b/arch/nds32/include/asm/elf.h index f5f9cf7e0544..95f3ea253e4c 100644 --- a/arch/nds32/include/asm/elf.h +++ b/arch/nds32/include/asm/elf.h @@ -9,6 +9,7 @@ */ #include +#include typedef unsigned long elf_greg_t; typedef unsigned long elf_freg_t[3]; @@ -159,8 +160,18 @@ struct elf32_hdr; #endif + +#if IS_ENABLED(CONFIG_FPU) +#define FPU_AUX_ENT NEW_AUX_ENT(AT_FPUCW, FPCSR_INIT) +#else +#define FPU_AUX_ENT NEW_AUX_ENT(AT_IGNORE, 0) +#endif + #define ARCH_DLINFO \ do { \ + /* Optional FPU initialization */ \ + FPU_AUX_ENT; \ + \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (elf_addr_t)current->mm->context.vdso); \ } while (0) diff --git a/arch/nds32/include/asm/fpu.h b/arch/nds32/include/asm/fpu.h index 9b1107b58e23..019f1bcfc5ee 100644 --- a/arch/nds32/include/asm/fpu.h +++ b/arch/nds32/include/asm/fpu.h @@ -28,7 +28,18 @@ extern int do_fpuemu(struct pt_regs *regs, struct fpu_struct *fpu); #define sNAN64 0xFFFFFFFFFFFFFFFFULL #define sNAN32 0xFFFFFFFFUL +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) +/* + * Denormalized number is unsupported by nds32 FPU. Hence the operation + * is treated as underflow cases when the final result is a denormalized + * number. To enhance precision, underflow exception trap should be + * enabled by default and kerenl will re-execute it by fpu emulator + * when getting underflow exception. + */ +#define FPCSR_INIT FPCSR_mskUDFE +#else #define FPCSR_INIT 0x0UL +#endif extern const struct fpu_struct init_fpuregs; diff --git a/arch/nds32/include/asm/syscalls.h b/arch/nds32/include/asm/syscalls.h index 78778ecff60c..da32101b455d 100644 --- a/arch/nds32/include/asm/syscalls.h +++ b/arch/nds32/include/asm/syscalls.h @@ -7,6 +7,7 @@ asmlinkage long sys_cacheflush(unsigned long addr, unsigned long len, unsigned int op); asmlinkage long sys_fadvise64_64_wrapper(int fd, int advice, loff_t offset, loff_t len); asmlinkage long sys_rt_sigreturn_wrapper(void); +asmlinkage long sys_udftrap(int option); #include diff --git a/arch/nds32/include/uapi/asm/auxvec.h b/arch/nds32/include/uapi/asm/auxvec.h index 56043ce4972f..2d3213f5e595 100644 --- a/arch/nds32/include/uapi/asm/auxvec.h +++ b/arch/nds32/include/uapi/asm/auxvec.h @@ -4,6 +4,13 @@ #ifndef __ASM_AUXVEC_H #define __ASM_AUXVEC_H +/* + * This entry gives some information about the FPU initialization + * performed by the kernel. + */ +#define AT_FPUCW 18 /* Used FPU control word. */ + + /* VDSO location */ #define AT_SYSINFO_EHDR 33 diff --git a/arch/nds32/include/uapi/asm/sigcontext.h b/arch/nds32/include/uapi/asm/sigcontext.h index 1257a78e3ae1..58afc416473e 100644 --- a/arch/nds32/include/uapi/asm/sigcontext.h +++ b/arch/nds32/include/uapi/asm/sigcontext.h @@ -12,6 +12,15 @@ struct fpu_struct { unsigned long long fd_regs[32]; unsigned long fpcsr; + /* + * UDF_trap is used to recognize whether underflow trap is enabled + * or not. When UDF_trap == 1, this process will be traped and then + * get a SIGFPE signal when encountering an underflow exception. + * UDF_trap is only modified through setfputrap syscall. Therefore, + * UDF_trap needn't be saved or loaded to context in each context + * switch. + */ + unsigned long UDF_trap; }; struct zol_struct { diff --git a/arch/nds32/include/uapi/asm/udftrap.h b/arch/nds32/include/uapi/asm/udftrap.h new file mode 100644 index 000000000000..433f79d679c0 --- /dev/null +++ b/arch/nds32/include/uapi/asm/udftrap.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2005-2018 Andes Technology Corporation */ +#ifndef _ASM_SETFPUTRAP +#define _ASM_SETFPUTRAP + +/* + * Options for setfputrap system call + */ +#define DISABLE_UDFTRAP 0 /* disable underflow exception trap */ +#define ENABLE_UDFTRAP 1 /* enable undeflos exception trap */ +#define GET_UDFTRAP 2 /* only get undeflos exception trap status */ + +#endif /* _ASM_CACHECTL */ diff --git a/arch/nds32/include/uapi/asm/unistd.h b/arch/nds32/include/uapi/asm/unistd.h index 603e826e0449..c2c3a3e34083 100644 --- a/arch/nds32/include/uapi/asm/unistd.h +++ b/arch/nds32/include/uapi/asm/unistd.h @@ -9,4 +9,6 @@ /* Additional NDS32 specific syscalls. */ #define __NR_cacheflush (__NR_arch_specific_syscall) +#define __NR_udftrap (__NR_arch_specific_syscall + 1) __SYSCALL(__NR_cacheflush, sys_cacheflush) +__SYSCALL(__NR_udftrap, sys_udftrap) diff --git a/arch/nds32/kernel/fpu.c b/arch/nds32/kernel/fpu.c index 2942df6f93e6..fddd40c7a16f 100644 --- a/arch/nds32/kernel/fpu.c +++ b/arch/nds32/kernel/fpu.c @@ -12,7 +12,10 @@ const struct fpu_struct init_fpuregs = { .fd_regs = {[0 ... 31] = sNAN64}, - .fpcsr = FPCSR_INIT + .fpcsr = FPCSR_INIT, +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + .UDF_trap = 0 +#endif }; void save_fpu(struct task_struct *tsk) @@ -174,6 +177,9 @@ inline void do_fpu_context_switch(struct pt_regs *regs) } else { /* First time FPU user. */ load_fpu(&init_fpuregs); +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + current->thread.fpu.UDF_trap = init_fpuregs.UDF_trap; +#endif set_used_math(); } @@ -183,10 +189,12 @@ inline void fill_sigfpe_signo(unsigned int fpcsr, int *signo) { if (fpcsr & FPCSR_mskOVFT) *signo = FPE_FLTOVF; - else if (fpcsr & FPCSR_mskIVOT) - *signo = FPE_FLTINV; +#ifndef CONFIG_SUPPORT_DENORMAL_ARITHMETIC else if (fpcsr & FPCSR_mskUDFT) *signo = FPE_FLTUND; +#endif + else if (fpcsr & FPCSR_mskIVOT) + *signo = FPE_FLTINV; else if (fpcsr & FPCSR_mskDBZT) *signo = FPE_FLTDIV; else if (fpcsr & FPCSR_mskIEXT) @@ -197,11 +205,20 @@ inline void handle_fpu_exception(struct pt_regs *regs) { unsigned int fpcsr; int si_code = 0, si_signo = SIGFPE; +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + unsigned long redo_except = FPCSR_mskDNIT|FPCSR_mskUDFT; +#else + unsigned long redo_except = FPCSR_mskDNIT; +#endif lose_fpu(); fpcsr = current->thread.fpu.fpcsr; - if (fpcsr & FPCSR_mskDNIT) { + if (fpcsr & redo_except) { +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + if (fpcsr & FPCSR_mskUDFT) + current->thread.fpu.fpcsr &= ~FPCSR_mskIEX; +#endif si_signo = do_fpuemu(regs, ¤t->thread.fpu); fpcsr = current->thread.fpu.fpcsr; if (!si_signo) diff --git a/arch/nds32/kernel/sys_nds32.c b/arch/nds32/kernel/sys_nds32.c index 9de93ab4c52b..0835277636ce 100644 --- a/arch/nds32/kernel/sys_nds32.c +++ b/arch/nds32/kernel/sys_nds32.c @@ -6,6 +6,8 @@ #include #include +#include +#include SYSCALL_DEFINE6(mmap2, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, @@ -48,3 +50,33 @@ SYSCALL_DEFINE3(cacheflush, unsigned int, start, unsigned int, end, int, cache) return 0; } + +SYSCALL_DEFINE1(udftrap, int, option) +{ +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + int old_udftrap; + + if (!used_math()) { + load_fpu(&init_fpuregs); + current->thread.fpu.UDF_trap = init_fpuregs.UDF_trap; + set_used_math(); + } + + old_udftrap = current->thread.fpu.UDF_trap; + switch (option) { + case DISABLE_UDFTRAP: + current->thread.fpu.UDF_trap = 0; + break; + case ENABLE_UDFTRAP: + current->thread.fpu.UDF_trap = FPCSR_mskUDFE; + break; + case GET_UDFTRAP: + break; + default: + return -EINVAL; + } + return old_udftrap; +#else + return -ENOTSUPP; +#endif +} diff --git a/arch/nds32/math-emu/fpuemu.c b/arch/nds32/math-emu/fpuemu.c index 2a01333d6e5f..75cf1643fa78 100644 --- a/arch/nds32/math-emu/fpuemu.c +++ b/arch/nds32/math-emu/fpuemu.c @@ -304,7 +304,12 @@ static int fpu_emu(struct fpu_struct *fpu_reg, unsigned long insn) /* * If an exception is required, generate a tidy SIGFPE exception. */ +#if IS_ENABLED(CONFIG_SUPPORT_DENORMAL_ARITHMETIC) + if (((fpu_reg->fpcsr << 5) & fpu_reg->fpcsr & FPCSR_mskALLE_NO_UDFE) || + ((fpu_reg->fpcsr & FPCSR_mskUDF) && (fpu_reg->UDF_trap))) +#else if ((fpu_reg->fpcsr << 5) & fpu_reg->fpcsr & FPCSR_mskALLE) +#endif return SIGFPE; return 0; } From 7adb3e998f5bea3e1fd2f76c1cf80d76b8af6626 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 22 Nov 2018 11:14:37 +0800 Subject: [PATCH 13/16] math-emu/op-2.h: Use statement expressions to prevent negative constant shift This modification is quoted from glibc 'commit < sysdeps/unix/sysv/linux/sparc/sparc64/dl-procinfo.c: Moved to> (fe0b1e854ad32a69b260)' Signed-off-by: Vincent Chen Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- include/math-emu/op-2.h | 97 +++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 51 deletions(-) diff --git a/include/math-emu/op-2.h b/include/math-emu/op-2.h index 4f26ecc1411b..13a374f51a22 100644 --- a/include/math-emu/op-2.h +++ b/include/math-emu/op-2.h @@ -31,61 +31,56 @@ #define _FP_FRAC_HIGH_2(X) (X##_f1) #define _FP_FRAC_LOW_2(X) (X##_f0) #define _FP_FRAC_WORD_2(X,w) (X##_f##w) +#define _FP_FRAC_SLL_2(X, N) ( \ + (void) (((N) < _FP_W_TYPE_SIZE) \ + ? ({ \ + if (__builtin_constant_p(N) && (N) == 1) { \ + X##_f1 = X##_f1 + X##_f1 + \ + (((_FP_WS_TYPE) (X##_f0)) < 0); \ + X##_f0 += X##_f0; \ + } else { \ + X##_f1 = X##_f1 << (N) | X##_f0 >> \ + (_FP_W_TYPE_SIZE - (N)); \ + X##_f0 <<= (N); \ + } \ + 0; \ + }) \ + : ({ \ + X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE); \ + X##_f0 = 0; \ + }))) -#define _FP_FRAC_SLL_2(X,N) \ - do { \ - if ((N) < _FP_W_TYPE_SIZE) \ - { \ - if (__builtin_constant_p(N) && (N) == 1) \ - { \ - X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0); \ - X##_f0 += X##_f0; \ - } \ - else \ - { \ - X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \ - X##_f0 <<= (N); \ - } \ - } \ - else \ - { \ - X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE); \ - X##_f0 = 0; \ - } \ - } while (0) -#define _FP_FRAC_SRL_2(X,N) \ - do { \ - if ((N) < _FP_W_TYPE_SIZE) \ - { \ - X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \ - X##_f1 >>= (N); \ - } \ - else \ - { \ - X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE); \ - X##_f1 = 0; \ - } \ - } while (0) +#define _FP_FRAC_SRL_2(X, N) ( \ + (void) (((N) < _FP_W_TYPE_SIZE) \ + ? ({ \ + X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \ + X##_f1 >>= (N); \ + }) \ + : ({ \ + X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE); \ + X##_f1 = 0; \ + }))) + /* Right shift with sticky-lsb. */ -#define _FP_FRAC_SRS_2(X,N,sz) \ - do { \ - if ((N) < _FP_W_TYPE_SIZE) \ - { \ - X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) | \ - (__builtin_constant_p(N) && (N) == 1 \ - ? X##_f0 & 1 \ - : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \ - X##_f1 >>= (N); \ - } \ - else \ - { \ - X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) | \ - (((X##_f1 << (2*_FP_W_TYPE_SIZE - (N))) | X##_f0) != 0)); \ - X##_f1 = 0; \ - } \ - } while (0) +#define _FP_FRAC_SRS_2(X, N, sz) ( \ + (void) (((N) < _FP_W_TYPE_SIZE) \ + ? ({ \ + X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) \ + | (__builtin_constant_p(N) && (N) == 1 \ + ? X##_f0 & 1 \ + : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \ + X##_f1 >>= (N); \ + }) \ + : ({ \ + X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) \ + | ((((N) == _FP_W_TYPE_SIZE \ + ? 0 \ + : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \ + | X##_f0) != 0)); \ + X##_f1 = 0; \ + }))) #define _FP_FRAC_ADDI_2(X,I) \ __FP_FRAC_ADDI_2(X##_f1, X##_f0, I) From 83312f1b7ae205dca647bf52bbe2d51303cdedfb Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 22 Nov 2018 11:14:38 +0800 Subject: [PATCH 14/16] math-emu/soft-fp.h: (_FP_ROUND_ZERO) cast 0 to void to fix warning _FP_ROUND_ZERO is defined as 0 and used as a statemente in macro _FP_ROUND. This generates "error: statement with no effect [-Werror=unused-value]" from gcc. Defining _FP_ROUND_ZERO as (void)0 to fix it. This modification is quoted from glibc 'commit (8ed1e7d5894000c155acbd06f)' Signed-off-by: Vincent Chen Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- include/math-emu/soft-fp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/math-emu/soft-fp.h b/include/math-emu/soft-fp.h index 3f284bc03180..5650c1628383 100644 --- a/include/math-emu/soft-fp.h +++ b/include/math-emu/soft-fp.h @@ -138,7 +138,7 @@ do { \ _FP_FRAC_ADDI_##wc(X, _FP_WORK_ROUND); \ } while (0) -#define _FP_ROUND_ZERO(wc, X) 0 +#define _FP_ROUND_ZERO(wc, X) (void)0 #define _FP_ROUND_PINF(wc, X) \ do { \ From a5234068e6dc18ae5300d678fbf3e129d9b93f78 Mon Sep 17 00:00:00 2001 From: Nylon Chen Date: Thu, 8 Nov 2018 19:28:05 +0800 Subject: [PATCH 15/16] nds32: Fix the items of hwcap_str ordering issue. The hwcap_str should be set in a correct order according to HWCAP_xx. We also add the missing "fpu_dp" to it. Signed-off-by: Nylon Chen Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/kernel/setup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c index d7f5657bc638..4b774ca433a9 100644 --- a/arch/nds32/kernel/setup.c +++ b/arch/nds32/kernel/setup.c @@ -72,8 +72,9 @@ static const char *hwcap_str[] = { "div", "mac", "l2c", - "dx_regs", + "fpu_dp", "v2", + "dx_regs", NULL, }; From e2f3f8b4a497d26bdcd55a53246ec2e613ae0fd4 Mon Sep 17 00:00:00 2001 From: Nylon Chen Date: Thu, 8 Nov 2018 19:28:15 +0800 Subject: [PATCH 16/16] nds32: support hardware prefetcher We add a config for user to enable or disable this feature. It can be used to control the hardware prefetch function. Signed-off-by: Nylon Chen Acked-by: Greentime Hu Signed-off-by: Greentime Hu --- arch/nds32/Kconfig.cpu | 7 +++++++ arch/nds32/include/asm/bitfield.h | 6 ++++++ arch/nds32/kernel/head.S | 2 +- arch/nds32/kernel/setup.c | 7 +++++++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/arch/nds32/Kconfig.cpu b/arch/nds32/Kconfig.cpu index 6482ed877f97..f16edf0582b4 100644 --- a/arch/nds32/Kconfig.cpu +++ b/arch/nds32/Kconfig.cpu @@ -177,6 +177,13 @@ config CACHE_L2 Say Y here to enable L2 cache if your SoC are integrated with L2CC. If unsure, say N. +config HW_PRE + bool "Enable hardware prefetcher" + default y + help + Say Y here to enable hardware prefetcher feature. + Only when CPU_VER.REV >= 0x09 can support. + menu "Memory configuration" choice diff --git a/arch/nds32/include/asm/bitfield.h b/arch/nds32/include/asm/bitfield.h index c1619730192a..7414fcbbab4e 100644 --- a/arch/nds32/include/asm/bitfield.h +++ b/arch/nds32/include/asm/bitfield.h @@ -740,14 +740,20 @@ #define N13MISC_CTL_offRTP 1 /* Disable Return Target Predictor */ #define N13MISC_CTL_offPTEPF 2 /* Disable HPTWK L2 PTE pefetch */ #define N13MISC_CTL_offSP_SHADOW_EN 4 /* Enable shadow stack pointers */ +#define MISC_CTL_offHWPRE 11 /* Enable HardWare PREFETCH */ /* bit 6, 9:31 reserved */ #define N13MISC_CTL_makBTB ( 0x1 << N13MISC_CTL_offBTB ) #define N13MISC_CTL_makRTP ( 0x1 << N13MISC_CTL_offRTP ) #define N13MISC_CTL_makPTEPF ( 0x1 << N13MISC_CTL_offPTEPF ) #define N13MISC_CTL_makSP_SHADOW_EN ( 0x1 << N13MISC_CTL_offSP_SHADOW_EN ) +#define MISC_CTL_makHWPRE_EN ( 0x1 << MISC_CTL_offHWPRE ) +#ifdef CONFIG_HW_PRE +#define MISC_init (N13MISC_CTL_makBTB|N13MISC_CTL_makRTP|N13MISC_CTL_makSP_SHADOW_EN|MISC_CTL_makHWPRE_EN) +#else #define MISC_init (N13MISC_CTL_makBTB|N13MISC_CTL_makRTP|N13MISC_CTL_makSP_SHADOW_EN) +#endif /****************************************************************************** * PRUSR_ACC_CTL (Privileged Resource User Access Control Registers) diff --git a/arch/nds32/kernel/head.S b/arch/nds32/kernel/head.S index 2c8aac6201be..db64b78b1232 100644 --- a/arch/nds32/kernel/head.S +++ b/arch/nds32/kernel/head.S @@ -151,7 +151,7 @@ _tlb: #endif mtsr $r3, $TLB_MISC - mfsr $r0, $MISC_CTL ! Enable BTB and RTP and shadow sp + mfsr $r0, $MISC_CTL ! Enable BTB, RTP, shadow sp, and HW_PRE ori $r0, $r0, #MISC_init mtsr $r0, $MISC_CTL diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c index 4b774ca433a9..31d29d92478e 100644 --- a/arch/nds32/kernel/setup.c +++ b/arch/nds32/kernel/setup.c @@ -39,6 +39,7 @@ #define HWCAP_FPU_DP 0x040000 #define HWCAP_V2 0x080000 #define HWCAP_DX_REGS 0x100000 +#define HWCAP_HWPRE 0x200000 unsigned long cpu_id, cpu_rev, cpu_cfgid; bool has_fpu = false; @@ -75,6 +76,7 @@ static const char *hwcap_str[] = { "fpu_dp", "v2", "dx_regs", + "hw_pre", NULL, }; @@ -221,6 +223,11 @@ static void __init setup_cpuinfo(void) if (__nds32__mfsr(NDS32_SR_MSC_CFG) & MSC_CFG_mskL2C) elf_hwcap |= HWCAP_L2C; +#ifdef CONFIG_HW_PRE + if (__nds32__mfsr(NDS32_SR_MISC_CTL) & MISC_CTL_makHWPRE_EN) + elf_hwcap |= HWCAP_HWPRE; +#endif + tmp = __nds32__mfsr(NDS32_SR_CACHE_CTL); if (!IS_ENABLED(CONFIG_CPU_DCACHE_DISABLE)) tmp |= CACHE_CTL_mskDC_EN;