linux/arch/x86/include/asm/irq_vectors.h
Shaohua Li 70e4a36973 x86: Scale up the number of TLB invalidate vectors with NR_CPUs, up to 32
Make the maxium TLB invalidate vectors depend on NR_CPUS linearly,
with a maximum of 32 vectors.

We currently only have 8 vectors for TLB invalidation and that is clearly
inadequate. If we have a lot of CPUs, the CPUs need share the 8 vectors and
tlbstate_lock is used to protect them. flush_tlb_page() is
heavily used in page reclaim, which will cause a lot of lock
contention for tlbstate_lock.

Andi Kleen suggested increasing the vectors number to 32, which should be
good for current typical systems to reduce the tlbstate_lock contention.

My test system has 4 sockets and 64G memory, and 64 CPUs. My
workload creates 64 processes. Each process mmap reads a big
empty sparse file. The total size of the files are 2*total_mem,
so this will cause a lot of page reclaim.

Below is the result I get from perf call-graph profiling:

 without the patch:
 ------------------

    24.25%           usemem  [kernel]                                   [k] _raw_spin_lock
                     |
                     --- _raw_spin_lock
                        |
                        |--42.15%-- native_flush_tlb_others

 with the patch:
 ------------------

    14.96%           usemem  [kernel]                                   [k] _raw_spin_lock
                     |
                     --- _raw_spin_lock
                        |--13.89%-- native_flush_tlb_others

So this heavily reduces the tlbstate_lock contention.

Suggested-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1295232727.1949.709.camel@sli10-conroe>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-02-14 13:03:08 +01:00

186 lines
5.1 KiB
C

#ifndef _ASM_X86_IRQ_VECTORS_H
#define _ASM_X86_IRQ_VECTORS_H
#include <linux/threads.h>
/*
* Linux IRQ vector layout.
*
* There are 256 IDT entries (per CPU - each entry is 8 bytes) which can
* be defined by Linux. They are used as a jump table by the CPU when a
* given vector is triggered - by a CPU-external, CPU-internal or
* software-triggered event.
*
* Linux sets the kernel code address each entry jumps to early during
* bootup, and never changes them. This is the general layout of the
* IDT entries:
*
* Vectors 0 ... 31 : system traps and exceptions - hardcoded events
* Vectors 32 ... 127 : device interrupts
* Vector 128 : legacy int80 syscall interface
* Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
* Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
*
* 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
*
* This file enumerates the exact layout of them:
*/
#define NMI_VECTOR 0x02
#define MCE_VECTOR 0x12
/*
* IDT vectors usable for external interrupt sources start at 0x20.
* (0x80 is the syscall vector, 0x30-0x3f are for ISA)
*/
#define FIRST_EXTERNAL_VECTOR 0x20
/*
* We start allocating at 0x21 to spread out vectors evenly between
* priority levels. (0x80 is the syscall vector)
*/
#define VECTOR_OFFSET_START 1
/*
* Reserve the lowest usable vector (and hence lowest priority) 0x20 for
* triggering cleanup after irq migration. 0x21-0x2f will still be used
* for device interrupts.
*/
#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
#define IA32_SYSCALL_VECTOR 0x80
#ifdef CONFIG_X86_32
# define SYSCALL_VECTOR 0x80
#endif
/*
* Vectors 0x30-0x3f are used for ISA interrupts.
* round up to the next 16-vector boundary
*/
#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15)
#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
#define IRQ4_VECTOR (IRQ0_VECTOR + 4)
#define IRQ5_VECTOR (IRQ0_VECTOR + 5)
#define IRQ6_VECTOR (IRQ0_VECTOR + 6)
#define IRQ7_VECTOR (IRQ0_VECTOR + 7)
#define IRQ8_VECTOR (IRQ0_VECTOR + 8)
#define IRQ9_VECTOR (IRQ0_VECTOR + 9)
#define IRQ10_VECTOR (IRQ0_VECTOR + 10)
#define IRQ11_VECTOR (IRQ0_VECTOR + 11)
#define IRQ12_VECTOR (IRQ0_VECTOR + 12)
#define IRQ13_VECTOR (IRQ0_VECTOR + 13)
#define IRQ14_VECTOR (IRQ0_VECTOR + 14)
#define IRQ15_VECTOR (IRQ0_VECTOR + 15)
/*
* Special IRQ vectors used by the SMP architecture, 0xf0-0xff
*
* some of the following vectors are 'rare', they are merged
* into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
* TLB, reschedule and local APIC vectors are performance-critical.
*/
#define SPURIOUS_APIC_VECTOR 0xff
/*
* Sanity check
*/
#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
# error SPURIOUS_APIC_VECTOR definition error
#endif
#define ERROR_APIC_VECTOR 0xfe
#define RESCHEDULE_VECTOR 0xfd
#define CALL_FUNCTION_VECTOR 0xfc
#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
#define THERMAL_APIC_VECTOR 0xfa
#define THRESHOLD_APIC_VECTOR 0xf9
#define REBOOT_VECTOR 0xf8
/*
* Generic system vector for platform specific use
*/
#define X86_PLATFORM_IPI_VECTOR 0xf7
/*
* IRQ work vector:
*/
#define IRQ_WORK_VECTOR 0xf6
#define UV_BAU_MESSAGE 0xf5
/*
* Self IPI vector for machine checks
*/
#define MCE_SELF_VECTOR 0xf4
/* Xen vector callback to receive events in a HVM domain */
#define XEN_HVM_EVTCHN_CALLBACK 0xf3
/*
* Local APIC timer IRQ vector is on a different priority level,
* to work around the 'lost local interrupt if more than 2 IRQ
* sources per level' errata.
*/
#define LOCAL_TIMER_VECTOR 0xef
/* up to 32 vectors used for spreading out TLB flushes: */
#if NR_CPUS <= 32
# define NUM_INVALIDATE_TLB_VECTORS NR_CPUS
#else
# define NUM_INVALIDATE_TLB_VECTORS 32
#endif
#define INVALIDATE_TLB_VECTOR_END 0xee
#define INVALIDATE_TLB_VECTOR_START \
(INVALIDATE_TLB_VECTOR_END - NUM_INVALIDATE_TLB_VECTORS + 1)
#define NR_VECTORS 256
#define FPU_IRQ 13
#define FIRST_VM86_IRQ 3
#define LAST_VM86_IRQ 15
#ifndef __ASSEMBLY__
static inline int invalid_vm86_irq(int irq)
{
return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
}
#endif
/*
* Size the maximum number of interrupts.
*
* If the irq_desc[] array has a sparse layout, we can size things
* generously - it scales up linearly with the maximum number of CPUs,
* and the maximum number of IO-APICs, whichever is higher.
*
* In other cases we size more conservatively, to not create too large
* static arrays.
*/
#define NR_IRQS_LEGACY 16
#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
#ifdef CONFIG_X86_IO_APIC
# ifdef CONFIG_SPARSE_IRQ
# define CPU_VECTOR_LIMIT (64 * NR_CPUS)
# define NR_IRQS \
(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
(NR_VECTORS + CPU_VECTOR_LIMIT) : \
(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
# else
# define CPU_VECTOR_LIMIT (32 * NR_CPUS)
# define NR_IRQS \
(CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \
(NR_VECTORS + CPU_VECTOR_LIMIT) : \
(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
# endif
#else /* !CONFIG_X86_IO_APIC: */
# define NR_IRQS NR_IRQS_LEGACY
#endif
#endif /* _ASM_X86_IRQ_VECTORS_H */