linux/arch/i386/kernel/smpboot.c

1514 lines
36 KiB
C
Raw Normal View History

/*
* x86 SMP booting functions
*
* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
* (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
*
* Much of the core SMP work is based on previous work by Thomas Radke, to
* whom a great many thanks are extended.
*
* Thanks to Intel for making available several different Pentium,
* Pentium Pro and Pentium-II/Xeon MP machines.
* Original development of Linux SMP code supported by Caldera.
*
* This code is released under the GNU General Public License version 2 or
* later.
*
* Fixes
* Felix Koop : NR_CPUS used properly
* Jose Renau : Handle single CPU case.
* Alan Cox : By repeated request 8) - Total BogoMIPS report.
* Greg Wright : Fix for kernel stacks panic.
* Erich Boleyn : MP v1.4 and additional changes.
* Matthias Sattler : Changes for 2.1 kernel map.
* Michel Lespinasse : Changes for 2.1 kernel map.
* Michael Chastain : Change trampoline.S to gnu as.
* Alan Cox : Dumb bug: 'B' step PPro's are fine
* Ingo Molnar : Added APIC timers, based on code
* from Jose Renau
* Ingo Molnar : various cleanups and rewrites
* Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
* Maciej W. Rozycki : Bits for genuine 82489DX APICs
* Martin J. Bligh : Added support for multi-quad systems
* Dave Jones : Report invalid combinations of Athlon CPUs.
* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/smp_lock.h>
#include <linux/bootmem.h>
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/percpu.h>
#include <linux/delay.h>
#include <linux/mc146818rtc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/arch_hooks.h>
#include <asm/nmi.h>
#include <mach_apic.h>
#include <mach_wakecpu.h>
#include <smpboot_hooks.h>
/* Set if we find a B stepping CPU */
static int __devinitdata smp_b_stepping;
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
#ifdef CONFIG_X86_HT
EXPORT_SYMBOL(smp_num_siblings);
#endif
/* Last level cache ID of each logical CPU */
int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
/* representing HT siblings of each logical CPU */
cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_sibling_map);
/* representing HT and core siblings of each logical CPU */
cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_core_map);
/* bitmap of online cpus */
cpumask_t cpu_online_map __read_mostly;
EXPORT_SYMBOL(cpu_online_map);
cpumask_t cpu_callin_map;
cpumask_t cpu_callout_map;
EXPORT_SYMBOL(cpu_callout_map);
cpumask_t cpu_possible_map;
EXPORT_SYMBOL(cpu_possible_map);
static cpumask_t smp_commenced_mask;
/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
* is no way to resync one AP against BP. TBD: for prescott and above, we
* should use IA64's algorithm
*/
static int __devinitdata tsc_sync_disabled;
/* Per CPU bogomips and other parameters */
struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
EXPORT_SYMBOL(cpu_data);
u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
{ [0 ... NR_CPUS-1] = 0xff };
EXPORT_SYMBOL(x86_cpu_to_apicid);
u8 apicid_2_node[MAX_APICID];
/*
* Trampoline 80x86 program as an array.
*/
extern unsigned char trampoline_data [];
extern unsigned char trampoline_end [];
static unsigned char *trampoline_base;
static int trampoline_exec;
static void map_cpu_to_logical_apicid(void);
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
/* State of each CPU. */
DEFINE_PER_CPU(int, cpu_state) = { 0 };
/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
* has made sure it's suitably aligned.
*/
static unsigned long __devinit setup_trampoline(void)
{
memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
return virt_to_phys(trampoline_base);
}
/*
* We are called very early to get the low memory for the
* SMP bootup trampoline page.
*/
void __init smp_alloc_memory(void)
{
trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
/*
* Has to be in very low memory so we can execute
* real-mode AP code.
*/
if (__pa(trampoline_base) >= 0x9F000)
BUG();
/*
* Make the SMP trampoline executable:
*/
trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
}
/*
* The bootstrap kernel entry code has set these up. Save them for
* a given CPU
*/
static void __devinit smp_store_cpu_info(int id)
{
struct cpuinfo_x86 *c = cpu_data + id;
*c = boot_cpu_data;
if (id!=0)
identify_cpu(c);
/*
* Mask B, Pentium, but not Pentium MMX
*/
if (c->x86_vendor == X86_VENDOR_INTEL &&
c->x86 == 5 &&
c->x86_mask >= 1 && c->x86_mask <= 4 &&
c->x86_model <= 3)
/*
* Remember we have B step Pentia with bugs
*/
smp_b_stepping = 1;
/*
* Certain Athlons might work (for various values of 'work') in SMP
* but they are not certified as MP capable.
*/
if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
if (num_possible_cpus() == 1)
goto valid_k7;
/* Athlon 660/661 is valid. */
if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
goto valid_k7;
/* Duron 670 is valid */
if ((c->x86_model==7) && (c->x86_mask==0))
goto valid_k7;
/*
* Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
* It's worth noting that the A5 stepping (662) of some Athlon XP's
* have the MP bit set.
* See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
*/
if (((c->x86_model==6) && (c->x86_mask>=2)) ||
((c->x86_model==7) && (c->x86_mask>=1)) ||
(c->x86_model> 7))
if (cpu_has_mp)
goto valid_k7;
/* If we get here, it's not a certified SMP capable AMD system. */
add_taint(TAINT_UNSAFE_SMP);
}
valid_k7:
;
}
/*
* TSC synchronization.
*
* We first check whether all CPUs have their TSC's synchronized,
* then we print a warning if not, and always resync.
*/
static struct {
atomic_t start_flag;
atomic_t count_start;
atomic_t count_stop;
unsigned long long values[NR_CPUS];
} tsc __initdata = {
.start_flag = ATOMIC_INIT(0),
.count_start = ATOMIC_INIT(0),
.count_stop = ATOMIC_INIT(0),
};
#define NR_LOOPS 5
static void __init synchronize_tsc_bp(void)
{
int i;
unsigned long long t0;
unsigned long long sum, avg;
long long delta;
unsigned int one_usec;
int buggy = 0;
printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
/* convert from kcyc/sec to cyc/usec */
one_usec = cpu_khz / 1000;
atomic_set(&tsc.start_flag, 1);
wmb();
/*
* We loop a few times to get a primed instruction cache,
* then the last pass is more or less synchronized and
* the BP and APs set their cycle counters to zero all at
* once. This reduces the chance of having random offsets
* between the processors, and guarantees that the maximum
* delay between the cycle counters is never bigger than
* the latency of information-passing (cachelines) between
* two CPUs.
*/
for (i = 0; i < NR_LOOPS; i++) {
/*
* all APs synchronize but they loop on '== num_cpus'
*/
while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
cpu_relax();
atomic_set(&tsc.count_stop, 0);
wmb();
/*
* this lets the APs save their current TSC:
*/
atomic_inc(&tsc.count_start);
rdtscll(tsc.values[smp_processor_id()]);
/*
* We clear the TSC in the last loop:
*/
if (i == NR_LOOPS-1)
write_tsc(0, 0);
/*
* Wait for all APs to leave the synchronization point:
*/
while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
cpu_relax();
atomic_set(&tsc.count_start, 0);
wmb();
atomic_inc(&tsc.count_stop);
}
sum = 0;
for (i = 0; i < NR_CPUS; i++) {
if (cpu_isset(i, cpu_callout_map)) {
t0 = tsc.values[i];
sum += t0;
}
}
avg = sum;
do_div(avg, num_booting_cpus());
for (i = 0; i < NR_CPUS; i++) {
if (!cpu_isset(i, cpu_callout_map))
continue;
delta = tsc.values[i] - avg;
if (delta < 0)
delta = -delta;
/*
* We report bigger than 2 microseconds clock differences.
*/
if (delta > 2*one_usec) {
long long realdelta;
if (!buggy) {
buggy = 1;
printk("\n");
}
realdelta = delta;
do_div(realdelta, one_usec);
if (tsc.values[i] < avg)
realdelta = -realdelta;
if (realdelta)
printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
"skew, fixed it up.\n", i, realdelta);
}
}
if (!buggy)
printk("passed.\n");
}
static void __init synchronize_tsc_ap(void)
{
int i;
/*
* Not every cpu is online at the time
* this gets called, so we first wait for the BP to
* finish SMP initialization:
*/
while (!atomic_read(&tsc.start_flag))
cpu_relax();
for (i = 0; i < NR_LOOPS; i++) {
atomic_inc(&tsc.count_start);
while (atomic_read(&tsc.count_start) != num_booting_cpus())
cpu_relax();
rdtscll(tsc.values[smp_processor_id()]);
if (i == NR_LOOPS-1)
write_tsc(0, 0);
atomic_inc(&tsc.count_stop);
while (atomic_read(&tsc.count_stop) != num_booting_cpus())
cpu_relax();
}
}
#undef NR_LOOPS
extern void calibrate_delay(void);
static atomic_t init_deasserted;
static void __devinit smp_callin(void)
{
int cpuid, phys_id;
unsigned long timeout;
/*
* If waken up by an INIT in an 82489DX configuration
* we may get here before an INIT-deassert IPI reaches
* our local APIC. We have to wait for the IPI or we'll
* lock up on an APIC access.
*/
wait_for_init_deassert(&init_deasserted);
/*
* (This works even if the APIC is not enabled.)
*/
phys_id = GET_APIC_ID(apic_read(APIC_ID));
cpuid = smp_processor_id();
if (cpu_isset(cpuid, cpu_callin_map)) {
printk("huh, phys CPU#%d, CPU#%d already present??\n",
phys_id, cpuid);
BUG();
}
Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
/*
* STARTUP IPIs are fragile beasts as they might sometimes
* trigger some glue motherboard logic. Complete APIC bus
* silence for 1 second, this overestimates the time the
* boot CPU is spending to send the up to 2 STARTUP IPIs
* by a factor of two. This should be enough.
*/
/*
* Waiting 2s total for startup (udelay is not yet working)
*/
timeout = jiffies + 2*HZ;
while (time_before(jiffies, timeout)) {
/*
* Has the boot CPU finished it's STARTUP sequence?
*/
if (cpu_isset(cpuid, cpu_callout_map))
break;
rep_nop();
}
if (!time_before(jiffies, timeout)) {
printk("BUG: CPU%d started up but did not get a callout!\n",
cpuid);
BUG();
}
/*
* the boot CPU has finished the init stage and is spinning
* on callin_map until we finish. We are free to set up this
* CPU, first the APIC. (this is probably redundant on most
* boards)
*/
Dprintk("CALLIN, before setup_local_APIC().\n");
smp_callin_clear_local_apic();
setup_local_APIC();
map_cpu_to_logical_apicid();
/*
* Get our bogomips.
*/
calibrate_delay();
Dprintk("Stack at about %p\n",&cpuid);
/*
* Save our processor parameters
*/
smp_store_cpu_info(cpuid);
disable_APIC_timer();
/*
* Allow the master to continue.
*/
cpu_set(cpuid, cpu_callin_map);
/*
* Synchronize the TSC with the BP
*/
if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
synchronize_tsc_ap();
}
static int cpucount;
/* maps the cpu to the sched domain representing multi-core */
cpumask_t cpu_coregroup_map(int cpu)
{
struct cpuinfo_x86 *c = cpu_data + cpu;
/*
* For perf, we return last level cache shared map.
* And for power savings, we return cpu_core_map
*/
if (sched_mc_power_savings || sched_smt_power_savings)
return cpu_core_map[cpu];
else
return c->llc_shared_map;
}
/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map;
static inline void
set_cpu_sibling_map(int cpu)
{
int i;
struct cpuinfo_x86 *c = cpu_data;
cpu_set(cpu, cpu_sibling_setup_map);
if (smp_num_siblings > 1) {
for_each_cpu_mask(i, cpu_sibling_setup_map) {
if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
c[cpu].cpu_core_id == c[i].cpu_core_id) {
cpu_set(i, cpu_sibling_map[cpu]);
cpu_set(cpu, cpu_sibling_map[i]);
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
cpu_set(i, c[cpu].llc_shared_map);
cpu_set(cpu, c[i].llc_shared_map);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}
cpu_set(cpu, c[cpu].llc_shared_map);
if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
c[cpu].booted_cores = 1;
return;
}
for_each_cpu_mask(i, cpu_sibling_setup_map) {
if (cpu_llc_id[cpu] != BAD_APICID &&
cpu_llc_id[cpu] == cpu_llc_id[i]) {
cpu_set(i, c[cpu].llc_shared_map);
cpu_set(cpu, c[i].llc_shared_map);
}
if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
/*
* Does this new cpu bringup a new core?
*/
if (cpus_weight(cpu_sibling_map[cpu]) == 1) {
/*
* for each core in package, increment
* the booted_cores for this new cpu
*/
if (first_cpu(cpu_sibling_map[i]) == i)
c[cpu].booted_cores++;
/*
* increment the core count for all
* the other cpus in this package
*/
if (i != cpu)
c[i].booted_cores++;
} else if (i != cpu && !c[cpu].booted_cores)
c[cpu].booted_cores = c[i].booted_cores;
}
}
}
/*
* Activate a secondary processor.
*/
static void __devinit start_secondary(void *unused)
{
/*
* Dont put anything before smp_callin(), SMP
* booting is too fragile that we want to limit the
* things done here to the most necessary things.
*/
cpu_init();
preempt_disable();
smp_callin();
while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
rep_nop();
setup_secondary_APIC_clock();
if (nmi_watchdog == NMI_IO_APIC) {
disable_8259A_irq(0);
enable_NMI_through_LVT0(NULL);
enable_8259A_irq(0);
}
enable_APIC_timer();
/*
* low-memory mappings have been cleared, flush them from
* the local TLBs too.
*/
local_flush_tlb();
/* This must be done before setting cpu_online_map */
set_cpu_sibling_map(raw_smp_processor_id());
wmb();
/*
* We need to hold call_lock, so there is no inconsistency
* between the time smp_call_function() determines number of
* IPI receipients, and the time when the determination is made
* for which cpus receive the IPI. Holding this
* lock helps us to not include this cpu in a currently in progress
* smp_call_function().
*/
lock_ipi_call_lock();
cpu_set(smp_processor_id(), cpu_online_map);
unlock_ipi_call_lock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
wmb();
cpu_idle();
}
/*
* Everything has been set up for the secondary
* CPUs - they just need to reload everything
* from the task structure
* This function must not return.
*/
void __devinit initialize_secondary(void)
{
/*
* We don't actually need to load the full TSS,
* basically just the stack pointer and the eip.
*/
asm volatile(
"movl %0,%%esp\n\t"
"jmp *%1"
:
:"r" (current->thread.esp),"r" (current->thread.eip));
}
extern struct {
void * esp;
unsigned short ss;
} stack_start;
#ifdef CONFIG_NUMA
/* which logical CPUs are on which nodes */
cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
EXPORT_SYMBOL(node_2_cpu_mask);
/* which node each logical CPU is on */
int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
EXPORT_SYMBOL(cpu_2_node);
/* set up a mapping between cpu and node. */
static inline void map_cpu_to_node(int cpu, int node)
{
printk("Mapping cpu %d to node %d\n", cpu, node);
cpu_set(cpu, node_2_cpu_mask[node]);
cpu_2_node[cpu] = node;
}
/* undo a mapping between cpu and node. */
static inline void unmap_cpu_to_node(int cpu)
{
int node;
printk("Unmapping cpu %d from all nodes\n", cpu);
for (node = 0; node < MAX_NUMNODES; node ++)
cpu_clear(cpu, node_2_cpu_mask[node]);
cpu_2_node[cpu] = 0;
}
#else /* !CONFIG_NUMA */
#define map_cpu_to_node(cpu, node) ({})
#define unmap_cpu_to_node(cpu) ({})
#endif /* CONFIG_NUMA */
u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
static void map_cpu_to_logical_apicid(void)
{
int cpu = smp_processor_id();
int apicid = logical_smp_processor_id();
int node = apicid_to_node(apicid);
if (!node_online(node))
node = first_online_node;
cpu_2_logical_apicid[cpu] = apicid;
map_cpu_to_node(cpu, node);
}
static void unmap_cpu_to_logical_apicid(int cpu)
{
cpu_2_logical_apicid[cpu] = BAD_APICID;
unmap_cpu_to_node(cpu);
}
#if APIC_DEBUG
static inline void __inquire_remote_apic(int apicid)
{
int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
char *names[] = { "ID", "VERSION", "SPIV" };
int timeout, status;
printk("Inquiring remote APIC #%d...\n", apicid);
for (i = 0; i < ARRAY_SIZE(regs); i++) {
printk("... APIC #%d %s: ", apicid, names[i]);
/*
* Wait for idle.
*/
apic_wait_icr_idle();
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
timeout = 0;
do {
udelay(100);
status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
} while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
switch (status) {
case APIC_ICR_RR_VALID:
status = apic_read(APIC_RRR);
printk("%08x\n", status);
break;
default:
printk("failed\n");
}
}
}
#endif
#ifdef WAKE_SECONDARY_VIA_NMI
/*
* Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
* INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
* won't ... remember to clear down the APIC, etc later.
*/
static int __devinit
wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
{
unsigned long send_status = 0, accept_status = 0;
int timeout, maxlvt;
/* Target chip */
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
/* Boot on the stack */
/* Kick the second */
apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
Dprintk("Waiting for send to finish...\n");
timeout = 0;
do {
Dprintk("+");
udelay(100);
send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
} while (send_status && (timeout++ < 1000));
/*
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
/*
* Due to the Pentium erratum 3AP.
*/
maxlvt = get_maxlvt();
if (maxlvt > 3) {
apic_read_around(APIC_SPIV);
apic_write(APIC_ESR, 0);
}
accept_status = (apic_read(APIC_ESR) & 0xEF);
Dprintk("NMI sent.\n");
if (send_status)
printk("APIC never delivered???\n");
if (accept_status)
printk("APIC delivery error (%lx).\n", accept_status);
return (send_status | accept_status);
}
#endif /* WAKE_SECONDARY_VIA_NMI */
#ifdef WAKE_SECONDARY_VIA_INIT
static int __devinit
wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
{
unsigned long send_status = 0, accept_status = 0;
int maxlvt, timeout, num_starts, j;
/*
* Be paranoid about clearing APIC errors.
*/
if (APIC_INTEGRATED(apic_version[phys_apicid])) {
apic_read_around(APIC_SPIV);
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
Dprintk("Asserting INIT.\n");
/*
* Turn INIT on target chip
*/
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
/*
* Send IPI
*/
apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
| APIC_DM_INIT);
Dprintk("Waiting for send to finish...\n");
timeout = 0;
do {
Dprintk("+");
udelay(100);
send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
} while (send_status && (timeout++ < 1000));
mdelay(10);
Dprintk("Deasserting INIT.\n");
/* Target chip */
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
/* Send IPI */
apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
Dprintk("Waiting for send to finish...\n");
timeout = 0;
do {
Dprintk("+");
udelay(100);
send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
} while (send_status && (timeout++ < 1000));
atomic_set(&init_deasserted, 1);
/*
* Should we send STARTUP IPIs ?
*
* Determine this based on the APIC version.
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/
if (APIC_INTEGRATED(apic_version[phys_apicid]))
num_starts = 2;
else
num_starts = 0;
/*
* Run STARTUP IPI loop.
*/
Dprintk("#startup loops: %d.\n", num_starts);
maxlvt = get_maxlvt();
for (j = 1; j <= num_starts; j++) {
Dprintk("Sending STARTUP #%d.\n",j);
apic_read_around(APIC_SPIV);
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
Dprintk("After apic_write.\n");
/*
* STARTUP IPI
*/
/* Target chip */
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
/* Boot on the stack */
/* Kick the second */
apic_write_around(APIC_ICR, APIC_DM_STARTUP
| (start_eip >> 12));
/*
* Give the other CPU some time to accept the IPI.
*/
udelay(300);
Dprintk("Startup point 1.\n");
Dprintk("Waiting for send to finish...\n");
timeout = 0;
do {
Dprintk("+");
udelay(100);
send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
} while (send_status && (timeout++ < 1000));
/*
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
/*
* Due to the Pentium erratum 3AP.
*/
if (maxlvt > 3) {
apic_read_around(APIC_SPIV);
apic_write(APIC_ESR, 0);
}
accept_status = (apic_read(APIC_ESR) & 0xEF);
if (send_status || accept_status)
break;
}
Dprintk("After Startup.\n");
if (send_status)
printk("APIC never delivered???\n");
if (accept_status)
printk("APIC delivery error (%lx).\n", accept_status);
return (send_status | accept_status);
}
#endif /* WAKE_SECONDARY_VIA_INIT */
extern cpumask_t cpu_initialized;
static inline int alloc_cpu_id(void)
{
cpumask_t tmp_map;
int cpu;
cpus_complement(tmp_map, cpu_present_map);
cpu = first_cpu(tmp_map);
if (cpu >= NR_CPUS)
return -ENODEV;
return cpu;
}
#ifdef CONFIG_HOTPLUG_CPU
static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
static inline struct task_struct * alloc_idle_task(int cpu)
{
struct task_struct *idle;
if ((idle = cpu_idle_tasks[cpu]) != NULL) {
/* initialize thread_struct. we really want to avoid destroy
* idle tread
*/
idle->thread.esp = (unsigned long)task_pt_regs(idle);
init_idle(idle, cpu);
return idle;
}
idle = fork_idle(cpu);
if (!IS_ERR(idle))
cpu_idle_tasks[cpu] = idle;
return idle;
}
#else
#define alloc_idle_task(cpu) fork_idle(cpu)
#endif
static int __devinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
* Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
*/
{
struct task_struct *idle;
unsigned long boot_error;
int timeout;
unsigned long start_eip;
unsigned short nmi_high = 0, nmi_low = 0;
++cpucount;
[PATCH] x86: SMP alternatives Implement SMP alternatives, i.e. switching at runtime between different code versions for UP and SMP. The code can patch both SMP->UP and UP->SMP. The UP->SMP case is useful for CPU hotplug. With CONFIG_CPU_HOTPLUG enabled the code switches to UP at boot time and when the number of CPUs goes down to 1, and switches to SMP when the number of CPUs goes up to 2. Without CONFIG_CPU_HOTPLUG or on non-SMP-capable systems the code is patched once at boot time (if needed) and the tables are released afterwards. The changes in detail: * The current alternatives bits are moved to a separate file, the SMP alternatives code is added there. * The patch adds some new elf sections to the kernel: .smp_altinstructions like .altinstructions, also contains a list of alt_instr structs. .smp_altinstr_replacement like .altinstr_replacement, but also has some space to save original instruction before replaving it. .smp_locks list of pointers to lock prefixes which can be nop'ed out on UP. The first two are used to replace more complex instruction sequences such as spinlocks and semaphores. It would be possible to deal with the lock prefixes with that as well, but by handling them as special case the table sizes become much smaller. * The sections are page-aligned and padded up to page size, so they can be free if they are not needed. * Splitted the code to release init pages to a separate function and use it to release the elf sections if they are unused. Signed-off-by: Gerd Hoffmann <kraxel@suse.de> Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-23 10:59:32 +00:00
alternatives_smp_switch(1);
/*
* We can't use kernel_thread since we must avoid to
* reschedule the child.
*/
idle = alloc_idle_task(cpu);
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
idle->thread.eip = (unsigned long) start_secondary;
/* start_eip had better be page-aligned! */
start_eip = setup_trampoline();
/* So we see what's up */
printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
/* Stack for startup_32 can be just as for start_secondary onwards */
stack_start.esp = (void *) idle->thread.esp;
irq_ctx_init(cpu);
x86_cpu_to_apicid[cpu] = apicid;
/*
* This grunge runs the startup process for
* the targeted processor.
*/
atomic_set(&init_deasserted, 0);
Dprintk("Setting warm reset code and vector.\n");
store_NMI_vector(&nmi_high, &nmi_low);
smpboot_setup_warm_reset_vector(start_eip);
/*
* Starting actual IPI sequence...
*/
boot_error = wakeup_secondary_cpu(apicid, start_eip);
if (!boot_error) {
/*
* allow APs to start initializing.
*/
Dprintk("Before Callout %d.\n", cpu);
cpu_set(cpu, cpu_callout_map);
Dprintk("After Callout %d.\n", cpu);
/*
* Wait 5s total for a response
*/
for (timeout = 0; timeout < 50000; timeout++) {
if (cpu_isset(cpu, cpu_callin_map))
break; /* It has booted */
udelay(100);
}
if (cpu_isset(cpu, cpu_callin_map)) {
/* number CPUs logically, starting from 1 (BSP is 0) */
Dprintk("OK.\n");
printk("CPU%d: ", cpu);
print_cpu_info(&cpu_data[cpu]);
Dprintk("CPU has booted.\n");
} else {
boot_error= 1;
if (*((volatile unsigned char *)trampoline_base)
== 0xA5)
/* trampoline started but...? */
printk("Stuck ??\n");
else
/* trampoline code not run */
printk("Not responding.\n");
inquire_remote_apic(apicid);
}
}
if (boot_error) {
/* Try to put things back the way they were before ... */
unmap_cpu_to_logical_apicid(cpu);
cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
cpucount--;
} else {
x86_cpu_to_apicid[cpu] = apicid;
cpu_set(cpu, cpu_present_map);
}
/* mark "stuck" area as not stuck */
*((volatile unsigned long *)trampoline_base) = 0;
return boot_error;
}
#ifdef CONFIG_HOTPLUG_CPU
void cpu_exit_clear(void)
{
int cpu = raw_smp_processor_id();
idle_task_exit();
cpucount --;
cpu_uninit();
irq_ctx_exit(cpu);
cpu_clear(cpu, cpu_callout_map);
cpu_clear(cpu, cpu_callin_map);
cpu_clear(cpu, smp_commenced_mask);
unmap_cpu_to_logical_apicid(cpu);
}
struct warm_boot_cpu_info {
struct completion *complete;
int apicid;
int cpu;
};
static void __cpuinit do_warm_boot_cpu(void *p)
{
struct warm_boot_cpu_info *info = p;
do_boot_cpu(info->apicid, info->cpu);
complete(info->complete);
}
static int __cpuinit __smp_prepare_cpu(int cpu)
{
DECLARE_COMPLETION_ONSTACK(done);
struct warm_boot_cpu_info info;
struct work_struct task;
int apicid, ret;
struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
apicid = x86_cpu_to_apicid[cpu];
if (apicid == BAD_APICID) {
ret = -ENODEV;
goto exit;
}
/*
* the CPU isn't initialized at boot time, allocate gdt table here.
* cpu_init will initialize it
*/
if (!cpu_gdt_descr->address) {
cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
if (!cpu_gdt_descr->address)
printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
ret = -ENOMEM;
goto exit;
}
info.complete = &done;
info.apicid = apicid;
info.cpu = cpu;
INIT_WORK(&task, do_warm_boot_cpu, &info);
tsc_sync_disabled = 1;
/* init low mem mapping */
clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
KERNEL_PGD_PTRS);
flush_tlb_all();
schedule_work(&task);
wait_for_completion(&done);
tsc_sync_disabled = 0;
zap_low_mappings();
ret = 0;
exit:
return ret;
}
#endif
static void smp_tune_scheduling (void)
{
unsigned long cachesize; /* kB */
unsigned long bandwidth = 350; /* MB/s */
/*
* Rough estimation for SMP scheduling, this is the number of
* cycles it takes for a fully memory-limited process to flush
* the SMP-local cache.
*
* (For a P5 this pretty much means we will choose another idle
* CPU almost always at wakeup time (this is due to the small
* L1 cache), on PIIs it's around 50-100 usecs, depending on
* the cache size)
*/
if (!cpu_khz) {
/*
* this basically disables processor-affinity
* scheduling on SMP without a TSC.
*/
return;
} else {
cachesize = boot_cpu_data.x86_cache_size;
if (cachesize == -1) {
cachesize = 16; /* Pentiums, 2x8kB cache */
bandwidth = 100;
}
[PATCH] scheduler cache-hot-autodetect ) From: Ingo Molnar <mingo@elte.hu> This is the latest version of the scheduler cache-hot-auto-tune patch. The first problem was that detection time scaled with O(N^2), which is unacceptable on larger SMP and NUMA systems. To solve this: - I've added a 'domain distance' function, which is used to cache measurement results. Each distance is only measured once. This means that e.g. on NUMA distances of 0, 1 and 2 might be measured, on HT distances 0 and 1, and on SMP distance 0 is measured. The code walks the domain tree to determine the distance, so it automatically follows whatever hierarchy an architecture sets up. This cuts down on the boot time significantly and removes the O(N^2) limit. The only assumption is that migration costs can be expressed as a function of domain distance - this covers the overwhelming majority of existing systems, and is a good guess even for more assymetric systems. [ People hacking systems that have assymetries that break this assumption (e.g. different CPU speeds) should experiment a bit with the cpu_distance() function. Adding a ->migration_distance factor to the domain structure would be one possible solution - but lets first see the problem systems, if they exist at all. Lets not overdesign. ] Another problem was that only a single cache-size was used for measuring the cost of migration, and most architectures didnt set that variable up. Furthermore, a single cache-size does not fit NUMA hierarchies with L3 caches and does not fit HT setups, where different CPUs will often have different 'effective cache sizes'. To solve this problem: - Instead of relying on a single cache-size provided by the platform and sticking to it, the code now auto-detects the 'effective migration cost' between two measured CPUs, via iterating through a wide range of cachesizes. The code searches for the maximum migration cost, which occurs when the working set of the test-workload falls just below the 'effective cache size'. I.e. real-life optimized search is done for the maximum migration cost, between two real CPUs. This, amongst other things, has the positive effect hat if e.g. two CPUs share a L2/L3 cache, a different (and accurate) migration cost will be found than between two CPUs on the same system that dont share any caches. (The reliable measurement of migration costs is tricky - see the source for details.) Furthermore i've added various boot-time options to override/tune migration behavior. Firstly, there's a blanket override for autodetection: migration_cost=1000,2000,3000 will override the depth 0/1/2 values with 1msec/2msec/3msec values. Secondly, there's a global factor that can be used to increase (or decrease) the autodetected values: migration_factor=120 will increase the autodetected values by 20%. This option is useful to tune things in a workload-dependent way - e.g. if a workload is cache-insensitive then CPU utilization can be maximized by specifying migration_factor=0. I've tested the autodetection code quite extensively on x86, on 3 P3/Xeon/2MB, and the autodetected values look pretty good: Dual Celeron (128K L2 cache): --------------------- migration cost matrix (max_cache_size: 131072, cpu: 467 MHz): --------------------- [00] [01] [00]: - 1.7(1) [01]: 1.7(1) - --------------------- cacheflush times [2]: 0.0 (0) 1.7 (1784008) --------------------- Here the slow memory subsystem dominates system performance, and even though caches are small, the migration cost is 1.7 msecs. Dual HT P4 (512K L2 cache): --------------------- migration cost matrix (max_cache_size: 524288, cpu: 2379 MHz): --------------------- [00] [01] [02] [03] [00]: - 0.4(1) 0.0(0) 0.4(1) [01]: 0.4(1) - 0.4(1) 0.0(0) [02]: 0.0(0) 0.4(1) - 0.4(1) [03]: 0.4(1) 0.0(0) 0.4(1) - --------------------- cacheflush times [2]: 0.0 (33900) 0.4 (448514) --------------------- Here it can be seen that there is no migration cost between two HT siblings (CPU#0/2 and CPU#1/3 are separate physical CPUs). A fast memory system makes inter-physical-CPU migration pretty cheap: 0.4 msecs. 8-way P3/Xeon [2MB L2 cache]: --------------------- migration cost matrix (max_cache_size: 2097152, cpu: 700 MHz): --------------------- [00] [01] [02] [03] [04] [05] [06] [07] [00]: - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) [01]: 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) [02]: 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) [03]: 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) [04]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) [05]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) [06]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) [07]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - --------------------- cacheflush times [2]: 0.0 (0) 19.2 (19281756) --------------------- This one has huge caches and a relatively slow memory subsystem - so the migration cost is 19 msecs. Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Cc: <wilder@us.ibm.com> Signed-off-by: John Hawkes <hawkes@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-12 09:05:30 +00:00
max_cache_size = cachesize * 1024;
}
}
/*
* Cycle through the processors sending APIC IPIs to boot each.
*/
static int boot_cpu_logical_apicid;
/* Where the IO area was mapped on multiquad, always 0 otherwise */
void *xquad_portio;
#ifdef CONFIG_X86_NUMAQ
EXPORT_SYMBOL(xquad_portio);
#endif
static void __init smp_boot_cpus(unsigned int max_cpus)
{
int apicid, cpu, bit, kicked;
unsigned long bogosum = 0;
/*
* Setup boot CPU information
*/
smp_store_cpu_info(0); /* Final full version of the data */
printk("CPU%d: ", 0);
print_cpu_info(&cpu_data[0]);
boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
boot_cpu_logical_apicid = logical_smp_processor_id();
x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
current_thread_info()->cpu = 0;
smp_tune_scheduling();
set_cpu_sibling_map(0);
/*
* If we couldn't find an SMP configuration at boot time,
* get out of here now!
*/
if (!smp_found_config && !acpi_lapic) {
printk(KERN_NOTICE "SMP motherboard not detected.\n");
smpboot_clear_io_apic_irqs();
phys_cpu_present_map = physid_mask_of_physid(0);
if (APIC_init_uniprocessor())
printk(KERN_NOTICE "Local APIC not detected."
" Using dummy APIC emulation.\n");
map_cpu_to_logical_apicid();
cpu_set(0, cpu_sibling_map[0]);
cpu_set(0, cpu_core_map[0]);
return;
}
/*
* Should not be necessary because the MP table should list the boot
* CPU too, but we do it for the sake of robustness anyway.
* Makes no sense to do this check in clustered apic mode, so skip it
*/
if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
boot_cpu_physical_apicid);
physid_set(hard_smp_processor_id(), phys_cpu_present_map);
}
/*
* If we couldn't find a local APIC, then get out of here now!
*/
if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
boot_cpu_physical_apicid);
printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
smpboot_clear_io_apic_irqs();
phys_cpu_present_map = physid_mask_of_physid(0);
cpu_set(0, cpu_sibling_map[0]);
cpu_set(0, cpu_core_map[0]);
return;
}
verify_local_APIC();
/*
* If SMP should be disabled, then really disable it!
*/
if (!max_cpus) {
smp_found_config = 0;
printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
smpboot_clear_io_apic_irqs();
phys_cpu_present_map = physid_mask_of_physid(0);
cpu_set(0, cpu_sibling_map[0]);
cpu_set(0, cpu_core_map[0]);
return;
}
connect_bsp_APIC();
setup_local_APIC();
map_cpu_to_logical_apicid();
setup_portio_remap();
/*
* Scan the CPU present map and fire up the other CPUs via do_boot_cpu
*
* In clustered apic mode, phys_cpu_present_map is a constructed thus:
* bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the
* clustered apic ID.
*/
Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
kicked = 1;
for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
apicid = cpu_present_to_apicid(bit);
/*
* Don't even attempt to start the boot CPU!
*/
if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
continue;
if (!check_apicid_present(bit))
continue;
if (max_cpus <= cpucount+1)
continue;
if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
printk("CPU #%d not responding - cannot use it.\n",
apicid);
else
++kicked;
}
/*
* Cleanup possible dangling ends...
*/
smpboot_restore_warm_reset_vector();
/*
* Allow the user to impress friends.
*/
Dprintk("Before bogomips.\n");
for (cpu = 0; cpu < NR_CPUS; cpu++)
if (cpu_isset(cpu, cpu_callout_map))
bogosum += cpu_data[cpu].loops_per_jiffy;
printk(KERN_INFO
"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
cpucount+1,
bogosum/(500000/HZ),
(bogosum/(5000/HZ))%100);
Dprintk("Before bogocount - setting activated=1.\n");
if (smp_b_stepping)
printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
/*
* Don't taint if we are running SMP kernel on a single non-MP
* approved Athlon
*/
if (tainted & TAINT_UNSAFE_SMP) {
if (cpucount)
printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
else
tainted &= ~TAINT_UNSAFE_SMP;
}
Dprintk("Boot done.\n");
/*
* construct cpu_sibling_map[], so that we can tell sibling CPUs
* efficiently.
*/
for (cpu = 0; cpu < NR_CPUS; cpu++) {
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
}
cpu_set(0, cpu_sibling_map[0]);
cpu_set(0, cpu_core_map[0]);
smpboot_setup_io_apic();
setup_boot_APIC_clock();
/*
* Synchronize the TSC with the AP
*/
if (cpu_has_tsc && cpucount && cpu_khz)
synchronize_tsc_bp();
}
/* These are wrappers to interface to the new boot process. Someone
who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
void __init smp_prepare_cpus(unsigned int max_cpus)
{
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
smp_commenced_mask = cpumask_of_cpu(0);
cpu_callin_map = cpumask_of_cpu(0);
mb();
smp_boot_cpus(max_cpus);
}
void __devinit smp_prepare_boot_cpu(void)
{
cpu_set(smp_processor_id(), cpu_online_map);
cpu_set(smp_processor_id(), cpu_callout_map);
cpu_set(smp_processor_id(), cpu_present_map);
cpu_set(smp_processor_id(), cpu_possible_map);
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
}
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
#ifdef CONFIG_HOTPLUG_CPU
static void
remove_siblinginfo(int cpu)
{
int sibling;
struct cpuinfo_x86 *c = cpu_data;
for_each_cpu_mask(sibling, cpu_core_map[cpu]) {
cpu_clear(cpu, cpu_core_map[sibling]);
/*
* last thread sibling in this cpu core going down
*/
if (cpus_weight(cpu_sibling_map[cpu]) == 1)
c[sibling].booted_cores--;
}
for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
cpu_clear(cpu, cpu_sibling_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
c[cpu].phys_proc_id = 0;
c[cpu].cpu_core_id = 0;
cpu_clear(cpu, cpu_sibling_setup_map);
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
}
int __cpu_disable(void)
{
cpumask_t map = cpu_online_map;
int cpu = smp_processor_id();
/*
* Perhaps use cpufreq to drop frequency, but that could go
* into generic code.
*
* We won't take down the boot processor on i386 due to some
* interrupts only being able to be serviced by the BSP.
* Especially so if we're not using an IOAPIC -zwane
*/
if (cpu == 0)
return -EBUSY;
if (nmi_watchdog == NMI_LOCAL_APIC)
stop_apic_nmi_watchdog(NULL);
clear_local_APIC();
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
/* Allow any queued timer interrupts to get serviced */
local_irq_enable();
mdelay(1);
local_irq_disable();
remove_siblinginfo(cpu);
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
cpu_clear(cpu, map);
fixup_irqs(map);
/* It's now safe to remove this processor from the online map */
cpu_clear(cpu, cpu_online_map);
return 0;
}
void __cpu_die(unsigned int cpu)
{
/* We don't do anything here: idle task is faking death itself. */
unsigned int i;
for (i = 0; i < 10; i++) {
/* They ack this in play_dead by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
printk ("CPU %d is now offline\n", cpu);
[PATCH] x86: SMP alternatives Implement SMP alternatives, i.e. switching at runtime between different code versions for UP and SMP. The code can patch both SMP->UP and UP->SMP. The UP->SMP case is useful for CPU hotplug. With CONFIG_CPU_HOTPLUG enabled the code switches to UP at boot time and when the number of CPUs goes down to 1, and switches to SMP when the number of CPUs goes up to 2. Without CONFIG_CPU_HOTPLUG or on non-SMP-capable systems the code is patched once at boot time (if needed) and the tables are released afterwards. The changes in detail: * The current alternatives bits are moved to a separate file, the SMP alternatives code is added there. * The patch adds some new elf sections to the kernel: .smp_altinstructions like .altinstructions, also contains a list of alt_instr structs. .smp_altinstr_replacement like .altinstr_replacement, but also has some space to save original instruction before replaving it. .smp_locks list of pointers to lock prefixes which can be nop'ed out on UP. The first two are used to replace more complex instruction sequences such as spinlocks and semaphores. It would be possible to deal with the lock prefixes with that as well, but by handling them as special case the table sizes become much smaller. * The sections are page-aligned and padded up to page size, so they can be free if they are not needed. * Splitted the code to release init pages to a separate function and use it to release the elf sections if they are unused. Signed-off-by: Gerd Hoffmann <kraxel@suse.de> Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-23 10:59:32 +00:00
if (1 == num_online_cpus())
alternatives_smp_switch(0);
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
return;
}
msleep(100);
}
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
#else /* ... !CONFIG_HOTPLUG_CPU */
int __cpu_disable(void)
{
return -ENOSYS;
}
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
void __cpu_die(unsigned int cpu)
{
/* We said "no" in __cpu_disable */
BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */
int __devinit __cpu_up(unsigned int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
int ret=0;
/*
* We do warm boot only on cpus that had booted earlier
* Otherwise cold boot is all handled from smp_boot_cpus().
* cpu_callin_map is set during AP kickstart process. Its reset
* when a cpu is taken offline from cpu_exit_clear().
*/
if (!cpu_isset(cpu, cpu_callin_map))
ret = __smp_prepare_cpu(cpu);
if (ret)
return -EIO;
#endif
/* In case one didn't come up */
if (!cpu_isset(cpu, cpu_callin_map)) {
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
local_irq_enable();
return -EIO;
}
local_irq_enable();
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* Unleash the CPU! */
cpu_set(cpu, smp_commenced_mask);
while (!cpu_isset(cpu, cpu_online_map))
cpu_relax();
return 0;
}
void __init smp_cpus_done(unsigned int max_cpus)
{
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
zap_low_mappings();
#ifndef CONFIG_HOTPLUG_CPU
/*
* Disable executability of the SMP trampoline:
*/
set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
#endif
}
void __init smp_intr_init(void)
{
/*
* IRQ0 must be given a fixed assignment and initialized,
* because it's used before the IO-APIC is set up.
*/
set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
/*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
*/
set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
/* IPI for invalidation */
set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
/* IPI for generic function call */
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
}
/*
* If the BIOS enumerates physical processors before logical,
* maxcpus=N at enumeration-time can be used to disable HT.
*/
static int __init parse_maxcpus(char *arg)
{
extern unsigned int maxcpus;
maxcpus = simple_strtoul(arg, NULL, 0);
return 0;
}
early_param("maxcpus", parse_maxcpus);