From a3cf5e6b6f2548b036921da5ab6325dc8a76e207 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 3 Aug 2008 00:01:05 -0700
Subject: [PATCH 01/14] sparc64: Need to disable preemption around
 smp_tsb_sync().

Based upon a bug report by Mariusz Kozlowski

It uses smp_call_function_masked() now, which has a preemption-disabled
requirement.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/mm/tsb.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
index 3547937b17a2..587f8efb2e05 100644
--- a/arch/sparc64/mm/tsb.c
+++ b/arch/sparc64/mm/tsb.c
@@ -1,9 +1,10 @@
 /* arch/sparc64/mm/tsb.c
  *
- * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ * Copyright (C) 2006, 2008 David S. Miller <davem@davemloft.net>
  */
 
 #include <linux/kernel.h>
+#include <linux/preempt.h>
 #include <asm/system.h>
 #include <asm/page.h>
 #include <asm/tlbflush.h>
@@ -415,7 +416,9 @@ retry_tsb_alloc:
 		tsb_context_switch(mm);
 
 		/* Now force other processors to do the same.  */
+		preempt_disable();
 		smp_tsb_sync(mm);
+		preempt_enable();
 
 		/* Now it is safe to free the old tsb.  */
 		kmem_cache_free(tsb_caches[old_cache_index], old_tsb);

From abd9e6982815ad7bd2c70dbf4cc0c08b48229d6e Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Sun, 3 Aug 2008 00:04:13 -0700
Subject: [PATCH 02/14] arch/sparc64/kernel/signal.c: removed duplicated
 #include

Removed duplicated #include <linux/tracehook.h> in
arch/sparc64/kernel/signal.c.

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/signal.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/sparc64/kernel/signal.c b/arch/sparc64/kernel/signal.c
index ca5a6ae3a6e2..ec82d76dc6f2 100644
--- a/arch/sparc64/kernel/signal.c
+++ b/arch/sparc64/kernel/signal.c
@@ -23,7 +23,6 @@
 #include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/bitops.h>
-#include <linux/tracehook.h>
 
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>

From 5e0797e5b84408a13260a107e2f7a49ee6342ae4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 3 Aug 2008 22:52:41 -0700
Subject: [PATCH 03/14] sparc64: Use function pointer for cross-call sending.

Initialize it using the smp_setup_processor_id() hook.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 340842e51ce1..3c6970ad774f 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -756,6 +756,8 @@ dump_cpu_list_and_out:
 	printk("]\n");
 }
 
+static void (*xcall_deliver)(u64, u64, u64, cpumask_t);
+
 /* Send cross call to all processors mentioned in MASK
  * except self.
  */
@@ -767,12 +769,7 @@ static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 d
 	cpus_and(mask, mask, cpu_online_map);
 	cpu_clear(this_cpu, mask);
 
-	if (tlb_type == spitfire)
-		spitfire_xcall_deliver(data0, data1, data2, mask);
-	else if (tlb_type == cheetah || tlb_type == cheetah_plus)
-		cheetah_xcall_deliver(data0, data1, data2, mask);
-	else
-		hypervisor_xcall_deliver(data0, data1, data2, mask);
+	xcall_deliver(data0, data1, data2, mask);
 	/* NOTE: Caller runs local copy on master. */
 
 	put_cpu();
@@ -1202,6 +1199,16 @@ void __devinit smp_prepare_boot_cpu(void)
 {
 }
 
+void __init smp_setup_processor_id(void)
+{
+	if (tlb_type == spitfire)
+		xcall_deliver = spitfire_xcall_deliver;
+	else if (tlb_type == cheetah || tlb_type == cheetah_plus)
+		xcall_deliver = cheetah_xcall_deliver;
+	else
+		xcall_deliver = hypervisor_xcall_deliver;
+}
+
 void __devinit smp_fill_in_sib_core_maps(void)
 {
 	unsigned int i;

From 622824dbb536f7bdc241eefc3e1ae31c463b4eb8 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 3 Aug 2008 23:07:18 -0700
Subject: [PATCH 04/14] sparc64: Use xcall_deliver() consistently.

There remained some spots still vectoring to the appropriate
*_xcall_deliver() function manually.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 40 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 3c6970ad774f..063668feab1e 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -890,29 +890,24 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 		__local_flush_dcache_page(page);
 	} else if (cpu_online(cpu)) {
 		void *pg_addr = page_address(page);
-		u64 data0;
+		u64 data0 = 0;
 
 		if (tlb_type == spitfire) {
-			data0 =
-				((u64)&xcall_flush_dcache_page_spitfire);
+			data0 = ((u64)&xcall_flush_dcache_page_spitfire);
 			if (page_mapping(page) != NULL)
 				data0 |= ((u64)1 << 32);
-			spitfire_xcall_deliver(data0,
-					       __pa(pg_addr),
-					       (u64) pg_addr,
-					       mask);
 		} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 #ifdef DCACHE_ALIASING_POSSIBLE
-			data0 =
-				((u64)&xcall_flush_dcache_page_cheetah);
-			cheetah_xcall_deliver(data0,
-					      __pa(pg_addr),
-					      0, mask);
+			data0 =	((u64)&xcall_flush_dcache_page_cheetah);
 #endif
 		}
+		if (data0) {
+			xcall_deliver(data0, __pa(pg_addr),
+				      (u64) pg_addr, mask);
 #ifdef CONFIG_DEBUG_DCFLUSH
-		atomic_inc(&dcpage_flushes_xcall);
+			atomic_inc(&dcpage_flushes_xcall);
 #endif
+		}
 	}
 
 	put_cpu();
@@ -920,10 +915,10 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 
 void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 {
-	void *pg_addr = page_address(page);
 	cpumask_t mask = cpu_online_map;
-	u64 data0;
+	void *pg_addr;
 	int this_cpu;
+	u64 data0;
 
 	if (tlb_type == hypervisor)
 		return;
@@ -937,25 +932,24 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 #endif
 	if (cpus_empty(mask))
 		goto flush_self;
+	data0 = 0;
+	pg_addr = page_address(page);
 	if (tlb_type == spitfire) {
 		data0 = ((u64)&xcall_flush_dcache_page_spitfire);
 		if (page_mapping(page) != NULL)
 			data0 |= ((u64)1 << 32);
-		spitfire_xcall_deliver(data0,
-				       __pa(pg_addr),
-				       (u64) pg_addr,
-				       mask);
 	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 #ifdef DCACHE_ALIASING_POSSIBLE
 		data0 = ((u64)&xcall_flush_dcache_page_cheetah);
-		cheetah_xcall_deliver(data0,
-				      __pa(pg_addr),
-				      0, mask);
 #endif
 	}
+	if (data0) {
+		xcall_deliver(data0, __pa(pg_addr),
+			      (u64) pg_addr, mask);
 #ifdef CONFIG_DEBUG_DCFLUSH
-	atomic_inc(&dcpage_flushes_xcall);
+		atomic_inc(&dcpage_flushes_xcall);
 #endif
+	}
  flush_self:
 	__local_flush_dcache_page(page);
 

From cd5bc89debb4045d55eeffe325b97f2dfba4ddea Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 3 Aug 2008 23:24:26 -0700
Subject: [PATCH 05/14] sparc64: Use cpumask_t pointers and
 for_each_cpu_mask_nr() in xcall_deliver.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 063668feab1e..868625e3b661 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -459,13 +459,13 @@ again:
 	}
 }
 
-static inline void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+static inline void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
 {
 	u64 pstate;
 	int i;
 
 	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
-	for_each_cpu_mask(i, mask)
+	for_each_cpu_mask_nr(i, *mask)
 		spitfire_xcall_helper(data0, data1, data2, pstate, i);
 }
 
@@ -473,14 +473,17 @@ static inline void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, cpuma
  * packet, but we have no use for that.  However we do take advantage of
  * the new pipelining feature (ie. dispatch to multiple cpus simultaneously).
  */
-static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask_p)
 {
 	u64 pstate, ver, busy_mask;
 	int nack_busy_id, is_jbus, need_more;
+	cpumask_t mask;
 
-	if (cpus_empty(mask))
+	if (cpus_empty(*mask_p))
 		return;
 
+	mask = *mask_p;
+
 	/* Unfortunately, someone at Sun had the brilliant idea to make the
 	 * busy/nack fields hard-coded by ITID number for this Ultra-III
 	 * derivative processor.
@@ -511,7 +514,7 @@ retry:
 	{
 		int i;
 
-		for_each_cpu_mask(i, mask) {
+		for_each_cpu_mask_nr(i, mask) {
 			u64 target = (i << 14) | 0x70;
 
 			if (is_jbus) {
@@ -550,7 +553,7 @@ retry:
 						     : : "r" (pstate));
 				if (unlikely(need_more)) {
 					int i, cnt = 0;
-					for_each_cpu_mask(i, mask) {
+					for_each_cpu_mask_nr(i, mask) {
 						cpu_clear(i, mask);
 						cnt++;
 						if (cnt == 32)
@@ -584,7 +587,7 @@ retry:
 			/* Clear out the mask bits for cpus which did not
 			 * NACK us.
 			 */
-			for_each_cpu_mask(i, mask) {
+			for_each_cpu_mask_nr(i, mask) {
 				u64 check_mask;
 
 				if (is_jbus)
@@ -605,16 +608,16 @@ retry:
 }
 
 /* Multi-cpu list version.  */
-static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
 {
+	int cnt, retries, this_cpu, prev_sent, i;
+	unsigned long flags, status;
+	cpumask_t error_mask;
 	struct trap_per_cpu *tb;
 	u16 *cpu_list;
 	u64 *mondo;
-	cpumask_t error_mask;
-	unsigned long flags, status;
-	int cnt, retries, this_cpu, prev_sent, i;
 
-	if (cpus_empty(mask))
+	if (cpus_empty(*mask))
 		return;
 
 	/* We have to do this whole thing with interrupts fully disabled.
@@ -642,7 +645,7 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t
 
 	/* Setup the initial cpu list.  */
 	cnt = 0;
-	for_each_cpu_mask(i, mask)
+	for_each_cpu_mask_nr(i, *mask)
 		cpu_list[cnt++] = i;
 
 	cpus_clear(error_mask);
@@ -729,7 +732,7 @@ fatal_mondo_cpu_error:
 	       "were in error state\n",
 	       this_cpu);
 	printk(KERN_CRIT "CPU[%d]: Error mask [ ", this_cpu);
-	for_each_cpu_mask(i, error_mask)
+	for_each_cpu_mask_nr(i, error_mask)
 		printk("%d ", i);
 	printk("]\n");
 	return;
@@ -756,7 +759,7 @@ dump_cpu_list_and_out:
 	printk("]\n");
 }
 
-static void (*xcall_deliver)(u64, u64, u64, cpumask_t);
+static void (*xcall_deliver)(u64, u64, u64, const cpumask_t *);
 
 /* Send cross call to all processors mentioned in MASK
  * except self.
@@ -769,7 +772,7 @@ static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 d
 	cpus_and(mask, mask, cpu_online_map);
 	cpu_clear(this_cpu, mask);
 
-	xcall_deliver(data0, data1, data2, mask);
+	xcall_deliver(data0, data1, data2, &mask);
 	/* NOTE: Caller runs local copy on master. */
 
 	put_cpu();
@@ -903,7 +906,7 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 		}
 		if (data0) {
 			xcall_deliver(data0, __pa(pg_addr),
-				      (u64) pg_addr, mask);
+				      (u64) pg_addr, &mask);
 #ifdef CONFIG_DEBUG_DCFLUSH
 			atomic_inc(&dcpage_flushes_xcall);
 #endif
@@ -945,7 +948,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 	}
 	if (data0) {
 		xcall_deliver(data0, __pa(pg_addr),
-			      (u64) pg_addr, mask);
+			      (u64) pg_addr, &mask);
 #ifdef CONFIG_DEBUG_DCFLUSH
 		atomic_inc(&dcpage_flushes_xcall);
 #endif

From 199266305311d060b6e057fa5c7de01f218bb911 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 3 Aug 2008 23:56:28 -0700
Subject: [PATCH 06/14] sparc64: Call xcall_deliver() directly in some cases.

For these cases the callers make sure:

1) The cpus indicated are online.

2) The current cpu is not in the list of indicated cpus.

Therefore we can pass a pointer to the mask directly.

One of the motivations in this transformation is to make use of
"&cpumask_of_cpu(cpu)" which evaluates to a pointer to constant
data in the kernel and thus takes up no stack space.

Hopefully someone in the future will change the interface of
arch_send_call_function_ipi() such that it passes a const cpumask_t
pointer so that this will optimize ever further.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 33 ++++++++++-----------------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 868625e3b661..47b0738ea4be 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -792,16 +792,15 @@ extern unsigned long xcall_call_function;
 
 void arch_send_call_function_ipi(cpumask_t mask)
 {
-	smp_cross_call_masked(&xcall_call_function, 0, 0, 0, mask);
+	xcall_deliver((u64) &xcall_call_function, 0, 0, &mask);
 }
 
 extern unsigned long xcall_call_function_single;
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-	cpumask_t mask = cpumask_of_cpu(cpu);
-
-	smp_cross_call_masked(&xcall_call_function_single, 0, 0, 0, mask);
+	xcall_deliver((u64) &xcall_call_function_single, 0, 0,
+		      &cpumask_of_cpu(cpu));
 }
 
 /* Send cross call to all processors except self. */
@@ -959,24 +958,6 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 	put_cpu();
 }
 
-static void __smp_receive_signal_mask(cpumask_t mask)
-{
-	smp_cross_call_masked(&xcall_receive_signal, 0, 0, 0, mask);
-}
-
-void smp_receive_signal(int cpu)
-{
-	cpumask_t mask = cpumask_of_cpu(cpu);
-
-	if (cpu_online(cpu))
-		__smp_receive_signal_mask(mask);
-}
-
-void smp_receive_signal_client(int irq, struct pt_regs *regs)
-{
-	clear_softint(1 << irq);
-}
-
 void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
 {
 	struct mm_struct *mm;
@@ -1374,7 +1355,13 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 void smp_send_reschedule(int cpu)
 {
-	smp_receive_signal(cpu);
+	xcall_deliver((u64) &xcall_receive_signal, 0, 0,
+		      &cpumask_of_cpu(cpu));
+}
+
+void smp_receive_signal_client(int irq, struct pt_regs *regs)
+{
+	clear_softint(1 << irq);
 }
 
 /* This is a nop because we capture all other cpus

From 24445a4ac9d3fdd3f96f0ad277cb2ba274470d94 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 4 Aug 2008 00:02:31 -0700
Subject: [PATCH 07/14] sparc64: Directly call xcall_deliver() in
 smp_start_sync_tick_client.

We know the cpu is online and not the current cpu here.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 47b0738ea4be..8c9e75dc1e65 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -782,10 +782,8 @@ extern unsigned long xcall_sync_tick;
 
 static void smp_start_sync_tick_client(int cpu)
 {
-	cpumask_t mask = cpumask_of_cpu(cpu);
-
-	smp_cross_call_masked(&xcall_sync_tick,
-			      0, 0, 0, mask);
+	xcall_deliver((u64) &xcall_sync_tick, 0, 0,
+		      &cpumask_of_cpu(cpu));
 }
 
 extern unsigned long xcall_call_function;

From 91a4231cc2efb9134373bb2a93be96a284955607 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 4 Aug 2008 00:51:18 -0700
Subject: [PATCH 08/14] sparc64: Make smp_cross_call_masked() take a cpumask_t
 pointer.

Ideally this could be simplified further such that we could pass
the pointer down directly into the xcall_deliver() implementation.

But if we do that we need to do the "cpu_online(cpu)" and
"cpu != self" checks down in those functions.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 8c9e75dc1e65..740259d89552 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -761,15 +761,19 @@ dump_cpu_list_and_out:
 
 static void (*xcall_deliver)(u64, u64, u64, const cpumask_t *);
 
-/* Send cross call to all processors mentioned in MASK
- * except self.
+/* Send cross call to all processors mentioned in MASK_P
+ * except self.  Really, there are only two cases currently,
+ * "&cpu_online_map" and "&mm->cpu_vm_mask".
  */
-static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, cpumask_t mask)
+static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, const cpumask_t *mask_p)
 {
 	u64 data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff));
 	int this_cpu = get_cpu();
+	cpumask_t mask;
 
-	cpus_and(mask, mask, cpu_online_map);
+	mask = *mask_p;
+	if (mask_p != &cpu_online_map)
+		cpus_and(mask, mask, cpu_online_map);
 	cpu_clear(this_cpu, mask);
 
 	xcall_deliver(data0, data1, data2, &mask);
@@ -803,7 +807,7 @@ void arch_send_call_function_single_ipi(int cpu)
 
 /* Send cross call to all processors except self. */
 #define smp_cross_call(func, ctx, data1, data2) \
-	smp_cross_call_masked(func, ctx, data1, data2, cpu_online_map)
+	smp_cross_call_masked(func, ctx, data1, data2, &cpu_online_map)
 
 void smp_call_function_client(int irq, struct pt_regs *regs)
 {
@@ -1056,7 +1060,7 @@ void smp_flush_tlb_mm(struct mm_struct *mm)
 
 	smp_cross_call_masked(&xcall_flush_tlb_mm,
 			      ctx, 0, 0,
-			      mm->cpu_vm_mask);
+			      &mm->cpu_vm_mask);
 
 local_flush_and_out:
 	__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
@@ -1074,7 +1078,7 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
 	else
 		smp_cross_call_masked(&xcall_flush_tlb_pending,
 				      ctx, nr, (unsigned long) vaddrs,
-				      mm->cpu_vm_mask);
+				      &mm->cpu_vm_mask);
 
 	__flush_tlb_pending(ctx, nr, vaddrs);
 

From 43f589235e223418d5807ebcddca73ec8a45f52c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 4 Aug 2008 16:13:51 -0700
Subject: [PATCH 09/14] sparc64: Always allocate the send mondo blocks, even on
 non-sun4v.

The idea is that we'll use this cpu list array and mondo block
even for non-hypervisor platforms.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/irq.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index c481673d249c..ba43d85e8dde 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -915,12 +915,18 @@ static void __init sun4v_init_mondo_queues(void)
 		alloc_one_mondo(&tb->nonresum_mondo_pa, tb->nonresum_qmask);
 		alloc_one_kbuf(&tb->nonresum_kernel_buf_pa,
 			       tb->nonresum_qmask);
+	}
+}
+
+static void __init init_send_mondo_info(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct trap_per_cpu *tb = &trap_block[cpu];
 
 		init_cpu_send_mondo_info(tb);
 	}
-
-	/* Load up the boot cpu's entries.  */
-	sun4v_register_mondo_queues(hard_smp_processor_id());
 }
 
 static struct irqaction timer_irq_action = {
@@ -949,6 +955,13 @@ void __init init_IRQ(void)
 	if (tlb_type == hypervisor)
 		sun4v_init_mondo_queues();
 
+	init_send_mondo_info();
+
+	if (tlb_type == hypervisor) {
+		/* Load up the boot cpu's entries.  */
+		sun4v_register_mondo_queues(hard_smp_processor_id());
+	}
+
 	/* We need to clear any IRQ's pending in the soft interrupt
 	 * registers, a spurious one could be left around from the
 	 * PROM timer which we just disabled.

From deb16999e452b74011dac5b2fe0d6258df81a2a1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 4 Aug 2008 16:16:20 -0700
Subject: [PATCH 10/14] sparc64: Make all xcall_deliver's go through common
 helper function.

This just facilitates the next changeset where we'll be building
the cpu list and mondo block in this helper function.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 740259d89552..20f4e291c74a 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -759,7 +759,12 @@ dump_cpu_list_and_out:
 	printk("]\n");
 }
 
-static void (*xcall_deliver)(u64, u64, u64, const cpumask_t *);
+static void (*xcall_deliver_impl)(u64, u64, u64, const cpumask_t *);
+
+static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
+{
+	xcall_deliver_impl(data0, data1, data2, mask);
+}
 
 /* Send cross call to all processors mentioned in MASK_P
  * except self.  Really, there are only two cases currently,
@@ -1182,11 +1187,11 @@ void __devinit smp_prepare_boot_cpu(void)
 void __init smp_setup_processor_id(void)
 {
 	if (tlb_type == spitfire)
-		xcall_deliver = spitfire_xcall_deliver;
+		xcall_deliver_impl = spitfire_xcall_deliver;
 	else if (tlb_type == cheetah || tlb_type == cheetah_plus)
-		xcall_deliver = cheetah_xcall_deliver;
+		xcall_deliver_impl = cheetah_xcall_deliver;
 	else
-		xcall_deliver = hypervisor_xcall_deliver;
+		xcall_deliver_impl = hypervisor_xcall_deliver;
 }
 
 void __devinit smp_fill_in_sib_core_maps(void)

From c02a5119e862dea9a1361182840d41ae1fe24227 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 4 Aug 2008 16:18:40 -0700
Subject: [PATCH 11/14] sparc64: Disable local interrupts around
 xcall_deliver_impl() invocation.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 20f4e291c74a..6d458b35643c 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -611,7 +611,7 @@ retry:
 static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
 {
 	int cnt, retries, this_cpu, prev_sent, i;
-	unsigned long flags, status;
+	unsigned long status;
 	cpumask_t error_mask;
 	struct trap_per_cpu *tb;
 	u16 *cpu_list;
@@ -620,18 +620,6 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpum
 	if (cpus_empty(*mask))
 		return;
 
-	/* We have to do this whole thing with interrupts fully disabled.
-	 * Otherwise if we send an xcall from interrupt context it will
-	 * corrupt both our mondo block and cpu list state.
-	 *
-	 * One consequence of this is that we cannot use timeout mechanisms
-	 * that depend upon interrupts being delivered locally.  So, for
-	 * example, we cannot sample jiffies and expect it to advance.
-	 *
-	 * Fortunately, udelay() uses %stick/%tick so we can use that.
-	 */
-	local_irq_save(flags);
-
 	this_cpu = smp_processor_id();
 	tb = &trap_block[this_cpu];
 
@@ -720,8 +708,6 @@ static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpum
 		}
 	} while (1);
 
-	local_irq_restore(flags);
-
 	if (unlikely(!cpus_empty(error_mask)))
 		goto fatal_mondo_cpu_error;
 
@@ -738,14 +724,12 @@ fatal_mondo_cpu_error:
 	return;
 
 fatal_mondo_timeout:
-	local_irq_restore(flags);
 	printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
 	       " progress after %d retries.\n",
 	       this_cpu, retries);
 	goto dump_cpu_list_and_out;
 
 fatal_mondo_error:
-	local_irq_restore(flags);
 	printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
 	       this_cpu, status);
 	printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
@@ -763,7 +747,21 @@ static void (*xcall_deliver_impl)(u64, u64, u64, const cpumask_t *);
 
 static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
 {
+	unsigned long flags;
+
+	/* We have to do this whole thing with interrupts fully disabled.
+	 * Otherwise if we send an xcall from interrupt context it will
+	 * corrupt both our mondo block and cpu list state.
+	 *
+	 * One consequence of this is that we cannot use timeout mechanisms
+	 * that depend upon interrupts being delivered locally.  So, for
+	 * example, we cannot sample jiffies and expect it to advance.
+	 *
+	 * Fortunately, udelay() uses %stick/%tick so we can use that.
+	 */
+	local_irq_save(flags);
 	xcall_deliver_impl(data0, data1, data2, mask);
+	local_irq_restore(flags);
 }
 
 /* Send cross call to all processors mentioned in MASK_P

From 90f7ae8a55190f5edfb9fda957e25c994ed39ec4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 4 Aug 2008 16:42:58 -0700
Subject: [PATCH 12/14] sparc64: Build cpu list and mondo block at top-level
 xcall_deliver().

Then modify all of the xcall dispatch implementations get passed and
use this information.

Now all of the xcall dispatch implementations do not need to be mindful
of details such as "is current cpu in the list?" and "is cpu online?"

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 113 +++++++++++++++++++++++---------------
 1 file changed, 69 insertions(+), 44 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 6d458b35643c..2387a9b81be7 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -459,30 +459,35 @@ again:
 	}
 }
 
-static inline void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
+static void spitfire_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 {
+	u64 *mondo, data0, data1, data2;
+	u16 *cpu_list;
 	u64 pstate;
 	int i;
 
 	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
-	for_each_cpu_mask_nr(i, *mask)
-		spitfire_xcall_helper(data0, data1, data2, pstate, i);
+	cpu_list = __va(tb->cpu_list_pa);
+	mondo = __va(tb->cpu_mondo_block_pa);
+	data0 = mondo[0];
+	data1 = mondo[1];
+	data2 = mondo[2];
+	for (i = 0; i < cnt; i++)
+		spitfire_xcall_helper(data0, data1, data2, pstate, cpu_list[i]);
 }
 
 /* Cheetah now allows to send the whole 64-bytes of data in the interrupt
  * packet, but we have no use for that.  However we do take advantage of
  * the new pipelining feature (ie. dispatch to multiple cpus simultaneously).
  */
-static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask_p)
+static void cheetah_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 {
-	u64 pstate, ver, busy_mask;
 	int nack_busy_id, is_jbus, need_more;
-	cpumask_t mask;
+	u64 *mondo, pstate, ver, busy_mask;
+	u16 *cpu_list;
 
-	if (cpus_empty(*mask_p))
-		return;
-
-	mask = *mask_p;
+	cpu_list = __va(tb->cpu_list_pa);
+	mondo = __va(tb->cpu_mondo_block_pa);
 
 	/* Unfortunately, someone at Sun had the brilliant idea to make the
 	 * busy/nack fields hard-coded by ITID number for this Ultra-III
@@ -505,7 +510,7 @@ retry:
 			     "stxa	%2, [%5] %6\n\t"
 			     "membar	#Sync\n\t"
 			     : /* no outputs */
-			     : "r" (data0), "r" (data1), "r" (data2),
+			     : "r" (mondo[0]), "r" (mondo[1]), "r" (mondo[2]),
 			       "r" (0x40), "r" (0x50), "r" (0x60),
 			       "i" (ASI_INTR_W));
 
@@ -514,11 +519,16 @@ retry:
 	{
 		int i;
 
-		for_each_cpu_mask_nr(i, mask) {
-			u64 target = (i << 14) | 0x70;
+		for (i = 0; i < cnt; i++) {
+			u64 target, nr;
 
+			nr = cpu_list[i];
+			if (nr == 0xffff)
+				continue;
+
+			target = (nr << 14) | 0x70;
 			if (is_jbus) {
-				busy_mask |= (0x1UL << (i * 2));
+				busy_mask |= (0x1UL << (nr * 2));
 			} else {
 				target |= (nack_busy_id << 24);
 				busy_mask |= (0x1UL <<
@@ -552,11 +562,13 @@ retry:
 				__asm__ __volatile__("wrpr %0, 0x0, %%pstate"
 						     : : "r" (pstate));
 				if (unlikely(need_more)) {
-					int i, cnt = 0;
-					for_each_cpu_mask_nr(i, mask) {
-						cpu_clear(i, mask);
-						cnt++;
-						if (cnt == 32)
+					int i, this_cnt = 0;
+					for (i = 0; i < cnt; i++) {
+						if (cpu_list[i] == 0xffff)
+							continue;
+						cpu_list[i] = 0xffff;
+						this_cnt++;
+						if (this_cnt == 32)
 							break;
 					}
 					goto retry;
@@ -587,16 +599,20 @@ retry:
 			/* Clear out the mask bits for cpus which did not
 			 * NACK us.
 			 */
-			for_each_cpu_mask_nr(i, mask) {
-				u64 check_mask;
+			for (i = 0; i < cnt; i++) {
+				u64 check_mask, nr;
+
+				nr = cpu_list[i];
+				if (nr == 0xffff)
+					continue;
 
 				if (is_jbus)
-					check_mask = (0x2UL << (2*i));
+					check_mask = (0x2UL << (2*nr));
 				else
 					check_mask = (0x2UL <<
 						      this_busy_nack);
 				if ((dispatch_stat & check_mask) == 0)
-					cpu_clear(i, mask);
+					cpu_list[i] = 0xffff;
 				this_busy_nack += 2;
 				if (this_busy_nack == 64)
 					break;
@@ -608,34 +624,17 @@ retry:
 }
 
 /* Multi-cpu list version.  */
-static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
+static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 {
-	int cnt, retries, this_cpu, prev_sent, i;
+	int retries, this_cpu, prev_sent, i;
 	unsigned long status;
 	cpumask_t error_mask;
-	struct trap_per_cpu *tb;
 	u16 *cpu_list;
-	u64 *mondo;
-
-	if (cpus_empty(*mask))
-		return;
 
 	this_cpu = smp_processor_id();
-	tb = &trap_block[this_cpu];
-
-	mondo = __va(tb->cpu_mondo_block_pa);
-	mondo[0] = data0;
-	mondo[1] = data1;
-	mondo[2] = data2;
-	wmb();
 
 	cpu_list = __va(tb->cpu_list_pa);
 
-	/* Setup the initial cpu list.  */
-	cnt = 0;
-	for_each_cpu_mask_nr(i, *mask)
-		cpu_list[cnt++] = i;
-
 	cpus_clear(error_mask);
 	retries = 0;
 	prev_sent = 0;
@@ -743,11 +742,15 @@ dump_cpu_list_and_out:
 	printk("]\n");
 }
 
-static void (*xcall_deliver_impl)(u64, u64, u64, const cpumask_t *);
+static void (*xcall_deliver_impl)(struct trap_per_cpu *, int);
 
 static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask)
 {
+	struct trap_per_cpu *tb;
+	int this_cpu, i, cnt;
 	unsigned long flags;
+	u16 *cpu_list;
+	u64 *mondo;
 
 	/* We have to do this whole thing with interrupts fully disabled.
 	 * Otherwise if we send an xcall from interrupt context it will
@@ -760,7 +763,29 @@ static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask
 	 * Fortunately, udelay() uses %stick/%tick so we can use that.
 	 */
 	local_irq_save(flags);
-	xcall_deliver_impl(data0, data1, data2, mask);
+
+	this_cpu = smp_processor_id();
+	tb = &trap_block[this_cpu];
+
+	mondo = __va(tb->cpu_mondo_block_pa);
+	mondo[0] = data0;
+	mondo[1] = data1;
+	mondo[2] = data2;
+	wmb();
+
+	cpu_list = __va(tb->cpu_list_pa);
+
+	/* Setup the initial cpu list.  */
+	cnt = 0;
+	for_each_cpu_mask_nr(i, *mask) {
+		if (i == this_cpu || !cpu_online(i))
+			continue;
+		cpu_list[cnt++] = i;
+	}
+
+	if (cnt)
+		xcall_deliver_impl(tb, cnt);
+
 	local_irq_restore(flags);
 }
 

From ed4d9c66eb941a416c8cb9a0138c69d46d82fc4f Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 4 Aug 2008 16:47:57 -0700
Subject: [PATCH 13/14] sparc64: Kill error_mask from
 hypervisor_xcall_deliver().

It can eat up a lot of stack space when NR_CPUS is large.
We retain some of it's functionality by reporting at least one
of the cpu's which are seen in error state.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 2387a9b81be7..ac8996ec97be 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -626,16 +626,15 @@ retry:
 /* Multi-cpu list version.  */
 static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 {
-	int retries, this_cpu, prev_sent, i;
+	int retries, this_cpu, prev_sent, i, saw_cpu_error;
 	unsigned long status;
-	cpumask_t error_mask;
 	u16 *cpu_list;
 
 	this_cpu = smp_processor_id();
 
 	cpu_list = __va(tb->cpu_list_pa);
 
-	cpus_clear(error_mask);
+	saw_cpu_error = 0;
 	retries = 0;
 	prev_sent = 0;
 	do {
@@ -680,10 +679,9 @@ static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 					continue;
 
 				err = sun4v_cpu_state(cpu);
-				if (err >= 0 &&
-				    err == HV_CPU_STATE_ERROR) {
+				if (err == HV_CPU_STATE_ERROR) {
+					saw_cpu_error = (cpu + 1);
 					cpu_list[i] = 0xffff;
-					cpu_set(cpu, error_mask);
 				}
 			}
 		} else if (unlikely(status != HV_EWOULDBLOCK))
@@ -707,19 +705,15 @@ static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt)
 		}
 	} while (1);
 
-	if (unlikely(!cpus_empty(error_mask)))
+	if (unlikely(saw_cpu_error))
 		goto fatal_mondo_cpu_error;
 
 	return;
 
 fatal_mondo_cpu_error:
 	printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
-	       "were in error state\n",
-	       this_cpu);
-	printk(KERN_CRIT "CPU[%d]: Error mask [ ", this_cpu);
-	for_each_cpu_mask_nr(i, error_mask)
-		printk("%d ", i);
-	printk("]\n");
+	       "(including %d) were in error state\n",
+	       this_cpu, saw_cpu_error - 1);
 	return;
 
 fatal_mondo_timeout:

From ae583885bfd07474789059cdef399289bd66c8d0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 4 Aug 2008 16:56:15 -0700
Subject: [PATCH 14/14] sparc64: Remove all cpumask_t local variables in xcall
 dispatch.

All of the xcall delivery implementation is cpumask agnostic, so
we can pass around pointers to const cpumask_t objects everywhere.

The sad remaining case is the argument to arch_send_call_function_ipi().

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index ac8996ec97be..27b81775a4de 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -787,21 +787,17 @@ static void xcall_deliver(u64 data0, u64 data1, u64 data2, const cpumask_t *mask
  * except self.  Really, there are only two cases currently,
  * "&cpu_online_map" and "&mm->cpu_vm_mask".
  */
-static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, const cpumask_t *mask_p)
+static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 data2, const cpumask_t *mask)
 {
 	u64 data0 = (((u64)ctx)<<32 | (((u64)func) & 0xffffffff));
-	int this_cpu = get_cpu();
-	cpumask_t mask;
 
-	mask = *mask_p;
-	if (mask_p != &cpu_online_map)
-		cpus_and(mask, mask, cpu_online_map);
-	cpu_clear(this_cpu, mask);
+	xcall_deliver(data0, data1, data2, mask);
+}
 
-	xcall_deliver(data0, data1, data2, &mask);
-	/* NOTE: Caller runs local copy on master. */
-
-	put_cpu();
+/* Send cross call to all processors except self. */
+static void smp_cross_call(unsigned long *func, u32 ctx, u64 data1, u64 data2)
+{
+	smp_cross_call_masked(func, ctx, data1, data2, &cpu_online_map);
 }
 
 extern unsigned long xcall_sync_tick;
@@ -827,10 +823,6 @@ void arch_send_call_function_single_ipi(int cpu)
 		      &cpumask_of_cpu(cpu));
 }
 
-/* Send cross call to all processors except self. */
-#define smp_cross_call(func, ctx, data1, data2) \
-	smp_cross_call_masked(func, ctx, data1, data2, &cpu_online_map)
-
 void smp_call_function_client(int irq, struct pt_regs *regs)
 {
 	clear_softint(1 << irq);
@@ -900,7 +892,6 @@ static inline void __local_flush_dcache_page(struct page *page)
 
 void smp_flush_dcache_page_impl(struct page *page, int cpu)
 {
-	cpumask_t mask = cpumask_of_cpu(cpu);
 	int this_cpu;
 
 	if (tlb_type == hypervisor)
@@ -929,7 +920,7 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 		}
 		if (data0) {
 			xcall_deliver(data0, __pa(pg_addr),
-				      (u64) pg_addr, &mask);
+				      (u64) pg_addr, &cpumask_of_cpu(cpu));
 #ifdef CONFIG_DEBUG_DCFLUSH
 			atomic_inc(&dcpage_flushes_xcall);
 #endif
@@ -941,7 +932,6 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 
 void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 {
-	cpumask_t mask = cpu_online_map;
 	void *pg_addr;
 	int this_cpu;
 	u64 data0;
@@ -951,13 +941,9 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 
 	this_cpu = get_cpu();
 
-	cpu_clear(this_cpu, mask);
-
 #ifdef CONFIG_DEBUG_DCFLUSH
 	atomic_inc(&dcpage_flushes);
 #endif
-	if (cpus_empty(mask))
-		goto flush_self;
 	data0 = 0;
 	pg_addr = page_address(page);
 	if (tlb_type == spitfire) {
@@ -971,12 +957,11 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 	}
 	if (data0) {
 		xcall_deliver(data0, __pa(pg_addr),
-			      (u64) pg_addr, &mask);
+			      (u64) pg_addr, &cpu_online_map);
 #ifdef CONFIG_DEBUG_DCFLUSH
 		atomic_inc(&dcpage_flushes_xcall);
 #endif
 	}
- flush_self:
 	__local_flush_dcache_page(page);
 
 	put_cpu();