powerpc: Fix endian issues in VMX copy loops

Fix the permute loops for little endian.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
This commit is contained in:
Anton Blanchard 2013-09-23 12:04:35 +10:00 committed by Benjamin Herrenschmidt
parent 8b5ede69d2
commit 32ee1e188e
2 changed files with 63 additions and 46 deletions

View File

@ -19,6 +19,14 @@
*/
#include <asm/ppc_asm.h>
#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
#else
#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
#endif
.macro err1
100:
.section __ex_table,"a"
@ -552,13 +560,13 @@ err3; stw r7,4(r3)
li r10,32
li r11,48
lvsl vr16,0,r4 /* Setup permute control vector */
LVS(vr16,0,r4) /* Setup permute control vector */
err3; lvx vr0,0,r4
addi r4,r4,16
bf cr7*4+3,5f
err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16
VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
err3; stvx vr8,r0,r3
addi r3,r3,16
@ -566,9 +574,9 @@ err3; stvx vr8,r0,r3
5: bf cr7*4+2,6f
err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16
VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16
VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
@ -576,13 +584,13 @@ err3; stvx vr9,r3,r9
6: bf cr7*4+1,7f
err3; lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16
VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16
VPERM(vr9,vr3,vr2,vr16)
err3; lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16
VPERM(vr10,vr2,vr1,vr16)
err3; lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16
VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
@ -611,21 +619,21 @@ err3; stvx vr11,r3,r11
.align 5
8:
err4; lvx vr7,r0,r4
vperm vr8,vr0,vr7,vr16
VPERM(vr8,vr0,vr7,vr16)
err4; lvx vr6,r4,r9
vperm vr9,vr7,vr6,vr16
VPERM(vr9,vr7,vr6,vr16)
err4; lvx vr5,r4,r10
vperm vr10,vr6,vr5,vr16
VPERM(vr10,vr6,vr5,vr16)
err4; lvx vr4,r4,r11
vperm vr11,vr5,vr4,vr16
VPERM(vr11,vr5,vr4,vr16)
err4; lvx vr3,r4,r12
vperm vr12,vr4,vr3,vr16
VPERM(vr12,vr4,vr3,vr16)
err4; lvx vr2,r4,r14
vperm vr13,vr3,vr2,vr16
VPERM(vr13,vr3,vr2,vr16)
err4; lvx vr1,r4,r15
vperm vr14,vr2,vr1,vr16
VPERM(vr14,vr2,vr1,vr16)
err4; lvx vr0,r4,r16
vperm vr15,vr1,vr0,vr16
VPERM(vr15,vr1,vr0,vr16)
addi r4,r4,128
err4; stvx vr8,r0,r3
err4; stvx vr9,r3,r9
@ -649,13 +657,13 @@ err4; stvx vr15,r3,r16
bf cr7*4+1,9f
err3; lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16
VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16
VPERM(vr9,vr3,vr2,vr16)
err3; lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16
VPERM(vr10,vr2,vr1,vr16)
err3; lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16
VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
@ -665,9 +673,9 @@ err3; stvx vr11,r3,r11
9: bf cr7*4+2,10f
err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16
VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16
VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32
err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
@ -675,7 +683,7 @@ err3; stvx vr9,r3,r9
10: bf cr7*4+3,11f
err3; lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16
VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
err3; stvx vr8,r0,r3
addi r3,r3,16

View File

@ -20,6 +20,15 @@
#include <asm/ppc_asm.h>
_GLOBAL(memcpy_power7)
#ifdef __BIG_ENDIAN__
#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
#else
#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
#endif
#ifdef CONFIG_ALTIVEC
cmpldi r5,16
cmpldi cr1,r5,4096
@ -485,13 +494,13 @@ _GLOBAL(memcpy_power7)
li r10,32
li r11,48
lvsl vr16,0,r4 /* Setup permute control vector */
LVS(vr16,0,r4) /* Setup permute control vector */
lvx vr0,0,r4
addi r4,r4,16
bf cr7*4+3,5f
lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16
VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
stvx vr8,r0,r3
addi r3,r3,16
@ -499,9 +508,9 @@ _GLOBAL(memcpy_power7)
5: bf cr7*4+2,6f
lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16
VPERM(vr8,vr0,vr1,vr16)
lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16
VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32
stvx vr8,r0,r3
stvx vr9,r3,r9
@ -509,13 +518,13 @@ _GLOBAL(memcpy_power7)
6: bf cr7*4+1,7f
lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16
VPERM(vr8,vr0,vr3,vr16)
lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16
VPERM(vr9,vr3,vr2,vr16)
lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16
VPERM(vr10,vr2,vr1,vr16)
lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16
VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64
stvx vr8,r0,r3
stvx vr9,r3,r9
@ -544,21 +553,21 @@ _GLOBAL(memcpy_power7)
.align 5
8:
lvx vr7,r0,r4
vperm vr8,vr0,vr7,vr16
VPERM(vr8,vr0,vr7,vr16)
lvx vr6,r4,r9
vperm vr9,vr7,vr6,vr16
VPERM(vr9,vr7,vr6,vr16)
lvx vr5,r4,r10
vperm vr10,vr6,vr5,vr16
VPERM(vr10,vr6,vr5,vr16)
lvx vr4,r4,r11
vperm vr11,vr5,vr4,vr16
VPERM(vr11,vr5,vr4,vr16)
lvx vr3,r4,r12
vperm vr12,vr4,vr3,vr16
VPERM(vr12,vr4,vr3,vr16)
lvx vr2,r4,r14
vperm vr13,vr3,vr2,vr16
VPERM(vr13,vr3,vr2,vr16)
lvx vr1,r4,r15
vperm vr14,vr2,vr1,vr16
VPERM(vr14,vr2,vr1,vr16)
lvx vr0,r4,r16
vperm vr15,vr1,vr0,vr16
VPERM(vr15,vr1,vr0,vr16)
addi r4,r4,128
stvx vr8,r0,r3
stvx vr9,r3,r9
@ -582,13 +591,13 @@ _GLOBAL(memcpy_power7)
bf cr7*4+1,9f
lvx vr3,r0,r4
vperm vr8,vr0,vr3,vr16
VPERM(vr8,vr0,vr3,vr16)
lvx vr2,r4,r9
vperm vr9,vr3,vr2,vr16
VPERM(vr9,vr3,vr2,vr16)
lvx vr1,r4,r10
vperm vr10,vr2,vr1,vr16
VPERM(vr10,vr2,vr1,vr16)
lvx vr0,r4,r11
vperm vr11,vr1,vr0,vr16
VPERM(vr11,vr1,vr0,vr16)
addi r4,r4,64
stvx vr8,r0,r3
stvx vr9,r3,r9
@ -598,9 +607,9 @@ _GLOBAL(memcpy_power7)
9: bf cr7*4+2,10f
lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16
VPERM(vr8,vr0,vr1,vr16)
lvx vr0,r4,r9
vperm vr9,vr1,vr0,vr16
VPERM(vr9,vr1,vr0,vr16)
addi r4,r4,32
stvx vr8,r0,r3
stvx vr9,r3,r9
@ -608,7 +617,7 @@ _GLOBAL(memcpy_power7)
10: bf cr7*4+3,11f
lvx vr1,r0,r4
vperm vr8,vr0,vr1,vr16
VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
stvx vr8,r0,r3
addi r3,r3,16