linux/arch/ia64/lib/ip_fast_csum.S

/*
 * Optmized version of the ip_fast_csum() function
 * Used for calculating IP header checksum
 *
 * Return: 16bit checksum, complemented
 *
 * Inputs:
 *      in0: address of buffer to checksum (char *)
 *      in1: length of the buffer (int)
 *
 * Copyright (C) 2002, 2006 Intel Corp.
 * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
 */

#include <asm/asmmacro.h>

/*
 * Since we know that most likely this function is called with buf aligned
 * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
 * versus calling generic version of do_csum, which has lots of overhead in
 * handling various alignments and sizes.  However, due to lack of constrains
 * put on the function input argument, cases with alignment not on 4-byte or
 * size not equal to 20 bytes will be handled by the generic do_csum function.
 */

#define in0	r32
#define in1	r33
#define in2	r34
#define in3	r35
#define in4	r36
#define ret0	r8

GLOBAL_ENTRY(ip_fast_csum)
	.prologue
	.body
	cmp.ne	p6,p7=5,in1	// size other than 20 byte?
	and	r14=3,in0	// is it aligned on 4-byte?
	add	r15=4,in0	// second source pointer
	;;
	cmp.ne.or.andcm p6,p7=r14,r0
	;;
(p7)	ld4	r20=[in0],8
(p7)	ld4	r21=[r15],8
(p6)	br.spnt	.generic
	;;
	ld4	r22=[in0],8
	ld4	r23=[r15],8
	;;
	ld4	r24=[in0]
	add	r20=r20,r21
	add	r22=r22,r23
	;;
	add	r20=r20,r22
	;;
	add	r20=r20,r24
	;;
	shr.u	ret0=r20,16	// now need to add the carry
	zxt2	r20=r20
	;;
	add	r20=ret0,r20
	;;
	shr.u	ret0=r20,16	// add carry again
	zxt2	r20=r20
	;;
	add	r20=ret0,r20
	;;
	shr.u	ret0=r20,16
	zxt2	r20=r20
	;;
	add	r20=ret0,r20
	mov	r9=0xffff
	;;
	andcm	ret0=r9,r20
	.restore sp		// reset frame state
	br.ret.sptk.many b0
	;;

.generic:
	.prologue
	.save ar.pfs, r35
	alloc	r35=ar.pfs,2,2,2,0
	.save rp, r34
	mov	r34=b0
	.body
	dep.z	out1=in1,2,30
	mov	out0=in0
	;;
	br.call.sptk.many b0=do_csum
	;;
	andcm	ret0=-1,ret0
	mov	ar.pfs=r35
	mov	b0=r34
	br.ret.sptk.many b0
END(ip_fast_csum)

GLOBAL_ENTRY(csum_ipv6_magic)
	ld4	r20=[in0],4
	ld4	r21=[in1],4
	zxt4	in2=in2
	;;
	ld4	r22=[in0],4
	ld4	r23=[in1],4
	dep	r15=in3,in2,32,16
	;;
	ld4	r24=[in0],4
	ld4	r25=[in1],4
	mux1	r15=r15,@rev
	add	r16=r20,r21
	add	r17=r22,r23
	zxt4	in4=in4
	;;
	ld4	r26=[in0],4
	ld4	r27=[in1],4
	shr.u	r15=r15,16
	add	r18=r24,r25
	add	r8=r16,r17
	;;
	add	r19=r26,r27
	add	r8=r8,r18
	;;
	add	r8=r8,r19
	add	r15=r15,in4
	;;
	add	r8=r8,r15
	;;
	shr.u	r10=r8,32	// now fold sum into short
	zxt4	r11=r8
	;;
	add	r8=r10,r11
	;;
	shr.u	r10=r8,16	// yeah, keep it rolling
	zxt2	r11=r8
	;;
	add	r8=r10,r11
	;;
	shr.u	r10=r8,16	// three times lucky
	zxt2	r11=r8
	;;
	add	r8=r10,r11
	mov	r9=0xffff
	;;
	andcm	r8=r9,r8
	br.ret.sptk.many b0
END(csum_ipv6_magic)
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`/*`
			`* Optmized version of the ip_fast_csum() function`
			`* Used for calculating IP header checksum`
			`*`
			`* Return: 16bit checksum, complemented`
			`*`
			`* Inputs:`
			`* in0: address of buffer to checksum (char *)`
			`* in1: length of the buffer (int)`
			`*`
[IA64] implement csum_ipv6_magic for ia64. The asm version is 4.4 times faster than the generic C version and 10X smaller in code size. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2006-11-10 21:17:50 +00:00			`* Copyright (C) 2002, 2006 Intel Corp.`
			`* Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`*/`

			`#include <asm/asmmacro.h>`

			`/*`
			`* Since we know that most likely this function is called with buf aligned`
			`* on 4-byte boundary and 20 bytes in length, we can execution rather quickly`
			`* versus calling generic version of do_csum, which has lots of overhead in`
			`* handling various alignments and sizes. However, due to lack of constrains`
			`* put on the function input argument, cases with alignment not on 4-byte or`
			`* size not equal to 20 bytes will be handled by the generic do_csum function.`
			`*/`

			`#define in0 r32`
			`#define in1 r33`
[IA64] implement csum_ipv6_magic for ia64. The asm version is 4.4 times faster than the generic C version and 10X smaller in code size. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2006-11-10 21:17:50 +00:00			`#define in2 r34`
			`#define in3 r35`
			`#define in4 r36`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`#define ret0 r8`

			`GLOBAL_ENTRY(ip_fast_csum)`
			`.prologue`
			`.body`
			`cmp.ne p6,p7=5,in1 // size other than 20 byte?`
			`and r14=3,in0 // is it aligned on 4-byte?`
			`add r15=4,in0 // second source pointer`
			`;;`
			`cmp.ne.or.andcm p6,p7=r14,r0`
			`;;`
			`(p7) ld4 r20=[in0],8`
			`(p7) ld4 r21=[r15],8`
			`(p6) br.spnt .generic`
			`;;`
			`ld4 r22=[in0],8`
			`ld4 r23=[r15],8`
			`;;`
			`ld4 r24=[in0]`
			`add r20=r20,r21`
			`add r22=r22,r23`
			`;;`
			`add r20=r20,r22`
			`;;`
			`add r20=r20,r24`
			`;;`
			`shr.u ret0=r20,16 // now need to add the carry`
			`zxt2 r20=r20`
			`;;`
			`add r20=ret0,r20`
			`;;`
			`shr.u ret0=r20,16 // add carry again`
			`zxt2 r20=r20`
			`;;`
			`add r20=ret0,r20`
			`;;`
			`shr.u ret0=r20,16`
			`zxt2 r20=r20`
			`;;`
			`add r20=ret0,r20`
[IA64] tidy up return value of ip_fast_csum While working on implementing csum_ipv6_magic, I noticed that current version of ip_fast_csum will potentially return bits above "unsigned short" as 1. While no harm is done right now because all call sites will chop off the upper bits when it uses the return value. However, this is still dangerous and buggy. Here is a patch to enforce that the function really returns unsigned short in the native register format. The fix is free as there are plenty open slot to add one more asm instruction. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2006-11-09 00:29:25 +00:00			`mov r9=0xffff`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`;;`
[IA64] tidy up return value of ip_fast_csum While working on implementing csum_ipv6_magic, I noticed that current version of ip_fast_csum will potentially return bits above "unsigned short" as 1. While no harm is done right now because all call sites will chop off the upper bits when it uses the return value. However, this is still dangerous and buggy. Here is a patch to enforce that the function really returns unsigned short in the native register format. The fix is free as there are plenty open slot to add one more asm instruction. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2006-11-09 00:29:25 +00:00			`andcm ret0=r9,r20`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`.restore sp // reset frame state`
			`br.ret.sptk.many b0`
			`;;`

			`.generic:`
			`.prologue`
			`.save ar.pfs, r35`
			`alloc r35=ar.pfs,2,2,2,0`
			`.save rp, r34`
			`mov r34=b0`
			`.body`
			`dep.z out1=in1,2,30`
			`mov out0=in0`
			`;;`
			`br.call.sptk.many b0=do_csum`
			`;;`
			`andcm ret0=-1,ret0`
			`mov ar.pfs=r35`
			`mov b0=r34`
			`br.ret.sptk.many b0`
			`END(ip_fast_csum)`
[IA64] implement csum_ipv6_magic for ia64. The asm version is 4.4 times faster than the generic C version and 10X smaller in code size. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2006-11-10 21:17:50 +00:00
			`GLOBAL_ENTRY(csum_ipv6_magic)`
			`ld4 r20=[in0],4`
			`ld4 r21=[in1],4`
[IA64] fix csum_ipv6_magic() The 32-bit parameters (len and csum) of csum_ipv6_magic() are passed in 64-bit registers in2 and in4. The high order 32 bits of the registers were never cleared, and garbage was sometimes calculated into the checksum. Fix this by clearing the high order 32 bits of these registers. Signed-off-by: Jiri Bohac <jbohac@suse.cz> Signed-off-by: Tony Luck <tony.luck@intel.com> 2009-09-02 09:00:46 +00:00			`zxt4 in2=in2`
[IA64] implement csum_ipv6_magic for ia64. The asm version is 4.4 times faster than the generic C version and 10X smaller in code size. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2006-11-10 21:17:50 +00:00			`;;`
			`ld4 r22=[in0],4`
			`ld4 r23=[in1],4`
[IA64] fix csum_ipv6_magic() The 32-bit parameters (len and csum) of csum_ipv6_magic() are passed in 64-bit registers in2 and in4. The high order 32 bits of the registers were never cleared, and garbage was sometimes calculated into the checksum. Fix this by clearing the high order 32 bits of these registers. Signed-off-by: Jiri Bohac <jbohac@suse.cz> Signed-off-by: Tony Luck <tony.luck@intel.com> 2009-09-02 09:00:46 +00:00			`dep r15=in3,in2,32,16`
[IA64] implement csum_ipv6_magic for ia64. The asm version is 4.4 times faster than the generic C version and 10X smaller in code size. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2006-11-10 21:17:50 +00:00			`;;`
			`ld4 r24=[in0],4`
			`ld4 r25=[in1],4`
[IA64] fix csum_ipv6_magic() The 32-bit parameters (len and csum) of csum_ipv6_magic() are passed in 64-bit registers in2 and in4. The high order 32 bits of the registers were never cleared, and garbage was sometimes calculated into the checksum. Fix this by clearing the high order 32 bits of these registers. Signed-off-by: Jiri Bohac <jbohac@suse.cz> Signed-off-by: Tony Luck <tony.luck@intel.com> 2009-09-02 09:00:46 +00:00			`mux1 r15=r15,@rev`
[IA64] implement csum_ipv6_magic for ia64. The asm version is 4.4 times faster than the generic C version and 10X smaller in code size. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2006-11-10 21:17:50 +00:00			`add r16=r20,r21`
			`add r17=r22,r23`
[IA64] fix csum_ipv6_magic() The 32-bit parameters (len and csum) of csum_ipv6_magic() are passed in 64-bit registers in2 and in4. The high order 32 bits of the registers were never cleared, and garbage was sometimes calculated into the checksum. Fix this by clearing the high order 32 bits of these registers. Signed-off-by: Jiri Bohac <jbohac@suse.cz> Signed-off-by: Tony Luck <tony.luck@intel.com> 2009-09-02 09:00:46 +00:00			`zxt4 in4=in4`
[IA64] implement csum_ipv6_magic for ia64. The asm version is 4.4 times faster than the generic C version and 10X smaller in code size. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2006-11-10 21:17:50 +00:00			`;;`
			`ld4 r26=[in0],4`
			`ld4 r27=[in1],4`
[IA64] fix csum_ipv6_magic() The 32-bit parameters (len and csum) of csum_ipv6_magic() are passed in 64-bit registers in2 and in4. The high order 32 bits of the registers were never cleared, and garbage was sometimes calculated into the checksum. Fix this by clearing the high order 32 bits of these registers. Signed-off-by: Jiri Bohac <jbohac@suse.cz> Signed-off-by: Tony Luck <tony.luck@intel.com> 2009-09-02 09:00:46 +00:00			`shr.u r15=r15,16`
[IA64] implement csum_ipv6_magic for ia64. The asm version is 4.4 times faster than the generic C version and 10X smaller in code size. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2006-11-10 21:17:50 +00:00			`add r18=r24,r25`
			`add r8=r16,r17`
			`;;`
			`add r19=r26,r27`
			`add r8=r8,r18`
			`;;`
			`add r8=r8,r19`
			`add r15=r15,in4`
			`;;`
			`add r8=r8,r15`
			`;;`
			`shr.u r10=r8,32 // now fold sum into short`
			`zxt4 r11=r8`
			`;;`
			`add r8=r10,r11`
			`;;`
			`shr.u r10=r8,16 // yeah, keep it rolling`
			`zxt2 r11=r8`
			`;;`
			`add r8=r10,r11`
			`;;`
			`shr.u r10=r8,16 // three times lucky`
			`zxt2 r11=r8`
			`;;`
			`add r8=r10,r11`
			`mov r9=0xffff`
			`;;`
			`andcm r8=r9,r8`
			`br.ret.sptk.many b0`
			`END(csum_ipv6_magic)`