arch/arm/include/asm/div64.h

#ifndef __ASM_ARM_DIV64
#define __ASM_ARM_DIV64

#include <linux/types.h>
#include <asm/compiler.h>

/*
 * The semantics of __div64_32() are:
 *
 * uint32_t __div64_32(uint64_t *n, uint32_t base)
 * {
 * 	uint32_t remainder = *n % base;
 * 	*n = *n / base;
 * 	return remainder;
 * }
 *
 * In other words, a 64-bit dividend with a 32-bit divisor producing
 * a 64-bit result and a 32-bit remainder.  To accomplish this optimally
 * we override the generic version in lib/div64.c to call our __do_div64
 * assembly implementation with completely non standard calling convention
 * for arguments and results (beware).
 */

#ifdef __ARMEB__
#define __xh "r0"
#define __xl "r1"
#else
#define __xl "r0"
#define __xh "r1"
#endif

static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
{
	register unsigned int __base      asm("r4") = base;
	register unsigned long long __n   asm("r0") = *n;
	register unsigned long long __res asm("r2");
	register unsigned int __rem       asm(__xh);
	asm(	__asmeq("%0", __xh)
		__asmeq("%1", "r2")
		__asmeq("%2", "r0")
		__asmeq("%3", "r4")
		"bl	__do_div64"
		: "=r" (__rem), "=r" (__res)
		: "r" (__n), "r" (__base)
		: "ip", "lr", "cc");
	*n = __res;
	return __rem;
}
#define __div64_32 __div64_32

#if !defined(CONFIG_AEABI)

/*
 * In OABI configurations, some uses of the do_div function
 * cause gcc to run out of registers. To work around that,
 * we can force the use of the out-of-line version for
 * configurations that build a OABI kernel.
 */
#define do_div(n, base) __div64_32(&(n), base)

#else

/*
 * gcc versions earlier than 4.0 are simply too problematic for the
 * __div64_const32() code in asm-generic/div64.h. First there is
 * gcc PR 15089 that tend to trig on more complex constructs, spurious
 * .global __udivsi3 are inserted even if none of those symbols are
 * referenced in the generated code, and those gcc versions are not able
 * to do constant propagation on long long values anyway.
 */

#define __div64_const32_is_OK (__GNUC__ >= 4)

static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)
{
	unsigned long long res;
	unsigned int tmp = 0;

	if (!bias) {
		asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"
			"mov	%Q0, #0"
			: "=&r" (res)
			: "r" (m), "r" (n)
			: "cc");
	} else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
		res = m;
		asm (	"umlal	%Q0, %R0, %Q1, %Q2\n\t"
			"mov	%Q0, #0"
			: "+&r" (res)
			: "r" (m), "r" (n)
			: "cc");
	} else {
		asm (	"umull	%Q0, %R0, %Q1, %Q2\n\t"
			"cmn	%Q0, %Q1\n\t"
			"adcs	%R0, %R0, %R1\n\t"
			"adc	%Q0, %3, #0"
			: "=&r" (res)
			: "r" (m), "r" (n), "r" (tmp)
			: "cc");
	}

	if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
		asm (	"umlal	%R0, %Q0, %R1, %Q2\n\t"
			"umlal	%R0, %Q0, %Q1, %R2\n\t"
			"mov	%R0, #0\n\t"
			"umlal	%Q0, %R0, %R1, %R2"
			: "+&r" (res)
			: "r" (m), "r" (n)
			: "cc");
	} else {
		asm (	"umlal	%R0, %Q0, %R2, %Q3\n\t"
			"umlal	%R0, %1, %Q2, %R3\n\t"
			"mov	%R0, #0\n\t"
			"adds	%Q0, %1, %Q0\n\t"
			"adc	%R0, %R0, #0\n\t"
			"umlal	%Q0, %R0, %R2, %R3"
			: "+&r" (res), "+&r" (tmp)
			: "r" (m), "r" (n)
			: "cc");
	}

	return res;
}
#define __arch_xprod_64 __arch_xprod_64

#include <asm-generic/div64.h>

#endif

#endif
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 15:20:36 -07:00			`#ifndef __ASM_ARM_DIV64`
			`#define __ASM_ARM_DIV64`

[NET]: div64_64 consolidate (rev3) Here is the current version of the 64 bit divide common code. Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org> Signed-off-by: David S. Miller <davem@davemloft.net> 2007-03-25 19:54:23 -07:00			`#include <linux/types.h>`
Disintegrate asm/system.h for ARM Disintegrate asm/system.h for ARM. Signed-off-by: David Howells <dhowells@redhat.com> cc: Russell King <linux@arm.linux.org.uk> cc: linux-arm-kernel@lists.infradead.org 2012-03-28 18:30:01 +01:00			`#include <asm/compiler.h>`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 15:20:36 -07:00
			`/*`
ARM: asm/div64.h: adjust to generic codde Now that the constant divisor optimization is made generic, adapt the ARM case to it. Signed-off-by: Nicolas Pitre <nico@linaro.org> 2015-11-02 14:20:41 -05:00			`* The semantics of __div64_32() are:`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 15:20:36 -07:00			`*`
ARM: asm/div64.h: adjust to generic codde Now that the constant divisor optimization is made generic, adapt the ARM case to it. Signed-off-by: Nicolas Pitre <nico@linaro.org> 2015-11-02 14:20:41 -05:00			`* uint32_t __div64_32(uint64_t *n, uint32_t base)`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 15:20:36 -07:00			`* {`
			`* uint32_t remainder = *n % base;`
			`* n = n / base;`
			`* return remainder;`
			`* }`
			`*`
			`* In other words, a 64-bit dividend with a 32-bit divisor producing`
			`* a 64-bit result and a 32-bit remainder. To accomplish this optimally`
ARM: asm/div64.h: adjust to generic codde Now that the constant divisor optimization is made generic, adapt the ARM case to it. Signed-off-by: Nicolas Pitre <nico@linaro.org> 2015-11-02 14:20:41 -05:00			`* we override the generic version in lib/div64.c to call our __do_div64`
			`* assembly implementation with completely non standard calling convention`
			`* for arguments and results (beware).`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 15:20:36 -07:00			`*/`

			`#ifdef __ARMEB__`
			`#define __xh "r0"`
			`#define __xl "r1"`
			`#else`
			`#define __xl "r0"`
			`#define __xh "r1"`
			`#endif`

ARM: asm/div64.h: adjust to generic codde Now that the constant divisor optimization is made generic, adapt the ARM case to it. Signed-off-by: Nicolas Pitre <nico@linaro.org> 2015-11-02 14:20:41 -05:00			`static inline uint32_t __div64_32(uint64_t *n, uint32_t base)`
			`{`
			`register unsigned int __base asm("r4") = base;`
			`register unsigned long long __n asm("r0") = *n;`
			`register unsigned long long __res asm("r2");`
			`register unsigned int __rem asm(__xh);`
			`asm( __asmeq("%0", __xh)`
			`__asmeq("%1", "r2")`
			`__asmeq("%2", "r0")`
			`__asmeq("%3", "r4")`
			`"bl __do_div64"`
			`: "=r" (__rem), "=r" (__res)`
			`: "r" (__n), "r" (__base)`
			`: "ip", "lr", "cc");`
			`*n = __res;`
			`return __rem;`
			`}`
			`#define __div64_32 __div64_32`

			`#if !defined(CONFIG_AEABI)`
[ARM] 3611/4: optimize do_div() when divisor is constant On ARM all divisions have to be performed "manually". For 64-bit divisions that may take more than a hundred cycles in many cases. With 32-bit divisions gcc already use the recyprocal of constant divisors to perform a multiplication, but not with 64-bit divisions. Since the kernel is increasingly relying upon 64-bit divisions it is worth optimizing at least those cases where the divisor is a constant. This is what this patch does using plain C code that gets optimized away at compile time. For example, despite the amount of added C code, do_div(x, 10000) now produces the following assembly code (where x is assigned to r0-r1): adr r4, .L0 ldmia r4, {r4-r5} umull r2, r3, r4, r0 mov r2, #0 umlal r3, r2, r5, r0 umlal r3, r2, r4, r1 mov r3, #0 umlal r2, r3, r5, r1 mov r0, r2, lsr #11 orr r0, r0, r3, lsl #21 mov r1, r3, lsr #11 ... .L0: .word 948328779 .word 879609302 which is the fastest that can be done for any value of x in that case, many times faster than the __do_div64 code (except for the small x value space for which the result ends up being zero or a single bit). The fact that this code is generated inline produces a tiny increase in .text size, but not significant compared to the needed code around each __do_div64 call site this code is replacing. The algorithm used has been validated on a 16-bit scale for all possible values, and then recodified for 64-bit values. Furthermore I've been running it with the final BUG_ON() uncommented for over two months now with no problem. Note that this new code is compiled with gcc versions 4.0 or later. Earlier gcc versions proved themselves too problematic and only the original code is used with them. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> 2006-12-06 04:13:18 +01:00
			`/*`
ARM: asm/div64.h: adjust to generic codde Now that the constant divisor optimization is made generic, adapt the ARM case to it. Signed-off-by: Nicolas Pitre <nico@linaro.org> 2015-11-02 14:20:41 -05:00			`* In OABI configurations, some uses of the do_div function`
			`* cause gcc to run out of registers. To work around that,`
			`* we can force the use of the out-of-line version for`
			`* configurations that build a OABI kernel.`
[ARM] 3611/4: optimize do_div() when divisor is constant On ARM all divisions have to be performed "manually". For 64-bit divisions that may take more than a hundred cycles in many cases. With 32-bit divisions gcc already use the recyprocal of constant divisors to perform a multiplication, but not with 64-bit divisions. Since the kernel is increasingly relying upon 64-bit divisions it is worth optimizing at least those cases where the divisor is a constant. This is what this patch does using plain C code that gets optimized away at compile time. For example, despite the amount of added C code, do_div(x, 10000) now produces the following assembly code (where x is assigned to r0-r1): adr r4, .L0 ldmia r4, {r4-r5} umull r2, r3, r4, r0 mov r2, #0 umlal r3, r2, r5, r0 umlal r3, r2, r4, r1 mov r3, #0 umlal r2, r3, r5, r1 mov r0, r2, lsr #11 orr r0, r0, r3, lsl #21 mov r1, r3, lsr #11 ... .L0: .word 948328779 .word 879609302 which is the fastest that can be done for any value of x in that case, many times faster than the __do_div64 code (except for the small x value space for which the result ends up being zero or a single bit). The fact that this code is generated inline produces a tiny increase in .text size, but not significant compared to the needed code around each __do_div64 call site this code is replacing. The algorithm used has been validated on a 16-bit scale for all possible values, and then recodified for 64-bit values. Furthermore I've been running it with the final BUG_ON() uncommented for over two months now with no problem. Note that this new code is compiled with gcc versions 4.0 or later. Earlier gcc versions proved themselves too problematic and only the original code is used with them. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> 2006-12-06 04:13:18 +01:00			`*/`
ARM: asm/div64.h: adjust to generic codde Now that the constant divisor optimization is made generic, adapt the ARM case to it. Signed-off-by: Nicolas Pitre <nico@linaro.org> 2015-11-02 14:20:41 -05:00			`#define do_div(n, base) __div64_32(&(n), base)`
[ARM] 3611/4: optimize do_div() when divisor is constant On ARM all divisions have to be performed "manually". For 64-bit divisions that may take more than a hundred cycles in many cases. With 32-bit divisions gcc already use the recyprocal of constant divisors to perform a multiplication, but not with 64-bit divisions. Since the kernel is increasingly relying upon 64-bit divisions it is worth optimizing at least those cases where the divisor is a constant. This is what this patch does using plain C code that gets optimized away at compile time. For example, despite the amount of added C code, do_div(x, 10000) now produces the following assembly code (where x is assigned to r0-r1): adr r4, .L0 ldmia r4, {r4-r5} umull r2, r3, r4, r0 mov r2, #0 umlal r3, r2, r5, r0 umlal r3, r2, r4, r1 mov r3, #0 umlal r2, r3, r5, r1 mov r0, r2, lsr #11 orr r0, r0, r3, lsl #21 mov r1, r3, lsr #11 ... .L0: .word 948328779 .word 879609302 which is the fastest that can be done for any value of x in that case, many times faster than the __do_div64 code (except for the small x value space for which the result ends up being zero or a single bit). The fact that this code is generated inline produces a tiny increase in .text size, but not significant compared to the needed code around each __do_div64 call site this code is replacing. The algorithm used has been validated on a 16-bit scale for all possible values, and then recodified for 64-bit values. Furthermore I've been running it with the final BUG_ON() uncommented for over two months now with no problem. Note that this new code is compiled with gcc versions 4.0 or later. Earlier gcc versions proved themselves too problematic and only the original code is used with them. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> 2006-12-06 04:13:18 +01:00
ARM: asm/div64.h: adjust to generic codde Now that the constant divisor optimization is made generic, adapt the ARM case to it. Signed-off-by: Nicolas Pitre <nico@linaro.org> 2015-11-02 14:20:41 -05:00			`#else`
[ARM] 3611/4: optimize do_div() when divisor is constant On ARM all divisions have to be performed "manually". For 64-bit divisions that may take more than a hundred cycles in many cases. With 32-bit divisions gcc already use the recyprocal of constant divisors to perform a multiplication, but not with 64-bit divisions. Since the kernel is increasingly relying upon 64-bit divisions it is worth optimizing at least those cases where the divisor is a constant. This is what this patch does using plain C code that gets optimized away at compile time. For example, despite the amount of added C code, do_div(x, 10000) now produces the following assembly code (where x is assigned to r0-r1): adr r4, .L0 ldmia r4, {r4-r5} umull r2, r3, r4, r0 mov r2, #0 umlal r3, r2, r5, r0 umlal r3, r2, r4, r1 mov r3, #0 umlal r2, r3, r5, r1 mov r0, r2, lsr #11 orr r0, r0, r3, lsl #21 mov r1, r3, lsr #11 ... .L0: .word 948328779 .word 879609302 which is the fastest that can be done for any value of x in that case, many times faster than the __do_div64 code (except for the small x value space for which the result ends up being zero or a single bit). The fact that this code is generated inline produces a tiny increase in .text size, but not significant compared to the needed code around each __do_div64 call site this code is replacing. The algorithm used has been validated on a 16-bit scale for all possible values, and then recodified for 64-bit values. Furthermore I've been running it with the final BUG_ON() uncommented for over two months now with no problem. Note that this new code is compiled with gcc versions 4.0 or later. Earlier gcc versions proved themselves too problematic and only the original code is used with them. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> 2006-12-06 04:13:18 +01:00
			`/*`
ARM: asm/div64.h: adjust to generic codde Now that the constant divisor optimization is made generic, adapt the ARM case to it. Signed-off-by: Nicolas Pitre <nico@linaro.org> 2015-11-02 14:20:41 -05:00			`* gcc versions earlier than 4.0 are simply too problematic for the`
			`* __div64_const32() code in asm-generic/div64.h. First there is`
			`* gcc PR 15089 that tend to trig on more complex constructs, spurious`
			`* .global __udivsi3 are inserted even if none of those symbols are`
			`* referenced in the generated code, and those gcc versions are not able`
			`* to do constant propagation on long long values anyway.`
[ARM] 3611/4: optimize do_div() when divisor is constant On ARM all divisions have to be performed "manually". For 64-bit divisions that may take more than a hundred cycles in many cases. With 32-bit divisions gcc already use the recyprocal of constant divisors to perform a multiplication, but not with 64-bit divisions. Since the kernel is increasingly relying upon 64-bit divisions it is worth optimizing at least those cases where the divisor is a constant. This is what this patch does using plain C code that gets optimized away at compile time. For example, despite the amount of added C code, do_div(x, 10000) now produces the following assembly code (where x is assigned to r0-r1): adr r4, .L0 ldmia r4, {r4-r5} umull r2, r3, r4, r0 mov r2, #0 umlal r3, r2, r5, r0 umlal r3, r2, r4, r1 mov r3, #0 umlal r2, r3, r5, r1 mov r0, r2, lsr #11 orr r0, r0, r3, lsl #21 mov r1, r3, lsr #11 ... .L0: .word 948328779 .word 879609302 which is the fastest that can be done for any value of x in that case, many times faster than the __do_div64 code (except for the small x value space for which the result ends up being zero or a single bit). The fact that this code is generated inline produces a tiny increase in .text size, but not significant compared to the needed code around each __do_div64 call site this code is replacing. The algorithm used has been validated on a 16-bit scale for all possible values, and then recodified for 64-bit values. Furthermore I've been running it with the final BUG_ON() uncommented for over two months now with no problem. Note that this new code is compiled with gcc versions 4.0 or later. Earlier gcc versions proved themselves too problematic and only the original code is used with them. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> 2006-12-06 04:13:18 +01:00			`*/`
ARM: asm/div64.h: adjust to generic codde Now that the constant divisor optimization is made generic, adapt the ARM case to it. Signed-off-by: Nicolas Pitre <nico@linaro.org> 2015-11-02 14:20:41 -05:00
			`#define __div64_const32_is_OK (__GNUC__ >= 4)`

			`static inline uint64_t __arch_xprod_64(uint64_t m, uint64_t n, bool bias)`
			`{`
			`unsigned long long res;`
			`unsigned int tmp = 0;`

			`if (!bias) {`
			`asm ( "umull %Q0, %R0, %Q1, %Q2\n\t"`
			`"mov %Q0, #0"`
			`: "=&r" (res)`
			`: "r" (m), "r" (n)`
			`: "cc");`
			`} else if (!(m & ((1ULL << 63) \| (1ULL << 31)))) {`
			`res = m;`
			`asm ( "umlal %Q0, %R0, %Q1, %Q2\n\t"`
			`"mov %Q0, #0"`
			`: "+&r" (res)`
			`: "r" (m), "r" (n)`
			`: "cc");`
			`} else {`
			`asm ( "umull %Q0, %R0, %Q1, %Q2\n\t"`
			`"cmn %Q0, %Q1\n\t"`
			`"adcs %R0, %R0, %R1\n\t"`
			`"adc %Q0, %3, #0"`
			`: "=&r" (res)`
			`: "r" (m), "r" (n), "r" (tmp)`
			`: "cc");`
			`}`

			`if (!(m & ((1ULL << 63) \| (1ULL << 31)))) {`
			`asm ( "umlal %R0, %Q0, %R1, %Q2\n\t"`
			`"umlal %R0, %Q0, %Q1, %R2\n\t"`
			`"mov %R0, #0\n\t"`
			`"umlal %Q0, %R0, %R1, %R2"`
			`: "+&r" (res)`
			`: "r" (m), "r" (n)`
			`: "cc");`
			`} else {`
			`asm ( "umlal %R0, %Q0, %R2, %Q3\n\t"`
			`"umlal %R0, %1, %Q2, %R3\n\t"`
			`"mov %R0, #0\n\t"`
			`"adds %Q0, %1, %Q0\n\t"`
			`"adc %R0, %R0, #0\n\t"`
			`"umlal %Q0, %R0, %R2, %R3"`
			`: "+&r" (res), "+&r" (tmp)`
			`: "r" (m), "r" (n)`
			`: "cc");`
			`}`

			`return res;`
			`}`
			`#define __arch_xprod_64 __arch_xprod_64`

			`#include <asm-generic/div64.h>`
[ARM] 3611/4: optimize do_div() when divisor is constant On ARM all divisions have to be performed "manually". For 64-bit divisions that may take more than a hundred cycles in many cases. With 32-bit divisions gcc already use the recyprocal of constant divisors to perform a multiplication, but not with 64-bit divisions. Since the kernel is increasingly relying upon 64-bit divisions it is worth optimizing at least those cases where the divisor is a constant. This is what this patch does using plain C code that gets optimized away at compile time. For example, despite the amount of added C code, do_div(x, 10000) now produces the following assembly code (where x is assigned to r0-r1): adr r4, .L0 ldmia r4, {r4-r5} umull r2, r3, r4, r0 mov r2, #0 umlal r3, r2, r5, r0 umlal r3, r2, r4, r1 mov r3, #0 umlal r2, r3, r5, r1 mov r0, r2, lsr #11 orr r0, r0, r3, lsl #21 mov r1, r3, lsr #11 ... .L0: .word 948328779 .word 879609302 which is the fastest that can be done for any value of x in that case, many times faster than the __do_div64 code (except for the small x value space for which the result ends up being zero or a single bit). The fact that this code is generated inline produces a tiny increase in .text size, but not significant compared to the needed code around each __do_div64 call site this code is replacing. The algorithm used has been validated on a 16-bit scale for all possible values, and then recodified for 64-bit values. Furthermore I've been running it with the final BUG_ON() uncommented for over two months now with no problem. Note that this new code is compiled with gcc versions 4.0 or later. Earlier gcc versions proved themselves too problematic and only the original code is used with them. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk> 2006-12-06 04:13:18 +01:00
			`#endif`

Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 15:20:36 -07:00			`#endif`