forked from Minki/linux
sparc64: Improve 64-bit constant loading in eBPF JIT.
Doing a full 64-bit decomposition is really stupid especially for simple values like 0 and -1. But if we are going to optimize this, go all the way and try for all 2 and 3 instruction sequences not requiring a temporary register as well. First we do the easy cases where it's a zero or sign extended 32-bit number (sethi+or, sethi+xor, respectively). Then we try to find a range of set bits we can load simply then shift up into place, in various ways. Then we try negating the constant and see if we can do a simple sequence using that with a xor at the end. (f.e. the range of set bits can't be loaded simply, but for the negated value it can) The final optimized strategy involves 4 instructions sequences not needing a temporary register. Otherwise we sadly fully decompose using a temp.. Example, from ALU64_XOR_K: 0x0000ffffffff0000 ^ 0x0 = 0x0000ffffffff0000: 0000000000000000 <foo>: 0: 9d e3 bf 50 save %sp, -176, %sp 4: 01 00 00 00 nop 8: 90 10 00 18 mov %i0, %o0 c: 13 3f ff ff sethi %hi(0xfffffc00), %o1 10: 92 12 63 ff or %o1, 0x3ff, %o1 ! ffffffff <foo+0xffffffff> 14: 93 2a 70 10 sllx %o1, 0x10, %o1 18: 15 3f ff ff sethi %hi(0xfffffc00), %o2 1c: 94 12 a3 ff or %o2, 0x3ff, %o2 ! ffffffff <foo+0xffffffff> 20: 95 2a b0 10 sllx %o2, 0x10, %o2 24: 92 1a 60 00 xor %o1, 0, %o1 28: 12 e2 40 8a cxbe %o1, %o2, 38 <foo+0x38> 2c: 9a 10 20 02 mov 2, %o5 30: 10 60 00 03 b,pn %xcc, 3c <foo+0x3c> 34: 01 00 00 00 nop 38: 9a 10 20 01 mov 1, %o5 ! 1 <foo+0x1> 3c: 81 c7 e0 08 ret 40: 91 eb 40 00 restore %o5, %g0, %o0 Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
e3a724edee
commit
14933dc8d9
@ -28,6 +28,11 @@ static inline bool is_simm5(unsigned int value)
|
||||
return value + 0x10 < 0x20;
|
||||
}
|
||||
|
||||
static inline bool is_sethi(unsigned int value)
|
||||
{
|
||||
return (value & ~0x3fffff) == 0;
|
||||
}
|
||||
|
||||
static void bpf_flush_icache(void *start_, void *end_)
|
||||
{
|
||||
/* Cheetah's I-cache is fully coherent. */
|
||||
@ -367,16 +372,252 @@ static void emit_loadimm_sext(s32 K, unsigned int dest, struct jit_ctx *ctx)
|
||||
}
|
||||
}
|
||||
|
||||
static void analyze_64bit_constant(u32 high_bits, u32 low_bits,
|
||||
int *hbsp, int *lbsp, int *abbasp)
|
||||
{
|
||||
int lowest_bit_set, highest_bit_set, all_bits_between_are_set;
|
||||
int i;
|
||||
|
||||
lowest_bit_set = highest_bit_set = -1;
|
||||
i = 0;
|
||||
do {
|
||||
if ((lowest_bit_set == -1) && ((low_bits >> i) & 1))
|
||||
lowest_bit_set = i;
|
||||
if ((highest_bit_set == -1) && ((high_bits >> (32 - i - 1)) & 1))
|
||||
highest_bit_set = (64 - i - 1);
|
||||
} while (++i < 32 && (highest_bit_set == -1 ||
|
||||
lowest_bit_set == -1));
|
||||
if (i == 32) {
|
||||
i = 0;
|
||||
do {
|
||||
if (lowest_bit_set == -1 && ((high_bits >> i) & 1))
|
||||
lowest_bit_set = i + 32;
|
||||
if (highest_bit_set == -1 &&
|
||||
((low_bits >> (32 - i - 1)) & 1))
|
||||
highest_bit_set = 32 - i - 1;
|
||||
} while (++i < 32 && (highest_bit_set == -1 ||
|
||||
lowest_bit_set == -1));
|
||||
}
|
||||
|
||||
all_bits_between_are_set = 1;
|
||||
for (i = lowest_bit_set; i <= highest_bit_set; i++) {
|
||||
if (i < 32) {
|
||||
if ((low_bits & (1 << i)) != 0)
|
||||
continue;
|
||||
} else {
|
||||
if ((high_bits & (1 << (i - 32))) != 0)
|
||||
continue;
|
||||
}
|
||||
all_bits_between_are_set = 0;
|
||||
break;
|
||||
}
|
||||
*hbsp = highest_bit_set;
|
||||
*lbsp = lowest_bit_set;
|
||||
*abbasp = all_bits_between_are_set;
|
||||
}
|
||||
|
||||
static unsigned long create_simple_focus_bits(unsigned long high_bits,
|
||||
unsigned long low_bits,
|
||||
int lowest_bit_set, int shift)
|
||||
{
|
||||
long hi, lo;
|
||||
|
||||
if (lowest_bit_set < 32) {
|
||||
lo = (low_bits >> lowest_bit_set) << shift;
|
||||
hi = ((high_bits << (32 - lowest_bit_set)) << shift);
|
||||
} else {
|
||||
lo = 0;
|
||||
hi = ((high_bits >> (lowest_bit_set - 32)) << shift);
|
||||
}
|
||||
return hi | lo;
|
||||
}
|
||||
|
||||
static bool const64_is_2insns(unsigned long high_bits,
|
||||
unsigned long low_bits)
|
||||
{
|
||||
int highest_bit_set, lowest_bit_set, all_bits_between_are_set;
|
||||
|
||||
if (high_bits == 0 || high_bits == 0xffffffff)
|
||||
return true;
|
||||
|
||||
analyze_64bit_constant(high_bits, low_bits,
|
||||
&highest_bit_set, &lowest_bit_set,
|
||||
&all_bits_between_are_set);
|
||||
|
||||
if ((highest_bit_set == 63 || lowest_bit_set == 0) &&
|
||||
all_bits_between_are_set != 0)
|
||||
return true;
|
||||
|
||||
if (highest_bit_set - lowest_bit_set < 21)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void sparc_emit_set_const64_quick2(unsigned long high_bits,
|
||||
unsigned long low_imm,
|
||||
unsigned int dest,
|
||||
int shift_count, struct jit_ctx *ctx)
|
||||
{
|
||||
emit_loadimm32(high_bits, dest, ctx);
|
||||
|
||||
/* Now shift it up into place. */
|
||||
emit_alu_K(SLLX, dest, shift_count, ctx);
|
||||
|
||||
/* If there is a low immediate part piece, finish up by
|
||||
* putting that in as well.
|
||||
*/
|
||||
if (low_imm != 0)
|
||||
emit(OR | IMMED | RS1(dest) | S13(low_imm) | RD(dest), ctx);
|
||||
}
|
||||
|
||||
static void emit_loadimm64(u64 K, unsigned int dest, struct jit_ctx *ctx)
|
||||
{
|
||||
int all_bits_between_are_set, lowest_bit_set, highest_bit_set;
|
||||
unsigned int tmp = bpf2sparc[TMP_REG_1];
|
||||
u32 high_part = (K >> 32);
|
||||
u32 low_part = (K & 0xffffffff);
|
||||
u32 low_bits = (K & 0xffffffff);
|
||||
u32 high_bits = (K >> 32);
|
||||
|
||||
/* These two tests also take care of all of the one
|
||||
* instruction cases.
|
||||
*/
|
||||
if (high_bits == 0xffffffff && (low_bits & 0x80000000))
|
||||
return emit_loadimm_sext(K, dest, ctx);
|
||||
if (high_bits == 0x00000000)
|
||||
return emit_loadimm32(K, dest, ctx);
|
||||
|
||||
analyze_64bit_constant(high_bits, low_bits, &highest_bit_set,
|
||||
&lowest_bit_set, &all_bits_between_are_set);
|
||||
|
||||
/* 1) mov -1, %reg
|
||||
* sllx %reg, shift, %reg
|
||||
* 2) mov -1, %reg
|
||||
* srlx %reg, shift, %reg
|
||||
* 3) mov some_small_const, %reg
|
||||
* sllx %reg, shift, %reg
|
||||
*/
|
||||
if (((highest_bit_set == 63 || lowest_bit_set == 0) &&
|
||||
all_bits_between_are_set != 0) ||
|
||||
((highest_bit_set - lowest_bit_set) < 12)) {
|
||||
int shift = lowest_bit_set;
|
||||
long the_const = -1;
|
||||
|
||||
if ((highest_bit_set != 63 && lowest_bit_set != 0) ||
|
||||
all_bits_between_are_set == 0) {
|
||||
the_const =
|
||||
create_simple_focus_bits(high_bits, low_bits,
|
||||
lowest_bit_set, 0);
|
||||
} else if (lowest_bit_set == 0)
|
||||
shift = -(63 - highest_bit_set);
|
||||
|
||||
emit(OR | IMMED | RS1(G0) | S13(the_const) | RD(dest), ctx);
|
||||
if (shift > 0)
|
||||
emit_alu_K(SLLX, dest, shift, ctx);
|
||||
else if (shift < 0)
|
||||
emit_alu_K(SRLX, dest, -shift, ctx);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* Now a range of 22 or less bits set somewhere.
|
||||
* 1) sethi %hi(focus_bits), %reg
|
||||
* sllx %reg, shift, %reg
|
||||
* 2) sethi %hi(focus_bits), %reg
|
||||
* srlx %reg, shift, %reg
|
||||
*/
|
||||
if ((highest_bit_set - lowest_bit_set) < 21) {
|
||||
unsigned long focus_bits =
|
||||
create_simple_focus_bits(high_bits, low_bits,
|
||||
lowest_bit_set, 10);
|
||||
|
||||
emit(SETHI(focus_bits, dest), ctx);
|
||||
|
||||
/* If lowest_bit_set == 10 then a sethi alone could
|
||||
* have done it.
|
||||
*/
|
||||
if (lowest_bit_set < 10)
|
||||
emit_alu_K(SRLX, dest, 10 - lowest_bit_set, ctx);
|
||||
else if (lowest_bit_set > 10)
|
||||
emit_alu_K(SLLX, dest, lowest_bit_set - 10, ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Ok, now 3 instruction sequences. */
|
||||
if (low_bits == 0) {
|
||||
emit_loadimm32(high_bits, dest, ctx);
|
||||
emit_alu_K(SLLX, dest, 32, ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
/* We may be able to do something quick
|
||||
* when the constant is negated, so try that.
|
||||
*/
|
||||
if (const64_is_2insns((~high_bits) & 0xffffffff,
|
||||
(~low_bits) & 0xfffffc00)) {
|
||||
/* NOTE: The trailing bits get XOR'd so we need the
|
||||
* non-negated bits, not the negated ones.
|
||||
*/
|
||||
unsigned long trailing_bits = low_bits & 0x3ff;
|
||||
|
||||
if ((((~high_bits) & 0xffffffff) == 0 &&
|
||||
((~low_bits) & 0x80000000) == 0) ||
|
||||
(((~high_bits) & 0xffffffff) == 0xffffffff &&
|
||||
((~low_bits) & 0x80000000) != 0)) {
|
||||
unsigned long fast_int = (~low_bits & 0xffffffff);
|
||||
|
||||
if ((is_sethi(fast_int) &&
|
||||
(~high_bits & 0xffffffff) == 0)) {
|
||||
emit(SETHI(fast_int, dest), ctx);
|
||||
} else if (is_simm13(fast_int)) {
|
||||
emit(OR | IMMED | RS1(G0) | S13(fast_int) | RD(dest), ctx);
|
||||
} else {
|
||||
emit_loadimm64(fast_int, dest, ctx);
|
||||
}
|
||||
} else {
|
||||
u64 n = ((~low_bits) & 0xfffffc00) |
|
||||
(((unsigned long)((~high_bits) & 0xffffffff))<<32);
|
||||
emit_loadimm64(n, dest, ctx);
|
||||
}
|
||||
|
||||
low_bits = -0x400 | trailing_bits;
|
||||
|
||||
emit(XOR | IMMED | RS1(dest) | S13(low_bits) | RD(dest), ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
/* 1) sethi %hi(xxx), %reg
|
||||
* or %reg, %lo(xxx), %reg
|
||||
* sllx %reg, yyy, %reg
|
||||
*/
|
||||
if ((highest_bit_set - lowest_bit_set) < 32) {
|
||||
unsigned long focus_bits =
|
||||
create_simple_focus_bits(high_bits, low_bits,
|
||||
lowest_bit_set, 0);
|
||||
|
||||
/* So what we know is that the set bits straddle the
|
||||
* middle of the 64-bit word.
|
||||
*/
|
||||
sparc_emit_set_const64_quick2(focus_bits, 0, dest,
|
||||
lowest_bit_set, ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
/* 1) sethi %hi(high_bits), %reg
|
||||
* or %reg, %lo(high_bits), %reg
|
||||
* sllx %reg, 32, %reg
|
||||
* or %reg, low_bits, %reg
|
||||
*/
|
||||
if (is_simm13(low_bits) && ((int)low_bits > 0)) {
|
||||
sparc_emit_set_const64_quick2(high_bits, low_bits,
|
||||
dest, 32, ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Oh well, we tried... Do a full 64-bit decomposition. */
|
||||
ctx->tmp_1_used = true;
|
||||
|
||||
emit_set_const(high_part, tmp, ctx);
|
||||
emit_set_const(low_part, dest, ctx);
|
||||
emit_loadimm32(high_bits, tmp, ctx);
|
||||
emit_loadimm32(low_bits, dest, ctx);
|
||||
emit_alu_K(SLLX, tmp, 32, ctx);
|
||||
emit(OR | RS1(dest) | RS2(tmp) | RD(dest), ctx);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user