crypto: x86/aes-ni-xts - use direct calls to and 4-way stride

The XTS asm helper arrangement is a bit odd: the 8-way stride helper
consists of back-to-back calls to the 4-way core transforms, which
are called indirectly, based on a boolean that indicates whether we
are performing encryption or decryption.

Given how costly indirect calls are on x86, let's switch to direct
calls, and given how the 8-way stride doesn't really add anything
substantial, use a 4-way stride instead, and make the asm core
routine deal with any multiple of 4 blocks. Since 512 byte sectors
or 4 KB blocks are the typical quantities XTS operates on, increase
the stride exported to the glue helper to 512 bytes as well.

As a result, the number of indirect calls is reduced from 3 per 64 bytes
of in/output to 1 per 512 bytes of in/output, which produces a 65% speedup
when operating on 1 KB blocks (measured on a Intel(R) Core(TM) i7-8650U CPU)

Fixes: 9697fa39ef ("x86/retpoline/crypto: Convert crypto assembler indirect jumps")
Tested-by: Eric Biggers <ebiggers@google.com> # x86_64
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2020-12-31 17:41:54 +01:00 committed by Herbert Xu
parent fecff3b931
commit 86ad60a65f
2 changed files with 93 additions and 65 deletions

View File

@ -2842,25 +2842,18 @@ SYM_FUNC_END(aesni_ctr_enc)
pxor CTR, IV;
/*
* void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
* const u8 *src, bool enc, le128 *iv)
* void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
* const u8 *src, unsigned int len, le128 *iv)
*/
SYM_FUNC_START(aesni_xts_crypt8)
SYM_FUNC_START(aesni_xts_encrypt)
FRAME_BEGIN
testb %cl, %cl
movl $0, %ecx
movl $240, %r10d
leaq _aesni_enc4, %r11
leaq _aesni_dec4, %rax
cmovel %r10d, %ecx
cmoveq %rax, %r11
movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
movups (IVP), IV
mov 480(KEYP), KLEN
addq %rcx, KEYP
.Lxts_enc_loop4:
movdqa IV, STATE1
movdqu 0x00(INP), INC
pxor INC, STATE1
@ -2884,71 +2877,103 @@ SYM_FUNC_START(aesni_xts_crypt8)
pxor INC, STATE4
movdqu IV, 0x30(OUTP)
CALL_NOSPEC r11
call _aesni_enc4
movdqu 0x00(OUTP), INC
pxor INC, STATE1
movdqu STATE1, 0x00(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE1
movdqu 0x40(INP), INC
pxor INC, STATE1
movdqu IV, 0x40(OUTP)
movdqu 0x10(OUTP), INC
pxor INC, STATE2
movdqu STATE2, 0x10(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE2
movdqu 0x50(INP), INC
pxor INC, STATE2
movdqu IV, 0x50(OUTP)
movdqu 0x20(OUTP), INC
pxor INC, STATE3
movdqu STATE3, 0x20(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE3
movdqu 0x60(INP), INC
pxor INC, STATE3
movdqu IV, 0x60(OUTP)
movdqu 0x30(OUTP), INC
pxor INC, STATE4
movdqu STATE4, 0x30(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE4
movdqu 0x70(INP), INC
pxor INC, STATE4
movdqu IV, 0x70(OUTP)
_aesni_gf128mul_x_ble()
add $64, INP
add $64, OUTP
sub $64, LEN
ja .Lxts_enc_loop4
movups IV, (IVP)
CALL_NOSPEC r11
movdqu 0x40(OUTP), INC
pxor INC, STATE1
movdqu STATE1, 0x40(OUTP)
movdqu 0x50(OUTP), INC
pxor INC, STATE2
movdqu STATE2, 0x50(OUTP)
movdqu 0x60(OUTP), INC
pxor INC, STATE3
movdqu STATE3, 0x60(OUTP)
movdqu 0x70(OUTP), INC
pxor INC, STATE4
movdqu STATE4, 0x70(OUTP)
FRAME_END
ret
SYM_FUNC_END(aesni_xts_crypt8)
SYM_FUNC_END(aesni_xts_encrypt)
/*
* void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
* const u8 *src, unsigned int len, le128 *iv)
*/
SYM_FUNC_START(aesni_xts_decrypt)
FRAME_BEGIN
movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
movups (IVP), IV
mov 480(KEYP), KLEN
add $240, KEYP
.Lxts_dec_loop4:
movdqa IV, STATE1
movdqu 0x00(INP), INC
pxor INC, STATE1
movdqu IV, 0x00(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE2
movdqu 0x10(INP), INC
pxor INC, STATE2
movdqu IV, 0x10(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE3
movdqu 0x20(INP), INC
pxor INC, STATE3
movdqu IV, 0x20(OUTP)
_aesni_gf128mul_x_ble()
movdqa IV, STATE4
movdqu 0x30(INP), INC
pxor INC, STATE4
movdqu IV, 0x30(OUTP)
call _aesni_dec4
movdqu 0x00(OUTP), INC
pxor INC, STATE1
movdqu STATE1, 0x00(OUTP)
movdqu 0x10(OUTP), INC
pxor INC, STATE2
movdqu STATE2, 0x10(OUTP)
movdqu 0x20(OUTP), INC
pxor INC, STATE3
movdqu STATE3, 0x20(OUTP)
movdqu 0x30(OUTP), INC
pxor INC, STATE4
movdqu STATE4, 0x30(OUTP)
_aesni_gf128mul_x_ble()
add $64, INP
add $64, OUTP
sub $64, LEN
ja .Lxts_dec_loop4
movups IV, (IVP)
FRAME_END
ret
SYM_FUNC_END(aesni_xts_decrypt)
#endif

View File

@ -101,6 +101,12 @@ asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
#define AVX_GEN2_OPTSIZE 640
#define AVX_GEN4_OPTSIZE 4096
asmlinkage void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
#ifdef CONFIG_X86_64
static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
@ -108,9 +114,6 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, bool enc, le128 *iv);
/* asmlinkage void aesni_gcm_enc()
* void *ctx, AES Key schedule. Starts on a 16 byte boundary.
* struct gcm_context_data. May be uninitialized.
@ -663,14 +666,14 @@ static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
}
static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
aesni_xts_crypt8(ctx, dst, src, true, iv);
aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
}
static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
aesni_xts_crypt8(ctx, dst, src, false, iv);
aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
}
static const struct common_glue_ctx aesni_enc_xts = {
@ -678,8 +681,8 @@ static const struct common_glue_ctx aesni_enc_xts = {
.fpu_blocks_limit = 1,
.funcs = { {
.num_blocks = 8,
.fn_u = { .xts = aesni_xts_enc8 }
.num_blocks = 32,
.fn_u = { .xts = aesni_xts_enc32 }
}, {
.num_blocks = 1,
.fn_u = { .xts = aesni_xts_enc }
@ -691,8 +694,8 @@ static const struct common_glue_ctx aesni_dec_xts = {
.fpu_blocks_limit = 1,
.funcs = { {
.num_blocks = 8,
.fn_u = { .xts = aesni_xts_dec8 }
.num_blocks = 32,
.fn_u = { .xts = aesni_xts_dec32 }
}, {
.num_blocks = 1,
.fn_u = { .xts = aesni_xts_dec }