crypto: blake2s - x86_64 SIMD implementation

These implementations from Samuel Neves support AVX and AVX-512VL.
Originally this used AVX-512F, but Skylake thermal throttling made
AVX-512VL more attractive and possible to do with negligable difference.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
[ardb: move to arch/x86/crypto, wire into lib/crypto framework]
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Jason A. Donenfeld 2019-11-08 13:22:31 +01:00 committed by Herbert Xu
parent 7f9b088092
commit ed0356eda1
4 changed files with 499 additions and 0 deletions

View File

@ -48,6 +48,7 @@ ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
endif
# These modules require assembler to support AVX2.
@ -70,6 +71,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \

View File

@ -0,0 +1,258 @@
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
*/
#include <linux/linkage.h>
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
.align 32
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
.section .rodata.cst16.ROT16, "aM", @progbits, 16
.align 16
ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
.section .rodata.cst16.ROR328, "aM", @progbits, 16
.align 16
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
.align 64
SIGMA:
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
#ifdef CONFIG_AS_AVX512
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
.align 64
SIGMA2:
.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
#endif /* CONFIG_AS_AVX512 */
.text
#ifdef CONFIG_AS_SSSE3
ENTRY(blake2s_compress_ssse3)
testq %rdx,%rdx
je .Lendofloop
movdqu (%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqa ROT16(%rip),%xmm12
movdqa ROR328(%rip),%xmm13
movdqu 0x20(%rdi),%xmm14
movq %rcx,%xmm15
leaq SIGMA+0xa0(%rip),%r8
jmp .Lbeginofloop
.align 32
.Lbeginofloop:
movdqa %xmm0,%xmm10
movdqa %xmm1,%xmm11
paddq %xmm15,%xmm14
movdqa IV(%rip),%xmm2
movdqa %xmm14,%xmm3
pxor IV+0x10(%rip),%xmm3
leaq SIGMA(%rip),%rcx
.Lroundloop:
movzbl (%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0x1(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0x2(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x3(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
punpckldq %xmm5,%xmm4
punpckldq %xmm7,%xmm6
punpcklqdq %xmm6,%xmm4
paddd %xmm4,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0xc,%xmm1
pslld $0x14,%xmm8
por %xmm8,%xmm1
movzbl 0x4(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0x5(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x6(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0x7(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
punpckldq %xmm6,%xmm5
punpckldq %xmm4,%xmm7
punpcklqdq %xmm7,%xmm5
paddd %xmm5,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0x7,%xmm1
pslld $0x19,%xmm8
por %xmm8,%xmm1
pshufd $0x93,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
movzbl 0x8(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x9(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0xa(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0xb(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
punpckldq %xmm7,%xmm6
punpckldq %xmm5,%xmm4
punpcklqdq %xmm4,%xmm6
paddd %xmm6,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0xc,%xmm1
pslld $0x14,%xmm8
por %xmm8,%xmm1
movzbl 0xc(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0xd(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0xe(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0xf(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
punpckldq %xmm4,%xmm7
punpckldq %xmm6,%xmm5
punpcklqdq %xmm5,%xmm7
paddd %xmm7,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0x7,%xmm1
pslld $0x19,%xmm8
por %xmm8,%xmm1
pshufd $0x39,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x93,%xmm2,%xmm2
addq $0x10,%rcx
cmpq %r8,%rcx
jnz .Lroundloop
pxor %xmm2,%xmm0
pxor %xmm3,%xmm1
pxor %xmm10,%xmm0
pxor %xmm11,%xmm1
addq $0x40,%rsi
decq %rdx
jnz .Lbeginofloop
movdqu %xmm0,(%rdi)
movdqu %xmm1,0x10(%rdi)
movdqu %xmm14,0x20(%rdi)
.Lendofloop:
ret
ENDPROC(blake2s_compress_ssse3)
#endif /* CONFIG_AS_SSSE3 */
#ifdef CONFIG_AS_AVX512
ENTRY(blake2s_compress_avx512)
vmovdqu (%rdi),%xmm0
vmovdqu 0x10(%rdi),%xmm1
vmovdqu 0x20(%rdi),%xmm4
vmovq %rcx,%xmm5
vmovdqa IV(%rip),%xmm14
vmovdqa IV+16(%rip),%xmm15
jmp .Lblake2s_compress_avx512_mainloop
.align 32
.Lblake2s_compress_avx512_mainloop:
vmovdqa %xmm0,%xmm10
vmovdqa %xmm1,%xmm11
vpaddq %xmm5,%xmm4,%xmm4
vmovdqa %xmm14,%xmm2
vpxor %xmm15,%xmm4,%xmm3
vmovdqu (%rsi),%ymm6
vmovdqu 0x20(%rsi),%ymm7
addq $0x40,%rsi
leaq SIGMA2(%rip),%rax
movb $0xa,%cl
.Lblake2s_compress_avx512_roundloop:
addq $0x40,%rax
vmovdqa -0x40(%rax),%ymm8
vmovdqa -0x20(%rax),%ymm9
vpermi2d %ymm7,%ymm6,%ymm8
vpermi2d %ymm7,%ymm6,%ymm9
vmovdqa %ymm8,%ymm6
vmovdqa %ymm9,%ymm7
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
vextracti128 $0x1,%ymm8,%xmm8
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
vpshufd $0x93,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x39,%xmm2,%xmm2
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
vextracti128 $0x1,%ymm9,%xmm9
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
vpshufd $0x39,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x93,%xmm2,%xmm2
decb %cl
jne .Lblake2s_compress_avx512_roundloop
vpxor %xmm10,%xmm0,%xmm0
vpxor %xmm11,%xmm1,%xmm1
vpxor %xmm2,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
decq %rdx
jne .Lblake2s_compress_avx512_mainloop
vmovdqu %xmm0,(%rdi)
vmovdqu %xmm1,0x10(%rdi)
vmovdqu %xmm4,0x20(%rdi)
vzeroupper
retq
ENDPROC(blake2s_compress_avx512)
#endif /* CONFIG_AS_AVX512 */

View File

@ -0,0 +1,233 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <crypto/internal/blake2s.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/hash.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
#include <asm/processor.h>
#include <asm/simd.h>
asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
void blake2s_compress_arch(struct blake2s_state *state,
const u8 *block, size_t nblocks,
const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8);
if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
blake2s_compress_generic(state, block, nblocks, inc);
return;
}
for (;;) {
const size_t blocks = min_t(size_t, nblocks,
PAGE_SIZE / BLAKE2S_BLOCK_SIZE);
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) &&
static_branch_likely(&blake2s_use_avx512))
blake2s_compress_avx512(state, block, blocks, inc);
else
blake2s_compress_ssse3(state, block, blocks, inc);
kernel_fpu_end();
nblocks -= blocks;
if (!nblocks)
break;
block += blocks * BLAKE2S_BLOCK_SIZE;
}
}
EXPORT_SYMBOL(blake2s_compress_arch);
static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
unsigned int keylen)
{
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
memcpy(tctx->key, key, keylen);
tctx->keylen = keylen;
return 0;
}
static int crypto_blake2s_init(struct shash_desc *desc)
{
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct blake2s_state *state = shash_desc_ctx(desc);
const int outlen = crypto_shash_digestsize(desc->tfm);
if (tctx->keylen)
blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
else
blake2s_init(state, outlen);
return 0;
}
static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
unsigned int inlen)
{
struct blake2s_state *state = shash_desc_ctx(desc);
const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
if (unlikely(!inlen))
return 0;
if (inlen > fill) {
memcpy(state->buf + state->buflen, in, fill);
blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
state->buflen = 0;
in += fill;
inlen -= fill;
}
if (inlen > BLAKE2S_BLOCK_SIZE) {
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
/* Hash one less (full) block than strictly possible */
blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
}
memcpy(state->buf + state->buflen, in, inlen);
state->buflen += inlen;
return 0;
}
static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
{
struct blake2s_state *state = shash_desc_ctx(desc);
blake2s_set_lastblock(state);
memset(state->buf + state->buflen, 0,
BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
blake2s_compress_arch(state, state->buf, 1, state->buflen);
cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
memcpy(out, state->h, state->outlen);
memzero_explicit(state, sizeof(*state));
return 0;
}
static struct shash_alg blake2s_algs[] = {{
.base.cra_name = "blake2s-128",
.base.cra_driver_name = "blake2s-128-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_128_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-160",
.base.cra_driver_name = "blake2s-160-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_160_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-224",
.base.cra_driver_name = "blake2s-224-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_224_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-256",
.base.cra_driver_name = "blake2s-256-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_256_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}};
static int __init blake2s_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_SSSE3))
return 0;
static_branch_enable(&blake2s_use_ssse3);
if (IS_ENABLED(CONFIG_AS_AVX512) &&
boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512VL) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
XFEATURE_MASK_AVX512, NULL))
static_branch_enable(&blake2s_use_avx512);
return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
}
static void __exit blake2s_mod_exit(void)
{
if (boot_cpu_has(X86_FEATURE_SSSE3))
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
}
module_init(blake2s_mod_init);
module_exit(blake2s_mod_exit);
MODULE_ALIAS_CRYPTO("blake2s-128");
MODULE_ALIAS_CRYPTO("blake2s-128-x86");
MODULE_ALIAS_CRYPTO("blake2s-160");
MODULE_ALIAS_CRYPTO("blake2s-160-x86");
MODULE_ALIAS_CRYPTO("blake2s-224");
MODULE_ALIAS_CRYPTO("blake2s-224-x86");
MODULE_ALIAS_CRYPTO("blake2s-256");
MODULE_ALIAS_CRYPTO("blake2s-256-x86");
MODULE_LICENSE("GPL v2");

View File

@ -674,6 +674,12 @@ config CRYPTO_BLAKE2S
See https://blake2.net for further information.
config CRYPTO_BLAKE2S_X86
tristate "BLAKE2s digest algorithm (x86 accelerated version)"
depends on X86 && 64BIT
select CRYPTO_LIB_BLAKE2S_GENERIC
select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
config CRYPTO_CRCT10DIF
tristate "CRCT10DIF algorithm"
select CRYPTO_HASH