mirror of
https://github.com/torvalds/linux.git
synced 2024-11-10 22:21:40 +00:00
543ea178fb
x86_64 has the "interesting" property that the instruction size is generally a bit shorter for instructions that operate on the 32-bit (or less) part of registers, or registers that are in the original set of 8. This patch adjusts the AES-XTS code to take advantage of that property by changing the LEN parameter from size_t to unsigned int (which is all that's needed and is what the non-AVX implementation uses) and using the %eax register for KEYLEN. This decreases the size of aes-xts-avx-x86_64.o by 1.2%. Note that changing the kmovq to kmovd was going to be needed anyway to make the AVX10/256 code really work on CPUs that don't support 512-bit vectors (since the AVX10 spec says that 64-bit opmask instructions will only be supported on processors that support 512-bit vectors). Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
846 lines
26 KiB
ArmAsm
846 lines
26 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* AES-XTS for modern x86_64 CPUs
|
|
*
|
|
* Copyright 2024 Google LLC
|
|
*
|
|
* Author: Eric Biggers <ebiggers@google.com>
|
|
*/
|
|
|
|
/*
|
|
* This file implements AES-XTS for modern x86_64 CPUs. To handle the
|
|
* complexities of coding for x86 SIMD, e.g. where every vector length needs
|
|
* different code, it uses a macro to generate several implementations that
|
|
* share similar source code but are targeted at different CPUs, listed below:
|
|
*
|
|
* AES-NI + AVX
|
|
* - 128-bit vectors (1 AES block per vector)
|
|
* - VEX-coded instructions
|
|
* - xmm0-xmm15
|
|
* - This is for older CPUs that lack VAES but do have AVX.
|
|
*
|
|
* VAES + VPCLMULQDQ + AVX2
|
|
* - 256-bit vectors (2 AES blocks per vector)
|
|
* - VEX-coded instructions
|
|
* - ymm0-ymm15
|
|
* - This is for CPUs that have VAES but lack AVX512 or AVX10,
|
|
* e.g. Intel's Alder Lake and AMD's Zen 3.
|
|
*
|
|
* VAES + VPCLMULQDQ + AVX10/256 + BMI2
|
|
* - 256-bit vectors (2 AES blocks per vector)
|
|
* - EVEX-coded instructions
|
|
* - ymm0-ymm31
|
|
* - This is for CPUs that have AVX512 but where using zmm registers causes
|
|
* downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
|
|
* - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
|
|
* To avoid confusion with 512-bit, we just write AVX10/256.
|
|
*
|
|
* VAES + VPCLMULQDQ + AVX10/512 + BMI2
|
|
* - Same as the previous one, but upgrades to 512-bit vectors
|
|
* (4 AES blocks per vector) in zmm0-zmm31.
|
|
* - This is for CPUs that have good AVX512 or AVX10/512 support.
|
|
*
|
|
* This file doesn't have an implementation for AES-NI alone (without AVX), as
|
|
* the lack of VEX would make all the assembly code different.
|
|
*
|
|
* When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
|
|
* the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be
|
|
* any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might
|
|
* need to start also providing an implementation using VAES alone.
|
|
*
|
|
* The AES-XTS implementations in this file support everything required by the
|
|
* crypto API, including support for arbitrary input lengths and multi-part
|
|
* processing. However, they are most heavily optimized for the common case of
|
|
* power-of-2 length inputs that are processed in a single part (disk sectors).
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <linux/cfi_types.h>
|
|
|
|
.section .rodata
|
|
.p2align 4
|
|
.Lgf_poly:
|
|
// The low 64 bits of this value represent the polynomial x^7 + x^2 + x
|
|
// + 1. It is the value that must be XOR'd into the low 64 bits of the
|
|
// tweak each time a 1 is carried out of the high 64 bits.
|
|
//
|
|
// The high 64 bits of this value is just the internal carry bit that
|
|
// exists when there's a carry out of the low 64 bits of the tweak.
|
|
.quad 0x87, 1
|
|
|
|
// This table contains constants for vpshufb and vpblendvb, used to
|
|
// handle variable byte shifts and blending during ciphertext stealing
|
|
// on CPUs that don't support AVX10-style masking.
|
|
.Lcts_permute_table:
|
|
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
|
|
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
|
|
.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
|
|
.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
|
|
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
|
|
.byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
|
|
.text
|
|
|
|
// Function parameters
|
|
.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
|
|
// advanced to point to 7th-from-last round key
|
|
.set SRC, %rsi // Pointer to next source data
|
|
.set DST, %rdx // Pointer to next destination data
|
|
.set LEN, %ecx // Remaining length in bytes
|
|
.set LEN8, %cl
|
|
.set LEN64, %rcx
|
|
.set TWEAK, %r8 // Pointer to next tweak
|
|
|
|
// %rax holds the AES key length in bytes.
|
|
.set KEYLEN, %eax
|
|
.set KEYLEN64, %rax
|
|
|
|
// %r9-r11 are available as temporaries.
|
|
|
|
.macro _define_Vi i
|
|
.if VL == 16
|
|
.set V\i, %xmm\i
|
|
.elseif VL == 32
|
|
.set V\i, %ymm\i
|
|
.elseif VL == 64
|
|
.set V\i, %zmm\i
|
|
.else
|
|
.error "Unsupported Vector Length (VL)"
|
|
.endif
|
|
.endm
|
|
|
|
.macro _define_aliases
|
|
// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
|
|
// are available, that map to the xmm, ymm, or zmm registers according
|
|
// to the selected Vector Length (VL).
|
|
_define_Vi 0
|
|
_define_Vi 1
|
|
_define_Vi 2
|
|
_define_Vi 3
|
|
_define_Vi 4
|
|
_define_Vi 5
|
|
_define_Vi 6
|
|
_define_Vi 7
|
|
_define_Vi 8
|
|
_define_Vi 9
|
|
_define_Vi 10
|
|
_define_Vi 11
|
|
_define_Vi 12
|
|
_define_Vi 13
|
|
_define_Vi 14
|
|
_define_Vi 15
|
|
.if USE_AVX10
|
|
_define_Vi 16
|
|
_define_Vi 17
|
|
_define_Vi 18
|
|
_define_Vi 19
|
|
_define_Vi 20
|
|
_define_Vi 21
|
|
_define_Vi 22
|
|
_define_Vi 23
|
|
_define_Vi 24
|
|
_define_Vi 25
|
|
_define_Vi 26
|
|
_define_Vi 27
|
|
_define_Vi 28
|
|
_define_Vi 29
|
|
_define_Vi 30
|
|
_define_Vi 31
|
|
.endif
|
|
|
|
// V0-V3 hold the data blocks during the main loop, or temporary values
|
|
// otherwise. V4-V5 hold temporary values.
|
|
|
|
// V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak.
|
|
.set TWEAK0_XMM, %xmm6
|
|
.set TWEAK0, V6
|
|
.set TWEAK1_XMM, %xmm7
|
|
.set TWEAK1, V7
|
|
.set TWEAK2, V8
|
|
.set TWEAK3, V9
|
|
|
|
// V10-V13 are used for computing the next values of TWEAK[0-3].
|
|
.set NEXT_TWEAK0, V10
|
|
.set NEXT_TWEAK1, V11
|
|
.set NEXT_TWEAK2, V12
|
|
.set NEXT_TWEAK3, V13
|
|
|
|
// V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
|
|
.set GF_POLY_XMM, %xmm14
|
|
.set GF_POLY, V14
|
|
|
|
// V15 holds the key for AES "round 0", copied to all 128-bit lanes.
|
|
.set KEY0_XMM, %xmm15
|
|
.set KEY0, V15
|
|
|
|
// If 32 SIMD registers are available, then V16-V29 hold the remaining
|
|
// AES round keys, copied to all 128-bit lanes.
|
|
//
|
|
// AES-128, AES-192, and AES-256 use different numbers of round keys.
|
|
// To allow handling all three variants efficiently, we align the round
|
|
// keys to the *end* of this register range. I.e., AES-128 uses
|
|
// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
|
|
// (All also use KEY0 for the XOR-only "round" at the beginning.)
|
|
.if USE_AVX10
|
|
.set KEY1_XMM, %xmm16
|
|
.set KEY1, V16
|
|
.set KEY2_XMM, %xmm17
|
|
.set KEY2, V17
|
|
.set KEY3_XMM, %xmm18
|
|
.set KEY3, V18
|
|
.set KEY4_XMM, %xmm19
|
|
.set KEY4, V19
|
|
.set KEY5_XMM, %xmm20
|
|
.set KEY5, V20
|
|
.set KEY6_XMM, %xmm21
|
|
.set KEY6, V21
|
|
.set KEY7_XMM, %xmm22
|
|
.set KEY7, V22
|
|
.set KEY8_XMM, %xmm23
|
|
.set KEY8, V23
|
|
.set KEY9_XMM, %xmm24
|
|
.set KEY9, V24
|
|
.set KEY10_XMM, %xmm25
|
|
.set KEY10, V25
|
|
.set KEY11_XMM, %xmm26
|
|
.set KEY11, V26
|
|
.set KEY12_XMM, %xmm27
|
|
.set KEY12, V27
|
|
.set KEY13_XMM, %xmm28
|
|
.set KEY13, V28
|
|
.set KEY14_XMM, %xmm29
|
|
.set KEY14, V29
|
|
.endif
|
|
// V30-V31 are currently unused.
|
|
.endm
|
|
|
|
// Move a vector between memory and a register.
|
|
.macro _vmovdqu src, dst
|
|
.if VL < 64
|
|
vmovdqu \src, \dst
|
|
.else
|
|
vmovdqu8 \src, \dst
|
|
.endif
|
|
.endm
|
|
|
|
// Broadcast a 128-bit value into a vector.
|
|
.macro _vbroadcast128 src, dst
|
|
.if VL == 16 && !USE_AVX10
|
|
vmovdqu \src, \dst
|
|
.elseif VL == 32 && !USE_AVX10
|
|
vbroadcasti128 \src, \dst
|
|
.else
|
|
vbroadcasti32x4 \src, \dst
|
|
.endif
|
|
.endm
|
|
|
|
// XOR two vectors together.
|
|
.macro _vpxor src1, src2, dst
|
|
.if USE_AVX10
|
|
vpxord \src1, \src2, \dst
|
|
.else
|
|
vpxor \src1, \src2, \dst
|
|
.endif
|
|
.endm
|
|
|
|
// XOR three vectors together.
|
|
.macro _xor3 src1, src2, src3_and_dst
|
|
.if USE_AVX10
|
|
// vpternlogd with immediate 0x96 is a three-argument XOR.
|
|
vpternlogd $0x96, \src1, \src2, \src3_and_dst
|
|
.else
|
|
vpxor \src1, \src3_and_dst, \src3_and_dst
|
|
vpxor \src2, \src3_and_dst, \src3_and_dst
|
|
.endif
|
|
.endm
|
|
|
|
// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
|
|
// (by multiplying by the polynomial 'x') and write it to \dst.
|
|
.macro _next_tweak src, tmp, dst
|
|
vpshufd $0x13, \src, \tmp
|
|
vpaddq \src, \src, \dst
|
|
vpsrad $31, \tmp, \tmp
|
|
vpand GF_POLY_XMM, \tmp, \tmp
|
|
vpxor \tmp, \dst, \dst
|
|
.endm
|
|
|
|
// Given the XTS tweak(s) in the vector \src, compute the next vector of
|
|
// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
|
|
//
|
|
// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
|
|
// all tweaks in the vector in parallel. If VL=16, we just do the regular
|
|
// computation without vpclmulqdq, as it's the faster method for a single tweak.
|
|
.macro _next_tweakvec src, tmp1, tmp2, dst
|
|
.if VL == 16
|
|
_next_tweak \src, \tmp1, \dst
|
|
.else
|
|
vpsrlq $64 - VL/16, \src, \tmp1
|
|
vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2
|
|
vpslldq $8, \tmp1, \tmp1
|
|
vpsllq $VL/16, \src, \dst
|
|
_xor3 \tmp1, \tmp2, \dst
|
|
.endif
|
|
.endm
|
|
|
|
// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
|
|
// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
|
|
.macro _compute_first_set_of_tweaks
|
|
vmovdqu (TWEAK), TWEAK0_XMM
|
|
_vbroadcast128 .Lgf_poly(%rip), GF_POLY
|
|
.if VL == 16
|
|
// With VL=16, multiplying by x serially is fastest.
|
|
_next_tweak TWEAK0, %xmm0, TWEAK1
|
|
_next_tweak TWEAK1, %xmm0, TWEAK2
|
|
_next_tweak TWEAK2, %xmm0, TWEAK3
|
|
.else
|
|
.if VL == 32
|
|
// Compute the second block of TWEAK0.
|
|
_next_tweak TWEAK0_XMM, %xmm0, %xmm1
|
|
vinserti128 $1, %xmm1, TWEAK0, TWEAK0
|
|
.elseif VL == 64
|
|
// Compute the remaining blocks of TWEAK0.
|
|
_next_tweak TWEAK0_XMM, %xmm0, %xmm1
|
|
_next_tweak %xmm1, %xmm0, %xmm2
|
|
_next_tweak %xmm2, %xmm0, %xmm3
|
|
vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0
|
|
vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0
|
|
vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0
|
|
.endif
|
|
// Compute TWEAK[1-3] from TWEAK0.
|
|
vpsrlq $64 - 1*VL/16, TWEAK0, V0
|
|
vpsrlq $64 - 2*VL/16, TWEAK0, V2
|
|
vpsrlq $64 - 3*VL/16, TWEAK0, V4
|
|
vpclmulqdq $0x01, GF_POLY, V0, V1
|
|
vpclmulqdq $0x01, GF_POLY, V2, V3
|
|
vpclmulqdq $0x01, GF_POLY, V4, V5
|
|
vpslldq $8, V0, V0
|
|
vpslldq $8, V2, V2
|
|
vpslldq $8, V4, V4
|
|
vpsllq $1*VL/16, TWEAK0, TWEAK1
|
|
vpsllq $2*VL/16, TWEAK0, TWEAK2
|
|
vpsllq $3*VL/16, TWEAK0, TWEAK3
|
|
.if USE_AVX10
|
|
vpternlogd $0x96, V0, V1, TWEAK1
|
|
vpternlogd $0x96, V2, V3, TWEAK2
|
|
vpternlogd $0x96, V4, V5, TWEAK3
|
|
.else
|
|
vpxor V0, TWEAK1, TWEAK1
|
|
vpxor V2, TWEAK2, TWEAK2
|
|
vpxor V4, TWEAK3, TWEAK3
|
|
vpxor V1, TWEAK1, TWEAK1
|
|
vpxor V3, TWEAK2, TWEAK2
|
|
vpxor V5, TWEAK3, TWEAK3
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
// Do one step in computing the next set of tweaks using the method of just
|
|
// multiplying by x repeatedly (the same method _next_tweak uses).
|
|
.macro _tweak_step_mulx i
|
|
.if \i == 0
|
|
.set PREV_TWEAK, TWEAK3
|
|
.set NEXT_TWEAK, NEXT_TWEAK0
|
|
.elseif \i == 5
|
|
.set PREV_TWEAK, NEXT_TWEAK0
|
|
.set NEXT_TWEAK, NEXT_TWEAK1
|
|
.elseif \i == 10
|
|
.set PREV_TWEAK, NEXT_TWEAK1
|
|
.set NEXT_TWEAK, NEXT_TWEAK2
|
|
.elseif \i == 15
|
|
.set PREV_TWEAK, NEXT_TWEAK2
|
|
.set NEXT_TWEAK, NEXT_TWEAK3
|
|
.endif
|
|
.if \i >= 0 && \i < 20 && \i % 5 == 0
|
|
vpshufd $0x13, PREV_TWEAK, V5
|
|
.elseif \i >= 0 && \i < 20 && \i % 5 == 1
|
|
vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
|
|
.elseif \i >= 0 && \i < 20 && \i % 5 == 2
|
|
vpsrad $31, V5, V5
|
|
.elseif \i >= 0 && \i < 20 && \i % 5 == 3
|
|
vpand GF_POLY, V5, V5
|
|
.elseif \i >= 0 && \i < 20 && \i % 5 == 4
|
|
vpxor V5, NEXT_TWEAK, NEXT_TWEAK
|
|
.elseif \i == 1000
|
|
vmovdqa NEXT_TWEAK0, TWEAK0
|
|
vmovdqa NEXT_TWEAK1, TWEAK1
|
|
vmovdqa NEXT_TWEAK2, TWEAK2
|
|
vmovdqa NEXT_TWEAK3, TWEAK3
|
|
.endif
|
|
.endm
|
|
|
|
// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
|
|
// (the same method _next_tweakvec uses for VL > 16). This means multiplying
|
|
// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8
|
|
// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
|
|
// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
|
|
.macro _tweak_step_pclmul i
|
|
.if \i == 0
|
|
vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
|
|
.elseif \i == 2
|
|
vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
|
|
.elseif \i == 4
|
|
vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
|
|
.elseif \i == 6
|
|
vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
|
|
.elseif \i == 8
|
|
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
|
|
.elseif \i == 10
|
|
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
|
|
.elseif \i == 12
|
|
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
|
|
.elseif \i == 14
|
|
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
|
|
.elseif \i == 1000
|
|
vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
|
|
vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1
|
|
vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2
|
|
vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3
|
|
_vpxor NEXT_TWEAK0, TWEAK0, TWEAK0
|
|
_vpxor NEXT_TWEAK1, TWEAK1, TWEAK1
|
|
_vpxor NEXT_TWEAK2, TWEAK2, TWEAK2
|
|
_vpxor NEXT_TWEAK3, TWEAK3, TWEAK3
|
|
.endif
|
|
.endm
|
|
|
|
// _tweak_step does one step of the computation of the next set of tweaks from
|
|
// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
|
|
// \i that include at least 0 through 19, then 1000 which signals the last step.
|
|
//
|
|
// This is used to interleave the computation of the next set of tweaks with the
|
|
// AES en/decryptions, which increases performance in some cases.
|
|
.macro _tweak_step i
|
|
.if VL == 16
|
|
_tweak_step_mulx \i
|
|
.else
|
|
_tweak_step_pclmul \i
|
|
.endif
|
|
.endm
|
|
|
|
.macro _setup_round_keys enc
|
|
|
|
// Select either the encryption round keys or the decryption round keys.
|
|
.if \enc
|
|
.set OFFS, 0
|
|
.else
|
|
.set OFFS, 240
|
|
.endif
|
|
|
|
// Load the round key for "round 0".
|
|
_vbroadcast128 OFFS(KEY), KEY0
|
|
|
|
// Increment KEY to make it so that 7*16(KEY) is the last round key.
|
|
// For AES-128, increment by 3*16, resulting in the 10 round keys (not
|
|
// counting the zero-th round key which was just loaded into KEY0) being
|
|
// -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
|
|
// 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
|
|
// by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
|
|
//
|
|
// This rebasing provides two benefits. First, it makes the offset to
|
|
// any round key be in the range [-96, 112], fitting in a signed byte.
|
|
// This shortens VEX-encoded instructions that access the later round
|
|
// keys which otherwise would need 4-byte offsets. Second, it makes it
|
|
// easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
|
|
// beginning. Skipping rounds at the end doesn't work as well because
|
|
// the last round needs different instructions.
|
|
//
|
|
// An alternative approach would be to roll up all the round loops. We
|
|
// don't do that because it isn't compatible with caching the round keys
|
|
// in registers which we do when possible (see below), and also because
|
|
// it seems unwise to rely *too* heavily on the CPU's branch predictor.
|
|
lea OFFS-16(KEY, KEYLEN64, 4), KEY
|
|
|
|
// If all 32 SIMD registers are available, cache all the round keys.
|
|
.if USE_AVX10
|
|
cmp $24, KEYLEN
|
|
jl .Laes128\@
|
|
je .Laes192\@
|
|
_vbroadcast128 -6*16(KEY), KEY1
|
|
_vbroadcast128 -5*16(KEY), KEY2
|
|
.Laes192\@:
|
|
_vbroadcast128 -4*16(KEY), KEY3
|
|
_vbroadcast128 -3*16(KEY), KEY4
|
|
.Laes128\@:
|
|
_vbroadcast128 -2*16(KEY), KEY5
|
|
_vbroadcast128 -1*16(KEY), KEY6
|
|
_vbroadcast128 0*16(KEY), KEY7
|
|
_vbroadcast128 1*16(KEY), KEY8
|
|
_vbroadcast128 2*16(KEY), KEY9
|
|
_vbroadcast128 3*16(KEY), KEY10
|
|
_vbroadcast128 4*16(KEY), KEY11
|
|
_vbroadcast128 5*16(KEY), KEY12
|
|
_vbroadcast128 6*16(KEY), KEY13
|
|
_vbroadcast128 7*16(KEY), KEY14
|
|
.endif
|
|
.endm
|
|
|
|
// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
|
|
// on the block(s) in \data using the round key(s) in \key. The register length
|
|
// determines the number of AES blocks en/decrypted.
|
|
.macro _vaes enc, last, key, data
|
|
.if \enc
|
|
.if \last
|
|
vaesenclast \key, \data, \data
|
|
.else
|
|
vaesenc \key, \data, \data
|
|
.endif
|
|
.else
|
|
.if \last
|
|
vaesdeclast \key, \data, \data
|
|
.else
|
|
vaesdec \key, \data, \data
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
// Do a single round of AES en/decryption on the block(s) in \data, using the
|
|
// same key for all block(s). The round key is loaded from the appropriate
|
|
// register or memory location for round \i. May clobber V4.
|
|
.macro _vaes_1x enc, last, i, xmm_suffix, data
|
|
.if USE_AVX10
|
|
_vaes \enc, \last, KEY\i\xmm_suffix, \data
|
|
.else
|
|
.ifnb \xmm_suffix
|
|
_vaes \enc, \last, (\i-7)*16(KEY), \data
|
|
.else
|
|
_vbroadcast128 (\i-7)*16(KEY), V4
|
|
_vaes \enc, \last, V4, \data
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
// Do a single round of AES en/decryption on the blocks in registers V0-V3,
|
|
// using the same key for all blocks. The round key is loaded from the
|
|
// appropriate register or memory location for round \i. In addition, does two
|
|
// steps of the computation of the next set of tweaks. May clobber V4.
|
|
.macro _vaes_4x enc, last, i
|
|
.if USE_AVX10
|
|
_tweak_step (2*(\i-5))
|
|
_vaes \enc, \last, KEY\i, V0
|
|
_vaes \enc, \last, KEY\i, V1
|
|
_tweak_step (2*(\i-5) + 1)
|
|
_vaes \enc, \last, KEY\i, V2
|
|
_vaes \enc, \last, KEY\i, V3
|
|
.else
|
|
_vbroadcast128 (\i-7)*16(KEY), V4
|
|
_tweak_step (2*(\i-5))
|
|
_vaes \enc, \last, V4, V0
|
|
_vaes \enc, \last, V4, V1
|
|
_tweak_step (2*(\i-5) + 1)
|
|
_vaes \enc, \last, V4, V2
|
|
_vaes \enc, \last, V4, V3
|
|
.endif
|
|
.endm
|
|
|
|
// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
|
|
// then XOR with \tweak again) of the block(s) in \data. To process a single
|
|
// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
|
|
// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
|
|
.macro _aes_crypt enc, xmm_suffix, tweak, data
|
|
_xor3 KEY0\xmm_suffix, \tweak, \data
|
|
cmp $24, KEYLEN
|
|
jl .Laes128\@
|
|
je .Laes192\@
|
|
_vaes_1x \enc, 0, 1, \xmm_suffix, \data
|
|
_vaes_1x \enc, 0, 2, \xmm_suffix, \data
|
|
.Laes192\@:
|
|
_vaes_1x \enc, 0, 3, \xmm_suffix, \data
|
|
_vaes_1x \enc, 0, 4, \xmm_suffix, \data
|
|
.Laes128\@:
|
|
_vaes_1x \enc, 0, 5, \xmm_suffix, \data
|
|
_vaes_1x \enc, 0, 6, \xmm_suffix, \data
|
|
_vaes_1x \enc, 0, 7, \xmm_suffix, \data
|
|
_vaes_1x \enc, 0, 8, \xmm_suffix, \data
|
|
_vaes_1x \enc, 0, 9, \xmm_suffix, \data
|
|
_vaes_1x \enc, 0, 10, \xmm_suffix, \data
|
|
_vaes_1x \enc, 0, 11, \xmm_suffix, \data
|
|
_vaes_1x \enc, 0, 12, \xmm_suffix, \data
|
|
_vaes_1x \enc, 0, 13, \xmm_suffix, \data
|
|
_vaes_1x \enc, 1, 14, \xmm_suffix, \data
|
|
_vpxor \tweak, \data, \data
|
|
.endm
|
|
|
|
.macro _aes_xts_crypt enc
|
|
_define_aliases
|
|
|
|
.if !\enc
|
|
// When decrypting a message whose length isn't a multiple of the AES
|
|
// block length, exclude the last full block from the main loop by
|
|
// subtracting 16 from LEN. This is needed because ciphertext stealing
|
|
// decryption uses the last two tweaks in reverse order. We'll handle
|
|
// the last full block and the partial block specially at the end.
|
|
lea -16(LEN), %eax
|
|
test $15, LEN8
|
|
cmovnz %eax, LEN
|
|
.endif
|
|
|
|
// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
|
|
movl 480(KEY), KEYLEN
|
|
|
|
// Setup the pointer to the round keys and cache as many as possible.
|
|
_setup_round_keys \enc
|
|
|
|
// Compute the first set of tweaks TWEAK[0-3].
|
|
_compute_first_set_of_tweaks
|
|
|
|
sub $4*VL, LEN
|
|
jl .Lhandle_remainder\@
|
|
|
|
.Lmain_loop\@:
|
|
// This is the main loop, en/decrypting 4*VL bytes per iteration.
|
|
|
|
// XOR each source block with its tweak and the zero-th round key.
|
|
.if USE_AVX10
|
|
vmovdqu8 0*VL(SRC), V0
|
|
vmovdqu8 1*VL(SRC), V1
|
|
vmovdqu8 2*VL(SRC), V2
|
|
vmovdqu8 3*VL(SRC), V3
|
|
vpternlogd $0x96, TWEAK0, KEY0, V0
|
|
vpternlogd $0x96, TWEAK1, KEY0, V1
|
|
vpternlogd $0x96, TWEAK2, KEY0, V2
|
|
vpternlogd $0x96, TWEAK3, KEY0, V3
|
|
.else
|
|
vpxor 0*VL(SRC), KEY0, V0
|
|
vpxor 1*VL(SRC), KEY0, V1
|
|
vpxor 2*VL(SRC), KEY0, V2
|
|
vpxor 3*VL(SRC), KEY0, V3
|
|
vpxor TWEAK0, V0, V0
|
|
vpxor TWEAK1, V1, V1
|
|
vpxor TWEAK2, V2, V2
|
|
vpxor TWEAK3, V3, V3
|
|
.endif
|
|
cmp $24, KEYLEN
|
|
jl .Laes128\@
|
|
je .Laes192\@
|
|
// Do all the AES rounds on the data blocks, interleaved with
|
|
// the computation of the next set of tweaks.
|
|
_vaes_4x \enc, 0, 1
|
|
_vaes_4x \enc, 0, 2
|
|
.Laes192\@:
|
|
_vaes_4x \enc, 0, 3
|
|
_vaes_4x \enc, 0, 4
|
|
.Laes128\@:
|
|
_vaes_4x \enc, 0, 5
|
|
_vaes_4x \enc, 0, 6
|
|
_vaes_4x \enc, 0, 7
|
|
_vaes_4x \enc, 0, 8
|
|
_vaes_4x \enc, 0, 9
|
|
_vaes_4x \enc, 0, 10
|
|
_vaes_4x \enc, 0, 11
|
|
_vaes_4x \enc, 0, 12
|
|
_vaes_4x \enc, 0, 13
|
|
_vaes_4x \enc, 1, 14
|
|
|
|
// XOR in the tweaks again.
|
|
_vpxor TWEAK0, V0, V0
|
|
_vpxor TWEAK1, V1, V1
|
|
_vpxor TWEAK2, V2, V2
|
|
_vpxor TWEAK3, V3, V3
|
|
|
|
// Store the destination blocks.
|
|
_vmovdqu V0, 0*VL(DST)
|
|
_vmovdqu V1, 1*VL(DST)
|
|
_vmovdqu V2, 2*VL(DST)
|
|
_vmovdqu V3, 3*VL(DST)
|
|
|
|
// Finish computing the next set of tweaks.
|
|
_tweak_step 1000
|
|
|
|
add $4*VL, SRC
|
|
add $4*VL, DST
|
|
sub $4*VL, LEN
|
|
jge .Lmain_loop\@
|
|
|
|
// Check for the uncommon case where the data length isn't a multiple of
|
|
// 4*VL. Handle it out-of-line in order to optimize for the common
|
|
// case. In the common case, just fall through to the ret.
|
|
test $4*VL-1, LEN8
|
|
jnz .Lhandle_remainder\@
|
|
.Ldone\@:
|
|
// Store the next tweak back to *TWEAK to support continuation calls.
|
|
vmovdqu TWEAK0_XMM, (TWEAK)
|
|
.if VL > 16
|
|
vzeroupper
|
|
.endif
|
|
RET
|
|
|
|
.Lhandle_remainder\@:
|
|
|
|
// En/decrypt any remaining full blocks, one vector at a time.
|
|
.if VL > 16
|
|
add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL.
|
|
jl .Lvec_at_a_time_done\@
|
|
.Lvec_at_a_time\@:
|
|
_vmovdqu (SRC), V0
|
|
_aes_crypt \enc, , TWEAK0, V0
|
|
_vmovdqu V0, (DST)
|
|
_next_tweakvec TWEAK0, V0, V1, TWEAK0
|
|
add $VL, SRC
|
|
add $VL, DST
|
|
sub $VL, LEN
|
|
jge .Lvec_at_a_time\@
|
|
.Lvec_at_a_time_done\@:
|
|
add $VL-16, LEN // Undo extra sub of VL, then sub 16.
|
|
.else
|
|
add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16.
|
|
.endif
|
|
|
|
// En/decrypt any remaining full blocks, one at a time.
|
|
jl .Lblock_at_a_time_done\@
|
|
.Lblock_at_a_time\@:
|
|
vmovdqu (SRC), %xmm0
|
|
_aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
|
|
vmovdqu %xmm0, (DST)
|
|
_next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM
|
|
add $16, SRC
|
|
add $16, DST
|
|
sub $16, LEN
|
|
jge .Lblock_at_a_time\@
|
|
.Lblock_at_a_time_done\@:
|
|
add $16, LEN // Undo the extra sub of 16.
|
|
// Now 0 <= LEN <= 15. If LEN is zero, we're done.
|
|
jz .Ldone\@
|
|
|
|
// Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
|
|
// Do ciphertext stealing to process the last 16 + LEN bytes.
|
|
|
|
.if \enc
|
|
// If encrypting, the main loop already encrypted the last full block to
|
|
// create the CTS intermediate ciphertext. Prepare for the rest of CTS
|
|
// by rewinding the pointers and loading the intermediate ciphertext.
|
|
sub $16, SRC
|
|
sub $16, DST
|
|
vmovdqu (DST), %xmm0
|
|
.else
|
|
// If decrypting, the main loop didn't decrypt the last full block
|
|
// because CTS decryption uses the last two tweaks in reverse order.
|
|
// Do it now by advancing the tweak and decrypting the last full block.
|
|
_next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
|
|
vmovdqu (SRC), %xmm0
|
|
_aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
|
|
.endif
|
|
|
|
.if USE_AVX10
|
|
// Create a mask that has the first LEN bits set.
|
|
mov $-1, %r9d
|
|
bzhi LEN, %r9d, %r9d
|
|
kmovd %r9d, %k1
|
|
|
|
// Swap the first LEN bytes of the en/decryption of the last full block
|
|
// with the partial block. Note that to support in-place en/decryption,
|
|
// the load from the src partial block must happen before the store to
|
|
// the dst partial block.
|
|
vmovdqa %xmm0, %xmm1
|
|
vmovdqu8 16(SRC), %xmm0{%k1}
|
|
vmovdqu8 %xmm1, 16(DST){%k1}
|
|
.else
|
|
lea .Lcts_permute_table(%rip), %r9
|
|
|
|
// Load the src partial block, left-aligned. Note that to support
|
|
// in-place en/decryption, this must happen before the store to the dst
|
|
// partial block.
|
|
vmovdqu (SRC, LEN64, 1), %xmm1
|
|
|
|
// Shift the first LEN bytes of the en/decryption of the last full block
|
|
// to the end of a register, then store it to DST+LEN. This stores the
|
|
// dst partial block. It also writes to the second part of the dst last
|
|
// full block, but that part is overwritten later.
|
|
vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
|
|
vmovdqu %xmm2, (DST, LEN64, 1)
|
|
|
|
// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
|
|
sub LEN64, %r9
|
|
vmovdqu 32(%r9), %xmm3
|
|
|
|
// Shift the src partial block to the beginning of its register.
|
|
vpshufb %xmm3, %xmm1, %xmm1
|
|
|
|
// Do a blend to generate the src partial block followed by the second
|
|
// part of the en/decryption of the last full block.
|
|
vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
|
|
.endif
|
|
// En/decrypt again and store the last full block.
|
|
_aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
|
|
vmovdqu %xmm0, (DST)
|
|
jmp .Ldone\@
|
|
.endm
|
|
|
|
// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
|
|
// u8 iv[AES_BLOCK_SIZE]);
|
|
SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
|
|
vmovdqu (%rsi), %xmm0
|
|
vpxor (%rdi), %xmm0, %xmm0
|
|
movl 480(%rdi), %eax // AES key length
|
|
lea -16(%rdi, %rax, 4), %rdi
|
|
cmp $24, %eax
|
|
jl .Lencrypt_iv_aes128
|
|
je .Lencrypt_iv_aes192
|
|
vaesenc -6*16(%rdi), %xmm0, %xmm0
|
|
vaesenc -5*16(%rdi), %xmm0, %xmm0
|
|
.Lencrypt_iv_aes192:
|
|
vaesenc -4*16(%rdi), %xmm0, %xmm0
|
|
vaesenc -3*16(%rdi), %xmm0, %xmm0
|
|
.Lencrypt_iv_aes128:
|
|
vaesenc -2*16(%rdi), %xmm0, %xmm0
|
|
vaesenc -1*16(%rdi), %xmm0, %xmm0
|
|
vaesenc 0*16(%rdi), %xmm0, %xmm0
|
|
vaesenc 1*16(%rdi), %xmm0, %xmm0
|
|
vaesenc 2*16(%rdi), %xmm0, %xmm0
|
|
vaesenc 3*16(%rdi), %xmm0, %xmm0
|
|
vaesenc 4*16(%rdi), %xmm0, %xmm0
|
|
vaesenc 5*16(%rdi), %xmm0, %xmm0
|
|
vaesenc 6*16(%rdi), %xmm0, %xmm0
|
|
vaesenclast 7*16(%rdi), %xmm0, %xmm0
|
|
vmovdqu %xmm0, (%rsi)
|
|
RET
|
|
SYM_FUNC_END(aes_xts_encrypt_iv)
|
|
|
|
// Below are the actual AES-XTS encryption and decryption functions,
|
|
// instantiated from the above macro. They all have the following prototype:
|
|
//
|
|
// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
|
|
// const u8 *src, u8 *dst, unsigned int len,
|
|
// u8 tweak[AES_BLOCK_SIZE]);
|
|
//
|
|
// |key| is the data key. |tweak| contains the next tweak; the encryption of
|
|
// the original IV with the tweak key was already done. This function supports
|
|
// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
|
|
// |len| must be a multiple of 16 except on the last call. If |len| is a
|
|
// multiple of 16, then this function updates |tweak| to contain the next tweak.
|
|
|
|
.set VL, 16
|
|
.set USE_AVX10, 0
|
|
SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
|
|
_aes_xts_crypt 1
|
|
SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
|
|
SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
|
|
_aes_xts_crypt 0
|
|
SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
|
|
|
|
#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
|
|
.set VL, 32
|
|
.set USE_AVX10, 0
|
|
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
|
|
_aes_xts_crypt 1
|
|
SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
|
|
SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
|
|
_aes_xts_crypt 0
|
|
SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
|
|
|
|
.set VL, 32
|
|
.set USE_AVX10, 1
|
|
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
|
|
_aes_xts_crypt 1
|
|
SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
|
|
SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
|
|
_aes_xts_crypt 0
|
|
SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
|
|
|
|
.set VL, 64
|
|
.set USE_AVX10, 1
|
|
SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
|
|
_aes_xts_crypt 1
|
|
SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
|
|
SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
|
|
_aes_xts_crypt 0
|
|
SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
|
|
#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
|