linux/arch/x86/crypto/camellia-x86_64-asm_64.S

/*
 * Camellia Cipher Algorithm (x86_64)
 *
 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
 * USA
 *
 */

.file "camellia-x86_64-asm_64.S"
.text

.extern camellia_sp10011110;
.extern camellia_sp22000222;
.extern camellia_sp03303033;
.extern camellia_sp00444404;
.extern camellia_sp02220222;
.extern camellia_sp30333033;
.extern camellia_sp44044404;
.extern camellia_sp11101110;

#define sp10011110 camellia_sp10011110
#define sp22000222 camellia_sp22000222
#define sp03303033 camellia_sp03303033
#define sp00444404 camellia_sp00444404
#define sp02220222 camellia_sp02220222
#define sp30333033 camellia_sp30333033
#define sp44044404 camellia_sp44044404
#define sp11101110 camellia_sp11101110

#define CAMELLIA_TABLE_BYTE_LEN 272

/* struct camellia_ctx: */
#define key_table 0
#define key_length CAMELLIA_TABLE_BYTE_LEN

/* register macros */
#define CTX %rdi
#define RIO %rsi
#define RIOd %esi

#define RAB0 %rax
#define RCD0 %rcx
#define RAB1 %rbx
#define RCD1 %rdx

#define RAB0d %eax
#define RCD0d %ecx
#define RAB1d %ebx
#define RCD1d %edx

#define RAB0bl %al
#define RCD0bl %cl
#define RAB1bl %bl
#define RCD1bl %dl

#define RAB0bh %ah
#define RCD0bh %ch
#define RAB1bh %bh
#define RCD1bh %dh

#define RT0 %rsi
#define RT1 %rbp
#define RT2 %r8

#define RT0d %esi
#define RT1d %ebp
#define RT2d %r8d

#define RT2bl %r8b

#define RXOR %r9
#define RRBP %r10
#define RDST %r11

#define RXORd %r9d
#define RXORbl %r9b

#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
	movzbl ab ## bl,		tmp2 ## d; \
	movzbl ab ## bh,		tmp1 ## d; \
	rorq $16,			ab; \
	xorq T0(, tmp2, 8),		dst; \
	xorq T1(, tmp1, 8),		dst;

/**********************************************************************
  1-way camellia
 **********************************************************************/
#define roundsm(ab, subkey, cd) \
	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
	\
	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
	\
	xorq RT2,					cd ## 0;

#define fls(l, r, kl, kr) \
	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
	andl l ## 0d,					RT0d; \
	roll $1,					RT0d; \
	shlq $32,					RT0; \
	xorq RT0,					l ## 0; \
	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
	orq r ## 0,					RT1; \
	shrq $32,					RT1; \
	xorq RT1,					r ## 0; \
	\
	movq (key_table + ((kl) * 2) * 4)(CTX),		RT2; \
	orq l ## 0,					RT2; \
	shrq $32,					RT2; \
	xorq RT2,					l ## 0; \
	movl (key_table + ((kr) * 2) * 4)(CTX),		RT0d; \
	andl r ## 0d,					RT0d; \
	roll $1,					RT0d; \
	shlq $32,					RT0; \
	xorq RT0,					r ## 0;

#define enc_rounds(i) \
	roundsm(RAB, i + 2, RCD); \
	roundsm(RCD, i + 3, RAB); \
	roundsm(RAB, i + 4, RCD); \
	roundsm(RCD, i + 5, RAB); \
	roundsm(RAB, i + 6, RCD); \
	roundsm(RCD, i + 7, RAB);

#define enc_fls(i) \
	fls(RAB, RCD, i + 0, i + 1);

#define enc_inpack() \
	movq (RIO),			RAB0; \
	bswapq				RAB0; \
	rolq $32,			RAB0; \
	movq 4*2(RIO),			RCD0; \
	bswapq				RCD0; \
	rorq $32,			RCD0; \
	xorq key_table(CTX),		RAB0;

#define enc_outunpack(op, max) \
	xorq key_table(CTX, max, 8),	RCD0; \
	rorq $32,			RCD0; \
	bswapq				RCD0; \
	op ## q RCD0,			(RIO); \
	rolq $32,			RAB0; \
	bswapq				RAB0; \
	op ## q RAB0,			4*2(RIO);

#define dec_rounds(i) \
	roundsm(RAB, i + 7, RCD); \
	roundsm(RCD, i + 6, RAB); \
	roundsm(RAB, i + 5, RCD); \
	roundsm(RCD, i + 4, RAB); \
	roundsm(RAB, i + 3, RCD); \
	roundsm(RCD, i + 2, RAB);

#define dec_fls(i) \
	fls(RAB, RCD, i + 1, i + 0);

#define dec_inpack(max) \
	movq (RIO),			RAB0; \
	bswapq				RAB0; \
	rolq $32,			RAB0; \
	movq 4*2(RIO),			RCD0; \
	bswapq				RCD0; \
	rorq $32,			RCD0; \
	xorq key_table(CTX, max, 8),	RAB0;

#define dec_outunpack() \
	xorq key_table(CTX),		RCD0; \
	rorq $32,			RCD0; \
	bswapq				RCD0; \
	movq RCD0,			(RIO); \
	rolq $32,			RAB0; \
	bswapq				RAB0; \
	movq RAB0,			4*2(RIO);

.global __camellia_enc_blk;
.type   __camellia_enc_blk,@function;

__camellia_enc_blk:
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 *	%rcx: bool xor
	 */
	movq %rbp, RRBP;

	movq %rcx, RXOR;
	movq %rsi, RDST;
	movq %rdx, RIO;

	enc_inpack();

	enc_rounds(0);
	enc_fls(8);
	enc_rounds(8);
	enc_fls(16);
	enc_rounds(16);
	movl $24, RT1d; /* max */

	cmpb $16, key_length(CTX);
	je __enc_done;

	enc_fls(24);
	enc_rounds(24);
	movl $32, RT1d; /* max */

__enc_done:
	testb RXORbl, RXORbl;
	movq RDST, RIO;

	jnz __enc_xor;

	enc_outunpack(mov, RT1);

	movq RRBP, %rbp;
	ret;

__enc_xor:
	enc_outunpack(xor, RT1);

	movq RRBP, %rbp;
	ret;

.global camellia_dec_blk;
.type   camellia_dec_blk,@function;

camellia_dec_blk:
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 */
	cmpl $16, key_length(CTX);
	movl $32, RT2d;
	movl $24, RXORd;
	cmovel RXORd, RT2d; /* max */

	movq %rbp, RRBP;
	movq %rsi, RDST;
	movq %rdx, RIO;

	dec_inpack(RT2);

	cmpb $24, RT2bl;
	je __dec_rounds16;

	dec_rounds(24);
	dec_fls(24);

__dec_rounds16:
	dec_rounds(16);
	dec_fls(16);
	dec_rounds(8);
	dec_fls(8);
	dec_rounds(0);

	movq RDST, RIO;

	dec_outunpack();

	movq RRBP, %rbp;
	ret;

/**********************************************************************
  2-way camellia
 **********************************************************************/
#define roundsm2(ab, subkey, cd) \
	movq (key_table + ((subkey) * 2) * 4)(CTX),	RT2; \
	xorq RT2,					cd ## 1; \
	\
	xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
	xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
	xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
	\
		xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
		xorq RT2,					cd ## 0; \
		xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
		xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
		xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);

#define fls2(l, r, kl, kr) \
	movl (key_table + ((kl) * 2) * 4)(CTX),		RT0d; \
	andl l ## 0d,					RT0d; \
	roll $1,					RT0d; \
	shlq $32,					RT0; \
	xorq RT0,					l ## 0; \
	movq (key_table + ((kr) * 2) * 4)(CTX),		RT1; \
	orq r ## 0,					RT1; \
	shrq $32,					RT1; \
	xorq RT1,					r ## 0; \
	\
		movl (key_table + ((kl) * 2) * 4)(CTX),		RT2d; \
		andl l ## 1d,					RT2d; \
		roll $1,					RT2d; \
		shlq $32,					RT2; \
		xorq RT2,					l ## 1; \
		movq (key_table + ((kr) * 2) * 4)(CTX),		RT0; \
		orq r ## 1,					RT0; \
		shrq $32,					RT0; \
		xorq RT0,					r ## 1; \
	\
	movq (key_table + ((kl) * 2) * 4)(CTX),		RT1; \
	orq l ## 0,					RT1; \
	shrq $32,					RT1; \
	xorq RT1,					l ## 0; \
	movl (key_table + ((kr) * 2) * 4)(CTX),		RT2d; \
	andl r ## 0d,					RT2d; \
	roll $1,					RT2d; \
	shlq $32,					RT2; \
	xorq RT2,					r ## 0; \
	\
		movq (key_table + ((kl) * 2) * 4)(CTX),		RT0; \
		orq l ## 1,					RT0; \
		shrq $32,					RT0; \
		xorq RT0,					l ## 1; \
		movl (key_table + ((kr) * 2) * 4)(CTX),		RT1d; \
		andl r ## 1d,					RT1d; \
		roll $1,					RT1d; \
		shlq $32,					RT1; \
		xorq RT1,					r ## 1;

#define enc_rounds2(i) \
	roundsm2(RAB, i + 2, RCD); \
	roundsm2(RCD, i + 3, RAB); \
	roundsm2(RAB, i + 4, RCD); \
	roundsm2(RCD, i + 5, RAB); \
	roundsm2(RAB, i + 6, RCD); \
	roundsm2(RCD, i + 7, RAB);

#define enc_fls2(i) \
	fls2(RAB, RCD, i + 0, i + 1);

#define enc_inpack2() \
	movq (RIO),			RAB0; \
	bswapq				RAB0; \
	rorq $32,			RAB0; \
	movq 4*2(RIO),			RCD0; \
	bswapq				RCD0; \
	rolq $32,			RCD0; \
	xorq key_table(CTX),		RAB0; \
	\
		movq 8*2(RIO),			RAB1; \
		bswapq				RAB1; \
		rorq $32,			RAB1; \
		movq 12*2(RIO),			RCD1; \
		bswapq				RCD1; \
		rolq $32,			RCD1; \
		xorq key_table(CTX),		RAB1;

#define enc_outunpack2(op, max) \
	xorq key_table(CTX, max, 8),	RCD0; \
	rolq $32,			RCD0; \
	bswapq				RCD0; \
	op ## q RCD0,			(RIO); \
	rorq $32,			RAB0; \
	bswapq				RAB0; \
	op ## q RAB0,			4*2(RIO); \
	\
		xorq key_table(CTX, max, 8),	RCD1; \
		rolq $32,			RCD1; \
		bswapq				RCD1; \
		op ## q RCD1,			8*2(RIO); \
		rorq $32,			RAB1; \
		bswapq				RAB1; \
		op ## q RAB1,			12*2(RIO);

#define dec_rounds2(i) \
	roundsm2(RAB, i + 7, RCD); \
	roundsm2(RCD, i + 6, RAB); \
	roundsm2(RAB, i + 5, RCD); \
	roundsm2(RCD, i + 4, RAB); \
	roundsm2(RAB, i + 3, RCD); \
	roundsm2(RCD, i + 2, RAB);

#define dec_fls2(i) \
	fls2(RAB, RCD, i + 1, i + 0);

#define dec_inpack2(max) \
	movq (RIO),			RAB0; \
	bswapq				RAB0; \
	rorq $32,			RAB0; \
	movq 4*2(RIO),			RCD0; \
	bswapq				RCD0; \
	rolq $32,			RCD0; \
	xorq key_table(CTX, max, 8),	RAB0; \
	\
		movq 8*2(RIO),			RAB1; \
		bswapq				RAB1; \
		rorq $32,			RAB1; \
		movq 12*2(RIO),			RCD1; \
		bswapq				RCD1; \
		rolq $32,			RCD1; \
		xorq key_table(CTX, max, 8),	RAB1;

#define dec_outunpack2() \
	xorq key_table(CTX),		RCD0; \
	rolq $32,			RCD0; \
	bswapq				RCD0; \
	movq RCD0,			(RIO); \
	rorq $32,			RAB0; \
	bswapq				RAB0; \
	movq RAB0,			4*2(RIO); \
	\
		xorq key_table(CTX),		RCD1; \
		rolq $32,			RCD1; \
		bswapq				RCD1; \
		movq RCD1,			8*2(RIO); \
		rorq $32,			RAB1; \
		bswapq				RAB1; \
		movq RAB1,			12*2(RIO);

.global __camellia_enc_blk_2way;
.type   __camellia_enc_blk_2way,@function;

__camellia_enc_blk_2way:
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 *	%rcx: bool xor
	 */
	pushq %rbx;

	movq %rbp, RRBP;
	movq %rcx, RXOR;
	movq %rsi, RDST;
	movq %rdx, RIO;

	enc_inpack2();

	enc_rounds2(0);
	enc_fls2(8);
	enc_rounds2(8);
	enc_fls2(16);
	enc_rounds2(16);
	movl $24, RT2d; /* max */

	cmpb $16, key_length(CTX);
	je __enc2_done;

	enc_fls2(24);
	enc_rounds2(24);
	movl $32, RT2d; /* max */

__enc2_done:
	test RXORbl, RXORbl;
	movq RDST, RIO;
	jnz __enc2_xor;

	enc_outunpack2(mov, RT2);

	movq RRBP, %rbp;
	popq %rbx;
	ret;

__enc2_xor:
	enc_outunpack2(xor, RT2);

	movq RRBP, %rbp;
	popq %rbx;
	ret;

.global camellia_dec_blk_2way;
.type   camellia_dec_blk_2way,@function;

camellia_dec_blk_2way:
	/* input:
	 *	%rdi: ctx, CTX
	 *	%rsi: dst
	 *	%rdx: src
	 */
	cmpl $16, key_length(CTX);
	movl $32, RT2d;
	movl $24, RXORd;
	cmovel RXORd, RT2d; /* max */

	movq %rbx, RXOR;
	movq %rbp, RRBP;
	movq %rsi, RDST;
	movq %rdx, RIO;

	dec_inpack2(RT2);

	cmpb $24, RT2bl;
	je __dec2_rounds16;

	dec_rounds2(24);
	dec_fls2(24);

__dec2_rounds16:
	dec_rounds2(16);
	dec_fls2(16);
	dec_rounds2(8);
	dec_fls2(8);
	dec_rounds2(0);

	movq RDST, RIO;

	dec_outunpack2();

	movq RRBP, %rbp;
	movq RXOR, %rbx;
	ret;
crypto: camellia - add assembler implementation for x86_64 Patch adds x86_64 assembler implementation of Camellia block cipher. Two set of functions are provided. First set is regular 'one-block at time' encrypt/decrypt functions. Second is 'two-block at time' functions that gain performance increase on out-of-order CPUs. Performance of 2-way functions should be equal to 1-way functions with in-order CPUs. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: AMD Phenom II 1055T (fam:16, model:10): camellia-asm vs camellia_generic: 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.27x 1.22x 1.30x 1.42x 1.30x 1.34x 1.19x 1.05x 1.23x 1.24x 64B 1.74x 1.79x 1.43x 1.87x 1.81x 1.87x 1.48x 1.38x 1.55x 1.62x 256B 1.90x 1.87x 1.43x 1.94x 1.94x 1.95x 1.63x 1.62x 1.67x 1.70x 1024B 1.96x 1.93x 1.43x 1.95x 1.98x 2.01x 1.67x 1.69x 1.74x 1.80x 8192B 1.96x 1.96x 1.39x 1.93x 2.01x 2.03x 1.72x 1.64x 1.71x 1.76x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.23x 1.23x 1.33x 1.39x 1.34x 1.38x 1.04x 1.18x 1.21x 1.29x 64B 1.72x 1.69x 1.42x 1.78x 1.81x 1.89x 1.57x 1.52x 1.56x 1.65x 256B 1.85x 1.88x 1.42x 1.86x 1.93x 1.96x 1.69x 1.65x 1.70x 1.75x 1024B 1.88x 1.86x 1.45x 1.95x 1.96x 1.95x 1.77x 1.71x 1.77x 1.78x 8192B 1.91x 1.86x 1.42x 1.91x 2.03x 1.98x 1.73x 1.71x 1.78x 1.76x camellia-asm vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.15x 1.22x ecb-dec 1.16x 1.16x cbc-enc 0.85x 0.90x cbc-dec 1.20x 1.23x ctr-enc 1.28x 1.30x ctr-dec 1.27x 1.28x lrw-enc 1.12x 1.16x lrw-dec 1.08x 1.10x xts-enc 1.11x 1.15x xts-dec 1.14x 1.15x Intel Core2 T8100 (fam:6, model:23, step:6): camellia-asm vs camellia_generic: 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.10x 1.12x 1.14x 1.16x 1.16x 1.15x 1.02x 1.02x 1.08x 1.08x 64B 1.61x 1.60x 1.17x 1.68x 1.67x 1.66x 1.43x 1.42x 1.44x 1.42x 256B 1.65x 1.73x 1.17x 1.77x 1.81x 1.80x 1.54x 1.53x 1.58x 1.54x 1024B 1.76x 1.74x 1.18x 1.80x 1.85x 1.85x 1.60x 1.59x 1.65x 1.60x 8192B 1.77x 1.75x 1.19x 1.81x 1.85x 1.86x 1.63x 1.61x 1.66x 1.62x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 1.10x 1.07x 1.13x 1.16x 1.11x 1.16x 1.03x 1.02x 1.08x 1.07x 64B 1.61x 1.62x 1.15x 1.66x 1.63x 1.68x 1.47x 1.46x 1.47x 1.44x 256B 1.71x 1.70x 1.16x 1.75x 1.69x 1.79x 1.58x 1.57x 1.59x 1.55x 1024B 1.78x 1.72x 1.17x 1.75x 1.80x 1.80x 1.63x 1.62x 1.65x 1.62x 8192B 1.76x 1.73x 1.17x 1.78x 1.80x 1.81x 1.64x 1.62x 1.68x 1.64x camellia-asm vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.17x 1.21x ecb-dec 1.17x 1.20x cbc-enc 0.80x 0.82x cbc-dec 1.22x 1.24x ctr-enc 1.25x 1.26x ctr-dec 1.25x 1.26x lrw-enc 1.14x 1.18x lrw-dec 1.13x 1.17x xts-enc 1.14x 1.18x xts-dec 1.14x 1.17x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2012-03-05 18:26:47 +00:00			`/*`
			`* Camellia Cipher Algorithm (x86_64)`
			`*`
			`* Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>`
			`*`
			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; either version 2 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program; if not, write to the Free Software`
			`* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307`
			`* USA`
			`*`
			`*/`

			`.file "camellia-x86_64-asm_64.S"`
			`.text`

			`.extern camellia_sp10011110;`
			`.extern camellia_sp22000222;`
			`.extern camellia_sp03303033;`
			`.extern camellia_sp00444404;`
			`.extern camellia_sp02220222;`
			`.extern camellia_sp30333033;`
			`.extern camellia_sp44044404;`
			`.extern camellia_sp11101110;`

			`#define sp10011110 camellia_sp10011110`
			`#define sp22000222 camellia_sp22000222`
			`#define sp03303033 camellia_sp03303033`
			`#define sp00444404 camellia_sp00444404`
			`#define sp02220222 camellia_sp02220222`
			`#define sp30333033 camellia_sp30333033`
			`#define sp44044404 camellia_sp44044404`
			`#define sp11101110 camellia_sp11101110`

			`#define CAMELLIA_TABLE_BYTE_LEN 272`

			`/* struct camellia_ctx: */`
			`#define key_table 0`
			`#define key_length CAMELLIA_TABLE_BYTE_LEN`

			`/* register macros */`
			`#define CTX %rdi`
			`#define RIO %rsi`
			`#define RIOd %esi`

			`#define RAB0 %rax`
			`#define RCD0 %rcx`
			`#define RAB1 %rbx`
			`#define RCD1 %rdx`

			`#define RAB0d %eax`
			`#define RCD0d %ecx`
			`#define RAB1d %ebx`
			`#define RCD1d %edx`

			`#define RAB0bl %al`
			`#define RCD0bl %cl`
			`#define RAB1bl %bl`
			`#define RCD1bl %dl`

			`#define RAB0bh %ah`
			`#define RCD0bh %ch`
			`#define RAB1bh %bh`
			`#define RCD1bh %dh`

			`#define RT0 %rsi`
			`#define RT1 %rbp`
			`#define RT2 %r8`

			`#define RT0d %esi`
			`#define RT1d %ebp`
			`#define RT2d %r8d`

			`#define RT2bl %r8b`

			`#define RXOR %r9`
			`#define RRBP %r10`
			`#define RDST %r11`

			`#define RXORd %r9d`
			`#define RXORbl %r9b`

			`#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \`
			`movzbl ab ## bl, tmp2 ## d; \`
			`movzbl ab ## bh, tmp1 ## d; \`
			`rorq $16, ab; \`
			`xorq T0(, tmp2, 8), dst; \`
			`xorq T1(, tmp1, 8), dst;`

			`/**********************************************************************`
			`1-way camellia`
			`**********************************************************************/`
			`#define roundsm(ab, subkey, cd) \`
			`movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \`
			`\`
			`xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \`
			`xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \`
			`xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \`
			`xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \`
			`\`
			`xorq RT2, cd ## 0;`

			`#define fls(l, r, kl, kr) \`
			`movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \`
			`andl l ## 0d, RT0d; \`
			`roll $1, RT0d; \`
			`shlq $32, RT0; \`
			`xorq RT0, l ## 0; \`
			`movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \`
			`orq r ## 0, RT1; \`
			`shrq $32, RT1; \`
			`xorq RT1, r ## 0; \`
			`\`
			`movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \`
			`orq l ## 0, RT2; \`
			`shrq $32, RT2; \`
			`xorq RT2, l ## 0; \`
			`movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \`
			`andl r ## 0d, RT0d; \`
			`roll $1, RT0d; \`
			`shlq $32, RT0; \`
			`xorq RT0, r ## 0;`

			`#define enc_rounds(i) \`
			`roundsm(RAB, i + 2, RCD); \`
			`roundsm(RCD, i + 3, RAB); \`
			`roundsm(RAB, i + 4, RCD); \`
			`roundsm(RCD, i + 5, RAB); \`
			`roundsm(RAB, i + 6, RCD); \`
			`roundsm(RCD, i + 7, RAB);`

			`#define enc_fls(i) \`
			`fls(RAB, RCD, i + 0, i + 1);`

			`#define enc_inpack() \`
			`movq (RIO), RAB0; \`
			`bswapq RAB0; \`
			`rolq $32, RAB0; \`
			`movq 4*2(RIO), RCD0; \`
			`bswapq RCD0; \`
			`rorq $32, RCD0; \`
			`xorq key_table(CTX), RAB0;`

			`#define enc_outunpack(op, max) \`
			`xorq key_table(CTX, max, 8), RCD0; \`
			`rorq $32, RCD0; \`
			`bswapq RCD0; \`
			`op ## q RCD0, (RIO); \`
			`rolq $32, RAB0; \`
			`bswapq RAB0; \`
			`op ## q RAB0, 4*2(RIO);`

			`#define dec_rounds(i) \`
			`roundsm(RAB, i + 7, RCD); \`
			`roundsm(RCD, i + 6, RAB); \`
			`roundsm(RAB, i + 5, RCD); \`
			`roundsm(RCD, i + 4, RAB); \`
			`roundsm(RAB, i + 3, RCD); \`
			`roundsm(RCD, i + 2, RAB);`

			`#define dec_fls(i) \`
			`fls(RAB, RCD, i + 1, i + 0);`

			`#define dec_inpack(max) \`
			`movq (RIO), RAB0; \`
			`bswapq RAB0; \`
			`rolq $32, RAB0; \`
			`movq 4*2(RIO), RCD0; \`
			`bswapq RCD0; \`
			`rorq $32, RCD0; \`
			`xorq key_table(CTX, max, 8), RAB0;`

			`#define dec_outunpack() \`
			`xorq key_table(CTX), RCD0; \`
			`rorq $32, RCD0; \`
			`bswapq RCD0; \`
			`movq RCD0, (RIO); \`
			`rolq $32, RAB0; \`
			`bswapq RAB0; \`
			`movq RAB0, 4*2(RIO);`

			`.global __camellia_enc_blk;`
			`.type __camellia_enc_blk,@function;`

			`__camellia_enc_blk:`
			`/* input:`
			`* %rdi: ctx, CTX`
			`* %rsi: dst`
			`* %rdx: src`
			`* %rcx: bool xor`
			`*/`
			`movq %rbp, RRBP;`

			`movq %rcx, RXOR;`
			`movq %rsi, RDST;`
			`movq %rdx, RIO;`

			`enc_inpack();`

			`enc_rounds(0);`
			`enc_fls(8);`
			`enc_rounds(8);`
			`enc_fls(16);`
			`enc_rounds(16);`
			`movl $24, RT1d; /* max */`

			`cmpb $16, key_length(CTX);`
			`je __enc_done;`

			`enc_fls(24);`
			`enc_rounds(24);`
			`movl $32, RT1d; /* max */`

			`__enc_done:`
			`testb RXORbl, RXORbl;`
			`movq RDST, RIO;`

			`jnz __enc_xor;`

			`enc_outunpack(mov, RT1);`

			`movq RRBP, %rbp;`
			`ret;`

			`__enc_xor:`
			`enc_outunpack(xor, RT1);`

			`movq RRBP, %rbp;`
			`ret;`

			`.global camellia_dec_blk;`
			`.type camellia_dec_blk,@function;`

			`camellia_dec_blk:`
			`/* input:`
			`* %rdi: ctx, CTX`
			`* %rsi: dst`
			`* %rdx: src`
			`*/`
			`cmpl $16, key_length(CTX);`
			`movl $32, RT2d;`
			`movl $24, RXORd;`
			`cmovel RXORd, RT2d; /* max */`

			`movq %rbp, RRBP;`
			`movq %rsi, RDST;`
			`movq %rdx, RIO;`

			`dec_inpack(RT2);`

			`cmpb $24, RT2bl;`
			`je __dec_rounds16;`

			`dec_rounds(24);`
			`dec_fls(24);`

			`__dec_rounds16:`
			`dec_rounds(16);`
			`dec_fls(16);`
			`dec_rounds(8);`
			`dec_fls(8);`
			`dec_rounds(0);`

			`movq RDST, RIO;`

			`dec_outunpack();`

			`movq RRBP, %rbp;`
			`ret;`

			`/**********************************************************************`
			`2-way camellia`
			`**********************************************************************/`
			`#define roundsm2(ab, subkey, cd) \`
			`movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \`
			`xorq RT2, cd ## 1; \`
			`\`
			`xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \`
			`xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \`
			`xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \`
			`xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \`
			`\`
			`xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \`
			`xorq RT2, cd ## 0; \`
			`xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \`
			`xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \`
			`xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);`

			`#define fls2(l, r, kl, kr) \`
			`movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \`
			`andl l ## 0d, RT0d; \`
			`roll $1, RT0d; \`
			`shlq $32, RT0; \`
			`xorq RT0, l ## 0; \`
			`movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \`
			`orq r ## 0, RT1; \`
			`shrq $32, RT1; \`
			`xorq RT1, r ## 0; \`
			`\`
			`movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \`
			`andl l ## 1d, RT2d; \`
			`roll $1, RT2d; \`
			`shlq $32, RT2; \`
			`xorq RT2, l ## 1; \`
			`movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \`
			`orq r ## 1, RT0; \`
			`shrq $32, RT0; \`
			`xorq RT0, r ## 1; \`
			`\`
			`movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \`
			`orq l ## 0, RT1; \`
			`shrq $32, RT1; \`
			`xorq RT1, l ## 0; \`
			`movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \`
			`andl r ## 0d, RT2d; \`
			`roll $1, RT2d; \`
			`shlq $32, RT2; \`
			`xorq RT2, r ## 0; \`
			`\`
			`movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \`
			`orq l ## 1, RT0; \`
			`shrq $32, RT0; \`
			`xorq RT0, l ## 1; \`
			`movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \`
			`andl r ## 1d, RT1d; \`
			`roll $1, RT1d; \`
			`shlq $32, RT1; \`
			`xorq RT1, r ## 1;`

			`#define enc_rounds2(i) \`
			`roundsm2(RAB, i + 2, RCD); \`
			`roundsm2(RCD, i + 3, RAB); \`
			`roundsm2(RAB, i + 4, RCD); \`
			`roundsm2(RCD, i + 5, RAB); \`
			`roundsm2(RAB, i + 6, RCD); \`
			`roundsm2(RCD, i + 7, RAB);`

			`#define enc_fls2(i) \`
			`fls2(RAB, RCD, i + 0, i + 1);`

			`#define enc_inpack2() \`
			`movq (RIO), RAB0; \`
			`bswapq RAB0; \`
			`rorq $32, RAB0; \`
			`movq 4*2(RIO), RCD0; \`
			`bswapq RCD0; \`
			`rolq $32, RCD0; \`
			`xorq key_table(CTX), RAB0; \`
			`\`
			`movq 8*2(RIO), RAB1; \`
			`bswapq RAB1; \`
			`rorq $32, RAB1; \`
			`movq 12*2(RIO), RCD1; \`
			`bswapq RCD1; \`
			`rolq $32, RCD1; \`
			`xorq key_table(CTX), RAB1;`

			`#define enc_outunpack2(op, max) \`
			`xorq key_table(CTX, max, 8), RCD0; \`
			`rolq $32, RCD0; \`
			`bswapq RCD0; \`
			`op ## q RCD0, (RIO); \`
			`rorq $32, RAB0; \`
			`bswapq RAB0; \`
			`op ## q RAB0, 4*2(RIO); \`
			`\`
			`xorq key_table(CTX, max, 8), RCD1; \`
			`rolq $32, RCD1; \`
			`bswapq RCD1; \`
			`op ## q RCD1, 8*2(RIO); \`
			`rorq $32, RAB1; \`
			`bswapq RAB1; \`
			`op ## q RAB1, 12*2(RIO);`

			`#define dec_rounds2(i) \`
			`roundsm2(RAB, i + 7, RCD); \`
			`roundsm2(RCD, i + 6, RAB); \`
			`roundsm2(RAB, i + 5, RCD); \`
			`roundsm2(RCD, i + 4, RAB); \`
			`roundsm2(RAB, i + 3, RCD); \`
			`roundsm2(RCD, i + 2, RAB);`

			`#define dec_fls2(i) \`
			`fls2(RAB, RCD, i + 1, i + 0);`

			`#define dec_inpack2(max) \`
			`movq (RIO), RAB0; \`
			`bswapq RAB0; \`
			`rorq $32, RAB0; \`
			`movq 4*2(RIO), RCD0; \`
			`bswapq RCD0; \`
			`rolq $32, RCD0; \`
			`xorq key_table(CTX, max, 8), RAB0; \`
			`\`
			`movq 8*2(RIO), RAB1; \`
			`bswapq RAB1; \`
			`rorq $32, RAB1; \`
			`movq 12*2(RIO), RCD1; \`
			`bswapq RCD1; \`
			`rolq $32, RCD1; \`
			`xorq key_table(CTX, max, 8), RAB1;`

			`#define dec_outunpack2() \`
			`xorq key_table(CTX), RCD0; \`
			`rolq $32, RCD0; \`
			`bswapq RCD0; \`
			`movq RCD0, (RIO); \`
			`rorq $32, RAB0; \`
			`bswapq RAB0; \`
			`movq RAB0, 4*2(RIO); \`
			`\`
			`xorq key_table(CTX), RCD1; \`
			`rolq $32, RCD1; \`
			`bswapq RCD1; \`
			`movq RCD1, 8*2(RIO); \`
			`rorq $32, RAB1; \`
			`bswapq RAB1; \`
			`movq RAB1, 12*2(RIO);`

			`.global __camellia_enc_blk_2way;`
			`.type __camellia_enc_blk_2way,@function;`

			`__camellia_enc_blk_2way:`
			`/* input:`
			`* %rdi: ctx, CTX`
			`* %rsi: dst`
			`* %rdx: src`
			`* %rcx: bool xor`
			`*/`
			`pushq %rbx;`

			`movq %rbp, RRBP;`
			`movq %rcx, RXOR;`
			`movq %rsi, RDST;`
			`movq %rdx, RIO;`

			`enc_inpack2();`

			`enc_rounds2(0);`
			`enc_fls2(8);`
			`enc_rounds2(8);`
			`enc_fls2(16);`
			`enc_rounds2(16);`
			`movl $24, RT2d; /* max */`

			`cmpb $16, key_length(CTX);`
			`je __enc2_done;`

			`enc_fls2(24);`
			`enc_rounds2(24);`
			`movl $32, RT2d; /* max */`

			`__enc2_done:`
			`test RXORbl, RXORbl;`
			`movq RDST, RIO;`
			`jnz __enc2_xor;`

			`enc_outunpack2(mov, RT2);`

			`movq RRBP, %rbp;`
			`popq %rbx;`
			`ret;`

			`__enc2_xor:`
			`enc_outunpack2(xor, RT2);`

			`movq RRBP, %rbp;`
			`popq %rbx;`
			`ret;`

			`.global camellia_dec_blk_2way;`
			`.type camellia_dec_blk_2way,@function;`

			`camellia_dec_blk_2way:`
			`/* input:`
			`* %rdi: ctx, CTX`
			`* %rsi: dst`
			`* %rdx: src`
			`*/`
			`cmpl $16, key_length(CTX);`
			`movl $32, RT2d;`
			`movl $24, RXORd;`
			`cmovel RXORd, RT2d; /* max */`

			`movq %rbx, RXOR;`
			`movq %rbp, RRBP;`
			`movq %rsi, RDST;`
			`movq %rdx, RIO;`

			`dec_inpack2(RT2);`

			`cmpb $24, RT2bl;`
			`je __dec2_rounds16;`

			`dec_rounds2(24);`
			`dec_fls2(24);`

			`__dec2_rounds16:`
			`dec_rounds2(16);`
			`dec_fls2(16);`
			`dec_rounds2(8);`
			`dec_fls2(8);`
			`dec_rounds2(0);`

			`movq RDST, RIO;`

			`dec_outunpack2();`

			`movq RRBP, %rbp;`
			`movq RXOR, %rbx;`
			`ret;`