732 lines
21 KiB
ArmAsm
732 lines
21 KiB
ArmAsm
/* sm4-armv8-aarch64-ce.S - ARMv8/AArch64/CE accelerated SM4 cipher
|
|
*
|
|
* Copyright (C) 2022 Alibaba Group.
|
|
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
|
|
*
|
|
* This file is part of Libgcrypt.
|
|
*
|
|
* Libgcrypt is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU Lesser General Public License as
|
|
* published by the Free Software Foundation; either version 2.1 of
|
|
* the License, or (at your option) any later version.
|
|
*
|
|
* Libgcrypt is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "asm-common-aarch64.h"
|
|
|
|
#if defined(__AARCH64EL__) && \
|
|
defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
|
|
defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && \
|
|
defined(USE_SM4)
|
|
|
|
.cpu generic+simd+crypto
|
|
|
|
#define vecnum_v0 0
|
|
#define vecnum_v1 1
|
|
#define vecnum_v2 2
|
|
#define vecnum_v3 3
|
|
#define vecnum_v4 4
|
|
#define vecnum_v5 5
|
|
#define vecnum_v6 6
|
|
#define vecnum_v7 7
|
|
#define vecnum_v16 16
|
|
#define vecnum_v24 24
|
|
#define vecnum_v25 25
|
|
#define vecnum_v26 26
|
|
#define vecnum_v27 27
|
|
#define vecnum_v28 28
|
|
#define vecnum_v29 29
|
|
#define vecnum_v30 30
|
|
#define vecnum_v31 31
|
|
|
|
#define sm4e(vd, vn) \
|
|
.inst (0xcec08400 | (vecnum_##vn << 5) | vecnum_##vd)
|
|
|
|
#define sm4ekey(vd, vn, vm) \
|
|
.inst (0xce60c800 | (vecnum_##vm << 16) | (vecnum_##vn << 5) | vecnum_##vd)
|
|
|
|
.text
|
|
|
|
/* Register macros */
|
|
|
|
#define RTMP0 v16
|
|
#define RTMP1 v17
|
|
#define RTMP2 v18
|
|
#define RTMP3 v19
|
|
|
|
#define RIV v20
|
|
#define RMASK v21
|
|
|
|
/* Helper macros. */
|
|
|
|
#define load_rkey(ptr) \
|
|
ld1 {v24.16b-v27.16b}, [ptr], #64; \
|
|
ld1 {v28.16b-v31.16b}, [ptr];
|
|
|
|
#define SM4_CRYPT_BLK(b0) \
|
|
rev32 b0.16b, b0.16b; \
|
|
sm4e(b0, v24); \
|
|
sm4e(b0, v25); \
|
|
sm4e(b0, v26); \
|
|
sm4e(b0, v27); \
|
|
sm4e(b0, v28); \
|
|
sm4e(b0, v29); \
|
|
sm4e(b0, v30); \
|
|
sm4e(b0, v31); \
|
|
rev64 b0.4s, b0.4s; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
rev32 b0.16b, b0.16b;
|
|
|
|
#define crypt_blk4(b0, b1, b2, b3) \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b; \
|
|
sm4e(b0, v24); \
|
|
sm4e(b1, v24); \
|
|
sm4e(b2, v24); \
|
|
sm4e(b3, v24); \
|
|
sm4e(b0, v25); \
|
|
sm4e(b1, v25); \
|
|
sm4e(b2, v25); \
|
|
sm4e(b3, v25); \
|
|
sm4e(b0, v26); \
|
|
sm4e(b1, v26); \
|
|
sm4e(b2, v26); \
|
|
sm4e(b3, v26); \
|
|
sm4e(b0, v27); \
|
|
sm4e(b1, v27); \
|
|
sm4e(b2, v27); \
|
|
sm4e(b3, v27); \
|
|
sm4e(b0, v28); \
|
|
sm4e(b1, v28); \
|
|
sm4e(b2, v28); \
|
|
sm4e(b3, v28); \
|
|
sm4e(b0, v29); \
|
|
sm4e(b1, v29); \
|
|
sm4e(b2, v29); \
|
|
sm4e(b3, v29); \
|
|
sm4e(b0, v30); \
|
|
sm4e(b1, v30); \
|
|
sm4e(b2, v30); \
|
|
sm4e(b3, v30); \
|
|
sm4e(b0, v31); \
|
|
sm4e(b1, v31); \
|
|
sm4e(b2, v31); \
|
|
sm4e(b3, v31); \
|
|
rev64 b0.4s, b0.4s; \
|
|
rev64 b1.4s, b1.4s; \
|
|
rev64 b2.4s, b2.4s; \
|
|
rev64 b3.4s, b3.4s; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
ext b1.16b, b1.16b, b1.16b, #8; \
|
|
ext b2.16b, b2.16b, b2.16b, #8; \
|
|
ext b3.16b, b3.16b, b3.16b, #8; \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b;
|
|
|
|
#define crypt_blk8(b0, b1, b2, b3, b4, b5, b6, b7) \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b; \
|
|
rev32 b4.16b, b4.16b; \
|
|
rev32 b5.16b, b5.16b; \
|
|
rev32 b6.16b, b6.16b; \
|
|
rev32 b7.16b, b7.16b; \
|
|
sm4e(b0, v24); \
|
|
sm4e(b1, v24); \
|
|
sm4e(b2, v24); \
|
|
sm4e(b3, v24); \
|
|
sm4e(b4, v24); \
|
|
sm4e(b5, v24); \
|
|
sm4e(b6, v24); \
|
|
sm4e(b7, v24); \
|
|
sm4e(b0, v25); \
|
|
sm4e(b1, v25); \
|
|
sm4e(b2, v25); \
|
|
sm4e(b3, v25); \
|
|
sm4e(b4, v25); \
|
|
sm4e(b5, v25); \
|
|
sm4e(b6, v25); \
|
|
sm4e(b7, v25); \
|
|
sm4e(b0, v26); \
|
|
sm4e(b1, v26); \
|
|
sm4e(b2, v26); \
|
|
sm4e(b3, v26); \
|
|
sm4e(b4, v26); \
|
|
sm4e(b5, v26); \
|
|
sm4e(b6, v26); \
|
|
sm4e(b7, v26); \
|
|
sm4e(b0, v27); \
|
|
sm4e(b1, v27); \
|
|
sm4e(b2, v27); \
|
|
sm4e(b3, v27); \
|
|
sm4e(b4, v27); \
|
|
sm4e(b5, v27); \
|
|
sm4e(b6, v27); \
|
|
sm4e(b7, v27); \
|
|
sm4e(b0, v28); \
|
|
sm4e(b1, v28); \
|
|
sm4e(b2, v28); \
|
|
sm4e(b3, v28); \
|
|
sm4e(b4, v28); \
|
|
sm4e(b5, v28); \
|
|
sm4e(b6, v28); \
|
|
sm4e(b7, v28); \
|
|
sm4e(b0, v29); \
|
|
sm4e(b1, v29); \
|
|
sm4e(b2, v29); \
|
|
sm4e(b3, v29); \
|
|
sm4e(b4, v29); \
|
|
sm4e(b5, v29); \
|
|
sm4e(b6, v29); \
|
|
sm4e(b7, v29); \
|
|
sm4e(b0, v30); \
|
|
sm4e(b1, v30); \
|
|
sm4e(b2, v30); \
|
|
sm4e(b3, v30); \
|
|
sm4e(b4, v30); \
|
|
sm4e(b5, v30); \
|
|
sm4e(b6, v30); \
|
|
sm4e(b7, v30); \
|
|
sm4e(b0, v31); \
|
|
sm4e(b1, v31); \
|
|
sm4e(b2, v31); \
|
|
sm4e(b3, v31); \
|
|
sm4e(b4, v31); \
|
|
sm4e(b5, v31); \
|
|
sm4e(b6, v31); \
|
|
sm4e(b7, v31); \
|
|
rev64 b0.4s, b0.4s; \
|
|
rev64 b1.4s, b1.4s; \
|
|
rev64 b2.4s, b2.4s; \
|
|
rev64 b3.4s, b3.4s; \
|
|
rev64 b4.4s, b4.4s; \
|
|
rev64 b5.4s, b5.4s; \
|
|
rev64 b6.4s, b6.4s; \
|
|
rev64 b7.4s, b7.4s; \
|
|
ext b0.16b, b0.16b, b0.16b, #8; \
|
|
ext b1.16b, b1.16b, b1.16b, #8; \
|
|
ext b2.16b, b2.16b, b2.16b, #8; \
|
|
ext b3.16b, b3.16b, b3.16b, #8; \
|
|
ext b4.16b, b4.16b, b4.16b, #8; \
|
|
ext b5.16b, b5.16b, b5.16b, #8; \
|
|
ext b6.16b, b6.16b, b6.16b, #8; \
|
|
ext b7.16b, b7.16b, b7.16b, #8; \
|
|
rev32 b0.16b, b0.16b; \
|
|
rev32 b1.16b, b1.16b; \
|
|
rev32 b2.16b, b2.16b; \
|
|
rev32 b3.16b, b3.16b; \
|
|
rev32 b4.16b, b4.16b; \
|
|
rev32 b5.16b, b5.16b; \
|
|
rev32 b6.16b, b6.16b; \
|
|
rev32 b7.16b, b7.16b;
|
|
|
|
|
|
.align 4
|
|
.global _gcry_sm4_armv8_ce_expand_key
|
|
ELF(.type _gcry_sm4_armv8_ce_expand_key,%function;)
|
|
_gcry_sm4_armv8_ce_expand_key:
|
|
/* input:
|
|
* x0: 128-bit key
|
|
* x1: rkey_enc
|
|
* x2: rkey_dec
|
|
* x3: fk array
|
|
* x4: ck array
|
|
*/
|
|
CFI_STARTPROC();
|
|
|
|
ld1 {v0.16b}, [x0];
|
|
rev32 v0.16b, v0.16b;
|
|
ld1 {v1.16b}, [x3];
|
|
load_rkey(x4);
|
|
|
|
/* input ^ fk */
|
|
eor v0.16b, v0.16b, v1.16b;
|
|
|
|
sm4ekey(v0, v0, v24);
|
|
sm4ekey(v1, v0, v25);
|
|
sm4ekey(v2, v1, v26);
|
|
sm4ekey(v3, v2, v27);
|
|
sm4ekey(v4, v3, v28);
|
|
sm4ekey(v5, v4, v29);
|
|
sm4ekey(v6, v5, v30);
|
|
sm4ekey(v7, v6, v31);
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
st1 {v4.16b-v7.16b}, [x1];
|
|
rev64 v7.4s, v7.4s;
|
|
rev64 v6.4s, v6.4s;
|
|
rev64 v5.4s, v5.4s;
|
|
rev64 v4.4s, v4.4s;
|
|
rev64 v3.4s, v3.4s;
|
|
rev64 v2.4s, v2.4s;
|
|
rev64 v1.4s, v1.4s;
|
|
rev64 v0.4s, v0.4s;
|
|
ext v7.16b, v7.16b, v7.16b, #8;
|
|
ext v6.16b, v6.16b, v6.16b, #8;
|
|
ext v5.16b, v5.16b, v5.16b, #8;
|
|
ext v4.16b, v4.16b, v4.16b, #8;
|
|
ext v3.16b, v3.16b, v3.16b, #8;
|
|
ext v2.16b, v2.16b, v2.16b, #8;
|
|
ext v1.16b, v1.16b, v1.16b, #8;
|
|
ext v0.16b, v0.16b, v0.16b, #8;
|
|
st1 {v7.16b}, [x2], #16;
|
|
st1 {v6.16b}, [x2], #16;
|
|
st1 {v5.16b}, [x2], #16;
|
|
st1 {v4.16b}, [x2], #16;
|
|
st1 {v3.16b}, [x2], #16;
|
|
st1 {v2.16b}, [x2], #16;
|
|
st1 {v1.16b}, [x2], #16;
|
|
st1 {v0.16b}, [x2];
|
|
|
|
ret_spec_stop;
|
|
CFI_ENDPROC();
|
|
ELF(.size _gcry_sm4_armv8_ce_expand_key,.-_gcry_sm4_armv8_ce_expand_key;)
|
|
|
|
.align 4
|
|
ELF(.type sm4_armv8_ce_crypt_blk1_4,%function;)
|
|
sm4_armv8_ce_crypt_blk1_4:
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: num blocks (1..4)
|
|
*/
|
|
CFI_STARTPROC();
|
|
|
|
load_rkey(x0);
|
|
|
|
ld1 {v0.16b}, [x2], #16;
|
|
mov v1.16b, v0.16b;
|
|
mov v2.16b, v0.16b;
|
|
mov v3.16b, v0.16b;
|
|
cmp x3, #2;
|
|
blt .Lblk4_load_input_done;
|
|
ld1 {v1.16b}, [x2], #16;
|
|
beq .Lblk4_load_input_done;
|
|
ld1 {v2.16b}, [x2], #16;
|
|
cmp x3, #3;
|
|
beq .Lblk4_load_input_done;
|
|
ld1 {v3.16b}, [x2];
|
|
|
|
.Lblk4_load_input_done:
|
|
crypt_blk4(v0, v1, v2, v3);
|
|
|
|
st1 {v0.16b}, [x1], #16;
|
|
cmp x3, #2;
|
|
blt .Lblk4_store_output_done;
|
|
st1 {v1.16b}, [x1], #16;
|
|
beq .Lblk4_store_output_done;
|
|
st1 {v2.16b}, [x1], #16;
|
|
cmp x3, #3;
|
|
beq .Lblk4_store_output_done;
|
|
st1 {v3.16b}, [x1];
|
|
|
|
.Lblk4_store_output_done:
|
|
ret_spec_stop;
|
|
CFI_ENDPROC();
|
|
ELF(.size sm4_armv8_ce_crypt_blk1_4,.-sm4_armv8_ce_crypt_blk1_4;)
|
|
|
|
.align 4
|
|
.global _gcry_sm4_armv8_ce_crypt_blk1_8
|
|
ELF(.type _gcry_sm4_armv8_ce_crypt_blk1_8,%function;)
|
|
_gcry_sm4_armv8_ce_crypt_blk1_8:
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: num blocks (1..8)
|
|
*/
|
|
CFI_STARTPROC();
|
|
|
|
cmp x3, #5;
|
|
blt sm4_armv8_ce_crypt_blk1_4;
|
|
|
|
load_rkey(x0);
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64;
|
|
ld1 {v4.16b}, [x2], #16;
|
|
mov v5.16b, v4.16b;
|
|
mov v6.16b, v4.16b;
|
|
mov v7.16b, v4.16b;
|
|
beq .Lblk8_load_input_done;
|
|
ld1 {v5.16b}, [x2], #16;
|
|
cmp x3, #7;
|
|
blt .Lblk8_load_input_done;
|
|
ld1 {v6.16b}, [x2], #16;
|
|
beq .Lblk8_load_input_done;
|
|
ld1 {v7.16b}, [x2];
|
|
|
|
.Lblk8_load_input_done:
|
|
crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
|
|
cmp x3, #6;
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
st1 {v4.16b}, [x1], #16;
|
|
blt .Lblk8_store_output_done;
|
|
st1 {v5.16b}, [x1], #16;
|
|
beq .Lblk8_store_output_done;
|
|
st1 {v6.16b}, [x1], #16;
|
|
cmp x3, #7;
|
|
beq .Lblk8_store_output_done;
|
|
st1 {v7.16b}, [x1];
|
|
|
|
.Lblk8_store_output_done:
|
|
ret_spec_stop;
|
|
CFI_ENDPROC();
|
|
ELF(.size _gcry_sm4_armv8_ce_crypt_blk1_8,.-_gcry_sm4_armv8_ce_crypt_blk1_8;)
|
|
|
|
.align 4
|
|
.global _gcry_sm4_armv8_ce_crypt
|
|
ELF(.type _gcry_sm4_armv8_ce_crypt,%function;)
|
|
_gcry_sm4_armv8_ce_crypt:
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: nblocks (multiples of 8)
|
|
*/
|
|
CFI_STARTPROC();
|
|
|
|
load_rkey(x0);
|
|
|
|
.Lcrypt_loop_blk:
|
|
subs x3, x3, #8;
|
|
bmi .Lcrypt_end;
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64;
|
|
ld1 {v4.16b-v7.16b}, [x2], #64;
|
|
|
|
crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
st1 {v4.16b-v7.16b}, [x1], #64;
|
|
|
|
b .Lcrypt_loop_blk;
|
|
|
|
.Lcrypt_end:
|
|
ret_spec_stop;
|
|
CFI_ENDPROC();
|
|
ELF(.size _gcry_sm4_armv8_ce_crypt,.-_gcry_sm4_armv8_ce_crypt;)
|
|
|
|
.align 4
|
|
.global _gcry_sm4_armv8_ce_cbc_dec
|
|
ELF(.type _gcry_sm4_armv8_ce_cbc_dec,%function;)
|
|
_gcry_sm4_armv8_ce_cbc_dec:
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* x4: nblocks (multiples of 8)
|
|
*/
|
|
CFI_STARTPROC();
|
|
|
|
load_rkey(x0);
|
|
ld1 {RIV.16b}, [x3];
|
|
|
|
.Lcbc_loop_blk:
|
|
subs x4, x4, #8;
|
|
bmi .Lcbc_end;
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64;
|
|
ld1 {v4.16b-v7.16b}, [x2];
|
|
|
|
crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
|
|
sub x2, x2, #64;
|
|
eor v0.16b, v0.16b, RIV.16b;
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v1.16b, v1.16b, RTMP0.16b;
|
|
eor v2.16b, v2.16b, RTMP1.16b;
|
|
eor v3.16b, v3.16b, RTMP2.16b;
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
eor v4.16b, v4.16b, RTMP3.16b;
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v5.16b, v5.16b, RTMP0.16b;
|
|
eor v6.16b, v6.16b, RTMP1.16b;
|
|
eor v7.16b, v7.16b, RTMP2.16b;
|
|
|
|
mov RIV.16b, RTMP3.16b;
|
|
st1 {v4.16b-v7.16b}, [x1], #64;
|
|
|
|
b .Lcbc_loop_blk;
|
|
|
|
.Lcbc_end:
|
|
/* store new IV */
|
|
st1 {RIV.16b}, [x3];
|
|
|
|
ret_spec_stop;
|
|
CFI_ENDPROC();
|
|
ELF(.size _gcry_sm4_armv8_ce_cbc_dec,.-_gcry_sm4_armv8_ce_cbc_dec;)
|
|
|
|
.align 4
|
|
.global _gcry_sm4_armv8_ce_cfb_dec
|
|
ELF(.type _gcry_sm4_armv8_ce_cfb_dec,%function;)
|
|
_gcry_sm4_armv8_ce_cfb_dec:
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: iv (big endian, 128 bit)
|
|
* x4: nblocks (multiples of 8)
|
|
*/
|
|
CFI_STARTPROC();
|
|
|
|
load_rkey(x0);
|
|
ld1 {v0.16b}, [x3];
|
|
|
|
.Lcfb_loop_blk:
|
|
subs x4, x4, #8;
|
|
bmi .Lcfb_end;
|
|
|
|
ld1 {v1.16b, v2.16b, v3.16b}, [x2], #48;
|
|
ld1 {v4.16b-v7.16b}, [x2];
|
|
|
|
crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
|
|
sub x2, x2, #48;
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v0.16b, v0.16b, RTMP0.16b;
|
|
eor v1.16b, v1.16b, RTMP1.16b;
|
|
eor v2.16b, v2.16b, RTMP2.16b;
|
|
eor v3.16b, v3.16b, RTMP3.16b;
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v4.16b, v4.16b, RTMP0.16b;
|
|
eor v5.16b, v5.16b, RTMP1.16b;
|
|
eor v6.16b, v6.16b, RTMP2.16b;
|
|
eor v7.16b, v7.16b, RTMP3.16b;
|
|
st1 {v4.16b-v7.16b}, [x1], #64;
|
|
|
|
mov v0.16b, RTMP3.16b;
|
|
|
|
b .Lcfb_loop_blk;
|
|
|
|
.Lcfb_end:
|
|
/* store new IV */
|
|
st1 {v0.16b}, [x3];
|
|
|
|
ret_spec_stop;
|
|
CFI_ENDPROC();
|
|
ELF(.size _gcry_sm4_armv8_ce_cfb_dec,.-_gcry_sm4_armv8_ce_cfb_dec;)
|
|
|
|
.align 4
|
|
.global _gcry_sm4_armv8_ce_ctr_enc
|
|
ELF(.type _gcry_sm4_armv8_ce_ctr_enc,%function;)
|
|
_gcry_sm4_armv8_ce_ctr_enc:
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: ctr (big endian, 128 bit)
|
|
* x4: nblocks (multiples of 8)
|
|
*/
|
|
CFI_STARTPROC();
|
|
|
|
load_rkey(x0);
|
|
|
|
ldp x7, x8, [x3];
|
|
rev x7, x7;
|
|
rev x8, x8;
|
|
|
|
.Lctr_loop_blk:
|
|
subs x4, x4, #8;
|
|
bmi .Lctr_end;
|
|
|
|
#define inc_le128(vctr) \
|
|
mov vctr.d[1], x8; \
|
|
mov vctr.d[0], x7; \
|
|
adds x8, x8, #1; \
|
|
adc x7, x7, xzr; \
|
|
rev64 vctr.16b, vctr.16b;
|
|
|
|
/* construct CTRs */
|
|
inc_le128(v0); /* +0 */
|
|
inc_le128(v1); /* +1 */
|
|
inc_le128(v2); /* +2 */
|
|
inc_le128(v3); /* +3 */
|
|
inc_le128(v4); /* +4 */
|
|
inc_le128(v5); /* +5 */
|
|
inc_le128(v6); /* +6 */
|
|
inc_le128(v7); /* +7 */
|
|
|
|
crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7);
|
|
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v0.16b, v0.16b, RTMP0.16b;
|
|
eor v1.16b, v1.16b, RTMP1.16b;
|
|
eor v2.16b, v2.16b, RTMP2.16b;
|
|
eor v3.16b, v3.16b, RTMP3.16b;
|
|
st1 {v0.16b-v3.16b}, [x1], #64;
|
|
|
|
ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64;
|
|
eor v4.16b, v4.16b, RTMP0.16b;
|
|
eor v5.16b, v5.16b, RTMP1.16b;
|
|
eor v6.16b, v6.16b, RTMP2.16b;
|
|
eor v7.16b, v7.16b, RTMP3.16b;
|
|
st1 {v4.16b-v7.16b}, [x1], #64;
|
|
|
|
b .Lctr_loop_blk;
|
|
|
|
.Lctr_end:
|
|
/* store new CTR */
|
|
rev x7, x7;
|
|
rev x8, x8;
|
|
stp x7, x8, [x3];
|
|
|
|
ret_spec_stop;
|
|
CFI_ENDPROC();
|
|
ELF(.size _gcry_sm4_armv8_ce_ctr_enc,.-_gcry_sm4_armv8_ce_ctr_enc;)
|
|
|
|
.align 4
|
|
.global _gcry_sm4_armv8_ce_xts_crypt
|
|
ELF(.type _gcry_sm4_armv8_ce_xts_crypt,%function;)
|
|
_gcry_sm4_armv8_ce_xts_crypt:
|
|
/* input:
|
|
* x0: round key array, CTX
|
|
* x1: dst
|
|
* x2: src
|
|
* x3: tweak (big endian, 128 bit)
|
|
* x4: nblocks
|
|
*/
|
|
CFI_STARTPROC()
|
|
VPUSH_ABI
|
|
|
|
load_rkey(x0)
|
|
|
|
mov x7, #0x87
|
|
mov x8, #0x1
|
|
mov RMASK.d[0], x7
|
|
mov RMASK.d[1], x8
|
|
|
|
ld1 {RIV.16b}, [x3]
|
|
mov v8.16b, RIV.16b
|
|
ext RIV.16b, RIV.16b, RIV.16b, #8
|
|
|
|
.Lxts_loop_blk:
|
|
sub x4, x4, #8
|
|
tbnz x4, #63, .Lxts_tail8
|
|
|
|
#define tweak_next(vt, vin, RTMP) \
|
|
sshr RTMP.2d, RIV.2d, #63; \
|
|
add vt.2d, vin.2d, vin.2d; \
|
|
and RTMP.16b, RTMP.16b, RMASK.16b; \
|
|
add RIV.2d, RIV.2d, RIV.2d; \
|
|
eor vt.16b, vt.16b, RTMP.16b;
|
|
|
|
tweak_next( v9, v8, RTMP0)
|
|
tweak_next(v10, v9, RTMP1)
|
|
tweak_next(v11, v10, RTMP2)
|
|
tweak_next(v12, v11, RTMP3)
|
|
tweak_next(v13, v12, RTMP0)
|
|
tweak_next(v14, v13, RTMP1)
|
|
tweak_next(v15, v14, RTMP2)
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
ld1 {v4.16b-v7.16b}, [x2], #64
|
|
eor v4.16b, v4.16b, v12.16b
|
|
eor v5.16b, v5.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
eor v7.16b, v7.16b, v15.16b
|
|
|
|
crypt_blk8(v0, v1, v2, v3, v4, v5, v6, v7)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
eor v4.16b, v4.16b, v12.16b
|
|
eor v5.16b, v5.16b, v13.16b
|
|
eor v6.16b, v6.16b, v14.16b
|
|
eor v7.16b, v7.16b, v15.16b
|
|
st1 {v4.16b-v7.16b}, [x1], #64
|
|
|
|
tweak_next(v8, v15, RTMP3)
|
|
|
|
cbz x4, .Lxts_end
|
|
b .Lxts_loop_blk
|
|
|
|
.Lxts_tail8:
|
|
add x4, x4, #8
|
|
cmp x4, #4
|
|
blt .Lxts_tail4
|
|
|
|
sub x4, x4, #4
|
|
|
|
tweak_next( v9, v8, RTMP0)
|
|
tweak_next(v10, v9, RTMP1)
|
|
tweak_next(v11, v10, RTMP2)
|
|
|
|
ld1 {v0.16b-v3.16b}, [x2], #64
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
|
|
crypt_blk4(v0, v1, v2, v3);
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
eor v1.16b, v1.16b, v9.16b
|
|
eor v2.16b, v2.16b, v10.16b
|
|
eor v3.16b, v3.16b, v11.16b
|
|
st1 {v0.16b-v3.16b}, [x1], #64
|
|
|
|
tweak_next(v8, v11, RTMP3)
|
|
|
|
cbz x4, .Lxts_end
|
|
|
|
.Lxts_tail4:
|
|
sub x4, x4, #1
|
|
|
|
ld1 {v0.16b}, [x2], #16
|
|
eor v0.16b, v0.16b, v8.16b
|
|
|
|
SM4_CRYPT_BLK(v0)
|
|
|
|
eor v0.16b, v0.16b, v8.16b
|
|
st1 {v0.16b}, [x1], #16
|
|
|
|
tweak_next(v8, v8, RTMP0)
|
|
|
|
cbnz x4, .Lxts_tail4
|
|
|
|
.Lxts_end:
|
|
/* store new tweak */
|
|
st1 {v8.16b}, [x3]
|
|
|
|
CLEAR_REG(v8)
|
|
CLEAR_REG(v9)
|
|
CLEAR_REG(v10)
|
|
CLEAR_REG(v11)
|
|
CLEAR_REG(v12)
|
|
CLEAR_REG(v13)
|
|
CLEAR_REG(v14)
|
|
CLEAR_REG(v15)
|
|
CLEAR_REG(RIV)
|
|
|
|
VPOP_ABI
|
|
ret_spec_stop
|
|
CFI_ENDPROC()
|
|
ELF(.size _gcry_sm4_armv8_ce_xts_crypt,.-_gcry_sm4_armv8_ce_xts_crypt;)
|
|
|
|
#endif
|