vaseboot/VasEBoot-core/lib/libgcrypt/cipher/serpent-armv7-neon.S

1181 lines
39 KiB
ArmAsm
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* serpent-armv7-neon.S - ARM/NEON assembly implementation of Serpent cipher
*
* Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
defined(HAVE_GCC_INLINE_ASM_NEON)
.text
.syntax unified
.fpu neon
.arm
/* ARM registers */
#define RROUND r0
/* NEON vector registers */
#define RA0 q0
#define RA1 q1
#define RA2 q2
#define RA3 q3
#define RA4 q4
#define RB0 q5
#define RB1 q6
#define RB2 q7
#define RB3 q8
#define RB4 q9
#define RT0 q10
#define RT1 q11
#define RT2 q12
#define RT3 q13
#define RA0d0 d0
#define RA0d1 d1
#define RA1d0 d2
#define RA1d1 d3
#define RA2d0 d4
#define RA2d1 d5
#define RA3d0 d6
#define RA3d1 d7
#define RA4d0 d8
#define RA4d1 d9
#define RB0d0 d10
#define RB0d1 d11
#define RB1d0 d12
#define RB1d1 d13
#define RB2d0 d14
#define RB2d1 d15
#define RB3d0 d16
#define RB3d1 d17
#define RB4d0 d18
#define RB4d1 d19
#define RT0d0 d20
#define RT0d1 d21
#define RT1d0 d22
#define RT1d1 d23
#define RT2d0 d24
#define RT2d1 d25
/**********************************************************************
helper macros
**********************************************************************/
#define transpose_4x4(_q0, _q1, _q2, _q3) \
vtrn.32 _q0, _q1; \
vtrn.32 _q2, _q3; \
vswp _q0##d1, _q2##d0; \
vswp _q1##d1, _q3##d0;
/**********************************************************************
8-way serpent
**********************************************************************/
/*
* These are the S-Boxes of Serpent from following research paper.
*
* D. A. Osvik, Speeding up Serpent, in Third AES Candidate Conference,
* (New York, New York, USA), p. 317329, National Institute of Standards and
* Technology, 2000.
*
* Paper is also available at: http://www.ii.uib.no/~osvik/pub/aes3.pdf
*
*/
#define SBOX0(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
veor a3, a3, a0; veor b3, b3, b0; vmov a4, a1; vmov b4, b1; \
vand a1, a1, a3; vand b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \
veor a1, a1, a0; veor b1, b1, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
veor a0, a0, a4; veor b0, b0, b4; veor a4, a4, a3; veor b4, b4, b3; \
veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a1; vorr b2, b2, b1; \
veor a2, a2, a4; veor b2, b2, b4; vmvn a4, a4; vmvn b4, b4; \
vorr a4, a4, a1; vorr b4, b4, b1; veor a1, a1, a3; veor b1, b1, b3; \
veor a1, a1, a4; veor b1, b1, b4; vorr a3, a3, a0; vorr b3, b3, b0; \
veor a1, a1, a3; veor b1, b1, b3; veor a4, a3; veor b4, b3;
#define SBOX0_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmvn a2, a2; vmvn b2, b2; vmov a4, a1; vmov b4, b1; \
vorr a1, a1, a0; vorr b1, b1, b0; vmvn a4, a4; vmvn b4, b4; \
veor a1, a1, a2; veor b1, b1, b2; vorr a2, a2, a4; vorr b2, b2, b4; \
veor a1, a1, a3; veor b1, b1, b3; veor a0, a0, a4; veor b0, b0, b4; \
veor a2, a2, a0; veor b2, b2, b0; vand a0, a0, a3; vand b0, b0, b3; \
veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a1; vorr b0, b0, b1; \
veor a0, a0, a2; veor b0, b0, b2; veor a3, a3, a4; veor b3, b3, b4; \
veor a2, a2, a1; veor b2, b2, b1; veor a3, a3, a0; veor b3, b3, b0; \
veor a3, a3, a1; veor b3, b3, b1;\
vand a2, a2, a3; vand b2, b2, b3;\
veor a4, a2; veor b4, b2;
#define SBOX1(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmvn a0, a0; vmvn b0, b0; vmvn a2, a2; vmvn b2, b2; \
vmov a4, a0; vmov b4, b0; vand a0, a0, a1; vand b0, b0, b1; \
veor a2, a2, a0; veor b2, b2, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
veor a3, a3, a2; veor b3, b3, b2; veor a1, a1, a0; veor b1, b1, b0; \
veor a0, a0, a4; veor b0, b0, b4; vorr a4, a4, a1; vorr b4, b4, b1; \
veor a1, a1, a3; veor b1, b1, b3; vorr a2, a2, a0; vorr b2, b2, b0; \
vand a2, a2, a4; vand b2, b2, b4; veor a0, a0, a1; veor b0, b0, b1; \
vand a1, a1, a2; vand b1, b1, b2;\
veor a1, a1, a0; veor b1, b1, b0; vand a0, a0, a2; vand b0, b0, b2; \
veor a0, a4; veor b0, b4;
#define SBOX1_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmov a4, a1; vmov b4, b1; veor a1, a1, a3; veor b1, b1, b3; \
vand a3, a3, a1; vand b3, b3, b1; veor a4, a4, a2; veor b4, b4, b2; \
veor a3, a3, a0; veor b3, b3, b0; vorr a0, a0, a1; vorr b0, b0, b1; \
veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a4; veor b0, b0, b4; \
vorr a0, a0, a2; vorr b0, b0, b2; veor a1, a1, a3; veor b1, b1, b3; \
veor a0, a0, a1; veor b0, b0, b1; vorr a1, a1, a3; vorr b1, b1, b3; \
veor a1, a1, a0; veor b1, b1, b0; vmvn a4, a4; vmvn b4, b4; \
veor a4, a4, a1; veor b4, b4, b1; vorr a1, a1, a0; vorr b1, b1, b0; \
veor a1, a1, a0; veor b1, b1, b0;\
vorr a1, a1, a4; vorr b1, b1, b4;\
veor a3, a1; veor b3, b1;
#define SBOX2(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmov a4, a0; vmov b4, b0; vand a0, a0, a2; vand b0, b0, b2; \
veor a0, a0, a3; veor b0, b0, b3; veor a2, a2, a1; veor b2, b2, b1; \
veor a2, a2, a0; veor b2, b2, b0; vorr a3, a3, a4; vorr b3, b3, b4; \
veor a3, a3, a1; veor b3, b3, b1; veor a4, a4, a2; veor b4, b4, b2; \
vmov a1, a3; vmov b1, b3; vorr a3, a3, a4; vorr b3, b3, b4; \
veor a3, a3, a0; veor b3, b3, b0; vand a0, a0, a1; vand b0, b0, b1; \
veor a4, a4, a0; veor b4, b4, b0; veor a1, a1, a3; veor b1, b1, b3; \
veor a1, a1, a4; veor b1, b1, b4; vmvn a4, a4; vmvn b4, b4;
#define SBOX2_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
veor a2, a2, a3; veor b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \
vmov a4, a3; vmov b4, b3; vand a3, a3, a2; vand b3, b3, b2; \
veor a3, a3, a1; veor b3, b3, b1; vorr a1, a1, a2; vorr b1, b1, b2; \
veor a1, a1, a4; veor b1, b1, b4; vand a4, a4, a3; vand b4, b4, b3; \
veor a2, a2, a3; veor b2, b2, b3; vand a4, a4, a0; vand b4, b4, b0; \
veor a4, a4, a2; veor b4, b4, b2; vand a2, a2, a1; vand b2, b2, b1; \
vorr a2, a2, a0; vorr b2, b2, b0; vmvn a3, a3; vmvn b3, b3; \
veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a3; veor b0, b0, b3; \
vand a0, a0, a1; vand b0, b0, b1; veor a3, a3, a4; veor b3, b3, b4; \
veor a3, a0; veor b3, b0;
#define SBOX3(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmov a4, a0; vmov b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
veor a3, a3, a1; veor b3, b3, b1; vand a1, a1, a4; vand b1, b1, b4; \
veor a4, a4, a2; veor b4, b4, b2; veor a2, a2, a3; veor b2, b2, b3; \
vand a3, a3, a0; vand b3, b3, b0; vorr a4, a4, a1; vorr b4, b4, b1; \
veor a3, a3, a4; veor b3, b3, b4; veor a0, a0, a1; veor b0, b0, b1; \
vand a4, a4, a0; vand b4, b4, b0; veor a1, a1, a3; veor b1, b1, b3; \
veor a4, a4, a2; veor b4, b4, b2; vorr a1, a1, a0; vorr b1, b1, b0; \
veor a1, a1, a2; veor b1, b1, b2; veor a0, a0, a3; veor b0, b0, b3; \
vmov a2, a1; vmov b2, b1; vorr a1, a1, a3; vorr b1, b1, b3; \
veor a1, a0; veor b1, b0;
#define SBOX3_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmov a4, a2; vmov b4, b2; veor a2, a2, a1; veor b2, b2, b1; \
veor a0, a0, a2; veor b0, b0, b2; vand a4, a4, a2; vand b4, b4, b2; \
veor a4, a4, a0; veor b4, b4, b0; vand a0, a0, a1; vand b0, b0, b1; \
veor a1, a1, a3; veor b1, b1, b3; vorr a3, a3, a4; vorr b3, b3, b4; \
veor a2, a2, a3; veor b2, b2, b3; veor a0, a0, a3; veor b0, b0, b3; \
veor a1, a1, a4; veor b1, b1, b4; vand a3, a3, a2; vand b3, b3, b2; \
veor a3, a3, a1; veor b3, b3, b1; veor a1, a1, a0; veor b1, b1, b0; \
vorr a1, a1, a2; vorr b1, b1, b2; veor a0, a0, a3; veor b0, b0, b3; \
veor a1, a1, a4; veor b1, b1, b4;\
veor a0, a1; veor b0, b1;
#define SBOX4(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
veor a1, a1, a3; veor b1, b1, b3; vmvn a3, a3; vmvn b3, b3; \
veor a2, a2, a3; veor b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \
vmov a4, a1; vmov b4, b1; vand a1, a1, a3; vand b1, b1, b3; \
veor a1, a1, a2; veor b1, b1, b2; veor a4, a4, a3; veor b4, b4, b3; \
veor a0, a0, a4; veor b0, b0, b4; vand a2, a2, a4; vand b2, b2, b4; \
veor a2, a2, a0; veor b2, b2, b0; vand a0, a0, a1; vand b0, b0, b1; \
veor a3, a3, a0; veor b3, b3, b0; vorr a4, a4, a1; vorr b4, b4, b1; \
veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
veor a0, a0, a2; veor b0, b0, b2; vand a2, a2, a3; vand b2, b2, b3; \
vmvn a0, a0; vmvn b0, b0; veor a4, a2; veor b4, b2;
#define SBOX4_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmov a4, a2; vmov b4, b2; vand a2, a2, a3; vand b2, b2, b3; \
veor a2, a2, a1; veor b2, b2, b1; vorr a1, a1, a3; vorr b1, b1, b3; \
vand a1, a1, a0; vand b1, b1, b0; veor a4, a4, a2; veor b4, b4, b2; \
veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a2; vand b1, b1, b2; \
vmvn a0, a0; vmvn b0, b0; veor a3, a3, a4; veor b3, b3, b4; \
veor a1, a1, a3; veor b1, b1, b3; vand a3, a3, a0; vand b3, b3, b0; \
veor a3, a3, a2; veor b3, b3, b2; veor a0, a0, a1; veor b0, b0, b1; \
vand a2, a2, a0; vand b2, b2, b0; veor a3, a3, a0; veor b3, b3, b0; \
veor a2, a2, a4; veor b2, b2, b4;\
vorr a2, a2, a3; vorr b2, b2, b3; veor a3, a3, a0; veor b3, b3, b0; \
veor a2, a1; veor b2, b1;
#define SBOX5(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
veor a0, a0, a1; veor b0, b0, b1; veor a1, a1, a3; veor b1, b1, b3; \
vmvn a3, a3; vmvn b3, b3; vmov a4, a1; vmov b4, b1; \
vand a1, a1, a0; vand b1, b1, b0; veor a2, a2, a3; veor b2, b2, b3; \
veor a1, a1, a2; veor b1, b1, b2; vorr a2, a2, a4; vorr b2, b2, b4; \
veor a4, a4, a3; veor b4, b4, b3; vand a3, a3, a1; vand b3, b3, b1; \
veor a3, a3, a0; veor b3, b3, b0; veor a4, a4, a1; veor b4, b4, b1; \
veor a4, a4, a2; veor b4, b4, b2; veor a2, a2, a0; veor b2, b2, b0; \
vand a0, a0, a3; vand b0, b0, b3; vmvn a2, a2; vmvn b2, b2; \
veor a0, a0, a4; veor b0, b0, b4; vorr a4, a4, a3; vorr b4, b4, b3; \
veor a2, a4; veor b2, b4;
#define SBOX5_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmvn a1, a1; vmvn b1, b1; vmov a4, a3; vmov b4, b3; \
veor a2, a2, a1; veor b2, b2, b1; vorr a3, a3, a0; vorr b3, b3, b0; \
veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a1; vorr b2, b2, b1; \
vand a2, a2, a0; vand b2, b2, b0; veor a4, a4, a3; veor b4, b4, b3; \
veor a2, a2, a4; veor b2, b2, b4; vorr a4, a4, a0; vorr b4, b4, b0; \
veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a2; vand b1, b1, b2; \
veor a1, a1, a3; veor b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \
vand a3, a3, a4; vand b3, b3, b4; veor a4, a4, a1; veor b4, b4, b1; \
veor a3, a3, a4; veor b3, b3, b4; vmvn a4, a4; vmvn b4, b4; \
veor a3, a0; veor b3, b0;
#define SBOX6(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmvn a2, a2; vmvn b2, b2; vmov a4, a3; vmov b4, b3; \
vand a3, a3, a0; vand b3, b3, b0; veor a0, a0, a4; veor b0, b0, b4; \
veor a3, a3, a2; veor b3, b3, b2; vorr a2, a2, a4; vorr b2, b2, b4; \
veor a1, a1, a3; veor b1, b1, b3; veor a2, a2, a0; veor b2, b2, b0; \
vorr a0, a0, a1; vorr b0, b0, b1; veor a2, a2, a1; veor b2, b2, b1; \
veor a4, a4, a0; veor b4, b4, b0; vorr a0, a0, a3; vorr b0, b0, b3; \
veor a0, a0, a2; veor b0, b0, b2; veor a4, a4, a3; veor b4, b4, b3; \
veor a4, a4, a0; veor b4, b4, b0; vmvn a3, a3; vmvn b3, b3; \
vand a2, a2, a4; vand b2, b2, b4;\
veor a2, a3; veor b2, b3;
#define SBOX6_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
veor a0, a0, a2; veor b0, b0, b2; vmov a4, a2; vmov b4, b2; \
vand a2, a2, a0; vand b2, b2, b0; veor a4, a4, a3; veor b4, b4, b3; \
vmvn a2, a2; vmvn b2, b2; veor a3, a3, a1; veor b3, b3, b1; \
veor a2, a2, a3; veor b2, b2, b3; vorr a4, a4, a0; vorr b4, b4, b0; \
veor a0, a0, a2; veor b0, b0, b2; veor a3, a3, a4; veor b3, b3, b4; \
veor a4, a4, a1; veor b4, b4, b1; vand a1, a1, a3; vand b1, b1, b3; \
veor a1, a1, a0; veor b1, b1, b0; veor a0, a0, a3; veor b0, b0, b3; \
vorr a0, a0, a2; vorr b0, b0, b2; veor a3, a3, a1; veor b3, b3, b1; \
veor a4, a0; veor b4, b0;
#define SBOX7(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmov a4, a1; vmov b4, b1; vorr a1, a1, a2; vorr b1, b1, b2; \
veor a1, a1, a3; veor b1, b1, b3; veor a4, a4, a2; veor b4, b4, b2; \
veor a2, a2, a1; veor b2, b2, b1; vorr a3, a3, a4; vorr b3, b3, b4; \
vand a3, a3, a0; vand b3, b3, b0; veor a4, a4, a2; veor b4, b4, b2; \
veor a3, a3, a1; veor b3, b3, b1; vorr a1, a1, a4; vorr b1, b1, b4; \
veor a1, a1, a0; veor b1, b1, b0; vorr a0, a0, a4; vorr b0, b0, b4; \
veor a0, a0, a2; veor b0, b0, b2; veor a1, a1, a4; veor b1, b1, b4; \
veor a2, a2, a1; veor b2, b2, b1; vand a1, a1, a0; vand b1, b1, b0; \
veor a1, a1, a4; veor b1, b1, b4; vmvn a2, a2; vmvn b2, b2; \
vorr a2, a2, a0; vorr b2, b2, b0;\
veor a4, a2; veor b4, b2;
#define SBOX7_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vmov a4, a2; vmov b4, b2; veor a2, a2, a0; veor b2, b2, b0; \
vand a0, a0, a3; vand b0, b0, b3; vorr a4, a4, a3; vorr b4, b4, b3; \
vmvn a2, a2; vmvn b2, b2; veor a3, a3, a1; veor b3, b3, b1; \
vorr a1, a1, a0; vorr b1, b1, b0; veor a0, a0, a2; veor b0, b0, b2; \
vand a2, a2, a4; vand b2, b2, b4; vand a3, a3, a4; vand b3, b3, b4; \
veor a1, a1, a2; veor b1, b1, b2; veor a2, a2, a0; veor b2, b2, b0; \
vorr a0, a0, a2; vorr b0, b0, b2; veor a4, a4, a1; veor b4, b4, b1; \
veor a0, a0, a3; veor b0, b0, b3; veor a3, a3, a4; veor b3, b3, b4; \
vorr a4, a4, a0; vorr b4, b4, b0; veor a3, a3, a2; veor b3, b3, b2; \
veor a4, a2; veor b4, b2;
/* Apply SBOX number WHICH to to the block. */
#define SBOX(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
SBOX##which (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4)
/* Apply inverse SBOX number WHICH to to the block. */
#define SBOX_INVERSE(which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
SBOX##which##_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4)
/* XOR round key into block state in a0,a1,a2,a3. a4 used as temporary. */
#define BLOCK_XOR_KEY(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vdup.32 RT3, RT0d0[0]; \
vdup.32 RT1, RT0d0[1]; \
vdup.32 RT2, RT0d1[0]; \
vdup.32 RT0, RT0d1[1]; \
veor a0, a0, RT3; veor b0, b0, RT3; \
veor a1, a1, RT1; veor b1, b1, RT1; \
veor a2, a2, RT2; veor b2, b2, RT2; \
veor a3, a3, RT0; veor b3, b3, RT0;
#define BLOCK_LOAD_KEY_ENC() \
vld1.8 {RT0d0, RT0d1}, [RROUND]!;
#define BLOCK_LOAD_KEY_DEC() \
vld1.8 {RT0d0, RT0d1}, [RROUND]; \
sub RROUND, RROUND, #16
/* Apply the linear transformation to BLOCK. */
#define LINEAR_TRANSFORMATION(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vshl.u32 a4, a0, #13; vshl.u32 b4, b0, #13; \
vshr.u32 a0, a0, #(32-13); vshr.u32 b0, b0, #(32-13); \
veor a0, a0, a4; veor b0, b0, b4; \
vshl.u32 a4, a2, #3; vshl.u32 b4, b2, #3; \
vshr.u32 a2, a2, #(32-3); vshr.u32 b2, b2, #(32-3); \
veor a2, a2, a4; veor b2, b2, b4; \
veor a1, a0, a1; veor b1, b0, b1; \
veor a1, a2, a1; veor b1, b2, b1; \
vshl.u32 a4, a0, #3; vshl.u32 b4, b0, #3; \
veor a3, a2, a3; veor b3, b2, b3; \
veor a3, a4, a3; veor b3, b4, b3; \
vshl.u32 a4, a1, #1; vshl.u32 b4, b1, #1; \
vshr.u32 a1, a1, #(32-1); vshr.u32 b1, b1, #(32-1); \
veor a1, a1, a4; veor b1, b1, b4; \
vshl.u32 a4, a3, #7; vshl.u32 b4, b3, #7; \
vshr.u32 a3, a3, #(32-7); vshr.u32 b3, b3, #(32-7); \
veor a3, a3, a4; veor b3, b3, b4; \
veor a0, a1, a0; veor b0, b1, b0; \
veor a0, a3, a0; veor b0, b3, b0; \
vshl.u32 a4, a1, #7; vshl.u32 b4, b1, #7; \
veor a2, a3, a2; veor b2, b3, b2; \
veor a2, a4, a2; veor b2, b4, b2; \
vshl.u32 a4, a0, #5; vshl.u32 b4, b0, #5; \
vshr.u32 a0, a0, #(32-5); vshr.u32 b0, b0, #(32-5); \
veor a0, a0, a4; veor b0, b0, b4; \
vshl.u32 a4, a2, #22; vshl.u32 b4, b2, #22; \
vshr.u32 a2, a2, #(32-22); vshr.u32 b2, b2, #(32-22); \
veor a2, a2, a4; veor b2, b2, b4;
/* Apply the inverse linear transformation to BLOCK. */
#define LINEAR_TRANSFORMATION_INVERSE(a0, a1, a2, a3, a4, b0, b1, b2, b3, b4) \
vshr.u32 a4, a2, #22; vshr.u32 b4, b2, #22; \
vshl.u32 a2, a2, #(32-22); vshl.u32 b2, b2, #(32-22); \
veor a2, a2, a4; veor b2, b2, b4; \
vshr.u32 a4, a0, #5; vshr.u32 b4, b0, #5; \
vshl.u32 a0, a0, #(32-5); vshl.u32 b0, b0, #(32-5); \
veor a0, a0, a4; veor b0, b0, b4; \
vshl.u32 a4, a1, #7; vshl.u32 b4, b1, #7; \
veor a2, a3, a2; veor b2, b3, b2; \
veor a2, a4, a2; veor b2, b4, b2; \
veor a0, a1, a0; veor b0, b1, b0; \
veor a0, a3, a0; veor b0, b3, b0; \
vshr.u32 a4, a3, #7; vshr.u32 b4, b3, #7; \
vshl.u32 a3, a3, #(32-7); vshl.u32 b3, b3, #(32-7); \
veor a3, a3, a4; veor b3, b3, b4; \
vshr.u32 a4, a1, #1; vshr.u32 b4, b1, #1; \
vshl.u32 a1, a1, #(32-1); vshl.u32 b1, b1, #(32-1); \
veor a1, a1, a4; veor b1, b1, b4; \
vshl.u32 a4, a0, #3; vshl.u32 b4, b0, #3; \
veor a3, a2, a3; veor b3, b2, b3; \
veor a3, a4, a3; veor b3, b4, b3; \
veor a1, a0, a1; veor b1, b0, b1; \
veor a1, a2, a1; veor b1, b2, b1; \
vshr.u32 a4, a2, #3; vshr.u32 b4, b2, #3; \
vshl.u32 a2, a2, #(32-3); vshl.u32 b2, b2, #(32-3); \
veor a2, a2, a4; veor b2, b2, b4; \
vshr.u32 a4, a0, #13; vshr.u32 b4, b0, #13; \
vshl.u32 a0, a0, #(32-13); vshl.u32 b0, b0, #(32-13); \
veor a0, a0, a4; veor b0, b0, b4;
/* Apply a Serpent round to eight parallel blocks. This macro increments
`round'. */
#define ROUND(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
BLOCK_LOAD_KEY_ENC (); \
SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
LINEAR_TRANSFORMATION (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4);
/* Apply the last Serpent round to eight parallel blocks. This macro increments
`round'. */
#define ROUND_LAST(round, which, a0, a1, a2, a3, a4, na0, na1, na2, na3, na4, \
b0, b1, b2, b3, b4, nb0, nb1, nb2, nb3, nb4) \
BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
BLOCK_LOAD_KEY_ENC (); \
SBOX (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4);
/* Apply an inverse Serpent round to eight parallel blocks. This macro
increments `round'. */
#define ROUND_INVERSE(round, which, a0, a1, a2, a3, a4, \
na0, na1, na2, na3, na4, \
b0, b1, b2, b3, b4, \
nb0, nb1, nb2, nb3, nb4) \
LINEAR_TRANSFORMATION_INVERSE (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); \
BLOCK_LOAD_KEY_DEC ();
/* Apply the first inverse Serpent round to eight parallel blocks. This macro
increments `round'. */
#define ROUND_FIRST_INVERSE(round, which, a0, a1, a2, a3, a4, \
na0, na1, na2, na3, na4, \
b0, b1, b2, b3, b4, \
nb0, nb1, nb2, nb3, nb4) \
BLOCK_XOR_KEY (a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
BLOCK_LOAD_KEY_DEC (); \
SBOX_INVERSE (which, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4); \
BLOCK_XOR_KEY (na0, na1, na2, na3, na4, nb0, nb1, nb2, nb3, nb4); \
BLOCK_LOAD_KEY_DEC ();
.align 3
.type __serpent_enc_blk8,%function;
__serpent_enc_blk8:
/* input:
* r0: round key pointer
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
* blocks
* output:
* RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel
* ciphertext blocks
*/
transpose_4x4(RA0, RA1, RA2, RA3);
BLOCK_LOAD_KEY_ENC ();
transpose_4x4(RB0, RB1, RB2, RB3);
ROUND (0, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
ROUND (1, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
ROUND (2, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
ROUND (3, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
ROUND (4, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
ROUND (5, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
ROUND (6, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
ROUND (7, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
ROUND (8, 0, RA4, RA1, RA2, RA0, RA3, RA1, RA3, RA2, RA4, RA0,
RB4, RB1, RB2, RB0, RB3, RB1, RB3, RB2, RB4, RB0);
ROUND (9, 1, RA1, RA3, RA2, RA4, RA0, RA2, RA1, RA4, RA3, RA0,
RB1, RB3, RB2, RB4, RB0, RB2, RB1, RB4, RB3, RB0);
ROUND (10, 2, RA2, RA1, RA4, RA3, RA0, RA4, RA3, RA1, RA0, RA2,
RB2, RB1, RB4, RB3, RB0, RB4, RB3, RB1, RB0, RB2);
ROUND (11, 3, RA4, RA3, RA1, RA0, RA2, RA3, RA1, RA0, RA2, RA4,
RB4, RB3, RB1, RB0, RB2, RB3, RB1, RB0, RB2, RB4);
ROUND (12, 4, RA3, RA1, RA0, RA2, RA4, RA1, RA4, RA3, RA2, RA0,
RB3, RB1, RB0, RB2, RB4, RB1, RB4, RB3, RB2, RB0);
ROUND (13, 5, RA1, RA4, RA3, RA2, RA0, RA4, RA2, RA1, RA3, RA0,
RB1, RB4, RB3, RB2, RB0, RB4, RB2, RB1, RB3, RB0);
ROUND (14, 6, RA4, RA2, RA1, RA3, RA0, RA4, RA2, RA0, RA1, RA3,
RB4, RB2, RB1, RB3, RB0, RB4, RB2, RB0, RB1, RB3);
ROUND (15, 7, RA4, RA2, RA0, RA1, RA3, RA3, RA1, RA2, RA4, RA0,
RB4, RB2, RB0, RB1, RB3, RB3, RB1, RB2, RB4, RB0);
ROUND (16, 0, RA3, RA1, RA2, RA4, RA0, RA1, RA0, RA2, RA3, RA4,
RB3, RB1, RB2, RB4, RB0, RB1, RB0, RB2, RB3, RB4);
ROUND (17, 1, RA1, RA0, RA2, RA3, RA4, RA2, RA1, RA3, RA0, RA4,
RB1, RB0, RB2, RB3, RB4, RB2, RB1, RB3, RB0, RB4);
ROUND (18, 2, RA2, RA1, RA3, RA0, RA4, RA3, RA0, RA1, RA4, RA2,
RB2, RB1, RB3, RB0, RB4, RB3, RB0, RB1, RB4, RB2);
ROUND (19, 3, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA4, RA2, RA3,
RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB4, RB2, RB3);
ROUND (20, 4, RA0, RA1, RA4, RA2, RA3, RA1, RA3, RA0, RA2, RA4,
RB0, RB1, RB4, RB2, RB3, RB1, RB3, RB0, RB2, RB4);
ROUND (21, 5, RA1, RA3, RA0, RA2, RA4, RA3, RA2, RA1, RA0, RA4,
RB1, RB3, RB0, RB2, RB4, RB3, RB2, RB1, RB0, RB4);
ROUND (22, 6, RA3, RA2, RA1, RA0, RA4, RA3, RA2, RA4, RA1, RA0,
RB3, RB2, RB1, RB0, RB4, RB3, RB2, RB4, RB1, RB0);
ROUND (23, 7, RA3, RA2, RA4, RA1, RA0, RA0, RA1, RA2, RA3, RA4,
RB3, RB2, RB4, RB1, RB0, RB0, RB1, RB2, RB3, RB4);
ROUND (24, 0, RA0, RA1, RA2, RA3, RA4, RA1, RA4, RA2, RA0, RA3,
RB0, RB1, RB2, RB3, RB4, RB1, RB4, RB2, RB0, RB3);
ROUND (25, 1, RA1, RA4, RA2, RA0, RA3, RA2, RA1, RA0, RA4, RA3,
RB1, RB4, RB2, RB0, RB3, RB2, RB1, RB0, RB4, RB3);
ROUND (26, 2, RA2, RA1, RA0, RA4, RA3, RA0, RA4, RA1, RA3, RA2,
RB2, RB1, RB0, RB4, RB3, RB0, RB4, RB1, RB3, RB2);
ROUND (27, 3, RA0, RA4, RA1, RA3, RA2, RA4, RA1, RA3, RA2, RA0,
RB0, RB4, RB1, RB3, RB2, RB4, RB1, RB3, RB2, RB0);
ROUND (28, 4, RA4, RA1, RA3, RA2, RA0, RA1, RA0, RA4, RA2, RA3,
RB4, RB1, RB3, RB2, RB0, RB1, RB0, RB4, RB2, RB3);
ROUND (29, 5, RA1, RA0, RA4, RA2, RA3, RA0, RA2, RA1, RA4, RA3,
RB1, RB0, RB4, RB2, RB3, RB0, RB2, RB1, RB4, RB3);
ROUND (30, 6, RA0, RA2, RA1, RA4, RA3, RA0, RA2, RA3, RA1, RA4,
RB0, RB2, RB1, RB4, RB3, RB0, RB2, RB3, RB1, RB4);
ROUND_LAST (31, 7, RA0, RA2, RA3, RA1, RA4, RA4, RA1, RA2, RA0, RA3,
RB0, RB2, RB3, RB1, RB4, RB4, RB1, RB2, RB0, RB3);
transpose_4x4(RA4, RA1, RA2, RA0);
transpose_4x4(RB4, RB1, RB2, RB0);
bx lr;
.size __serpent_enc_blk8,.-__serpent_enc_blk8;
.align 3
.type __serpent_dec_blk8,%function;
__serpent_dec_blk8:
/* input:
* r0: round key pointer
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
* ciphertext blocks
* output:
* RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
* blocks
*/
add RROUND, RROUND, #(32*16);
transpose_4x4(RA0, RA1, RA2, RA3);
BLOCK_LOAD_KEY_DEC ();
transpose_4x4(RB0, RB1, RB2, RB3);
ROUND_FIRST_INVERSE (31, 7, RA0, RA1, RA2, RA3, RA4,
RA3, RA0, RA1, RA4, RA2,
RB0, RB1, RB2, RB3, RB4,
RB3, RB0, RB1, RB4, RB2);
ROUND_INVERSE (30, 6, RA3, RA0, RA1, RA4, RA2, RA0, RA1, RA2, RA4, RA3,
RB3, RB0, RB1, RB4, RB2, RB0, RB1, RB2, RB4, RB3);
ROUND_INVERSE (29, 5, RA0, RA1, RA2, RA4, RA3, RA1, RA3, RA4, RA2, RA0,
RB0, RB1, RB2, RB4, RB3, RB1, RB3, RB4, RB2, RB0);
ROUND_INVERSE (28, 4, RA1, RA3, RA4, RA2, RA0, RA1, RA2, RA4, RA0, RA3,
RB1, RB3, RB4, RB2, RB0, RB1, RB2, RB4, RB0, RB3);
ROUND_INVERSE (27, 3, RA1, RA2, RA4, RA0, RA3, RA4, RA2, RA0, RA1, RA3,
RB1, RB2, RB4, RB0, RB3, RB4, RB2, RB0, RB1, RB3);
ROUND_INVERSE (26, 2, RA4, RA2, RA0, RA1, RA3, RA2, RA3, RA0, RA1, RA4,
RB4, RB2, RB0, RB1, RB3, RB2, RB3, RB0, RB1, RB4);
ROUND_INVERSE (25, 1, RA2, RA3, RA0, RA1, RA4, RA4, RA2, RA1, RA0, RA3,
RB2, RB3, RB0, RB1, RB4, RB4, RB2, RB1, RB0, RB3);
ROUND_INVERSE (24, 0, RA4, RA2, RA1, RA0, RA3, RA4, RA3, RA2, RA0, RA1,
RB4, RB2, RB1, RB0, RB3, RB4, RB3, RB2, RB0, RB1);
ROUND_INVERSE (23, 7, RA4, RA3, RA2, RA0, RA1, RA0, RA4, RA3, RA1, RA2,
RB4, RB3, RB2, RB0, RB1, RB0, RB4, RB3, RB1, RB2);
ROUND_INVERSE (22, 6, RA0, RA4, RA3, RA1, RA2, RA4, RA3, RA2, RA1, RA0,
RB0, RB4, RB3, RB1, RB2, RB4, RB3, RB2, RB1, RB0);
ROUND_INVERSE (21, 5, RA4, RA3, RA2, RA1, RA0, RA3, RA0, RA1, RA2, RA4,
RB4, RB3, RB2, RB1, RB0, RB3, RB0, RB1, RB2, RB4);
ROUND_INVERSE (20, 4, RA3, RA0, RA1, RA2, RA4, RA3, RA2, RA1, RA4, RA0,
RB3, RB0, RB1, RB2, RB4, RB3, RB2, RB1, RB4, RB0);
ROUND_INVERSE (19, 3, RA3, RA2, RA1, RA4, RA0, RA1, RA2, RA4, RA3, RA0,
RB3, RB2, RB1, RB4, RB0, RB1, RB2, RB4, RB3, RB0);
ROUND_INVERSE (18, 2, RA1, RA2, RA4, RA3, RA0, RA2, RA0, RA4, RA3, RA1,
RB1, RB2, RB4, RB3, RB0, RB2, RB0, RB4, RB3, RB1);
ROUND_INVERSE (17, 1, RA2, RA0, RA4, RA3, RA1, RA1, RA2, RA3, RA4, RA0,
RB2, RB0, RB4, RB3, RB1, RB1, RB2, RB3, RB4, RB0);
ROUND_INVERSE (16, 0, RA1, RA2, RA3, RA4, RA0, RA1, RA0, RA2, RA4, RA3,
RB1, RB2, RB3, RB4, RB0, RB1, RB0, RB2, RB4, RB3);
ROUND_INVERSE (15, 7, RA1, RA0, RA2, RA4, RA3, RA4, RA1, RA0, RA3, RA2,
RB1, RB0, RB2, RB4, RB3, RB4, RB1, RB0, RB3, RB2);
ROUND_INVERSE (14, 6, RA4, RA1, RA0, RA3, RA2, RA1, RA0, RA2, RA3, RA4,
RB4, RB1, RB0, RB3, RB2, RB1, RB0, RB2, RB3, RB4);
ROUND_INVERSE (13, 5, RA1, RA0, RA2, RA3, RA4, RA0, RA4, RA3, RA2, RA1,
RB1, RB0, RB2, RB3, RB4, RB0, RB4, RB3, RB2, RB1);
ROUND_INVERSE (12, 4, RA0, RA4, RA3, RA2, RA1, RA0, RA2, RA3, RA1, RA4,
RB0, RB4, RB3, RB2, RB1, RB0, RB2, RB3, RB1, RB4);
ROUND_INVERSE (11, 3, RA0, RA2, RA3, RA1, RA4, RA3, RA2, RA1, RA0, RA4,
RB0, RB2, RB3, RB1, RB4, RB3, RB2, RB1, RB0, RB4);
ROUND_INVERSE (10, 2, RA3, RA2, RA1, RA0, RA4, RA2, RA4, RA1, RA0, RA3,
RB3, RB2, RB1, RB0, RB4, RB2, RB4, RB1, RB0, RB3);
ROUND_INVERSE (9, 1, RA2, RA4, RA1, RA0, RA3, RA3, RA2, RA0, RA1, RA4,
RB2, RB4, RB1, RB0, RB3, RB3, RB2, RB0, RB1, RB4);
ROUND_INVERSE (8, 0, RA3, RA2, RA0, RA1, RA4, RA3, RA4, RA2, RA1, RA0,
RB3, RB2, RB0, RB1, RB4, RB3, RB4, RB2, RB1, RB0);
ROUND_INVERSE (7, 7, RA3, RA4, RA2, RA1, RA0, RA1, RA3, RA4, RA0, RA2,
RB3, RB4, RB2, RB1, RB0, RB1, RB3, RB4, RB0, RB2);
ROUND_INVERSE (6, 6, RA1, RA3, RA4, RA0, RA2, RA3, RA4, RA2, RA0, RA1,
RB1, RB3, RB4, RB0, RB2, RB3, RB4, RB2, RB0, RB1);
ROUND_INVERSE (5, 5, RA3, RA4, RA2, RA0, RA1, RA4, RA1, RA0, RA2, RA3,
RB3, RB4, RB2, RB0, RB1, RB4, RB1, RB0, RB2, RB3);
ROUND_INVERSE (4, 4, RA4, RA1, RA0, RA2, RA3, RA4, RA2, RA0, RA3, RA1,
RB4, RB1, RB0, RB2, RB3, RB4, RB2, RB0, RB3, RB1);
ROUND_INVERSE (3, 3, RA4, RA2, RA0, RA3, RA1, RA0, RA2, RA3, RA4, RA1,
RB4, RB2, RB0, RB3, RB1, RB0, RB2, RB3, RB4, RB1);
ROUND_INVERSE (2, 2, RA0, RA2, RA3, RA4, RA1, RA2, RA1, RA3, RA4, RA0,
RB0, RB2, RB3, RB4, RB1, RB2, RB1, RB3, RB4, RB0);
ROUND_INVERSE (1, 1, RA2, RA1, RA3, RA4, RA0, RA0, RA2, RA4, RA3, RA1,
RB2, RB1, RB3, RB4, RB0, RB0, RB2, RB4, RB3, RB1);
ROUND_INVERSE (0, 0, RA0, RA2, RA4, RA3, RA1, RA0, RA1, RA2, RA3, RA4,
RB0, RB2, RB4, RB3, RB1, RB0, RB1, RB2, RB3, RB4);
transpose_4x4(RA0, RA1, RA2, RA3);
transpose_4x4(RB0, RB1, RB2, RB3);
bx lr;
.size __serpent_dec_blk8,.-__serpent_dec_blk8;
.align 3
.globl _gcry_serpent_neon_blk8
.type _gcry_serpent_neon_blk8,%function;
_gcry_serpent_neon_blk8:
/* input:
* r0: ctx, CTX
* r1: dst (8 blocks)
* r2: src (8 blocks)
* r3: encrypt
*/
push {lr};
vpush {RA4-RB2};
cmp r3, #0
vld1.8 {RA0, RA1}, [r2]!;
vld1.8 {RA2, RA3}, [r2]!;
vld1.8 {RB0, RB1}, [r2]!;
vld1.8 {RB2, RB3}, [r2]!;
beq .Lblk8_dec;
bl __serpent_enc_blk8;
vst1.8 {RA4}, [r1]!;
vst1.8 {RA1, RA2}, [r1]!;
vst1.8 {RA0}, [r1]!;
vst1.8 {RB4}, [r1]!;
vst1.8 {RB1, RB2}, [r1]!;
vst1.8 {RB0}, [r1]!;
b .Lblk8_end;
.Lblk8_dec:
bl __serpent_dec_blk8;
vst1.8 {RA0, RA1}, [r1]!;
vst1.8 {RA2, RA3}, [r1]!;
vst1.8 {RB0, RB1}, [r1]!;
vst1.8 {RB2, RB3}, [r1]!;
.Lblk8_end:
/* clear the used registers */
veor RA0, RA0;
veor RA1, RA1;
veor RA2, RA2;
veor RA3, RA3;
vpop {RA4-RB2};
veor RB3, RB3;
veor RB4, RB4;
veor RT0, RT0;
veor RT1, RT1;
veor RT2, RT2;
veor RT3, RT3;
pop {pc};
.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec;
.align 3
.globl _gcry_serpent_neon_ctr_enc
.type _gcry_serpent_neon_ctr_enc,%function;
_gcry_serpent_neon_ctr_enc:
/* input:
* r0: ctx, CTX
* r1: dst (8 blocks)
* r2: src (8 blocks)
* r3: iv
*/
vmov.u8 RT1d0, #0xff; /* u64: -1 */
push {r4,lr};
vadd.u64 RT2d0, RT1d0, RT1d0; /* u64: -2 */
vpush {RA4-RB2};
/* load IV and byteswap */
vld1.8 {RA0}, [r3];
vrev64.u8 RT0, RA0; /* be => le */
ldr r4, [r3, #8];
/* construct IVs */
vsub.u64 RA2d1, RT0d1, RT2d0; /* +2 */
vsub.u64 RA1d1, RT0d1, RT1d0; /* +1 */
cmp r4, #-1;
vsub.u64 RB0d1, RA2d1, RT2d0; /* +4 */
vsub.u64 RA3d1, RA2d1, RT1d0; /* +3 */
ldr r4, [r3, #12];
vsub.u64 RB2d1, RB0d1, RT2d0; /* +6 */
vsub.u64 RB1d1, RB0d1, RT1d0; /* +5 */
vsub.u64 RT2d1, RB2d1, RT2d0; /* +8 */
vsub.u64 RB3d1, RB2d1, RT1d0; /* +7 */
vmov RA1d0, RT0d0;
vmov RA2d0, RT0d0;
vmov RA3d0, RT0d0;
vmov RB0d0, RT0d0;
rev r4, r4;
vmov RB1d0, RT0d0;
vmov RB2d0, RT0d0;
vmov RB3d0, RT0d0;
vmov RT2d0, RT0d0;
/* check need for handling 64-bit overflow and carry */
beq .Ldo_ctr_carry;
.Lctr_carry_done:
/* le => be */
vrev64.u8 RA1, RA1;
vrev64.u8 RA2, RA2;
vrev64.u8 RA3, RA3;
vrev64.u8 RB0, RB0;
vrev64.u8 RT2, RT2;
vrev64.u8 RB1, RB1;
vrev64.u8 RB2, RB2;
vrev64.u8 RB3, RB3;
/* store new IV */
vst1.8 {RT2}, [r3];
bl __serpent_enc_blk8;
vld1.8 {RT0, RT1}, [r2]!;
vld1.8 {RT2, RT3}, [r2]!;
veor RA4, RA4, RT0;
veor RA1, RA1, RT1;
vld1.8 {RT0, RT1}, [r2]!;
veor RA2, RA2, RT2;
veor RA0, RA0, RT3;
vld1.8 {RT2, RT3}, [r2]!;
veor RB4, RB4, RT0;
veor RT0, RT0;
veor RB1, RB1, RT1;
veor RT1, RT1;
veor RB2, RB2, RT2;
veor RT2, RT2;
veor RB0, RB0, RT3;
veor RT3, RT3;
vst1.8 {RA4}, [r1]!;
vst1.8 {RA1}, [r1]!;
veor RA1, RA1;
vst1.8 {RA2}, [r1]!;
veor RA2, RA2;
vst1.8 {RA0}, [r1]!;
veor RA0, RA0;
vst1.8 {RB4}, [r1]!;
veor RB4, RB4;
vst1.8 {RB1}, [r1]!;
vst1.8 {RB2}, [r1]!;
vst1.8 {RB0}, [r1]!;
vpop {RA4-RB2};
/* clear the used registers */
veor RA3, RA3;
veor RB3, RB3;
pop {r4,pc};
.Ldo_ctr_carry:
cmp r4, #-8;
blo .Lctr_carry_done;
beq .Lcarry_RT2;
cmp r4, #-6;
blo .Lcarry_RB3;
beq .Lcarry_RB2;
cmp r4, #-4;
blo .Lcarry_RB1;
beq .Lcarry_RB0;
cmp r4, #-2;
blo .Lcarry_RA3;
beq .Lcarry_RA2;
vsub.u64 RA1d0, RT1d0;
.Lcarry_RA2:
vsub.u64 RA2d0, RT1d0;
.Lcarry_RA3:
vsub.u64 RA3d0, RT1d0;
.Lcarry_RB0:
vsub.u64 RB0d0, RT1d0;
.Lcarry_RB1:
vsub.u64 RB1d0, RT1d0;
.Lcarry_RB2:
vsub.u64 RB2d0, RT1d0;
.Lcarry_RB3:
vsub.u64 RB3d0, RT1d0;
.Lcarry_RT2:
vsub.u64 RT2d0, RT1d0;
b .Lctr_carry_done;
.size _gcry_serpent_neon_ctr_enc,.-_gcry_serpent_neon_ctr_enc;
.align 3
.globl _gcry_serpent_neon_cfb_dec
.type _gcry_serpent_neon_cfb_dec,%function;
_gcry_serpent_neon_cfb_dec:
/* input:
* r0: ctx, CTX
* r1: dst (8 blocks)
* r2: src (8 blocks)
* r3: iv
*/
push {lr};
vpush {RA4-RB2};
/* Load input */
vld1.8 {RA0}, [r3];
vld1.8 {RA1, RA2}, [r2]!;
vld1.8 {RA3}, [r2]!;
vld1.8 {RB0}, [r2]!;
vld1.8 {RB1, RB2}, [r2]!;
vld1.8 {RB3}, [r2]!;
/* Update IV */
vld1.8 {RT0}, [r2]!;
vst1.8 {RT0}, [r3];
mov r3, lr;
sub r2, r2, #(8*16);
bl __serpent_enc_blk8;
vld1.8 {RT0, RT1}, [r2]!;
vld1.8 {RT2, RT3}, [r2]!;
veor RA4, RA4, RT0;
veor RA1, RA1, RT1;
vld1.8 {RT0, RT1}, [r2]!;
veor RA2, RA2, RT2;
veor RA0, RA0, RT3;
vld1.8 {RT2, RT3}, [r2]!;
veor RB4, RB4, RT0;
veor RT0, RT0;
veor RB1, RB1, RT1;
veor RT1, RT1;
veor RB2, RB2, RT2;
veor RT2, RT2;
veor RB0, RB0, RT3;
veor RT3, RT3;
vst1.8 {RA4}, [r1]!;
vst1.8 {RA1}, [r1]!;
veor RA1, RA1;
vst1.8 {RA2}, [r1]!;
veor RA2, RA2;
vst1.8 {RA0}, [r1]!;
veor RA0, RA0;
vst1.8 {RB4}, [r1]!;
veor RB4, RB4;
vst1.8 {RB1}, [r1]!;
vst1.8 {RB2}, [r1]!;
vst1.8 {RB0}, [r1]!;
vpop {RA4-RB2};
/* clear the used registers */
veor RA3, RA3;
veor RB3, RB3;
pop {pc};
.size _gcry_serpent_neon_cfb_dec,.-_gcry_serpent_neon_cfb_dec;
.align 3
.globl _gcry_serpent_neon_cbc_dec
.type _gcry_serpent_neon_cbc_dec,%function;
_gcry_serpent_neon_cbc_dec:
/* input:
* r0: ctx, CTX
* r1: dst (8 blocks)
* r2: src (8 blocks)
* r3: iv
*/
push {lr};
vpush {RA4-RB2};
vld1.8 {RA0, RA1}, [r2]!;
vld1.8 {RA2, RA3}, [r2]!;
vld1.8 {RB0, RB1}, [r2]!;
vld1.8 {RB2, RB3}, [r2]!;
sub r2, r2, #(8*16);
bl __serpent_dec_blk8;
vld1.8 {RB4}, [r3];
vld1.8 {RT0, RT1}, [r2]!;
vld1.8 {RT2, RT3}, [r2]!;
veor RA0, RA0, RB4;
veor RA1, RA1, RT0;
veor RA2, RA2, RT1;
vld1.8 {RT0, RT1}, [r2]!;
veor RA3, RA3, RT2;
veor RB0, RB0, RT3;
vld1.8 {RT2, RT3}, [r2]!;
veor RB1, RB1, RT0;
veor RT0, RT0;
veor RB2, RB2, RT1;
veor RT1, RT1;
veor RB3, RB3, RT2;
veor RT2, RT2;
vst1.8 {RT3}, [r3]; /* store new IV */
veor RT3, RT3;
vst1.8 {RA0, RA1}, [r1]!;
veor RA0, RA0;
veor RA1, RA1;
vst1.8 {RA2, RA3}, [r1]!;
veor RA2, RA2;
vst1.8 {RB0, RB1}, [r1]!;
veor RA3, RA3;
vst1.8 {RB2, RB3}, [r1]!;
veor RB3, RB3;
vpop {RA4-RB2};
/* clear the used registers */
veor RB4, RB4;
pop {pc};
.size _gcry_serpent_neon_cbc_dec,.-_gcry_serpent_neon_cbc_dec;
.align 3
.globl _gcry_serpent_neon_ocb_enc
.type _gcry_serpent_neon_ocb_enc,%function;
_gcry_serpent_neon_ocb_enc:
/* input:
* r0 : ctx, CTX
* r1 : dst (8 blocks)
* r2 : src (8 blocks)
* r3 : offset
* sp+0: checksum
* sp+4: L pointers (void *L[8])
*/
push {r4-r11, ip, lr};
add ip, sp, #(10*4);
vpush {RA4-RB2};
ldm ip, {r4, lr};
vld1.8 {RT0}, [r3];
vld1.8 {RT1}, [r4];
/* Load L pointers */
ldm lr!, {r5, r6, r7, r8};
ldm lr, {r9, r10, r11, ip};
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Checksum_i = Checksum_{i-1} xor P_i */
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
vld1.8 {RA0, RA1}, [r2]!;
vld1.8 {RA2, RA3}, [r2]!;
vld1.8 {RB0, RB1}, [r2]!;
vld1.8 {RB2, RB3}, [r2];
#define OCB_INPUT(lreg, vreg) \
vld1.8 {RT3}, [lreg]; \
veor RT0, RT3; \
veor RT1, vreg; \
veor vreg, RT0; \
vst1.8 {RT0}, [r1]!;
OCB_INPUT(r5, RA0);
OCB_INPUT(r6, RA1);
OCB_INPUT(r7, RA2);
OCB_INPUT(r8, RA3);
OCB_INPUT(r9, RB0);
OCB_INPUT(r10, RB1);
OCB_INPUT(r11, RB2);
OCB_INPUT(ip, RB3);
#undef OCB_INPUT
sub r1, r1, #(8*16);
vst1.8 {RT0}, [r3];
vst1.8 {RT1}, [r4];
mov r2, r1;
bl __serpent_enc_blk8;
vld1.8 {RT0, RT1}, [r1]!;
veor RT0, RA4, RT0;
veor RT1, RA1, RT1;
vld1.8 {RT2, RT3}, [r1]!;
vst1.8 {RT0, RT1}, [r2]!;
veor RT2, RA2, RT2;
veor RT3, RA0, RT3;
vld1.8 {RT0, RT1}, [r1]!;
vst1.8 {RT2, RT3}, [r2]!;
veor RT0, RB4, RT0;
veor RT1, RB1, RT1;
vld1.8 {RT2, RT3}, [r1]!;
vst1.8 {RT0, RT1}, [r2]!;
veor RT2, RB2, RT2;
veor RT3, RB0, RT3;
vst1.8 {RT2, RT3}, [r2]!;
vpop {RA4-RB2};
/* clear the used registers */
veor RA3, RA3;
veor RB3, RB3;
pop {r4-r11, ip, pc};
.size _gcry_serpent_neon_ocb_enc,.-_gcry_serpent_neon_ocb_enc;
.align 3
.globl _gcry_serpent_neon_ocb_dec
.type _gcry_serpent_neon_ocb_dec,%function;
_gcry_serpent_neon_ocb_dec:
/* input:
* r0 : ctx, CTX
* r1 : dst (8 blocks)
* r2 : src (8 blocks)
* r3 : offset
* sp+0: checksum
* sp+4: L pointers (void *L[8])
*/
push {r4-r11, ip, lr};
add ip, sp, #(10*4);
vpush {RA4-RB2};
ldm ip, {r4, lr};
vld1.8 {RT0}, [r3];
/* Load L pointers */
ldm lr!, {r5, r6, r7, r8};
ldm lr, {r9, r10, r11, ip};
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
vld1.8 {RA0, RA1}, [r2]!;
vld1.8 {RA2, RA3}, [r2]!;
vld1.8 {RB0, RB1}, [r2]!;
vld1.8 {RB2, RB3}, [r2];
#define OCB_INPUT(lreg, vreg) \
vld1.8 {RT3}, [lreg]; \
veor RT0, RT3; \
veor vreg, RT0; \
vst1.8 {RT0}, [r1]!;
OCB_INPUT(r5, RA0);
OCB_INPUT(r6, RA1);
OCB_INPUT(r7, RA2);
OCB_INPUT(r8, RA3);
OCB_INPUT(r9, RB0);
OCB_INPUT(r10, RB1);
OCB_INPUT(r11, RB2);
OCB_INPUT(ip, RB3);
#undef OCB_INPUT
sub r1, r1, #(8*16);
vst1.8 {RT0}, [r3];
mov r2, r1;
bl __serpent_dec_blk8;
/* Checksum_i = Checksum_{i-1} xor P_i */
vld1.8 {RA4}, [r4];
vld1.8 {RT0, RT1}, [r1]!;
veor RA0, RA0, RT0;
veor RA1, RA1, RT1;
vld1.8 {RT2, RT3}, [r1]!;
veor RA4, RA4, RA0;
vst1.8 {RA0, RA1}, [r2]!;
veor RA4, RA4, RA1;
veor RA2, RA2, RT2;
veor RA3, RA3, RT3;
vld1.8 {RT0, RT1}, [r1]!;
veor RA4, RA4, RA2;
vst1.8 {RA2, RA3}, [r2]!;
veor RA4, RA4, RA3;
veor RB0, RB0, RT0;
veor RB1, RB1, RT1;
vld1.8 {RT2, RT3}, [r1]!;
veor RA4, RA4, RB0;
vst1.8 {RB0, RB1}, [r2]!;
veor RA4, RA4, RB1;
veor RB2, RB2, RT2;
veor RB3, RB3, RT3;
veor RA4, RA4, RB2;
vst1.8 {RB2, RB3}, [r2]!;
veor RA4, RA4, RB3;
vst1.8 {RA4}, [r4];
vpop {RA4-RB2};
/* clear the used registers */
veor RB4, RB4;
pop {r4-r11, ip, pc};
.size _gcry_serpent_neon_ocb_dec,.-_gcry_serpent_neon_ocb_dec;
.align 3
.globl _gcry_serpent_neon_ocb_auth
.type _gcry_serpent_neon_ocb_auth,%function;
_gcry_serpent_neon_ocb_auth:
/* input:
* r0 : ctx, CTX
* r1 : abuf (8 blocks)
* r2 : offset
* r3 : checksum
* sp+0: L pointers (void *L[8])
*/
push {r5-r11, ip, lr};
ldr lr, [sp, #(9*4)];
vpush {RA4-RB2};
vld1.8 {RT0}, [r2];
/* Load L pointers */
ldm lr!, {r5, r6, r7, r8};
ldm lr, {r9, r10, r11, ip};
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
vld1.8 {RA0, RA1}, [r1]!;
vld1.8 {RA2, RA3}, [r1]!;
vld1.8 {RB0, RB1}, [r1]!;
vld1.8 {RB2, RB3}, [r1];
#define OCB_INPUT(lreg, vreg) \
vld1.8 {RT3}, [lreg]; \
veor RT0, RT3; \
veor vreg, RT0;
OCB_INPUT(r5, RA0);
OCB_INPUT(r6, RA1);
OCB_INPUT(r7, RA2);
OCB_INPUT(r8, RA3);
OCB_INPUT(r9, RB0);
OCB_INPUT(r10, RB1);
OCB_INPUT(r11, RB2);
OCB_INPUT(ip, RB3);
#undef OCB_INPUT
vst1.8 {RT0}, [r2];
bl __serpent_enc_blk8;
/* Checksum_i = Checksum_{i-1} xor P_i */
vld1.8 {RT0}, [r3];
veor RA4, RB4;
veor RA1, RB1;
veor RA2, RB2;
veor RA0, RB0;
veor RA2, RT0;
veor RA1, RA4;
veor RA0, RA2;
veor RA0, RA1;
vst1.8 {RA0}, [r3];
vpop {RA4-RB2};
/* clear the used registers */
veor RA3, RA3;
veor RB3, RB3;
pop {r5-r11, ip, pc};
.size _gcry_serpent_neon_ocb_auth,.-_gcry_serpent_neon_ocb_auth;
#endif