qemu-s390x
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 2/2] tests/tcg/s390x: Import linux tools/testing/crypto/chacha20-


From: Richard Henderson
Subject: [PATCH 2/2] tests/tcg/s390x: Import linux tools/testing/crypto/chacha20-s390
Date: Thu, 18 Jan 2024 08:36:46 +1100

Modify and simplify the driver, as we're really only interested
in correctness of translation of chacha-vx.S.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tests/tcg/s390x/chacha.c        | 341 ++++++++++++
 tests/tcg/s390x/Makefile.target |   4 +
 tests/tcg/s390x/chacha-vx.S     | 914 ++++++++++++++++++++++++++++++++
 3 files changed, 1259 insertions(+)
 create mode 100644 tests/tcg/s390x/chacha.c
 create mode 100644 tests/tcg/s390x/chacha-vx.S

diff --git a/tests/tcg/s390x/chacha.c b/tests/tcg/s390x/chacha.c
new file mode 100644
index 0000000000..ca9e4c1959
--- /dev/null
+++ b/tests/tcg/s390x/chacha.c
@@ -0,0 +1,341 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Derived from linux kernel sources:
+ *   ./include/crypto/chacha.h
+ *   ./crypto/chacha_generic.c
+ *   ./arch/s390/crypto/chacha-glue.c
+ *   ./tools/testing/crypto/chacha20-s390/test-cipher.c
+ *   ./tools/testing/crypto/chacha20-s390/run-tests.sh
+ */
+
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/random.h>
+
+typedef uint8_t u8;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+static unsigned data_size;
+static bool debug;
+
+#define CHACHA_IV_SIZE          16
+#define CHACHA_KEY_SIZE         32
+#define CHACHA_BLOCK_SIZE       64
+#define CHACHAPOLY_IV_SIZE      12
+#define CHACHA_STATE_WORDS      (CHACHA_BLOCK_SIZE / sizeof(u32))
+
+static u32 rol32(u32 val, u32 sh)
+{
+    return (val << (sh & 31)) | (val >> (-sh & 31));
+}
+
+static u32 get_unaligned_le32(const void *ptr)
+{
+    u32 val;
+    memcpy(&val, ptr, 4);
+    return __builtin_bswap32(val);
+}
+
+static void put_unaligned_le32(u32 val, void *ptr)
+{
+    val = __builtin_bswap32(val);
+    memcpy(ptr, &val, 4);
+}
+
+static void chacha_permute(u32 *x, int nrounds)
+{
+    for (int i = 0; i < nrounds; i += 2) {
+        x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
+        x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
+        x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
+        x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);
+
+        x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
+        x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
+        x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
+        x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);
+
+        x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
+        x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
+        x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
+        x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);
+
+        x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
+        x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
+        x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
+        x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);
+
+        x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
+        x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
+        x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
+        x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);
+
+        x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
+        x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
+        x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
+        x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);
+
+        x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
+        x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
+        x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
+        x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);
+
+        x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
+        x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
+        x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
+        x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
+    }
+}
+
+static void chacha_block_generic(u32 *state, u8 *stream, int nrounds)
+{
+    u32 x[16];
+
+    memcpy(x, state, 64);
+    chacha_permute(x, nrounds);
+
+    for (int i = 0; i < 16; i++) {
+        put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);
+    }
+    state[12]++;
+}
+
+static void crypto_xor_cpy(u8 *dst, const u8 *src1,
+                           const u8 *src2, unsigned len)
+{
+    while (len--) {
+        *dst++ = *src1++ ^ *src2++;
+    }
+}
+
+static void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
+                                 unsigned int bytes, int nrounds)
+{
+    u8 stream[CHACHA_BLOCK_SIZE];
+
+    while (bytes >= CHACHA_BLOCK_SIZE) {
+        chacha_block_generic(state, stream, nrounds);
+        crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE);
+        bytes -= CHACHA_BLOCK_SIZE;
+        dst += CHACHA_BLOCK_SIZE;
+        src += CHACHA_BLOCK_SIZE;
+    }
+    if (bytes) {
+        chacha_block_generic(state, stream, nrounds);
+        crypto_xor_cpy(dst, src, stream, bytes);
+    }
+}
+
+enum chacha_constants { /* expand 32-byte k */
+    CHACHA_CONSTANT_EXPA = 0x61707865U,
+    CHACHA_CONSTANT_ND_3 = 0x3320646eU,
+    CHACHA_CONSTANT_2_BY = 0x79622d32U,
+    CHACHA_CONSTANT_TE_K = 0x6b206574U
+};
+
+static void chacha_init_generic(u32 *state, const u32 *key, const u8 *iv)
+{
+    state[0]  = CHACHA_CONSTANT_EXPA;
+    state[1]  = CHACHA_CONSTANT_ND_3;
+    state[2]  = CHACHA_CONSTANT_2_BY;
+    state[3]  = CHACHA_CONSTANT_TE_K;
+    state[4]  = key[0];
+    state[5]  = key[1];
+    state[6]  = key[2];
+    state[7]  = key[3];
+    state[8]  = key[4];
+    state[9]  = key[5];
+    state[10] = key[6];
+    state[11] = key[7];
+    state[12] = get_unaligned_le32(iv +  0);
+    state[13] = get_unaligned_le32(iv +  4);
+    state[14] = get_unaligned_le32(iv +  8);
+    state[15] = get_unaligned_le32(iv + 12);
+}
+
+void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
+                 const u32 *counter);
+
+static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
+                                unsigned int nbytes, const u32 *key,
+                                u32 *counter)
+{
+    chacha20_vx(dst, src, nbytes, key, counter);
+    *counter += (nbytes + CHACHA_BLOCK_SIZE - 1) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
+                              unsigned int bytes, int nrounds)
+{
+    /*
+     * s390 chacha20 implementation has 20 rounds hard-coded,
+     * it cannot handle a block of data or less, but otherwise
+     * it can handle data of arbitrary size
+     */
+    if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20) {
+        chacha_crypt_generic(state, dst, src, bytes, nrounds);
+    } else {
+        chacha20_crypt_s390(state, dst, src, bytes, &state[4], &state[12]);
+    }
+}
+
+static void print_hex_dump(const char *prefix_str, const void *buf, int len)
+{
+    for (int i = 0; i < len; i += 16) {
+        printf("%s%.8x: ", prefix_str, i);
+        for (int j = 0; j < 16; ++j) {
+            printf("%02x%c", *(u8 *)(buf + i + j), j == 15 ? '\n' : ' ');
+        }
+    }
+}
+
+/* Perform cipher operations with the chacha lib */
+static int test_lib_chacha(u8 *revert, u8 *cipher, u8 *plain, bool generic)
+{
+    u32 chacha_state[CHACHA_STATE_WORDS];
+    u8 iv[16], key[32];
+
+    memset(key, 'X', sizeof(key));
+    memset(iv, 'I', sizeof(iv));
+
+    if (debug) {
+        print_hex_dump("key: ", key, 32);
+        print_hex_dump("iv:  ", iv, 16);
+    }
+
+    /* Encrypt */
+    chacha_init_generic(chacha_state, (u32*)key, iv);
+
+    if (generic) {
+        chacha_crypt_generic(chacha_state, cipher, plain, data_size, 20);
+    } else {
+        chacha_crypt_arch(chacha_state, cipher, plain, data_size, 20);
+    }
+
+    if (debug) {
+        print_hex_dump("encr:", cipher,
+                       (data_size > 64 ? 64 : data_size));
+    }
+
+    /* Decrypt */
+    chacha_init_generic(chacha_state, (u32 *)key, iv);
+
+    if (generic) {
+        chacha_crypt_generic(chacha_state, revert, cipher, data_size, 20);
+    } else {
+        chacha_crypt_arch(chacha_state, revert, cipher, data_size, 20);
+    }
+
+    if (debug) {
+        print_hex_dump("decr:", revert,
+                       (data_size > 64 ? 64 : data_size));
+    }
+    return 0;
+}
+
+static int chacha_s390_test_init(void)
+{
+    u8 *plain = NULL, *revert = NULL;
+    u8 *cipher_generic = NULL, *cipher_s390 = NULL;
+    int ret = -1;
+
+    printf("s390 ChaCha20 test module: size=%d debug=%d\n",
+           data_size, debug);
+
+    /* Allocate and fill buffers */
+    plain = malloc(data_size);
+    if (!plain) {
+        printf("could not allocate plain buffer\n");
+        ret = -2;
+        goto out;
+    }
+
+    memset(plain, 'a', data_size);
+    for (unsigned i = 0, n = data_size > 256 ? 256 : data_size; i < n; ) {
+        ssize_t t = getrandom(plain + i, n - i, 0);
+        if (t < 0) {
+            break;
+        }
+        i -= t;
+    }
+
+    cipher_generic = calloc(1, data_size);
+    if (!cipher_generic) {
+        printf("could not allocate cipher_generic buffer\n");
+        ret = -2;
+        goto out;
+    }
+
+    cipher_s390 = calloc(1, data_size);
+    if (!cipher_s390) {
+        printf("could not allocate cipher_s390 buffer\n");
+        ret = -2;
+        goto out;
+    }
+
+    revert = calloc(1, data_size);
+    if (!revert) {
+        printf("could not allocate revert buffer\n");
+        ret = -2;
+        goto out;
+    }
+
+    if (debug) {
+        print_hex_dump("src: ", plain,
+                       (data_size > 64 ? 64 : data_size));
+    }
+
+    /* Use chacha20 lib */
+    test_lib_chacha(revert, cipher_generic, plain, true);
+    if (memcmp(plain, revert, data_size)) {
+        printf("generic en/decryption check FAILED\n");
+        ret = -2;
+        goto out;
+    }
+    printf("generic en/decryption check OK\n");
+
+    test_lib_chacha(revert, cipher_s390, plain, false);
+    if (memcmp(plain, revert, data_size)) {
+        printf("lib en/decryption check FAILED\n");
+        ret = -2;
+        goto out;
+    }
+    printf("lib en/decryption check OK\n");
+
+    if (memcmp(cipher_generic, cipher_s390, data_size)) {
+        printf("lib vs generic check FAILED\n");
+        ret = -2;
+        goto out;
+    }
+    printf("lib vs generic check OK\n");
+
+    printf("--- chacha20 s390 test end ---\n");
+
+out:
+    free(plain);
+    free(cipher_generic);
+    free(cipher_s390);
+    free(revert);
+    return ret;
+}
+
+int main(int ac, char **av)
+{
+    static const unsigned sizes[] = {
+        63, 64, 65, 127, 128, 129, 511, 512, 513, 4096, 65611,
+        /* too slow for tcg: 6291456, 62914560 */
+    };
+
+    debug = ac >= 2;
+    for (int i = 0; i < sizeof(sizes) / sizeof(sizes[0]); ++i) {
+        data_size = sizes[i];
+        if (chacha_s390_test_init() != -1) {
+            return 1;
+        }
+    }
+    return 0;
+}
diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 30994dcf9c..28f19a3176 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -66,9 +66,13 @@ Z13_TESTS+=vcksm
 Z13_TESTS+=vstl
 Z13_TESTS+=vrep
 Z13_TESTS+=precise-smc-user
+Z13_TESTS+=chacha
 $(Z13_TESTS): CFLAGS+=-march=z13 -O2
 TESTS+=$(Z13_TESTS)
 
+chacha: chacha.c chacha-vx.S
+       $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $^ -o $@
+
 ifneq ($(CROSS_CC_HAS_Z14),)
 Z14_TESTS=vfminmax
 vfminmax: LDFLAGS+=-lm
diff --git a/tests/tcg/s390x/chacha-vx.S b/tests/tcg/s390x/chacha-vx.S
new file mode 100644
index 0000000000..eee6275368
--- /dev/null
+++ b/tests/tcg/s390x/chacha-vx.S
@@ -0,0 +1,914 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights 
Reserved.
+ *
+ * For qemu testing, drop <asm/vx-insn-asm.h> and assume assembler support.
+ */
+
+#define SP     %r15
+#define FRAME  (16 * 8 + 4 * 8)
+
+       .data
+       .balign 32
+
+sigma:
+       .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     # endian-neutral
+       .long   1,0,0,0
+       .long   2,0,0,0
+       .long   3,0,0,0
+       .long   0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c     # byte swap
+
+       .long   0,1,2,3
+       .long   0x61707865,0x61707865,0x61707865,0x61707865     # smashed sigma
+       .long   0x3320646e,0x3320646e,0x3320646e,0x3320646e
+       .long   0x79622d32,0x79622d32,0x79622d32,0x79622d32
+       .long   0x6b206574,0x6b206574,0x6b206574,0x6b206574
+
+       .type   sigma, @object
+       .size   sigma, . - sigma
+
+       .previous
+
+       .text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+#                    counst u32 *key, const u32 *counter)
+
+#define        OUT             %r2
+#define        INP             %r3
+#define        LEN             %r4
+#define        KEY             %r5
+#define        COUNTER         %r6
+
+#define BEPERM         %v31
+#define CTR            %v26
+
+#define K0             %v16
+#define K1             %v17
+#define K2             %v18
+#define K3             %v19
+
+#define XA0            %v0
+#define XA1            %v1
+#define XA2            %v2
+#define XA3            %v3
+
+#define XB0            %v4
+#define XB1            %v5
+#define XB2            %v6
+#define XB3            %v7
+
+#define XC0            %v8
+#define XC1            %v9
+#define XC2            %v10
+#define XC3            %v11
+
+#define XD0            %v12
+#define XD1            %v13
+#define XD2            %v14
+#define XD3            %v15
+
+#define XT0            %v27
+#define XT1            %v28
+#define XT2            %v29
+#define XT3            %v30
+
+       .balign 32
+chacha20_vx_4x:
+       stmg    %r6,%r7,6*8(SP)
+
+       larl    %r7,sigma
+       lhi     %r0,10
+       lhi     %r1,0
+
+       vl      K0,0(%r7)               # load sigma
+       vl      K1,0(KEY)               # load key
+       vl      K2,16(KEY)
+       vl      K3,0(COUNTER)           # load counter
+
+       vl      BEPERM,0x40(%r7)
+       vl      CTR,0x50(%r7)
+
+       vlm     XA0,XA3,0x60(%r7),4     # load [smashed] sigma
+
+       vrepf   XB0,K1,0                # smash the key
+       vrepf   XB1,K1,1
+       vrepf   XB2,K1,2
+       vrepf   XB3,K1,3
+
+       vrepf   XD0,K3,0
+       vrepf   XD1,K3,1
+       vrepf   XD2,K3,2
+       vrepf   XD3,K3,3
+       vaf     XD0,XD0,CTR
+
+       vrepf   XC0,K2,0
+       vrepf   XC1,K2,1
+       vrepf   XC2,K2,2
+       vrepf   XC3,K2,3
+
+.Loop_4x:
+       vaf     XA0,XA0,XB0
+       vx      XD0,XD0,XA0
+       verllf  XD0,XD0,16
+
+       vaf     XA1,XA1,XB1
+       vx      XD1,XD1,XA1
+       verllf  XD1,XD1,16
+
+       vaf     XA2,XA2,XB2
+       vx      XD2,XD2,XA2
+       verllf  XD2,XD2,16
+
+       vaf     XA3,XA3,XB3
+       vx      XD3,XD3,XA3
+       verllf  XD3,XD3,16
+
+       vaf     XC0,XC0,XD0
+       vx      XB0,XB0,XC0
+       verllf  XB0,XB0,12
+
+       vaf     XC1,XC1,XD1
+       vx      XB1,XB1,XC1
+       verllf  XB1,XB1,12
+
+       vaf     XC2,XC2,XD2
+       vx      XB2,XB2,XC2
+       verllf  XB2,XB2,12
+
+       vaf     XC3,XC3,XD3
+       vx      XB3,XB3,XC3
+       verllf  XB3,XB3,12
+
+       vaf     XA0,XA0,XB0
+       vx      XD0,XD0,XA0
+       verllf  XD0,XD0,8
+
+       vaf     XA1,XA1,XB1
+       vx      XD1,XD1,XA1
+       verllf  XD1,XD1,8
+
+       vaf     XA2,XA2,XB2
+       vx      XD2,XD2,XA2
+       verllf  XD2,XD2,8
+
+       vaf     XA3,XA3,XB3
+       vx      XD3,XD3,XA3
+       verllf  XD3,XD3,8
+
+       vaf     XC0,XC0,XD0
+       vx      XB0,XB0,XC0
+       verllf  XB0,XB0,7
+
+       vaf     XC1,XC1,XD1
+       vx      XB1,XB1,XC1
+       verllf  XB1,XB1,7
+
+       vaf     XC2,XC2,XD2
+       vx      XB2,XB2,XC2
+       verllf  XB2,XB2,7
+
+       vaf     XC3,XC3,XD3
+       vx      XB3,XB3,XC3
+       verllf  XB3,XB3,7
+
+       vaf     XA0,XA0,XB1
+       vx      XD3,XD3,XA0
+       verllf  XD3,XD3,16
+
+       vaf     XA1,XA1,XB2
+       vx      XD0,XD0,XA1
+       verllf  XD0,XD0,16
+
+       vaf     XA2,XA2,XB3
+       vx      XD1,XD1,XA2
+       verllf  XD1,XD1,16
+
+       vaf     XA3,XA3,XB0
+       vx      XD2,XD2,XA3
+       verllf  XD2,XD2,16
+
+       vaf     XC2,XC2,XD3
+       vx      XB1,XB1,XC2
+       verllf  XB1,XB1,12
+
+       vaf     XC3,XC3,XD0
+       vx      XB2,XB2,XC3
+       verllf  XB2,XB2,12
+
+       vaf     XC0,XC0,XD1
+       vx      XB3,XB3,XC0
+       verllf  XB3,XB3,12
+
+       vaf     XC1,XC1,XD2
+       vx      XB0,XB0,XC1
+       verllf  XB0,XB0,12
+
+       vaf     XA0,XA0,XB1
+       vx      XD3,XD3,XA0
+       verllf  XD3,XD3,8
+
+       vaf     XA1,XA1,XB2
+       vx      XD0,XD0,XA1
+       verllf  XD0,XD0,8
+
+       vaf     XA2,XA2,XB3
+       vx      XD1,XD1,XA2
+       verllf  XD1,XD1,8
+
+       vaf     XA3,XA3,XB0
+       vx      XD2,XD2,XA3
+       verllf  XD2,XD2,8
+
+       vaf     XC2,XC2,XD3
+       vx      XB1,XB1,XC2
+       verllf  XB1,XB1,7
+
+       vaf     XC3,XC3,XD0
+       vx      XB2,XB2,XC3
+       verllf  XB2,XB2,7
+
+       vaf     XC0,XC0,XD1
+       vx      XB3,XB3,XC0
+       verllf  XB3,XB3,7
+
+       vaf     XC1,XC1,XD2
+       vx      XB0,XB0,XC1
+       verllf  XB0,XB0,7
+       brct    %r0,.Loop_4x
+
+       vaf     XD0,XD0,CTR
+
+       vmrhf   XT0,XA0,XA1             # transpose data
+       vmrhf   XT1,XA2,XA3
+       vmrlf   XT2,XA0,XA1
+       vmrlf   XT3,XA2,XA3
+       vpdi    XA0,XT0,XT1,0b0000
+       vpdi    XA1,XT0,XT1,0b0101
+       vpdi    XA2,XT2,XT3,0b0000
+       vpdi    XA3,XT2,XT3,0b0101
+
+       vmrhf   XT0,XB0,XB1
+       vmrhf   XT1,XB2,XB3
+       vmrlf   XT2,XB0,XB1
+       vmrlf   XT3,XB2,XB3
+       vpdi    XB0,XT0,XT1,0b0000
+       vpdi    XB1,XT0,XT1,0b0101
+       vpdi    XB2,XT2,XT3,0b0000
+       vpdi    XB3,XT2,XT3,0b0101
+
+       vmrhf   XT0,XC0,XC1
+       vmrhf   XT1,XC2,XC3
+       vmrlf   XT2,XC0,XC1
+       vmrlf   XT3,XC2,XC3
+       vpdi    XC0,XT0,XT1,0b0000
+       vpdi    XC1,XT0,XT1,0b0101
+       vpdi    XC2,XT2,XT3,0b0000
+       vpdi    XC3,XT2,XT3,0b0101
+
+       vmrhf   XT0,XD0,XD1
+       vmrhf   XT1,XD2,XD3
+       vmrlf   XT2,XD0,XD1
+       vmrlf   XT3,XD2,XD3
+       vpdi    XD0,XT0,XT1,0b0000
+       vpdi    XD1,XT0,XT1,0b0101
+       vpdi    XD2,XT2,XT3,0b0000
+       vpdi    XD3,XT2,XT3,0b0101
+
+       vaf     XA0,XA0,K0
+       vaf     XB0,XB0,K1
+       vaf     XC0,XC0,K2
+       vaf     XD0,XD0,K3
+
+       vperm   XA0,XA0,XA0,BEPERM
+       vperm   XB0,XB0,XB0,BEPERM
+       vperm   XC0,XC0,XC0,BEPERM
+       vperm   XD0,XD0,XD0,BEPERM
+
+       vlm     XT0,XT3,0(INP),0
+
+       vx      XT0,XT0,XA0
+       vx      XT1,XT1,XB0
+       vx      XT2,XT2,XC0
+       vx      XT3,XT3,XD0
+
+       vstm    XT0,XT3,0(OUT),0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+
+       vaf     XA0,XA1,K0
+       vaf     XB0,XB1,K1
+       vaf     XC0,XC1,K2
+       vaf     XD0,XD1,K3
+
+       vperm   XA0,XA0,XA0,BEPERM
+       vperm   XB0,XB0,XB0,BEPERM
+       vperm   XC0,XC0,XC0,BEPERM
+       vperm   XD0,XD0,XD0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_4x
+
+       vlm     XT0,XT3,0(INP),0
+
+       vx      XT0,XT0,XA0
+       vx      XT1,XT1,XB0
+       vx      XT2,XT2,XC0
+       vx      XT3,XT3,XD0
+
+       vstm    XT0,XT3,0(OUT),0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_4x
+
+       vaf     XA0,XA2,K0
+       vaf     XB0,XB2,K1
+       vaf     XC0,XC2,K2
+       vaf     XD0,XD2,K3
+
+       vperm   XA0,XA0,XA0,BEPERM
+       vperm   XB0,XB0,XB0,BEPERM
+       vperm   XC0,XC0,XC0,BEPERM
+       vperm   XD0,XD0,XD0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_4x
+
+       vlm     XT0,XT3,0(INP),0
+
+       vx      XT0,XT0,XA0
+       vx      XT1,XT1,XB0
+       vx      XT2,XT2,XC0
+       vx      XT3,XT3,XD0
+
+       vstm    XT0,XT3,0(OUT),0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_4x
+
+       vaf     XA0,XA3,K0
+       vaf     XB0,XB3,K1
+       vaf     XC0,XC3,K2
+       vaf     XD0,XD3,K3
+
+       vperm   XA0,XA0,XA0,BEPERM
+       vperm   XB0,XB0,XB0,BEPERM
+       vperm   XC0,XC0,XC0,BEPERM
+       vperm   XD0,XD0,XD0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_4x
+
+       vlm     XT0,XT3,0(INP),0
+
+       vx      XT0,XT0,XA0
+       vx      XT1,XT1,XB0
+       vx      XT2,XT2,XC0
+       vx      XT3,XT3,XD0
+
+       vstm    XT0,XT3,0(OUT),0
+
+.Ldone_4x:
+       lmg     %r6,%r7,6*8(SP)
+       br      %r14
+
+.Ltail_4x: 
+       vlr     XT0,XC0
+       vlr     XT1,XD0
+
+       vst     XA0,8*8+0x00(SP)
+       vst     XB0,8*8+0x10(SP)
+       vst     XT0,8*8+0x20(SP)
+       vst     XT1,8*8+0x30(SP)
+
+       lghi    %r1,0
+
+.Loop_tail_4x:
+       llgc    %r5,0(%r1,INP)
+       llgc    %r6,8*8(%r1,SP)
+       xr      %r6,%r5
+       stc     %r6,0(%r1,OUT)
+       la      %r1,1(%r1)
+       brct    LEN,.Loop_tail_4x
+
+       lmg     %r6,%r7,6*8(SP)
+       br      %r14
+
+       .type   chacha20_vx_4x, @function
+       .size   chacha20_vx_4x, . - chacha20_vx_4x
+
+#undef OUT
+#undef INP
+#undef LEN
+#undef KEY
+#undef COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+#                 counst u32 *key, const u32 *counter)
+
+#define        OUT             %r2
+#define        INP             %r3
+#define        LEN             %r4
+#define        KEY             %r5
+#define        COUNTER         %r6
+
+#define BEPERM         %v31
+
+#define K0             %v27
+#define K1             %v24
+#define K2             %v25
+#define K3             %v26
+
+#define A0             %v0
+#define B0             %v1
+#define C0             %v2
+#define D0             %v3
+
+#define A1             %v4
+#define B1             %v5
+#define C1             %v6
+#define D1             %v7
+
+#define A2             %v8
+#define B2             %v9
+#define C2             %v10
+#define D2             %v11
+
+#define A3             %v12
+#define B3             %v13
+#define C3             %v14
+#define D3             %v15
+
+#define A4             %v16
+#define B4             %v17
+#define C4             %v18
+#define D4             %v19
+
+#define A5             %v20
+#define B5             %v21
+#define C5             %v22
+#define D5             %v23
+
+#define T0             %v27
+#define T1             %v28
+#define T2             %v29
+#define T3             %v30
+
+       .balign 32
+chacha20_vx:
+       clgfi   LEN,256
+       jle     chacha20_vx_4x
+       stmg    %r6,%r7,6*8(SP)
+
+       lghi    %r1,-FRAME
+       lgr     %r0,SP
+       la      SP,0(%r1,SP)
+       stg     %r0,0(SP)               # back-chain
+
+       larl    %r7,sigma
+       lhi     %r0,10
+
+       vlm     K1,K2,0(KEY),0          # load key
+       vl      K3,0(COUNTER)           # load counter
+
+       vlm     K0,BEPERM,0(%r7),4      # load sigma, increments, ...
+
+.Loop_outer_vx:
+       vlr     A0,K0
+       vlr     B0,K1
+       vlr     A1,K0
+       vlr     B1,K1
+       vlr     A2,K0
+       vlr     B2,K1
+       vlr     A3,K0
+       vlr     B3,K1
+       vlr     A4,K0
+       vlr     B4,K1
+       vlr     A5,K0
+       vlr     B5,K1
+
+       vlr     D0,K3
+       vaf     D1,K3,T1                # K[3]+1
+       vaf     D2,K3,T2                # K[3]+2
+       vaf     D3,K3,T3                # K[3]+3
+       vaf     D4,D2,T2                # K[3]+4
+       vaf     D5,D2,T3                # K[3]+5
+
+       vlr     C0,K2
+       vlr     C1,K2
+       vlr     C2,K2
+       vlr     C3,K2
+       vlr     C4,K2
+       vlr     C5,K2
+
+       vlr     T1,D1
+       vlr     T2,D2
+       vlr     T3,D3
+
+.Loop_vx:
+       vaf     A0,A0,B0
+       vaf     A1,A1,B1
+       vaf     A2,A2,B2
+       vaf     A3,A3,B3
+       vaf     A4,A4,B4
+       vaf     A5,A5,B5
+       vx      D0,D0,A0
+       vx      D1,D1,A1
+       vx      D2,D2,A2
+       vx      D3,D3,A3
+       vx      D4,D4,A4
+       vx      D5,D5,A5
+       verllf  D0,D0,16
+       verllf  D1,D1,16
+       verllf  D2,D2,16
+       verllf  D3,D3,16
+       verllf  D4,D4,16
+       verllf  D5,D5,16
+
+       vaf     C0,C0,D0
+       vaf     C1,C1,D1
+       vaf     C2,C2,D2
+       vaf     C3,C3,D3
+       vaf     C4,C4,D4
+       vaf     C5,C5,D5
+       vx      B0,B0,C0
+       vx      B1,B1,C1
+       vx      B2,B2,C2
+       vx      B3,B3,C3
+       vx      B4,B4,C4
+       vx      B5,B5,C5
+       verllf  B0,B0,12
+       verllf  B1,B1,12
+       verllf  B2,B2,12
+       verllf  B3,B3,12
+       verllf  B4,B4,12
+       verllf  B5,B5,12
+
+       vaf     A0,A0,B0
+       vaf     A1,A1,B1
+       vaf     A2,A2,B2
+       vaf     A3,A3,B3
+       vaf     A4,A4,B4
+       vaf     A5,A5,B5
+       vx      D0,D0,A0
+       vx      D1,D1,A1
+       vx      D2,D2,A2
+       vx      D3,D3,A3
+       vx      D4,D4,A4
+       vx      D5,D5,A5
+       verllf  D0,D0,8
+       verllf  D1,D1,8
+       verllf  D2,D2,8
+       verllf  D3,D3,8
+       verllf  D4,D4,8
+       verllf  D5,D5,8
+
+       vaf     C0,C0,D0
+       vaf     C1,C1,D1
+       vaf     C2,C2,D2
+       vaf     C3,C3,D3
+       vaf     C4,C4,D4
+       vaf     C5,C5,D5
+       vx      B0,B0,C0
+       vx      B1,B1,C1
+       vx      B2,B2,C2
+       vx      B3,B3,C3
+       vx      B4,B4,C4
+       vx      B5,B5,C5
+       verllf  B0,B0,7
+       verllf  B1,B1,7
+       verllf  B2,B2,7
+       verllf  B3,B3,7
+       verllf  B4,B4,7
+       verllf  B5,B5,7
+
+       vsldb   C0,C0,C0,8
+       vsldb   C1,C1,C1,8
+       vsldb   C2,C2,C2,8
+       vsldb   C3,C3,C3,8
+       vsldb   C4,C4,C4,8
+       vsldb   C5,C5,C5,8
+       vsldb   B0,B0,B0,4
+       vsldb   B1,B1,B1,4
+       vsldb   B2,B2,B2,4
+       vsldb   B3,B3,B3,4
+       vsldb   B4,B4,B4,4
+       vsldb   B5,B5,B5,4
+       vsldb   D0,D0,D0,12
+       vsldb   D1,D1,D1,12
+       vsldb   D2,D2,D2,12
+       vsldb   D3,D3,D3,12
+       vsldb   D4,D4,D4,12
+       vsldb   D5,D5,D5,12
+
+       vaf     A0,A0,B0
+       vaf     A1,A1,B1
+       vaf     A2,A2,B2
+       vaf     A3,A3,B3
+       vaf     A4,A4,B4
+       vaf     A5,A5,B5
+       vx      D0,D0,A0
+       vx      D1,D1,A1
+       vx      D2,D2,A2
+       vx      D3,D3,A3
+       vx      D4,D4,A4
+       vx      D5,D5,A5
+       verllf  D0,D0,16
+       verllf  D1,D1,16
+       verllf  D2,D2,16
+       verllf  D3,D3,16
+       verllf  D4,D4,16
+       verllf  D5,D5,16
+
+       vaf     C0,C0,D0
+       vaf     C1,C1,D1
+       vaf     C2,C2,D2
+       vaf     C3,C3,D3
+       vaf     C4,C4,D4
+       vaf     C5,C5,D5
+       vx      B0,B0,C0
+       vx      B1,B1,C1
+       vx      B2,B2,C2
+       vx      B3,B3,C3
+       vx      B4,B4,C4
+       vx      B5,B5,C5
+       verllf  B0,B0,12
+       verllf  B1,B1,12
+       verllf  B2,B2,12
+       verllf  B3,B3,12
+       verllf  B4,B4,12
+       verllf  B5,B5,12
+
+       vaf     A0,A0,B0
+       vaf     A1,A1,B1
+       vaf     A2,A2,B2
+       vaf     A3,A3,B3
+       vaf     A4,A4,B4
+       vaf     A5,A5,B5
+       vx      D0,D0,A0
+       vx      D1,D1,A1
+       vx      D2,D2,A2
+       vx      D3,D3,A3
+       vx      D4,D4,A4
+       vx      D5,D5,A5
+       verllf  D0,D0,8
+       verllf  D1,D1,8
+       verllf  D2,D2,8
+       verllf  D3,D3,8
+       verllf  D4,D4,8
+       verllf  D5,D5,8
+
+       vaf     C0,C0,D0
+       vaf     C1,C1,D1
+       vaf     C2,C2,D2
+       vaf     C3,C3,D3
+       vaf     C4,C4,D4
+       vaf     C5,C5,D5
+       vx      B0,B0,C0
+       vx      B1,B1,C1
+       vx      B2,B2,C2
+       vx      B3,B3,C3
+       vx      B4,B4,C4
+       vx      B5,B5,C5
+       verllf  B0,B0,7
+       verllf  B1,B1,7
+       verllf  B2,B2,7
+       verllf  B3,B3,7
+       verllf  B4,B4,7
+       verllf  B5,B5,7
+
+       vsldb   C0,C0,C0,8
+       vsldb   C1,C1,C1,8
+       vsldb   C2,C2,C2,8
+       vsldb   C3,C3,C3,8
+       vsldb   C4,C4,C4,8
+       vsldb   C5,C5,C5,8
+       vsldb   B0,B0,B0,12
+       vsldb   B1,B1,B1,12
+       vsldb   B2,B2,B2,12
+       vsldb   B3,B3,B3,12
+       vsldb   B4,B4,B4,12
+       vsldb   B5,B5,B5,12
+       vsldb   D0,D0,D0,4
+       vsldb   D1,D1,D1,4
+       vsldb   D2,D2,D2,4
+       vsldb   D3,D3,D3,4
+       vsldb   D4,D4,D4,4
+       vsldb   D5,D5,D5,4
+       brct    %r0,.Loop_vx
+
+       vaf     A0,A0,K0
+       vaf     B0,B0,K1
+       vaf     C0,C0,K2
+       vaf     D0,D0,K3
+       vaf     A1,A1,K0
+       vaf     D1,D1,T1                # +K[3]+1
+
+       vperm   A0,A0,A0,BEPERM
+       vperm   B0,B0,B0,BEPERM
+       vperm   C0,C0,C0,BEPERM
+       vperm   D0,D0,D0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       vaf     D2,D2,T2                # +K[3]+2
+       vaf     D3,D3,T3                # +K[3]+3
+       vlm     T0,T3,0(INP),0
+
+       vx      A0,A0,T0
+       vx      B0,B0,T1
+       vx      C0,C0,T2
+       vx      D0,D0,T3
+
+       vlm     K0,T3,0(%r7),4          # re-load sigma and increments
+
+       vstm    A0,D0,0(OUT),0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       vaf     B1,B1,K1
+       vaf     C1,C1,K2
+
+       vperm   A0,A1,A1,BEPERM
+       vperm   B0,B1,B1,BEPERM
+       vperm   C0,C1,C1,BEPERM
+       vperm   D0,D1,D1,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       vlm     A1,D1,0(INP),0
+
+       vx      A0,A0,A1
+       vx      B0,B0,B1
+       vx      C0,C0,C1
+       vx      D0,D0,D1
+
+       vstm    A0,D0,0(OUT),0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       vaf     A2,A2,K0
+       vaf     B2,B2,K1
+       vaf     C2,C2,K2
+
+       vperm   A0,A2,A2,BEPERM
+       vperm   B0,B2,B2,BEPERM
+       vperm   C0,C2,C2,BEPERM
+       vperm   D0,D2,D2,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       vlm     A1,D1,0(INP),0
+
+       vx      A0,A0,A1
+       vx      B0,B0,B1
+       vx      C0,C0,C1
+       vx      D0,D0,D1
+
+       vstm    A0,D0,0(OUT),0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       vaf     A3,A3,K0
+       vaf     B3,B3,K1
+       vaf     C3,C3,K2
+       vaf     D2,K3,T3                # K[3]+3
+
+       vperm   A0,A3,A3,BEPERM
+       vperm   B0,B3,B3,BEPERM
+       vperm   C0,C3,C3,BEPERM
+       vperm   D0,D3,D3,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       vaf     D3,D2,T1                # K[3]+4
+       VLM     A1,D1,0(INP),0
+
+       vx      A0,A0,A1
+       vx      B0,B0,B1
+       vx      C0,C0,C1
+       vx      D0,D0,D1
+
+       vstm    A0,D0,0(OUT),0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       vaf     A4,A4,K0
+       vaf     B4,B4,K1
+       vaf     C4,C4,K2
+       vaf     D4,D4,D3                # +K[3]+4
+       vaf     D3,D3,T1                # K[3]+5
+       vaf     K3,D2,T3                # K[3]+=6
+
+       vperm   A0,A4,A4,BEPERM
+       vperm   B0,B4,B4,BEPERM
+       vperm   C0,C4,C4,BEPERM
+       vperm   D0,D4,D4,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       vlm     A1,D1,0(INP),0
+
+       vx      A0,A0,A1
+       vx      B0,B0,B1
+       vx      C0,C0,C1
+       vx      D0,D0,D1
+
+       vstm    A0,D0,0(OUT),0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       vaf     A5,A5,K0
+       vaf     B5,B5,K1
+       vaf     C5,C5,K2
+       vaf     D5,D5,D3                # +K[3]+5
+
+       vperm   A0,A5,A5,BEPERM
+       vperm   B0,B5,B5,BEPERM
+       vperm   C0,C5,C5,BEPERM
+       vperm   D0,D5,D5,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       vlm     A1,D1,0(INP),0
+
+       vx      A0,A0,A1
+       vx      B0,B0,B1
+       vx      C0,C0,C1
+       vx      D0,D0,D1
+
+       vstm    A0,D0,0(OUT),0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       lhi     %r0,10
+       aghi    LEN,-0x40
+       jne     .Loop_outer_vx
+
+.Ldone_vx:
+       lmg     %r6,%r7,FRAME+6*8(SP)
+       la      SP,FRAME(SP)
+       br      %r14
+
+.Ltail_vx:
+       vstm    A0,D0,8*8(SP),3
+       lghi    %r1,0
+
+.Loop_tail_vx:
+       llgc    %r5,0(%r1,INP)
+       llgc    %r6,8*8(%r1,SP)
+       xr      %r6,%r5
+       stc     %r6,0(%r1,OUT)
+       la      %r1,1(%r1)
+       brct    LEN,.Loop_tail_vx
+
+       lmg     %r6,%r7,FRAME+6*8(SP)
+       la      SP,FRAME(SP)
+       br      %r14
+
+       .type   chacha20_vx, @function
+       .size   chacha20_vx, . - chacha20_vx
+       .globl  chacha20_vx
+
+.previous
+.section .note.GNU-stack,"",%progbits
-- 
2.34.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]