You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
141 lines
5.2 KiB
141 lines
5.2 KiB
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "blake2.h"
|
|
#include "blake2b.h"
|
|
#include "blake2b-common.h"
|
|
|
|
ALIGN(64) static const uint64_t blake2b_IV[8] = {
|
|
UINT64_C(0x6A09E667F3BCC908), UINT64_C(0xBB67AE8584CAA73B),
|
|
UINT64_C(0x3C6EF372FE94F82B), UINT64_C(0xA54FF53A5F1D36F1),
|
|
UINT64_C(0x510E527FADE682D1), UINT64_C(0x9B05688C2B3E6C1F),
|
|
UINT64_C(0x1F83D9ABFB41BD6B), UINT64_C(0x5BE0CD19137E2179),
|
|
};
|
|
|
|
#define BLAKE2B_G1_V1(a, b, c, d, m) do { \
|
|
a = ADD(a, m); \
|
|
a = ADD(a, b); d = XOR(d, a); d = ROT32(d); \
|
|
c = ADD(c, d); b = XOR(b, c); b = ROT24(b); \
|
|
} while(0)
|
|
|
|
#define BLAKE2B_G2_V1(a, b, c, d, m) do { \
|
|
a = ADD(a, m); \
|
|
a = ADD(a, b); d = XOR(d, a); d = ROT16(d); \
|
|
c = ADD(c, d); b = XOR(b, c); b = ROT63(b); \
|
|
} while(0)
|
|
|
|
#define BLAKE2B_DIAG_V1(a, b, c, d) do { \
|
|
a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(2,1,0,3)); \
|
|
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(1,0,3,2)); \
|
|
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(0,3,2,1)); \
|
|
} while(0)
|
|
|
|
#define BLAKE2B_UNDIAG_V1(a, b, c, d) do { \
|
|
a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(0,3,2,1)); \
|
|
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(1,0,3,2)); \
|
|
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(2,1,0,3)); \
|
|
} while(0)
|
|
|
|
#include "blake2b-load-avx2.h"
|
|
|
|
#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \
|
|
__m256i b0; \
|
|
BLAKE2B_LOAD_MSG_ ##r ##_1(b0); \
|
|
BLAKE2B_G1_V1(a, b, c, d, b0); \
|
|
BLAKE2B_LOAD_MSG_ ##r ##_2(b0); \
|
|
BLAKE2B_G2_V1(a, b, c, d, b0); \
|
|
BLAKE2B_DIAG_V1(a, b, c, d); \
|
|
BLAKE2B_LOAD_MSG_ ##r ##_3(b0); \
|
|
BLAKE2B_G1_V1(a, b, c, d, b0); \
|
|
BLAKE2B_LOAD_MSG_ ##r ##_4(b0); \
|
|
BLAKE2B_G2_V1(a, b, c, d, b0); \
|
|
BLAKE2B_UNDIAG_V1(a, b, c, d); \
|
|
} while(0)
|
|
|
|
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
|
|
BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
|
|
} while(0)
|
|
|
|
#define DECLARE_MESSAGE_WORDS(m) \
|
|
const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
|
|
const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
|
|
const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
|
|
const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
|
|
const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
|
|
const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
|
|
const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
|
|
const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
|
|
__m256i t0, t1;
|
|
|
|
#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) do { \
|
|
DECLARE_MESSAGE_WORDS(m) \
|
|
const __m256i iv0 = a; \
|
|
const __m256i iv1 = b; \
|
|
__m256i c = LOAD(&blake2b_IV[0]); \
|
|
__m256i d = XOR( \
|
|
LOAD(&blake2b_IV[4]), \
|
|
_mm256_set_epi64x(f1, f0, t1, t0) \
|
|
); \
|
|
BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
|
|
a = XOR(a, c); \
|
|
b = XOR(b, d); \
|
|
a = XOR(a, iv0); \
|
|
b = XOR(b, iv1); \
|
|
} while(0)
|
|
|
|
int blake2b_avx2(void* out_ptr, size_t outlen, const void* in_ptr, size_t inlen) {
|
|
const __m256i parameter_block = _mm256_set_epi64x(0, 0, 0, 0x01010000UL | (uint32_t)outlen);
|
|
ALIGN(64) uint8_t buffer[BLAKE2B_BLOCKBYTES];
|
|
__m256i a = XOR(LOAD(&blake2b_IV[0]), parameter_block);
|
|
__m256i b = LOAD(&blake2b_IV[4]);
|
|
uint64_t counter = 0;
|
|
const uint8_t* in = (const uint8_t*)in_ptr;
|
|
do {
|
|
const uint64_t flag = (inlen <= BLAKE2B_BLOCKBYTES) ? -1 : 0;
|
|
size_t block_size = BLAKE2B_BLOCKBYTES;
|
|
if(inlen < BLAKE2B_BLOCKBYTES) {
|
|
memcpy(buffer, in, inlen);
|
|
memset(buffer + inlen, 0, BLAKE2B_BLOCKBYTES - inlen);
|
|
block_size = inlen;
|
|
in = buffer;
|
|
}
|
|
counter += block_size;
|
|
BLAKE2B_COMPRESS_V1(a, b, in, counter, 0, flag, 0);
|
|
inlen -= block_size;
|
|
in += block_size;
|
|
} while(inlen > 0);
|
|
|
|
uint8_t* out = (uint8_t*)out_ptr;
|
|
|
|
switch (outlen) {
|
|
case 64:
|
|
STOREU(out + 32, b);
|
|
// Fall through
|
|
|
|
case 32:
|
|
STOREU(out, a);
|
|
break;
|
|
|
|
default:
|
|
STOREU(buffer, a);
|
|
STOREU(buffer + 32, b);
|
|
memcpy(out, buffer, outlen);
|
|
break;
|
|
}
|
|
|
|
_mm256_zeroupper();
|
|
return 0;
|
|
}
|
|
|