![github@pureftpd.org](/assets/img/avatar_default.png)
11 changed files with 631 additions and 35 deletions
@ -0,0 +1,45 @@ |
|||
|
|||
#define BLAKE2_USE_SSSE3 |
|||
#define BLAKE2_USE_SSE41 |
|||
#define BLAKE2_USE_AVX2 |
|||
|
|||
#include <stdint.h> |
|||
#include <string.h> |
|||
|
|||
#if (defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \ |
|||
(defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))) |
|||
|
|||
#pragma GCC target("sse2") |
|||
#pragma GCC target("ssse3") |
|||
#pragma GCC target("sse4.1") |
|||
#pragma GCC target("avx2") |
|||
|
|||
#include <emmintrin.h> |
|||
#include <tmmintrin.h> |
|||
#include <smmintrin.h> |
|||
#include <immintrin.h> |
|||
|
|||
#include "blake2.h" |
|||
#include "blake2-impl.h" |
|||
#include "blake2b-compress-avx2.h" |
|||
|
|||
CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] = |
|||
{ |
|||
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, |
|||
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, |
|||
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, |
|||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL |
|||
}; |
|||
|
|||
int blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) |
|||
{ |
|||
__m256i a = LOADU(&S->h[0]); |
|||
__m256i b = LOADU(&S->h[4]); |
|||
BLAKE2B_COMPRESS_V1(a, b, block, S->t[0], S->t[1], S->f[0], S->f[1]); |
|||
STOREU(&S->h[0], a); |
|||
STOREU(&S->h[4], b); |
|||
|
|||
return 0; |
|||
} |
|||
|
|||
#endif |
@ -0,0 +1,123 @@ |
|||
|
|||
#ifndef blake2b_compress_avx2_H |
|||
#define blake2b_compress_avx2_H |
|||
|
|||
#define LOAD128(p) _mm_load_si128((__m128i *)(p)) |
|||
#define STORE128(p, r) _mm_store_si128((__m128i *)(p), r) |
|||
|
|||
#define LOADU128(p) _mm_loadu_si128((__m128i *)(p)) |
|||
#define STOREU128(p, r) _mm_storeu_si128((__m128i *)(p), r) |
|||
|
|||
#define LOAD(p) _mm256_load_si256((__m256i *)(p)) |
|||
#define STORE(p, r) _mm256_store_si256((__m256i *)(p), r) |
|||
|
|||
#define LOADU(p) _mm256_loadu_si256((__m256i *)(p)) |
|||
#define STOREU(p, r) _mm256_storeu_si256((__m256i *)(p), r) |
|||
|
|||
static inline uint64_t LOADU64(const void *p) { |
|||
uint64_t v; |
|||
memcpy(&v, p, sizeof v); |
|||
return v; |
|||
} |
|||
|
|||
#define ROTATE16 _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, \ |
|||
2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9) |
|||
|
|||
#define ROTATE24 _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, \ |
|||
3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10) |
|||
|
|||
#define ADD(a, b) _mm256_add_epi64(a, b) |
|||
#define SUB(a, b) _mm256_sub_epi64(a, b) |
|||
|
|||
#define XOR(a, b) _mm256_xor_si256(a, b) |
|||
#define AND(a, b) _mm256_and_si256(a, b) |
|||
#define OR(a, b) _mm256_or_si256(a, b) |
|||
|
|||
#define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) |
|||
#define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24) |
|||
#define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16) |
|||
#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x))) |
|||
|
|||
#define BLAKE2B_G1_V1(a, b, c, d, m) do { \ |
|||
a = ADD(a, m); \ |
|||
a = ADD(a, b); d = XOR(d, a); d = ROT32(d); \ |
|||
c = ADD(c, d); b = XOR(b, c); b = ROT24(b); \ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_G2_V1(a, b, c, d, m) do { \ |
|||
a = ADD(a, m); \ |
|||
a = ADD(a, b); d = XOR(d, a); d = ROT16(d); \ |
|||
c = ADD(c, d); b = XOR(b, c); b = ROT63(b); \ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_DIAG_V1(a, b, c, d) do { \ |
|||
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2,1,0,3)); \ |
|||
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1,0,3,2)); \ |
|||
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0,3,2,1)); \ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_UNDIAG_V1(a, b, c, d) do { \ |
|||
d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0,3,2,1)); \ |
|||
c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1,0,3,2)); \ |
|||
b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2,1,0,3)); \ |
|||
} while(0) |
|||
|
|||
#include "blake2b-load-avx2.h" |
|||
|
|||
#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \ |
|||
__m256i b0; \ |
|||
BLAKE2B_LOAD_MSG_ ##r ##_1(b0); \ |
|||
BLAKE2B_G1_V1(a, b, c, d, b0); \ |
|||
BLAKE2B_LOAD_MSG_ ##r ##_2(b0); \ |
|||
BLAKE2B_G2_V1(a, b, c, d, b0); \ |
|||
BLAKE2B_DIAG_V1(a, b, c, d); \ |
|||
BLAKE2B_LOAD_MSG_ ##r ##_3(b0); \ |
|||
BLAKE2B_G1_V1(a, b, c, d, b0); \ |
|||
BLAKE2B_LOAD_MSG_ ##r ##_4(b0); \ |
|||
BLAKE2B_G2_V1(a, b, c, d, b0); \ |
|||
BLAKE2B_UNDIAG_V1(a, b, c, d); \ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \ |
|||
BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \ |
|||
} while(0) |
|||
|
|||
#define DECLARE_MESSAGE_WORDS(m) \ |
|||
const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \ |
|||
const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \ |
|||
const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \ |
|||
const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \ |
|||
const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \ |
|||
const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \ |
|||
const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \ |
|||
const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \ |
|||
__m256i t0, t1; |
|||
|
|||
#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) do { \ |
|||
DECLARE_MESSAGE_WORDS(m) \ |
|||
const __m256i iv0 = a; \ |
|||
const __m256i iv1 = b; \ |
|||
__m256i c = LOAD(&blake2b_IV[0]); \ |
|||
__m256i d = XOR( \ |
|||
LOAD(&blake2b_IV[4]), \ |
|||
_mm256_set_epi64x(f1, f0, t1, t0) \ |
|||
); \ |
|||
BLAKE2B_ROUNDS_V1(a, b, c, d, m); \ |
|||
a = XOR(a, c); \ |
|||
b = XOR(b, d); \ |
|||
a = XOR(a, iv0); \ |
|||
b = XOR(b, iv1); \ |
|||
} while(0) |
|||
|
|||
#endif |
@ -0,0 +1,97 @@ |
|||
|
|||
#ifndef blake2b_compress_ssse3_H |
|||
#define blake2b_compress_ssse3_H |
|||
|
|||
#define LOADU(p) _mm_loadu_si128( (const __m128i *)(const void *)(p) ) |
|||
#define STOREU(p,r) _mm_storeu_si128((__m128i *)(void *)(p), r) |
|||
|
|||
#define _mm_roti_epi64(x, c) \ |
|||
(-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ |
|||
: (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ |
|||
: (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ |
|||
: (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ |
|||
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) |
|||
|
|||
#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ |
|||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ |
|||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ |
|||
\ |
|||
row4l = _mm_xor_si128(row4l, row1l); \ |
|||
row4h = _mm_xor_si128(row4h, row1h); \ |
|||
\ |
|||
row4l = _mm_roti_epi64(row4l, -32); \ |
|||
row4h = _mm_roti_epi64(row4h, -32); \ |
|||
\ |
|||
row3l = _mm_add_epi64(row3l, row4l); \ |
|||
row3h = _mm_add_epi64(row3h, row4h); \ |
|||
\ |
|||
row2l = _mm_xor_si128(row2l, row3l); \ |
|||
row2h = _mm_xor_si128(row2h, row3h); \ |
|||
\ |
|||
row2l = _mm_roti_epi64(row2l, -24); \ |
|||
row2h = _mm_roti_epi64(row2h, -24); \ |
|||
|
|||
#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ |
|||
row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ |
|||
row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ |
|||
\ |
|||
row4l = _mm_xor_si128(row4l, row1l); \ |
|||
row4h = _mm_xor_si128(row4h, row1h); \ |
|||
\ |
|||
row4l = _mm_roti_epi64(row4l, -16); \ |
|||
row4h = _mm_roti_epi64(row4h, -16); \ |
|||
\ |
|||
row3l = _mm_add_epi64(row3l, row4l); \ |
|||
row3h = _mm_add_epi64(row3h, row4h); \ |
|||
\ |
|||
row2l = _mm_xor_si128(row2l, row3l); \ |
|||
row2h = _mm_xor_si128(row2h, row3h); \ |
|||
\ |
|||
row2l = _mm_roti_epi64(row2l, -63); \ |
|||
row2h = _mm_roti_epi64(row2h, -63); \ |
|||
|
|||
#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
|||
t0 = _mm_alignr_epi8(row2h, row2l, 8); \ |
|||
t1 = _mm_alignr_epi8(row2l, row2h, 8); \ |
|||
row2l = t0; \ |
|||
row2h = t1; \ |
|||
\ |
|||
t0 = row3l; \ |
|||
row3l = row3h; \ |
|||
row3h = t0; \ |
|||
\ |
|||
t0 = _mm_alignr_epi8(row4h, row4l, 8); \ |
|||
t1 = _mm_alignr_epi8(row4l, row4h, 8); \ |
|||
row4l = t1; \ |
|||
row4h = t0; |
|||
|
|||
#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ |
|||
t0 = _mm_alignr_epi8(row2l, row2h, 8); \ |
|||
t1 = _mm_alignr_epi8(row2h, row2l, 8); \ |
|||
row2l = t0; \ |
|||
row2h = t1; \ |
|||
\ |
|||
t0 = row3l; \ |
|||
row3l = row3h; \ |
|||
row3h = t0; \ |
|||
\ |
|||
t0 = _mm_alignr_epi8(row4l, row4h, 8); \ |
|||
t1 = _mm_alignr_epi8(row4h, row4l, 8); \ |
|||
row4l = t1; \ |
|||
row4h = t0; |
|||
|
|||
#include "blake2b-load-sse2.h" |
|||
|
|||
#define ROUND(r) \ |
|||
LOAD_MSG_ ##r ##_1(b0, b1); \ |
|||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
|||
LOAD_MSG_ ##r ##_2(b0, b1); \ |
|||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
|||
DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ |
|||
LOAD_MSG_ ##r ##_3(b0, b1); \ |
|||
G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
|||
LOAD_MSG_ ##r ##_4(b0, b1); \ |
|||
G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ |
|||
UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); |
|||
|
|||
#endif |
@ -0,0 +1,339 @@ |
|||
#ifndef blake2b_load_avx2_H |
|||
#define blake2b_load_avx2_H |
|||
|
|||
#define BLAKE2B_LOAD_MSG_0_1(b0) do { \ |
|||
t0 = _mm256_unpacklo_epi64(m0, m1); \ |
|||
t1 = _mm256_unpacklo_epi64(m2, m3); \ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_0_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m0, m1);\ |
|||
t1 = _mm256_unpackhi_epi64(m2, m3);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_0_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m4, m5);\ |
|||
t1 = _mm256_unpacklo_epi64(m6, m7);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_0_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m4, m5);\ |
|||
t1 = _mm256_unpackhi_epi64(m6, m7);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_1_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m7, m2);\ |
|||
t1 = _mm256_unpackhi_epi64(m4, m6);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_1_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m5, m4);\ |
|||
t1 = _mm256_alignr_epi8(m3, m7, 8);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_1_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\ |
|||
t1 = _mm256_unpackhi_epi64(m5, m2);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_1_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m6, m1);\ |
|||
t1 = _mm256_unpackhi_epi64(m3, m1);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_2_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_alignr_epi8(m6, m5, 8);\ |
|||
t1 = _mm256_unpackhi_epi64(m2, m7);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_2_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m4, m0);\ |
|||
t1 = _mm256_blend_epi32(m6, m1, 0x33);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_2_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_blend_epi32(m1, m5, 0x33);\ |
|||
t1 = _mm256_unpackhi_epi64(m3, m4);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_2_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m7, m3);\ |
|||
t1 = _mm256_alignr_epi8(m2, m0, 8);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_3_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m3, m1);\ |
|||
t1 = _mm256_unpackhi_epi64(m6, m5);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_3_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m4, m0);\ |
|||
t1 = _mm256_unpacklo_epi64(m6, m7);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_3_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_blend_epi32(m2, m1, 0x33);\ |
|||
t1 = _mm256_blend_epi32(m7, m2, 0x33);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_3_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m3, m5);\ |
|||
t1 = _mm256_unpacklo_epi64(m0, m4);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_4_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m4, m2);\ |
|||
t1 = _mm256_unpacklo_epi64(m1, m5);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_4_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_blend_epi32(m3, m0, 0x33);\ |
|||
t1 = _mm256_blend_epi32(m7, m2, 0x33);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_4_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_blend_epi32(m5, m7, 0x33);\ |
|||
t1 = _mm256_blend_epi32(m1, m3, 0x33);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_4_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_alignr_epi8(m6, m0, 8);\ |
|||
t1 = _mm256_blend_epi32(m6, m4, 0x33);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_5_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m1, m3);\ |
|||
t1 = _mm256_unpacklo_epi64(m0, m4);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_5_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m6, m5);\ |
|||
t1 = _mm256_unpackhi_epi64(m5, m1);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_5_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_blend_epi32(m3, m2, 0x33);\ |
|||
t1 = _mm256_unpackhi_epi64(m7, m0);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_5_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m6, m2);\ |
|||
t1 = _mm256_blend_epi32(m4, m7, 0x33);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_6_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_blend_epi32(m0, m6, 0x33);\ |
|||
t1 = _mm256_unpacklo_epi64(m7, m2);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_6_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m2, m7);\ |
|||
t1 = _mm256_alignr_epi8(m5, m6, 8);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_6_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m0, m3);\ |
|||
t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_6_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m3, m1);\ |
|||
t1 = _mm256_blend_epi32(m5, m1, 0x33);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_7_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m6, m3);\ |
|||
t1 = _mm256_blend_epi32(m1, m6, 0x33);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_7_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_alignr_epi8(m7, m5, 8);\ |
|||
t1 = _mm256_unpackhi_epi64(m0, m4);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_7_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m2, m7);\ |
|||
t1 = _mm256_unpacklo_epi64(m4, m1);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_7_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m0, m2);\ |
|||
t1 = _mm256_unpacklo_epi64(m3, m5);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_8_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m3, m7);\ |
|||
t1 = _mm256_alignr_epi8(m0, m5, 8);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_8_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m7, m4);\ |
|||
t1 = _mm256_alignr_epi8(m4, m1, 8);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_8_3(b0) \ |
|||
do { \ |
|||
t0 = m6;\ |
|||
t1 = _mm256_alignr_epi8(m5, m0, 8);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_8_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_blend_epi32(m3, m1, 0x33);\ |
|||
t1 = m2;\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_9_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m5, m4);\ |
|||
t1 = _mm256_unpackhi_epi64(m3, m0);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_9_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m1, m2);\ |
|||
t1 = _mm256_blend_epi32(m2, m3, 0x33);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_9_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m7, m4);\ |
|||
t1 = _mm256_unpackhi_epi64(m1, m6);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_9_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_alignr_epi8(m7, m5, 8);\ |
|||
t1 = _mm256_unpacklo_epi64(m6, m0);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_10_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m0, m1);\ |
|||
t1 = _mm256_unpacklo_epi64(m2, m3);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_10_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m0, m1);\ |
|||
t1 = _mm256_unpackhi_epi64(m2, m3);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_10_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m4, m5);\ |
|||
t1 = _mm256_unpacklo_epi64(m6, m7);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_10_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpackhi_epi64(m4, m5);\ |
|||
t1 = _mm256_unpackhi_epi64(m6, m7);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_11_1(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m7, m2);\ |
|||
t1 = _mm256_unpackhi_epi64(m4, m6);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_11_2(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m5, m4);\ |
|||
t1 = _mm256_alignr_epi8(m3, m7, 8);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_11_3(b0) \ |
|||
do { \ |
|||
t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\ |
|||
t1 = _mm256_unpackhi_epi64(m5, m2);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#define BLAKE2B_LOAD_MSG_11_4(b0) \ |
|||
do { \ |
|||
t0 = _mm256_unpacklo_epi64(m6, m1);\ |
|||
t1 = _mm256_unpackhi_epi64(m3, m1);\ |
|||
b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ |
|||
} while(0) |
|||
|
|||
#endif |
Loading…
Reference in new issue