BLAKE2b AVX2 implementation

By the marvellous Samuel Neves - https://github.com/sneves/blake2-avx2
8 years ago · 0131a72082
11 changed files with 631 additions and 35 deletions
--- a/configure.ac
+++ b/configure.ac
@ -394,6 +394,8 @@ AC_SUBST(CFLAGS_SSE2)
 AC_SUBST(CFLAGS_SSE3)
 AC_SUBST(CFLAGS_SSSE3)
 AC_SUBST(CFLAGS_SSE41)
+AC_SUBST(CFLAGS_AVX)
+AC_SUBST(CFLAGS_AVX2)
 AC_SUBST(CFLAGS_AESNI)
 AC_SUBST(CFLAGS_PCLMUL)

--- a/src/libsodium/Makefile.am
+++ b/src/libsodium/Makefile.am
@ -36,6 +36,7 @@ libsodium_la_SOURCES = \
 	crypto_generichash/blake2/ref/blake2b-compress-ref.c \
 	crypto_generichash/blake2/ref/blake2b-load-sse2.h \
 	crypto_generichash/blake2/ref/blake2b-load-sse41.h \
+	crypto_generichash/blake2/ref/blake2b-load-avx2.h \
 	crypto_generichash/blake2/ref/blake2b-ref.c \
 	crypto_generichash/blake2/ref/blake2b-round.h \
 	crypto_generichash/blake2/ref/generichash_blake2b.c \
@ -216,8 +217,8 @@ endif
 SUBDIRS = \
 	include

-libsodium_la_LIBADD = libaesni.la libsse2.la libssse3.la libsse41.la
-noinst_LTLIBRARIES =  libaesni.la libsse2.la libssse3.la libsse41.la
+libsodium_la_LIBADD = libaesni.la libsse2.la libssse3.la libsse41.la libavx2.la
+noinst_LTLIBRARIES =  libaesni.la libsse2.la libssse3.la libsse41.la libavx2.la

 libaesni_la_LDFLAGS = $(libsodium_la_LDFLAGS)
 libaesni_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
@ -238,6 +239,7 @@ libssse3_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
 	@CFLAGS_SSE2@ @CFLAGS_SSSE3@
 libssse3_la_SOURCES = \
 	crypto_generichash/blake2/ref/blake2b-compress-ssse3.c \
+	crypto_generichash/blake2/ref/blake2b-compress-ssse3.h \
 	crypto_pwhash/argon2/argon2-fill-block-ssse3.c \
 	crypto_pwhash/argon2/blamka-round-ssse3.h \
 	crypto_stream/chacha20/vec/stream_chacha20_vec.h \
@ -247,4 +249,12 @@ libsse41_la_LDFLAGS = $(libsodium_la_LDFLAGS)
 libsse41_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
 	@CFLAGS_SSE2@ @CFLAGS_SSSE3@ @CFLAGS_SSE41@
 libsse41_la_SOURCES = \
-	crypto_generichash/blake2/ref/blake2b-compress-sse41.c
+	crypto_generichash/blake2/ref/blake2b-compress-sse41.c \
+	crypto_generichash/blake2/ref/blake2b-compress-sse41.h
+
+libavx2_la_LDFLAGS = $(libsodium_la_LDFLAGS)
+libavx2_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
+	@CFLAGS_SSE2@ @CFLAGS_SSSE3@ @CFLAGS_SSE41@ @CFLAGS_AVX@ @CFLAGS_AVX2@
+libavx2_la_SOURCES = \
+	crypto_generichash/blake2/ref/blake2b-compress-avx2.c \
+	crypto_generichash/blake2/ref/blake2b-compress-avx2.h
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2.h
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2.h
@ -183,6 +183,7 @@ CRYPTO_ALIGN( 64 ) typedef struct blake2b_state_
  int blake2b_compress_ref( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
  int blake2b_compress_ssse3( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
  int blake2b_compress_sse41( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );
+  int blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] );

 #if defined(__cplusplus)
 }
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.c
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.c
@ -0,0 +1,45 @@
+
+#define BLAKE2_USE_SSSE3
+#define BLAKE2_USE_SSE41
+#define BLAKE2_USE_AVX2
+
+#include <stdint.h>
+#include <string.h>
+
+#if (defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \
+    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
+
+#pragma GCC target("sse2")
+#pragma GCC target("ssse3")
+#pragma GCC target("sse4.1")
+#pragma GCC target("avx2")
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+#include "blake2b-compress-avx2.h"
+
+CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] =
+{
+    0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+    0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+    0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+    0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+int blake2b_compress_avx2( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
+{
+    __m256i a = LOADU(&S->h[0]);
+    __m256i b = LOADU(&S->h[4]);
+    BLAKE2B_COMPRESS_V1(a, b, block, S->t[0], S->t[1], S->f[0], S->f[1]);
+    STOREU(&S->h[0], a);
+    STOREU(&S->h[4], b);
+
+    return 0;
+}
+
+#endif
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.h
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-avx2.h
@ -0,0 +1,123 @@
+
+#ifndef blake2b_compress_avx2_H
+#define blake2b_compress_avx2_H
+
+#define LOAD128(p)     _mm_load_si128((__m128i *)(p))
+#define STORE128(p, r) _mm_store_si128((__m128i *)(p), r)
+
+#define LOADU128(p)     _mm_loadu_si128((__m128i *)(p))
+#define STOREU128(p, r) _mm_storeu_si128((__m128i *)(p), r)
+
+#define LOAD(p)     _mm256_load_si256((__m256i *)(p))
+#define STORE(p, r) _mm256_store_si256((__m256i *)(p), r)
+
+#define LOADU(p)     _mm256_loadu_si256((__m256i *)(p))
+#define STOREU(p, r) _mm256_storeu_si256((__m256i *)(p), r)
+
+static inline uint64_t LOADU64(const void *p) {
+    uint64_t v;
+    memcpy(&v, p, sizeof v);
+    return v;
+}
+
+#define ROTATE16 _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, \
+                                  2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
+
+#define ROTATE24 _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, \
+                                  3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
+
+#define ADD(a, b) _mm256_add_epi64(a, b)
+#define SUB(a, b) _mm256_sub_epi64(a, b)
+
+#define XOR(a, b) _mm256_xor_si256(a, b)
+#define AND(a, b) _mm256_and_si256(a, b)
+#define  OR(a, b) _mm256_or_si256(a, b)
+
+#define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))
+#define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24)
+#define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16)
+#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))
+
+#define BLAKE2B_G1_V1(a, b, c, d, m) do {       \
+    a = ADD(a, m);                              \
+    a = ADD(a, b); d = XOR(d, a); d = ROT32(d); \
+    c = ADD(c, d); b = XOR(b, c); b = ROT24(b); \
+} while(0)
+
+#define BLAKE2B_G2_V1(a, b, c, d, m) do {       \
+    a = ADD(a, m);                              \
+    a = ADD(a, b); d = XOR(d, a); d = ROT16(d); \
+    c = ADD(c, d); b = XOR(b, c); b = ROT63(b); \
+} while(0)
+
+#define BLAKE2B_DIAG_V1(a, b, c, d) do {                   \
+    d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2,1,0,3)); \
+    c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1,0,3,2)); \
+    b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0,3,2,1)); \
+} while(0)
+
+#define BLAKE2B_UNDIAG_V1(a, b, c, d) do {                 \
+    d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0,3,2,1)); \
+    c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1,0,3,2)); \
+    b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2,1,0,3)); \
+} while(0)
+
+#include "blake2b-load-avx2.h"
+
+#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \
+    __m256i b0;                                 \
+    BLAKE2B_LOAD_MSG_ ##r ##_1(b0);             \
+    BLAKE2B_G1_V1(a, b, c, d, b0);              \
+    BLAKE2B_LOAD_MSG_ ##r ##_2(b0);             \
+    BLAKE2B_G2_V1(a, b, c, d, b0);              \
+    BLAKE2B_DIAG_V1(a, b, c, d);                \
+    BLAKE2B_LOAD_MSG_ ##r ##_3(b0);             \
+    BLAKE2B_G1_V1(a, b, c, d, b0);              \
+    BLAKE2B_LOAD_MSG_ ##r ##_4(b0);             \
+    BLAKE2B_G2_V1(a, b, c, d, b0);              \
+    BLAKE2B_UNDIAG_V1(a, b, c, d);              \
+} while(0)
+
+#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \
+    BLAKE2B_ROUND_V1(a, b, c, d,  0, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d,  1, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d,  2, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d,  3, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d,  4, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d,  5, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d,  6, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d,  7, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d,  8, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d,  9, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d, 10, (m));    \
+    BLAKE2B_ROUND_V1(a, b, c, d, 11, (m));    \
+} while(0)
+
+#define DECLARE_MESSAGE_WORDS(m)                                         \
+    const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) +   0)); \
+    const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) +  16)); \
+    const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) +  32)); \
+    const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) +  48)); \
+    const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) +  64)); \
+    const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) +  80)); \
+    const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) +  96)); \
+    const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
+    __m256i t0, t1;
+
+#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) do { \
+    DECLARE_MESSAGE_WORDS(m)                              \
+    const __m256i iv0 = a;                                \
+    const __m256i iv1 = b;                                \
+    __m256i c = LOAD(&blake2b_IV[0]);                     \
+    __m256i d = XOR(                                      \
+      LOAD(&blake2b_IV[4]),                               \
+      _mm256_set_epi64x(f1, f0, t1, t0)                   \
+    );                                                    \
+    BLAKE2B_ROUNDS_V1(a, b, c, d, m);                     \
+    a = XOR(a, c);                                        \
+    b = XOR(b, d);                                        \
+    a = XOR(a, iv0);                                      \
+    b = XOR(b, iv1);                                      \
+} while(0)
+
+#endif
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.c
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.c
@ -18,7 +18,7 @@

 #include "blake2.h"
 #include "blake2-impl.h"
-#include "blake2b-round.h"
+#include "blake2b-compress-sse41.h"

 CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] =
 {
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.h
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-sse41.h
@ -1,31 +1,10 @@
-/*
-   BLAKE2 reference source code package - optimized C implementations

-   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
-
-   To the extent possible under law, the author(s) have dedicated all copyright
-   and related and neighboring rights to this software to the public domain
-   worldwide. This software is distributed without any warranty.
-
-   You should have received a copy of the CC0 Public Domain Dedication along with
-   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
-*/
-
-#ifndef blake2b_round_H
-#define blake2b_round_H
-
-#ifndef BLAKE2_USE_SSSE3
-# error BLAKE2_USE_SSSE3 must be defined in order to use this file
-#endif
+#ifndef blake2b_compress_sse41_H
+#define blake2b_compress_sse41_H

 #define LOADU(p)  _mm_loadu_si128( (const __m128i *)(const void *)(p) )
 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(void *)(p), r)

-#define TOF(reg) _mm_castsi128_ps((reg))
-#define TOI(reg) _mm_castps_si128((reg))
-
-
-/* Microarchitecture-specific macros */
 #define _mm_roti_epi64(x, c) \
    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
@ -33,7 +12,6 @@
    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))

-
 #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
@ -102,11 +80,7 @@
  row4l = t1; \
  row4h = t0;

-#if defined(BLAKE2_USE_SSE41)
 #include "blake2b-load-sse41.h"
-#else
-#include "blake2b-load-sse2.h"
-#endif

 #define ROUND(r) \
  LOAD_MSG_ ##r ##_1(b0, b1); \
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.c
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.c
@ -1,6 +1,4 @@

-#define BLAKE2_USE_SSSE3
-
 #include <stdint.h>
 #include <string.h>

@ -18,7 +16,7 @@

 #include "blake2.h"
 #include "blake2-impl.h"
-#include "blake2b-round.h"
+#include "blake2b-compress-ssse3.h"

 CRYPTO_ALIGN(64) static const uint64_t blake2b_IV[8] =
 {
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.h
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-compress-ssse3.h
@ -0,0 +1,97 @@
+
+#ifndef blake2b_compress_ssse3_H
+#define blake2b_compress_ssse3_H
+
+#define LOADU(p)  _mm_loadu_si128( (const __m128i *)(const void *)(p) )
+#define STOREU(p,r) _mm_storeu_si128((__m128i *)(void *)(p), r)
+
+#define _mm_roti_epi64(x, c) \
+    (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1))  \
+    : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
+    : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
+    : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x)))  \
+    : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
+
+#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, -32); \
+  row4h = _mm_roti_epi64(row4h, -32); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, -24); \
+  row2h = _mm_roti_epi64(row2h, -24); \
+
+#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
+  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
+  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
+  \
+  row4l = _mm_xor_si128(row4l, row1l); \
+  row4h = _mm_xor_si128(row4h, row1h); \
+  \
+  row4l = _mm_roti_epi64(row4l, -16); \
+  row4h = _mm_roti_epi64(row4h, -16); \
+  \
+  row3l = _mm_add_epi64(row3l, row4l); \
+  row3h = _mm_add_epi64(row3h, row4h); \
+  \
+  row2l = _mm_xor_si128(row2l, row3l); \
+  row2h = _mm_xor_si128(row2h, row3h); \
+  \
+  row2l = _mm_roti_epi64(row2l, -63); \
+  row2h = _mm_roti_epi64(row2h, -63); \
+
+#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2h, row2l, 8); \
+  t1 = _mm_alignr_epi8(row2l, row2h, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0;    \
+  \
+  t0 = _mm_alignr_epi8(row4h, row4l, 8); \
+  t1 = _mm_alignr_epi8(row4l, row4h, 8); \
+  row4l = t1; \
+  row4h = t0;
+
+#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
+  t0 = _mm_alignr_epi8(row2l, row2h, 8); \
+  t1 = _mm_alignr_epi8(row2h, row2l, 8); \
+  row2l = t0; \
+  row2h = t1; \
+  \
+  t0 = row3l; \
+  row3l = row3h; \
+  row3h = t0; \
+  \
+  t0 = _mm_alignr_epi8(row4l, row4h, 8); \
+  t1 = _mm_alignr_epi8(row4h, row4l, 8); \
+  row4l = t1; \
+  row4h = t0;
+
+#include "blake2b-load-sse2.h"
+
+#define ROUND(r) \
+  LOAD_MSG_ ##r ##_1(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_2(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
+  LOAD_MSG_ ##r ##_3(b0, b1); \
+  G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  LOAD_MSG_ ##r ##_4(b0, b1); \
+  G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
+  UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
+
+#endif
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-load-avx2.h
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-load-avx2.h
@ -0,0 +1,339 @@
+#ifndef blake2b_load_avx2_H
+#define blake2b_load_avx2_H
+
+#define BLAKE2B_LOAD_MSG_0_1(b0) do { \
+  t0 = _mm256_unpacklo_epi64(m0, m1); \
+  t1 = _mm256_unpacklo_epi64(m2, m3); \
+  b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_0_2(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m0, m1);\
+t1 = _mm256_unpackhi_epi64(m2, m3);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_0_3(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m4, m5);\
+t1 = _mm256_unpacklo_epi64(m6, m7);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_0_4(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m4, m5);\
+t1 = _mm256_unpackhi_epi64(m6, m7);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_1_1(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m7, m2);\
+t1 = _mm256_unpackhi_epi64(m4, m6);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_1_2(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m5, m4);\
+t1 = _mm256_alignr_epi8(m3, m7, 8);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_1_3(b0) \
+do { \
+t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\
+t1 = _mm256_unpackhi_epi64(m5, m2);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_1_4(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m6, m1);\
+t1 = _mm256_unpackhi_epi64(m3, m1);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_2_1(b0) \
+do { \
+t0 = _mm256_alignr_epi8(m6, m5, 8);\
+t1 = _mm256_unpackhi_epi64(m2, m7);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_2_2(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m4, m0);\
+t1 = _mm256_blend_epi32(m6, m1, 0x33);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_2_3(b0) \
+do { \
+t0 = _mm256_blend_epi32(m1, m5, 0x33);\
+t1 = _mm256_unpackhi_epi64(m3, m4);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_2_4(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m7, m3);\
+t1 = _mm256_alignr_epi8(m2, m0, 8);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_3_1(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m3, m1);\
+t1 = _mm256_unpackhi_epi64(m6, m5);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_3_2(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m4, m0);\
+t1 = _mm256_unpacklo_epi64(m6, m7);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_3_3(b0) \
+do { \
+t0 = _mm256_blend_epi32(m2, m1, 0x33);\
+t1 = _mm256_blend_epi32(m7, m2, 0x33);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_3_4(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m3, m5);\
+t1 = _mm256_unpacklo_epi64(m0, m4);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_4_1(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m4, m2);\
+t1 = _mm256_unpacklo_epi64(m1, m5);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_4_2(b0) \
+do { \
+t0 = _mm256_blend_epi32(m3, m0, 0x33);\
+t1 = _mm256_blend_epi32(m7, m2, 0x33);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_4_3(b0) \
+do { \
+t0 = _mm256_blend_epi32(m5, m7, 0x33);\
+t1 = _mm256_blend_epi32(m1, m3, 0x33);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_4_4(b0) \
+do { \
+t0 = _mm256_alignr_epi8(m6, m0, 8);\
+t1 = _mm256_blend_epi32(m6, m4, 0x33);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_5_1(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m1, m3);\
+t1 = _mm256_unpacklo_epi64(m0, m4);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_5_2(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m6, m5);\
+t1 = _mm256_unpackhi_epi64(m5, m1);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_5_3(b0) \
+do { \
+t0 = _mm256_blend_epi32(m3, m2, 0x33);\
+t1 = _mm256_unpackhi_epi64(m7, m0);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_5_4(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m6, m2);\
+t1 = _mm256_blend_epi32(m4, m7, 0x33);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_6_1(b0) \
+do { \
+t0 = _mm256_blend_epi32(m0, m6, 0x33);\
+t1 = _mm256_unpacklo_epi64(m7, m2);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_6_2(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m2, m7);\
+t1 = _mm256_alignr_epi8(m5, m6, 8);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_6_3(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m0, m3);\
+t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_6_4(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m3, m1);\
+t1 = _mm256_blend_epi32(m5, m1, 0x33);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_7_1(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m6, m3);\
+t1 = _mm256_blend_epi32(m1, m6, 0x33);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_7_2(b0) \
+do { \
+t0 = _mm256_alignr_epi8(m7, m5, 8);\
+t1 = _mm256_unpackhi_epi64(m0, m4);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_7_3(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m2, m7);\
+t1 = _mm256_unpacklo_epi64(m4, m1);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_7_4(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m0, m2);\
+t1 = _mm256_unpacklo_epi64(m3, m5);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_8_1(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m3, m7);\
+t1 = _mm256_alignr_epi8(m0, m5, 8);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_8_2(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m7, m4);\
+t1 = _mm256_alignr_epi8(m4, m1, 8);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_8_3(b0) \
+do { \
+t0 = m6;\
+t1 = _mm256_alignr_epi8(m5, m0, 8);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_8_4(b0) \
+do { \
+t0 = _mm256_blend_epi32(m3, m1, 0x33);\
+t1 = m2;\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_9_1(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m5, m4);\
+t1 = _mm256_unpackhi_epi64(m3, m0);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_9_2(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m1, m2);\
+t1 = _mm256_blend_epi32(m2, m3, 0x33);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_9_3(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m7, m4);\
+t1 = _mm256_unpackhi_epi64(m1, m6);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_9_4(b0) \
+do { \
+t0 = _mm256_alignr_epi8(m7, m5, 8);\
+t1 = _mm256_unpacklo_epi64(m6, m0);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_10_1(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m0, m1);\
+t1 = _mm256_unpacklo_epi64(m2, m3);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_10_2(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m0, m1);\
+t1 = _mm256_unpackhi_epi64(m2, m3);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_10_3(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m4, m5);\
+t1 = _mm256_unpacklo_epi64(m6, m7);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_10_4(b0) \
+do { \
+t0 = _mm256_unpackhi_epi64(m4, m5);\
+t1 = _mm256_unpackhi_epi64(m6, m7);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_11_1(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m7, m2);\
+t1 = _mm256_unpackhi_epi64(m4, m6);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_11_2(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m5, m4);\
+t1 = _mm256_alignr_epi8(m3, m7, 8);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_11_3(b0) \
+do { \
+t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\
+t1 = _mm256_unpackhi_epi64(m5, m2);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#define BLAKE2B_LOAD_MSG_11_4(b0) \
+do { \
+t0 = _mm256_unpacklo_epi64(m6, m1);\
+t1 = _mm256_unpackhi_epi64(m3, m1);\
+b0 = _mm256_blend_epi32(t0, t1, 0xF0);\
+} while(0)
+
+#endif
--- a/src/libsodium/crypto_generichash/blake2/ref/blake2b-ref.c
+++ b/src/libsodium/crypto_generichash/blake2/ref/blake2b-ref.c
@ -416,6 +416,13 @@ int blake2b_salt_personal( uint8_t *out, const void *in, const void *key, const
 int
 blake2b_pick_best_implementation(void)
 {
+#if (defined(HAVE_AVX2INTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \
+    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
+  if (sodium_runtime_has_avx2()) {
+    blake2b_compress = blake2b_compress_avx2;
+    return 0;
+  }
+#endif
 #if (defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)) || \
    (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
  if (sodium_runtime_has_sse41()) {